aboutsummaryrefslogtreecommitdiff
path: root/utils.py
diff options
context:
space:
mode:
authorcedricbonhomme <devnull@localhost>2010-02-24 09:08:06 +0100
committercedricbonhomme <devnull@localhost>2010-02-24 09:08:06 +0100
commit3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7 (patch)
treeaaaaa1b07cad5e7501083102cca7da0aa3f90578 /utils.py
parentStatistics on words are only processed on articles content. (diff)
downloadnewspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.gz
newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.bz2
newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.zip
Added utils.py and lot of improvements.
Diffstat (limited to 'utils.py')
-rw-r--r--utils.py156
1 files changed, 156 insertions, 0 deletions
diff --git a/utils.py b/utils.py
new file mode 100644
index 00000000..7193fde3
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,156 @@
+#! /usr/local/bin/python
+#-*- coding: utf-8 -*-
+
+__author__ = "Cedric Bonhomme"
+__version__ = "$Revision: 0.1 $"
+__date__ = "$Date: 2010/02/24 $"
+__copyright__ = "Copyright (c) 2010 Cedric Bonhomme"
+__license__ = "GPLv3"
+
+import re
+import pylab
+import sqlite3
+import hashlib
+
+from datetime import datetime
+from string import punctuation
+from collections import defaultdict
+
+
+def remove_html_tags(data):
+ """
+ Remove HTML tags for the search.
+ """
+ p = re.compile(r'<[^<]*?/?>')
+ return p.sub('', data)
+
+def top_words(dic_articles, n=10):
+ """
+ """
+ N = 10
+ words = {}
+ articles_content = ""
+ for rss_feed_id in dic_articles.keys():
+ for article in dic_articles[rss_feed_id]:
+ articles_content += remove_html_tags(article[4].encode('utf-8'))
+ words_gen = (word.strip(punctuation).lower() \
+ for word in articles_content.split() \
+ if len(word) >= 5)
+ words = defaultdict(int)
+ for word in words_gen:
+ words[word] += 1
+ top_words = sorted(words.iteritems(),
+ key=lambda(word, count): (-count, word))[:N]
+ return top_words
+
+def create_histogram(words, file_name="./var/histogram.png"):
+ """
+ Create a histogram.
+ """
+ length = 10
+ ind = pylab.arange(length) # abscissa
+ width = 0.35 # bars width
+
+ w = [elem[0] for elem in words]
+ count = [int(elem[1]) for elem in words]
+
+ max_count = max(count) # maximal weight
+
+ p = pylab.bar(ind, count, width, color='r')
+
+ pylab.ylabel("Count")
+ pylab.title("Most frequent words")
+ pylab.xticks(ind + (width / 2), range(1, len(w)+1))
+ pylab.xlim(-width, len(ind))
+
+ # changing the ordinate scale according to the max.
+ if max_count <= 100:
+ pylab.ylim(0, max_count + 5)
+ pylab.yticks(pylab.arange(0, max_count + 5, 5))
+ elif max_count <= 200:
+ pylab.ylim(0, max_count + 10)
+ pylab.yticks(pylab.arange(0, max_count + 10, 10))
+ elif max_count <= 600:
+ pylab.ylim(0, max_count + 25)
+ pylab.yticks(pylab.arange(0, max_count + 25, 25))
+ elif max_count <= 800:
+ pylab.ylim(0, max_count + 50)
+ pylab.yticks(pylab.arange(0, max_count + 50, 50))
+
+ pylab.savefig(file_name, dpi = 80)
+ pylab.close()
+
+def compare(stringtime1, stringtime2):
+ """
+ Compare two dates in the format 'yyyy-mm-dd hh:mm:ss'.
+ """
+ date1, time1 = stringtime1.split(' ')
+ date2, time2 = stringtime2.split(' ')
+
+ year1, month1, day1 = date1.split('-')
+ year2, month2, day2 = date2.split('-')
+
+ hour1, minute1, second1 = time1.split(':')
+ hour2, minute2, second2 = time2.split(':')
+
+ datetime1 = datetime(year=int(year1), month=int(month1), day=int(day1), \
+ hour=int(hour1), minute=int(minute1), second=int(second1))
+
+ datetime2 = datetime(year=int(year2), month=int(month2), day=int(day2), \
+ hour=int(hour2), minute=int(minute2), second=int(second2))
+
+ if datetime1 < datetime2:
+ return -1
+ elif datetime1 > datetime2:
+ return 1
+ else:
+ return 0
+
+
+def load_feed():
+ """
+ Load feeds in a dictionary.
+ """
+ list_of_articles = None
+ try:
+ conn = sqlite3.connect("./var/feed.db", isolation_level = None)
+ c = conn.cursor()
+ list_of_articles = c.execute("SELECT * FROM rss_feed").fetchall()
+ c.close()
+ except:
+ pass
+
+ # The key of dic is the id of the feed:
+ # dic[feed_id] = (article_id, article_date, article_title,
+ # article_link, article_description, feed_title,
+ # feed_link, article_readed)
+ # dic_info[feed_id] = (nb_article, nb_article_unreaded)
+ dic, dic_info = {}, {}
+ if list_of_articles is not None:
+ for article in list_of_articles:
+ sha256_hash = hashlib.sha256()
+ sha256_hash.update(article[5].encode('utf-8'))
+ feed_id = sha256_hash.hexdigest()
+ sha256_hash.update(article[2].encode('utf-8'))
+ article_id = sha256_hash.hexdigest()
+
+ article_list = [article_id, article[0], article[1], \
+ article[2], article[3], article[4], article[5], article[6]]
+
+ if feed_id not in dic:
+ dic[feed_id] = [article_list]
+ else:
+ dic[feed_id].append(article_list)
+
+ # sort articles by date for each feeds
+ for feeds in dic.keys():
+ dic[feeds].sort(lambda x,y: compare(y[1], x[1]))
+
+ for rss_feed_id in dic.keys():
+ dic_info[rss_feed_id] = (len(dic[rss_feed_id]), \
+ len([article for article in dic[rss_feed_id] \
+ if article[7]=="0"]) \
+ )
+
+ return (dic, dic_info)
+ return (dic, dic_info) \ No newline at end of file
bgstack15