diff options
author | cedricbonhomme <devnull@localhost> | 2010-02-24 09:08:06 +0100 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-02-24 09:08:06 +0100 |
commit | 3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7 (patch) | |
tree | aaaaa1b07cad5e7501083102cca7da0aa3f90578 /utils.py | |
parent | Statistics on words are only processed on articles content. (diff) | |
download | newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.gz newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.bz2 newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.zip |
Added utils.py and lot of improvements.
Diffstat (limited to 'utils.py')
-rw-r--r-- | utils.py | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/utils.py b/utils.py new file mode 100644 index 00000000..7193fde3 --- /dev/null +++ b/utils.py @@ -0,0 +1,156 @@ +#! /usr/local/bin/python +#-*- coding: utf-8 -*- + +__author__ = "Cedric Bonhomme" +__version__ = "$Revision: 0.1 $" +__date__ = "$Date: 2010/02/24 $" +__copyright__ = "Copyright (c) 2010 Cedric Bonhomme" +__license__ = "GPLv3" + +import re +import pylab +import sqlite3 +import hashlib + +from datetime import datetime +from string import punctuation +from collections import defaultdict + + +def remove_html_tags(data): + """ + Remove HTML tags for the search. + """ + p = re.compile(r'<[^<]*?/?>') + return p.sub('', data) + +def top_words(dic_articles, n=10): + """ + """ + N = 10 + words = {} + articles_content = "" + for rss_feed_id in dic_articles.keys(): + for article in dic_articles[rss_feed_id]: + articles_content += remove_html_tags(article[4].encode('utf-8')) + words_gen = (word.strip(punctuation).lower() \ + for word in articles_content.split() \ + if len(word) >= 5) + words = defaultdict(int) + for word in words_gen: + words[word] += 1 + top_words = sorted(words.iteritems(), + key=lambda(word, count): (-count, word))[:N] + return top_words + +def create_histogram(words, file_name="./var/histogram.png"): + """ + Create a histogram. + """ + length = 10 + ind = pylab.arange(length) # abscissa + width = 0.35 # bars width + + w = [elem[0] for elem in words] + count = [int(elem[1]) for elem in words] + + max_count = max(count) # maximal weight + + p = pylab.bar(ind, count, width, color='r') + + pylab.ylabel("Count") + pylab.title("Most frequent words") + pylab.xticks(ind + (width / 2), range(1, len(w)+1)) + pylab.xlim(-width, len(ind)) + + # changing the ordinate scale according to the max. + if max_count <= 100: + pylab.ylim(0, max_count + 5) + pylab.yticks(pylab.arange(0, max_count + 5, 5)) + elif max_count <= 200: + pylab.ylim(0, max_count + 10) + pylab.yticks(pylab.arange(0, max_count + 10, 10)) + elif max_count <= 600: + pylab.ylim(0, max_count + 25) + pylab.yticks(pylab.arange(0, max_count + 25, 25)) + elif max_count <= 800: + pylab.ylim(0, max_count + 50) + pylab.yticks(pylab.arange(0, max_count + 50, 50)) + + pylab.savefig(file_name, dpi = 80) + pylab.close() + +def compare(stringtime1, stringtime2): + """ + Compare two dates in the format 'yyyy-mm-dd hh:mm:ss'. + """ + date1, time1 = stringtime1.split(' ') + date2, time2 = stringtime2.split(' ') + + year1, month1, day1 = date1.split('-') + year2, month2, day2 = date2.split('-') + + hour1, minute1, second1 = time1.split(':') + hour2, minute2, second2 = time2.split(':') + + datetime1 = datetime(year=int(year1), month=int(month1), day=int(day1), \ + hour=int(hour1), minute=int(minute1), second=int(second1)) + + datetime2 = datetime(year=int(year2), month=int(month2), day=int(day2), \ + hour=int(hour2), minute=int(minute2), second=int(second2)) + + if datetime1 < datetime2: + return -1 + elif datetime1 > datetime2: + return 1 + else: + return 0 + + +def load_feed(): + """ + Load feeds in a dictionary. + """ + list_of_articles = None + try: + conn = sqlite3.connect("./var/feed.db", isolation_level = None) + c = conn.cursor() + list_of_articles = c.execute("SELECT * FROM rss_feed").fetchall() + c.close() + except: + pass + + # The key of dic is the id of the feed: + # dic[feed_id] = (article_id, article_date, article_title, + # article_link, article_description, feed_title, + # feed_link, article_readed) + # dic_info[feed_id] = (nb_article, nb_article_unreaded) + dic, dic_info = {}, {} + if list_of_articles is not None: + for article in list_of_articles: + sha256_hash = hashlib.sha256() + sha256_hash.update(article[5].encode('utf-8')) + feed_id = sha256_hash.hexdigest() + sha256_hash.update(article[2].encode('utf-8')) + article_id = sha256_hash.hexdigest() + + article_list = [article_id, article[0], article[1], \ + article[2], article[3], article[4], article[5], article[6]] + + if feed_id not in dic: + dic[feed_id] = [article_list] + else: + dic[feed_id].append(article_list) + + # sort articles by date for each feeds + for feeds in dic.keys(): + dic[feeds].sort(lambda x,y: compare(y[1], x[1])) + + for rss_feed_id in dic.keys(): + dic_info[rss_feed_id] = (len(dic[rss_feed_id]), \ + len([article for article in dic[rss_feed_id] \ + if article[7]=="0"]) \ + ) + + return (dic, dic_info) + return (dic, dic_info)
\ No newline at end of file |