#! /usr/local/bin/python #-*- coding: utf-8 -*- __author__ = "Cedric Bonhomme" __version__ = "$Revision: 0.1 $" __date__ = "$Date: 2010/02/24 $" __copyright__ = "Copyright (c) 2010 Cedric Bonhomme" __license__ = "GPLv3" import re import pylab import sqlite3 import hashlib from datetime import datetime from string import punctuation from collections import defaultdict def remove_html_tags(data): """ Remove HTML tags for the search. """ p = re.compile(r'<[^<]*?/?>') return p.sub('', data) def top_words(dic_articles, n=10): """ """ N = 10 words = {} articles_content = "" for rss_feed_id in dic_articles.keys(): for article in dic_articles[rss_feed_id]: articles_content += remove_html_tags(article[4].encode('utf-8')) words_gen = (word.strip(punctuation).lower() \ for word in articles_content.split() \ if len(word) >= 5) words = defaultdict(int) for word in words_gen: words[word] += 1 top_words = sorted(words.iteritems(), key=lambda(word, count): (-count, word))[:N] return top_words def create_histogram(words, file_name="./var/histogram.png"): """ Create a histogram. """ length = 10 ind = pylab.arange(length) # abscissa width = 0.35 # bars width w = [elem[0] for elem in words] count = [int(elem[1]) for elem in words] max_count = max(count) # maximal weight p = pylab.bar(ind, count, width, color='r') pylab.ylabel("Count") pylab.title("Most frequent words") pylab.xticks(ind + (width / 2), range(1, len(w)+1)) pylab.xlim(-width, len(ind)) # changing the ordinate scale according to the max. if max_count <= 100: pylab.ylim(0, max_count + 5) pylab.yticks(pylab.arange(0, max_count + 5, 5)) elif max_count <= 200: pylab.ylim(0, max_count + 10) pylab.yticks(pylab.arange(0, max_count + 10, 10)) elif max_count <= 600: pylab.ylim(0, max_count + 25) pylab.yticks(pylab.arange(0, max_count + 25, 25)) elif max_count <= 800: pylab.ylim(0, max_count + 50) pylab.yticks(pylab.arange(0, max_count + 50, 50)) pylab.savefig(file_name, dpi = 80) pylab.close() def compare(stringtime1, stringtime2): """ Compare two dates in the format 'yyyy-mm-dd hh:mm:ss'. """ date1, time1 = stringtime1.split(' ') date2, time2 = stringtime2.split(' ') year1, month1, day1 = date1.split('-') year2, month2, day2 = date2.split('-') hour1, minute1, second1 = time1.split(':') hour2, minute2, second2 = time2.split(':') datetime1 = datetime(year=int(year1), month=int(month1), day=int(day1), \ hour=int(hour1), minute=int(minute1), second=int(second1)) datetime2 = datetime(year=int(year2), month=int(month2), day=int(day2), \ hour=int(hour2), minute=int(minute2), second=int(second2)) if datetime1 < datetime2: return -1 elif datetime1 > datetime2: return 1 else: return 0 def load_feed(): """ Load feeds in a dictionary. """ list_of_feeds = None list_of_articles = None try: conn = sqlite3.connect("./var/feed.db", isolation_level = None) c = conn.cursor() list_of_feeds = c.execute("SELECT * FROM feeds").fetchall() #c.close() except: pass # The key of dic is the id of the feed: # dic[feed_id] = (article_id, article_date, article_title, # article_link, article_description, feed_title, # feed_link, article_readed) # dic_info[feed_id] = (nb_article, nb_article_unreaded, feed_image) dic, dic_info = {}, {} if list_of_feeds is not None: for feed in list_of_feeds: feed_title = feed[0] feed_site_link = feed[1] feed_link = feed[2] feed_image = feed[3] list_of_articles = c.execute(\ "SELECT * FROM articles WHERE feed_link='" + \ feed_link + "'").fetchall() if list_of_articles is not None: for article in list_of_articles: sha256_hash = hashlib.sha256() sha256_hash.update(article[5].encode('utf-8')) feed_id = sha256_hash.hexdigest() sha256_hash.update(article[2].encode('utf-8')) article_id = sha256_hash.hexdigest() article_list = [article_id, article[0], article[1], \ article[2], article[3], feed_title, feed_link, article[4]] if feed_id not in dic: dic[feed_id] = [article_list] else: dic[feed_id].append(article_list) # sort articles by date for each feeds for feeds in dic.keys(): dic[feeds].sort(lambda x,y: compare(y[1], x[1])) dic_info[feed_id] = (len(dic[feed_id]), \ len([article for article in dic[feed_id] \ if article[7]=="0"]), \ feed_image ) c.close() return (dic, dic_info) return (dic, dic_info)