diff options
author | cedricbonhomme <devnull@localhost> | 2010-02-24 09:08:06 +0100 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-02-24 09:08:06 +0100 |
commit | 3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7 (patch) | |
tree | aaaaa1b07cad5e7501083102cca7da0aa3f90578 | |
parent | Statistics on words are only processed on articles content. (diff) | |
download | newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.gz newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.bz2 newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.zip |
Added utils.py and lot of improvements.
-rw-r--r-- | pyAggr3g470r.py | 189 | ||||
-rw-r--r-- | utils.py | 156 |
2 files changed, 191 insertions, 154 deletions
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py index d72a6008..87be077b 100644 --- a/pyAggr3g470r.py +++ b/pyAggr3g470r.py @@ -7,19 +7,13 @@ __date__ = "$Date: 2010/02/23 $" __copyright__ = "Copyright (c) 2010 Cedric Bonhomme" __license__ = "GPLv3" -import re import os -import pylab -import sqlite3 -import hashlib import cherrypy import ConfigParser -from datetime import datetime -from string import punctuation -from collections import defaultdict from cherrypy.lib.static import serve_file +import utils import feedgetter config = ConfigParser.RawConfigParser() @@ -41,10 +35,10 @@ htmlheader = """<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" <title>pyAggr3g470r - RSS Feed Reader</title> </head>""" htmlfooter = """<p>This software is under GPLv3 license. You are welcome to copy, modify or - redistribute the source code according to the GPLv3 license.</p></div> - </body></html>""" + redistribute the source code according to the GPLv3 license.</p></div>\n + </body>\n</html>""" -htmlnav = """<body><h1><a name="top"><a href="/">pyAggr3g470r - RSS Feed Reader</a></a></h1><a +htmlnav = """<body>\n<h1><a name="top"><a href="/">pyAggr3g470r - RSS Feed Reader</a></a></h1>\n<a href="http://bitbucket.org/cedricbonhomme/pyaggr3g470r/" rel="noreferrer" target="_blank"> pyAggr3g470r (source code)</a> """ @@ -55,7 +49,7 @@ class Root: """ Main page containing the list of feeds and articles. """ - self.dic, self.dic_info = self.load_feed() + self.dic, self.dic_info = utils.load_feed() html = htmlheader html += htmlnav html += """<div class="right inner">\n""" @@ -113,10 +107,13 @@ class Root: html += htmlfooter return html + index.exposed = True + + def management(self): """ """ - self.dic, self.dic_info = self.load_feed() + self.dic, self.dic_info = utils.load_feed() html = htmlheader html += htmlnav html += """</div> <div class="left inner">\n""" @@ -147,35 +144,24 @@ class Root: html += "<hr />\n" html += "<h1>Statistics</h1>\n" - N = 10 - words = {} - article_content = "" - for rss_feed_id in self.dic.keys(): - for article in self.dic[rss_feed_id]: - article_content += remove_html_tags(article[4].encode('utf-8')) - - words_gen = (word.strip(punctuation).lower() \ - for word in article_content.split() \ - if len(word) >= 5) - words = defaultdict(int) - for word in words_gen: - words[word] += 1 + top_words = utils.top_words(self.dic, 10) - top_words = sorted(words.iteritems(), - key=lambda(word, count): (-count, word))[:N] html += "<table border=0>\n<tr><td>" html += "<ol>\n" for word, frequency in top_words: html += """\t<li><a href="/q/?querystring=%s">%s</a>: %s</li>\n""" % \ (word, word, frequency) html += "</ol>\n</td><td>" - create_histogram(top_words) + utils.create_histogram(top_words) html += """<img src="/var/histogram.png" /></td></tr></table>""" html += "<hr />\n" html += htmlfooter return html + management.exposed = True + + def q(self, querystring=None): """ Search for a feed. Simply search for the string 'querystring' @@ -193,7 +179,7 @@ class Root: if feed_id is not None: for article in self.dic[rss_feed_id]: - article_content = remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8')) + article_content = utils.remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8')) if querystring.lower() in article_content.lower(): if article[7] == "0": # not readed articles are in bold @@ -211,7 +197,7 @@ class Root: else: for rss_feed_id in self.dic.keys(): for article in self.dic[rss_feed_id]: - article_content = remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8')) + article_content = utils.remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8')) if querystring.lower() in article_content.lower(): if article[7] == "0": # not readed articles are in bold @@ -231,6 +217,9 @@ class Root: html += htmlfooter return html + q.exposed = True + + def fetch(self): """ Fetch all feeds @@ -239,6 +228,9 @@ class Root: feed_getter.retrieve_feed() return self.index() + fetch.exposed = True + + def description(self, article_id): """ Display the description of an article in a new Web page. @@ -260,10 +252,16 @@ class Root: html += description else: html += "No description available." - html += """<hr />\n<a href="%s">Complete story</a>\n""" % (article[3].encode('utf-8'),) + html += """<hr />\n<a href="%s">Complete story</a>\n<br />\n""" % \ + (article[3].encode('utf-8'),) + html += """<a href="http://delicious.com/%s">Delicious</a>""" % \ + (article[3].encode('utf-8'),) html += "<hr />\n" + htmlfooter return html + description.exposed = True + + def all_articles(self, feed_id): """ Display all articles of a feed ('feed_title'). @@ -309,6 +307,9 @@ class Root: html += htmlfooter return html + all_articles.exposed = True + + def unread(self, feed_id): """ Display all unread articles of a feed ('feed_title'). @@ -334,53 +335,8 @@ class Root: html += htmlfooter return html - def load_feed(self): - """ - Load feeds in a dictionary. - """ - list_of_articles = None - try: - conn = sqlite3.connect("./var/feed.db", isolation_level = None) - c = conn.cursor() - list_of_articles = c.execute("SELECT * FROM rss_feed").fetchall() - c.close() - except: - pass - - # The key of dic is the id of the feed: - # dic[feed_id] = (article_id, article_date, article_title, - # article_link, article_description, feed_title, - # feed_link, article_readed) - # dic_info[feed_id] = (nb_article, nb_article_unreaded) - dic, dic_info = {}, {} - if list_of_articles is not None: - for article in list_of_articles: - sha256_hash = hashlib.sha256() - sha256_hash.update(article[5].encode('utf-8')) - feed_id = sha256_hash.hexdigest() - sha256_hash.update(article[2].encode('utf-8')) - article_id = sha256_hash.hexdigest() - - article_list = [article_id, article[0], article[1], \ - article[2], article[3], article[4], article[5], article[6]] - - if feed_id not in dic: - dic[feed_id] = [article_list] - else: - dic[feed_id].append(article_list) - - # sort articles by date for each feeds - for feeds in dic.keys(): - dic[feeds].sort(lambda x,y: compare(y[1], x[1])) - - for rss_feed_id in dic.keys(): - dic_info[rss_feed_id] = (len(dic[rss_feed_id]), \ - len([article for article in dic[rss_feed_id] \ - if article[7]=="0"]) \ - ) + unread.exposed = True - return (dic, dic_info) - return (dic, dic_info) def mark_as_read(self, target): """ @@ -405,91 +361,16 @@ class Root: except Exception, e: pass - self.dic, self.dic_info = self.load_feed() + self.dic, self.dic_info = utils.load_feed() if param == "All" or param == "Feed_FromMainPage": return self.index() elif param == "Feed": return self.all_articles(identifiant) - index.exposed = True - management.exposed = True - fetch.exposed = True - q.exposed = True - description.exposed = True - all_articles.exposed = True mark_as_read.exposed = True - unread.exposed = True -def remove_html_tags(data): - """ - Remove HTML tags for the search. - """ - p = re.compile(r'<[^<]*?/?>') - return p.sub('', data) - -def create_histogram(words, file_name="./var/histogram.png"): - """ - Create a histogram. - """ - length = 10 - ind = pylab.arange(length) # abscissa - width = 0.35 # bars width - - w = [elem[0] for elem in words] - count = [int(elem[1]) for elem in words] - - max_count = max(count) # maximal weight - - p = pylab.bar(ind, count, width, color='r') - - pylab.ylabel("Count") - pylab.title("Most frequent words") - pylab.xticks(ind + (width / 2), range(1, len(w)+1)) - pylab.xlim(-width, len(ind)) - - # changing the ordinate scale according to the max. - if max_count <= 100: - pylab.ylim(0, max_count + 5) - pylab.yticks(pylab.arange(0, max_count + 5, 5)) - elif max_count <= 200: - pylab.ylim(0, max_count + 10) - pylab.yticks(pylab.arange(0, max_count + 10, 10)) - elif max_count <= 600: - pylab.ylim(0, max_count + 25) - pylab.yticks(pylab.arange(0, max_count + 25, 25)) - elif max_count <= 800: - pylab.ylim(0, max_count + 50) - pylab.yticks(pylab.arange(0, max_count + 50, 50)) - - pylab.savefig(file_name, dpi = 80) - pylab.close() - -def compare(stringtime1, stringtime2): - """ - Compare two dates in the format 'yyyy-mm-dd hh:mm:ss'. - """ - date1, time1 = stringtime1.split(' ') - date2, time2 = stringtime2.split(' ') - - year1, month1, day1 = date1.split('-') - year2, month2, day2 = date2.split('-') - - hour1, minute1, second1 = time1.split(':') - hour2, minute2, second2 = time2.split(':') - - datetime1 = datetime(year=int(year1), month=int(month1), day=int(day1), \ - hour=int(hour1), minute=int(minute1), second=int(second1)) - - datetime2 = datetime(year=int(year2), month=int(month2), day=int(day2), \ - hour=int(hour2), minute=int(minute2), second=int(second2)) - - if datetime1 < datetime2: - return -1 - elif datetime1 > datetime2: - return 1 - else: - return 0 + if __name__ == '__main__': diff --git a/utils.py b/utils.py new file mode 100644 index 00000000..7193fde3 --- /dev/null +++ b/utils.py @@ -0,0 +1,156 @@ +#! /usr/local/bin/python +#-*- coding: utf-8 -*- + +__author__ = "Cedric Bonhomme" +__version__ = "$Revision: 0.1 $" +__date__ = "$Date: 2010/02/24 $" +__copyright__ = "Copyright (c) 2010 Cedric Bonhomme" +__license__ = "GPLv3" + +import re +import pylab +import sqlite3 +import hashlib + +from datetime import datetime +from string import punctuation +from collections import defaultdict + + +def remove_html_tags(data): + """ + Remove HTML tags for the search. + """ + p = re.compile(r'<[^<]*?/?>') + return p.sub('', data) + +def top_words(dic_articles, n=10): + """ + """ + N = 10 + words = {} + articles_content = "" + for rss_feed_id in dic_articles.keys(): + for article in dic_articles[rss_feed_id]: + articles_content += remove_html_tags(article[4].encode('utf-8')) + words_gen = (word.strip(punctuation).lower() \ + for word in articles_content.split() \ + if len(word) >= 5) + words = defaultdict(int) + for word in words_gen: + words[word] += 1 + top_words = sorted(words.iteritems(), + key=lambda(word, count): (-count, word))[:N] + return top_words + +def create_histogram(words, file_name="./var/histogram.png"): + """ + Create a histogram. + """ + length = 10 + ind = pylab.arange(length) # abscissa + width = 0.35 # bars width + + w = [elem[0] for elem in words] + count = [int(elem[1]) for elem in words] + + max_count = max(count) # maximal weight + + p = pylab.bar(ind, count, width, color='r') + + pylab.ylabel("Count") + pylab.title("Most frequent words") + pylab.xticks(ind + (width / 2), range(1, len(w)+1)) + pylab.xlim(-width, len(ind)) + + # changing the ordinate scale according to the max. + if max_count <= 100: + pylab.ylim(0, max_count + 5) + pylab.yticks(pylab.arange(0, max_count + 5, 5)) + elif max_count <= 200: + pylab.ylim(0, max_count + 10) + pylab.yticks(pylab.arange(0, max_count + 10, 10)) + elif max_count <= 600: + pylab.ylim(0, max_count + 25) + pylab.yticks(pylab.arange(0, max_count + 25, 25)) + elif max_count <= 800: + pylab.ylim(0, max_count + 50) + pylab.yticks(pylab.arange(0, max_count + 50, 50)) + + pylab.savefig(file_name, dpi = 80) + pylab.close() + +def compare(stringtime1, stringtime2): + """ + Compare two dates in the format 'yyyy-mm-dd hh:mm:ss'. + """ + date1, time1 = stringtime1.split(' ') + date2, time2 = stringtime2.split(' ') + + year1, month1, day1 = date1.split('-') + year2, month2, day2 = date2.split('-') + + hour1, minute1, second1 = time1.split(':') + hour2, minute2, second2 = time2.split(':') + + datetime1 = datetime(year=int(year1), month=int(month1), day=int(day1), \ + hour=int(hour1), minute=int(minute1), second=int(second1)) + + datetime2 = datetime(year=int(year2), month=int(month2), day=int(day2), \ + hour=int(hour2), minute=int(minute2), second=int(second2)) + + if datetime1 < datetime2: + return -1 + elif datetime1 > datetime2: + return 1 + else: + return 0 + + +def load_feed(): + """ + Load feeds in a dictionary. + """ + list_of_articles = None + try: + conn = sqlite3.connect("./var/feed.db", isolation_level = None) + c = conn.cursor() + list_of_articles = c.execute("SELECT * FROM rss_feed").fetchall() + c.close() + except: + pass + + # The key of dic is the id of the feed: + # dic[feed_id] = (article_id, article_date, article_title, + # article_link, article_description, feed_title, + # feed_link, article_readed) + # dic_info[feed_id] = (nb_article, nb_article_unreaded) + dic, dic_info = {}, {} + if list_of_articles is not None: + for article in list_of_articles: + sha256_hash = hashlib.sha256() + sha256_hash.update(article[5].encode('utf-8')) + feed_id = sha256_hash.hexdigest() + sha256_hash.update(article[2].encode('utf-8')) + article_id = sha256_hash.hexdigest() + + article_list = [article_id, article[0], article[1], \ + article[2], article[3], article[4], article[5], article[6]] + + if feed_id not in dic: + dic[feed_id] = [article_list] + else: + dic[feed_id].append(article_list) + + # sort articles by date for each feeds + for feeds in dic.keys(): + dic[feeds].sort(lambda x,y: compare(y[1], x[1])) + + for rss_feed_id in dic.keys(): + dic_info[rss_feed_id] = (len(dic[rss_feed_id]), \ + len([article for article in dic[rss_feed_id] \ + if article[7]=="0"]) \ + ) + + return (dic, dic_info) + return (dic, dic_info)
\ No newline at end of file |