diff options
author | cedricbonhomme <devnull@localhost> | 2010-02-23 21:39:59 +0100 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-02-23 21:39:59 +0100 |
commit | 9480aff3040c0f494027944ce6c0f8ef111f1001 (patch) | |
tree | bf3ad38703d50e0ff1d3147e0a74a27e5f30dc53 /pyAggr3g470r.py | |
parent | Minor improvements. (diff) | |
download | newspipe-9480aff3040c0f494027944ce6c0f8ef111f1001.tar.gz newspipe-9480aff3040c0f494027944ce6c0f8ef111f1001.tar.bz2 newspipe-9480aff3040c0f494027944ce6c0f8ef111f1001.zip |
Bug fix: HTML tags are skipped for the search. Added a page _Management of feed_ with statistics on words. Search through feed only. And some enhancements.
Diffstat (limited to 'pyAggr3g470r.py')
-rw-r--r-- | pyAggr3g470r.py | 97 |
1 files changed, 88 insertions, 9 deletions
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py index 15ff7299..1ab9448b 100644 --- a/pyAggr3g470r.py +++ b/pyAggr3g470r.py @@ -2,17 +2,22 @@ #-*- coding: utf-8 -*- __author__ = "Cedric Bonhomme" -__version__ = "$Revision: 0.7 $" -__date__ = "$Date: 2010/02/15 $" +__version__ = "$Revision: 0.8 $" +__date__ = "$Date: 2010/02/23 $" __copyright__ = "Copyright (c) 2010 Cedric Bonhomme" __license__ = "GPLv3" +import re +import os +import pylab import sqlite3 import hashlib import cherrypy import ConfigParser from datetime import datetime +from string import punctuation +from collections import defaultdict from cherrypy.lib.static import serve_file import feedgetter @@ -26,15 +31,17 @@ bindhost = "0.0.0.0" cherrypy.config.update({ 'server.socket_port': 12556, 'server.socket_host': bindhost}) path = { '/css/style.css': {'tools.staticfile.on': True, \ - 'tools.staticfile.filename':path+'css/style.css'}} + 'tools.staticfile.filename':path+'css/style.css'}, \ + '/var/histogram.png':{'tools.staticfile.on': True, \ + 'tools.staticfile.filename':path+'var/histogram.png'}} htmlheader = """<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n<head>\n<link rel="stylesheet" type="text/css" href="/css/style.css" />\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n <title>pyAggr3g470r - RSS Feed Reader</title> </head>""" -htmlfooter = """This software is under GPLv3 license. You are welcome to copy, modify or - redistribute the source code according to the GPLv3 license.</div> +htmlfooter = """<p>This software is under GPLv3 license. You are welcome to copy, modify or + redistribute the source code according to the GPLv3 license.</p></div> </body></html>""" htmlnav = """<body><h1><a name="top"><a href="/">pyAggr3g470r - RSS Feed Reader</a></a></h1><a @@ -126,10 +133,12 @@ class Root: html += "<hr />\n" - html += """The database contains a total of %s articles with - %s unread articles.<br /><br />""" % \ + html += """<p>The database contains a total of %s articles with + %s unread articles.<br />""" % \ (sum([feed[0] for feed in self.dic_info.values()]), sum([feed[1] for feed in self.dic_info.values()])) + html += """Database: %s.\n<br />Size: %s bytes.</p>\n""" % \ + (os.path.abspath("./var/feed.db"), os.path.getsize("./var/feed.db")) html += """<form method=get action="/fetch/">\n<input type="submit" value="Fetch all feeds"></form>\n""" @@ -137,6 +146,33 @@ class Root: type="submit" value="Delete all articles"></form>\n""" html += "<hr />\n" + html += "<h1>Statistics</h1>\n" + N = 10 + words = {} + article_content = "" + for rss_feed_id in self.dic.keys(): + for article in self.dic[rss_feed_id]: + article_content += remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8')) + + words_gen = (word.strip(punctuation).lower() \ + for word in article_content.split() \ + if len(word) >= 5) + words = defaultdict(int) + for word in words_gen: + words[word] += 1 + + top_words = sorted(words.iteritems(), + key=lambda(word, count): (-count, word))[:N] + html += "<table border=0>\n<tr><td>" + html += "<ol>\n" + for word, frequency in top_words: + html += """\t<li><a href="/q/?querystring=%s">%s</a>: %s</li>\n""" % \ + (word, word, frequency) + html += "</ol>\n</td><td>" + create_histogram(top_words) + html += """<img src="/var/histogram.png" /></td></tr></table>""" + + html += "<hr />\n" html += htmlfooter return html @@ -157,7 +193,7 @@ class Root: if feed_id is not None: for article in self.dic[rss_feed_id]: - article_content = article[4].encode('utf-8') + article[2].encode('utf-8') + article_content = remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8')) if querystring.lower() in article_content.lower(): if article[7] == "0": # not readed articles are in bold @@ -175,7 +211,7 @@ class Root: else: for rss_feed_id in self.dic.keys(): for article in self.dic[rss_feed_id]: - article_content = article[4].encode('utf-8') + article[2].encode('utf-8') + article_content = remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8')) if querystring.lower() in article_content.lower(): if article[7] == "0": # not readed articles are in bold @@ -385,6 +421,49 @@ class Root: mark_as_read.exposed = True unread.exposed = True +def remove_html_tags(data): + """ + Remove HTML tags for the search. + """ + p = re.compile(r'<[^<]*?/?>') + return p.sub('', data) + +def create_histogram(words, file_name="./var/histogram.png"): + """ + Create a histogram. + """ + length = 10 + ind = pylab.arange(length) # abscissa + width = 0.35 # bars width + + w = [elem[0] for elem in words] + count = [int(elem[1]) for elem in words] + + max_count = max(count) # maximal weight + + p = pylab.bar(ind, count, width, color='r') + + pylab.ylabel("Count") + pylab.title("Most frequent words") + pylab.xticks(ind + (width / 2), range(1, len(w)+1)) + pylab.xlim(-width, len(ind)) + + # changing the ordinate scale according to the max. + if max_count <= 100: + pylab.ylim(0, max_count + 5) + pylab.yticks(pylab.arange(0, max_count + 5, 5)) + elif max_count <= 200: + pylab.ylim(0, max_count + 10) + pylab.yticks(pylab.arange(0, max_count + 10, 10)) + elif max_count <= 600: + pylab.ylim(0, max_count + 25) + pylab.yticks(pylab.arange(0, max_count + 25, 25)) + elif max_count <= 800: + pylab.ylim(0, max_count + 50) + pylab.yticks(pylab.arange(0, max_count + 50, 50)) + + pylab.savefig(file_name, dpi = 80) + pylab.close() def compare(stringtime1, stringtime2): """ |