diff options
-rwxr-xr-x | pyAggr3g470r.py | 21 | ||||
-rwxr-xr-x | utils.py | 14 |
2 files changed, 26 insertions, 9 deletions
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py index 8c33f7dc..42536371 100755 --- a/pyAggr3g470r.py +++ b/pyAggr3g470r.py @@ -11,6 +11,7 @@ import os import time import sqlite3 import cherrypy +import operator import threading from cherrypy.lib.static import serve_file @@ -173,7 +174,7 @@ class Root: return html - def management(self): + def management(self, word_size=6): """ Management of articles. """ @@ -211,6 +212,9 @@ class Root: html += "<hr />\n" if self.articles: + self.top_words = utils.top_words(self.articles, n=50, size=int(word_size)) + if "pylab" not in utils.IMPORT_ERROR: + utils.create_histogram(self.top_words[:10]) html += "<h1>Statistics</h1>\n<br />\n" if "oice" not in utils.IMPORT_ERROR: nb_french = 0 @@ -223,16 +227,25 @@ class Root: nb_english += 1 nb_other = self.nb_articles - nb_french - nb_english + html += "Minimum size of a word: " + html += """<form method=get action="/management/"><select name="word_size">\n""" + for size in range(1,16): + if size == int(word_size): + select = " selected='selected'" + else: + select = "" + html += """\t<option value="%s" %s>%s</option>\n""" % (size, select,size) + html += """</select><input type="submit" value="OK"></form>\n""" html += "<table border=0>\n" html += '<tr><td colspan="2">' html += "<h3>Tag cloud</h3>\n" html += '<div style="width: 35%; overflow:hidden; text-align: justify">' + \ - utils.tag_cloud(utils.top_words(self.articles, 50)) + '</div>' + utils.tag_cloud(self.top_words) + '</div>' html += "<td></tr>" html += "<tr><td>" html += "<h3>Words count</h3>\n" html += "<ol>\n" - for word, frequency in self.top_words: + for word, frequency in sorted(self.top_words, key=operator.itemgetter(1), reverse=True)[:10]: html += """\t<li><a href="/q/?querystring=%s">%s</a>: %s</li>\n""" % \ (word, word, frequency) html += "</ol>\n" @@ -740,7 +753,7 @@ class Root: self.articles, self.feeds = utils.load_feed() self.nb_articles = sum([feed[0] for feed in self.feeds.values()]) if self.articles != {}: - self.top_words = utils.top_words(self.articles, 10) + self.top_words = utils.top_words(self.articles, 10, size=6) if "pylab" not in utils.IMPORT_ERROR: utils.create_histogram(self.top_words) print "Base (%s) loaded" % utils.sqlite_base @@ -1,6 +1,8 @@ #! /usr/local/bin/python #-*- coding: utf-8 -*- +from __future__ import with_statement + __author__ = "Cedric Bonhomme" __version__ = "$Revision: 0.5 $" __date__ = "$Date: 2010/04/15 $" @@ -14,8 +16,9 @@ try: import pylab except: IMPORT_ERROR.append("pylab") -import sqlite3 +import string import hashlib +import sqlite3 import operator import smtplib @@ -75,7 +78,7 @@ def remove_html_tags(data): p = re.compile(r'<[^<]*?/?>') return p.sub('', data) -def top_words(dic_articles, n=10): +def top_words(dic_articles, n=10, size=5): """ Return the n most frequent words in a list. """ @@ -84,9 +87,10 @@ def top_words(dic_articles, n=10): for rss_feed_id in dic_articles.keys(): for article in dic_articles[rss_feed_id]: articles_content += remove_html_tags(article[4].encode('utf-8')) - words_gen = (word.strip(punctuation).lower() \ - for word in articles_content.split() \ - if len(word) >= 6) + + words_gen = [word for word in articles_content.split() if len(word) > size] + words_gen = [word.strip(punctuation).lower() for word in words_gen] + words = defaultdict(int) for word in words_gen: words[word] += 1 |