2 files changed, 26 insertions, 9 deletions
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py
index 8c33f7dc..42536371 100755
--- a/pyAggr3g470r.py
+++ b/pyAggr3g470r.py
@@ -11,6 +11,7 @@ import os
 import time
 import sqlite3
 import cherrypy
+import operator
 import threading
 
 from cherrypy.lib.static import serve_file
@@ -173,7 +174,7 @@ class Root:
         return html
 
 
-    def management(self):
+    def management(self, word_size=6):
         """
         Management of articles.
         """
@@ -211,6 +212,9 @@ class Root:
 
         html += "<hr />\n"
         if self.articles:
+            self.top_words = utils.top_words(self.articles, n=50, size=int(word_size))
+            if "pylab" not in utils.IMPORT_ERROR:
+                utils.create_histogram(self.top_words[:10])
             html += "<h1>Statistics</h1>\n<br />\n"
             if "oice" not in utils.IMPORT_ERROR:
                 nb_french = 0
@@ -223,16 +227,25 @@ class Root:
                             nb_english += 1
                 nb_other = self.nb_articles - nb_french - nb_english
 
+            html += "Minimum size of a word: "
+            html += """<form method=get action="/management/"><select name="word_size">\n"""
+            for size in range(1,16):
+                if size == int(word_size):
+                    select = " selected='selected'"
+                else:
+                    select = ""
+                html += """\t<option value="%s" %s>%s</option>\n""" % (size, select,size)
+            html += """</select><input type="submit" value="OK"></form>\n"""
             html += "<table border=0>\n"
             html += '<tr><td colspan="2">'
             html += "<h3>Tag cloud</h3>\n"
             html += '<div style="width: 35%; overflow:hidden; text-align: justify">' + \
-                        utils.tag_cloud(utils.top_words(self.articles, 50)) + '</div>'
+                        utils.tag_cloud(self.top_words) + '</div>'
             html += "<td></tr>"
             html += "<tr><td>"
             html += "<h3>Words count</h3>\n"
             html += "<ol>\n"
-            for word, frequency in self.top_words:
+            for word, frequency in sorted(self.top_words, key=operator.itemgetter(1), reverse=True)[:10]:
                 html += """\t<li><a href="/q/?querystring=%s">%s</a>: %s</li>\n""" % \
                                 (word, word, frequency)
             html += "</ol>\n"
@@ -740,7 +753,7 @@ class Root:
         self.articles, self.feeds = utils.load_feed()
         self.nb_articles = sum([feed[0] for feed in self.feeds.values()])
         if self.articles != {}:
-            self.top_words = utils.top_words(self.articles, 10)
+            self.top_words = utils.top_words(self.articles, 10, size=6)
             if "pylab" not in utils.IMPORT_ERROR:
                 utils.create_histogram(self.top_words)
             print "Base (%s) loaded" % utils.sqlite_base
diff --git a/utils.py b/utils.py
index aaac0754..3c716c4b 100755
--- a/utils.py
+++ b/utils.py
@@ -1,6 +1,8 @@
 #! /usr/local/bin/python
 #-*- coding: utf-8 -*-
 
+from __future__ import with_statement
+
 __author__ = "Cedric Bonhomme"
 __version__ = "$Revision: 0.5 $"
 __date__ = "$Date: 2010/04/15 $"
@@ -14,8 +16,9 @@ try:
     import pylab
 except:
     IMPORT_ERROR.append("pylab")
-import sqlite3
+import string
 import hashlib
+import sqlite3
 import operator
 
 import smtplib
@@ -75,7 +78,7 @@ def remove_html_tags(data):
     p = re.compile(r'<[^<]*?/?>')
     return p.sub('', data)
 
-def top_words(dic_articles, n=10):
+def top_words(dic_articles, n=10, size=5):
     """
     Return the n most frequent words in a list.
     """
@@ -84,9 +87,10 @@ def top_words(dic_articles, n=10):
     for rss_feed_id in dic_articles.keys():
         for article in dic_articles[rss_feed_id]:
             articles_content += remove_html_tags(article[4].encode('utf-8'))
-    words_gen = (word.strip(punctuation).lower() \
-                        for word in articles_content.split() \
-                                if len(word) >= 6)
+
+    words_gen = [word for word in articles_content.split() if len(word) > size]
+    words_gen = [word.strip(punctuation).lower() for word in words_gen]
+
     words = defaultdict(int)
     for word in words_gen:
         words[word] += 1