aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xpyAggr3g470r.py21
-rwxr-xr-xutils.py14
2 files changed, 26 insertions, 9 deletions
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py
index 8c33f7dc..42536371 100755
--- a/pyAggr3g470r.py
+++ b/pyAggr3g470r.py
@@ -11,6 +11,7 @@ import os
import time
import sqlite3
import cherrypy
+import operator
import threading
from cherrypy.lib.static import serve_file
@@ -173,7 +174,7 @@ class Root:
return html
- def management(self):
+ def management(self, word_size=6):
"""
Management of articles.
"""
@@ -211,6 +212,9 @@ class Root:
html += "<hr />\n"
if self.articles:
+ self.top_words = utils.top_words(self.articles, n=50, size=int(word_size))
+ if "pylab" not in utils.IMPORT_ERROR:
+ utils.create_histogram(self.top_words[:10])
html += "<h1>Statistics</h1>\n<br />\n"
if "oice" not in utils.IMPORT_ERROR:
nb_french = 0
@@ -223,16 +227,25 @@ class Root:
nb_english += 1
nb_other = self.nb_articles - nb_french - nb_english
+ html += "Minimum size of a word: "
+ html += """<form method=get action="/management/"><select name="word_size">\n"""
+ for size in range(1,16):
+ if size == int(word_size):
+ select = " selected='selected'"
+ else:
+ select = ""
+ html += """\t<option value="%s" %s>%s</option>\n""" % (size, select,size)
+ html += """</select><input type="submit" value="OK"></form>\n"""
html += "<table border=0>\n"
html += '<tr><td colspan="2">'
html += "<h3>Tag cloud</h3>\n"
html += '<div style="width: 35%; overflow:hidden; text-align: justify">' + \
- utils.tag_cloud(utils.top_words(self.articles, 50)) + '</div>'
+ utils.tag_cloud(self.top_words) + '</div>'
html += "<td></tr>"
html += "<tr><td>"
html += "<h3>Words count</h3>\n"
html += "<ol>\n"
- for word, frequency in self.top_words:
+ for word, frequency in sorted(self.top_words, key=operator.itemgetter(1), reverse=True)[:10]:
html += """\t<li><a href="/q/?querystring=%s">%s</a>: %s</li>\n""" % \
(word, word, frequency)
html += "</ol>\n"
@@ -740,7 +753,7 @@ class Root:
self.articles, self.feeds = utils.load_feed()
self.nb_articles = sum([feed[0] for feed in self.feeds.values()])
if self.articles != {}:
- self.top_words = utils.top_words(self.articles, 10)
+ self.top_words = utils.top_words(self.articles, 10, size=6)
if "pylab" not in utils.IMPORT_ERROR:
utils.create_histogram(self.top_words)
print "Base (%s) loaded" % utils.sqlite_base
diff --git a/utils.py b/utils.py
index aaac0754..3c716c4b 100755
--- a/utils.py
+++ b/utils.py
@@ -1,6 +1,8 @@
#! /usr/local/bin/python
#-*- coding: utf-8 -*-
+from __future__ import with_statement
+
__author__ = "Cedric Bonhomme"
__version__ = "$Revision: 0.5 $"
__date__ = "$Date: 2010/04/15 $"
@@ -14,8 +16,9 @@ try:
import pylab
except:
IMPORT_ERROR.append("pylab")
-import sqlite3
+import string
import hashlib
+import sqlite3
import operator
import smtplib
@@ -75,7 +78,7 @@ def remove_html_tags(data):
p = re.compile(r'<[^<]*?/?>')
return p.sub('', data)
-def top_words(dic_articles, n=10):
+def top_words(dic_articles, n=10, size=5):
"""
Return the n most frequent words in a list.
"""
@@ -84,9 +87,10 @@ def top_words(dic_articles, n=10):
for rss_feed_id in dic_articles.keys():
for article in dic_articles[rss_feed_id]:
articles_content += remove_html_tags(article[4].encode('utf-8'))
- words_gen = (word.strip(punctuation).lower() \
- for word in articles_content.split() \
- if len(word) >= 6)
+
+ words_gen = [word for word in articles_content.split() if len(word) > size]
+ words_gen = [word.strip(punctuation).lower() for word in words_gen]
+
words = defaultdict(int)
for word in words_gen:
words[word] += 1
bgstack15