From 5d7c90635605c7c3c1818bb9b16787db02299fc5 Mon Sep 17 00:00:00 2001 From: cedricbonhomme Date: Wed, 28 Apr 2010 08:11:19 +0200 Subject: Added tag cloud. Some improvements. --- pyAggr3g470r.py | 21 +++++++++++++++++---- utils.py | 14 +++++++++----- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py index 8c33f7dc..42536371 100755 --- a/pyAggr3g470r.py +++ b/pyAggr3g470r.py @@ -11,6 +11,7 @@ import os import time import sqlite3 import cherrypy +import operator import threading from cherrypy.lib.static import serve_file @@ -173,7 +174,7 @@ class Root: return html - def management(self): + def management(self, word_size=6): """ Management of articles. """ @@ -211,6 +212,9 @@ class Root: html += "
\n" if self.articles: + self.top_words = utils.top_words(self.articles, n=50, size=int(word_size)) + if "pylab" not in utils.IMPORT_ERROR: + utils.create_histogram(self.top_words[:10]) html += "

Statistics

\n
\n" if "oice" not in utils.IMPORT_ERROR: nb_french = 0 @@ -223,16 +227,25 @@ class Root: nb_english += 1 nb_other = self.nb_articles - nb_french - nb_english + html += "Minimum size of a word: " + html += """
\n""" html += "\n" html += '" html += "
' html += "

Tag cloud

\n" html += '
' + \ - utils.tag_cloud(utils.top_words(self.articles, 50)) + '
' + utils.tag_cloud(self.top_words) + '' html += "
" html += "

Words count

\n" html += "
    \n" - for word, frequency in self.top_words: + for word, frequency in sorted(self.top_words, key=operator.itemgetter(1), reverse=True)[:10]: html += """\t
  1. %s: %s
  2. \n""" % \ (word, word, frequency) html += "
\n" @@ -740,7 +753,7 @@ class Root: self.articles, self.feeds = utils.load_feed() self.nb_articles = sum([feed[0] for feed in self.feeds.values()]) if self.articles != {}: - self.top_words = utils.top_words(self.articles, 10) + self.top_words = utils.top_words(self.articles, 10, size=6) if "pylab" not in utils.IMPORT_ERROR: utils.create_histogram(self.top_words) print "Base (%s) loaded" % utils.sqlite_base diff --git a/utils.py b/utils.py index aaac0754..3c716c4b 100755 --- a/utils.py +++ b/utils.py @@ -1,6 +1,8 @@ #! /usr/local/bin/python #-*- coding: utf-8 -*- +from __future__ import with_statement + __author__ = "Cedric Bonhomme" __version__ = "$Revision: 0.5 $" __date__ = "$Date: 2010/04/15 $" @@ -14,8 +16,9 @@ try: import pylab except: IMPORT_ERROR.append("pylab") -import sqlite3 +import string import hashlib +import sqlite3 import operator import smtplib @@ -75,7 +78,7 @@ def remove_html_tags(data): p = re.compile(r'<[^<]*?/?>') return p.sub('', data) -def top_words(dic_articles, n=10): +def top_words(dic_articles, n=10, size=5): """ Return the n most frequent words in a list. """ @@ -84,9 +87,10 @@ def top_words(dic_articles, n=10): for rss_feed_id in dic_articles.keys(): for article in dic_articles[rss_feed_id]: articles_content += remove_html_tags(article[4].encode('utf-8')) - words_gen = (word.strip(punctuation).lower() \ - for word in articles_content.split() \ - if len(word) >= 6) + + words_gen = [word for word in articles_content.split() if len(word) > size] + words_gen = [word.strip(punctuation).lower() for word in words_gen] + words = defaultdict(int) for word in words_gen: words[word] += 1 -- cgit