Lot of performance improvements

author: cedricbonhomme <devnull@localhost> 2010-07-07 11:40:31 +0200
committer: cedricbonhomme <devnull@localhost> 2010-07-07 11:40:31 +0200
commit: ae508ad4d7968a03f1254d82b3da6bdf541fd515 (patch)
tree: 2ff5ec88926c5c585d16632797552bd92b741bd7
parent: Faster top_words function. (diff)
download: newspipe-ae508ad4d7968a03f1254d82b3da6bdf541fd515.tar.gz
newspipe-ae508ad4d7968a03f1254d82b3da6bdf541fd515.tar.bz2
newspipe-ae508ad4d7968a03f1254d82b3da6bdf541fd515.zip
4 files changed, 31 insertions, 85 deletions
diff --git a/feedgetter.py b/feedgetter.py
index 267246db..d2aca435 100755
--- a/feedgetter.py
+++ b/feedgetter.py
@@ -16,13 +16,6 @@ from datetime import datetime
 
 import utils
 
-url_finders = [ \
-    re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \
-    re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \
-    re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \
-    re.compile("'\\<((mailto:)|)[-A-Za-z0-9\\.]+@[-A-Za-z0-9\\.]+"), \
-]
-
 feeds_list = []
 list_of_threads = []
 
@@ -47,7 +40,7 @@ class FeedGetter(object):
         with open("./var/feed.lst") as f:
             for a_feed in f:
                 # test if the URL is well formed
-                for url_regexp in url_finders:
+                for url_regexp in utils.url_finders:
                     if url_regexp.match(a_feed):
                         the_good_url = url_regexp.match(a_feed).group(0).replace("\n", "")
                         try:
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py
index cc649cf1..af6b772e 100755
--- a/pyAggr3g470r.py
+++ b/pyAggr3g470r.py
@@ -81,14 +81,13 @@ class Root:
 
         if self.articles:
             html += """<a href="/list_favorites/"><img src="/css/img/heart.png" title="Your favorites (%s)" /></a>\n""" % \
-                (sum([len([article for article in self.articles[feed_id] if article[7] == "1"]) \
-                    for feed_id in self.feeds.keys()]),)
+                (self.nb_favorites,)
 
             html += """<a href="/list_notification"><img src="/css/img/email.png" title="Active e-mail notifications (%s)" /></a>\n""" % \
-                (len([feed for feed in self.feeds.values() if feed[6] == "1"]),)
+                (self.nb_mail_notifications,)
 
             html += """<a href="/unread/All">Unread article(s): %s</a>\n""" % \
-                (sum([feed[1] for feed in self.feeds.values()]),)
+                (self.nb_unread_articles,)
 
         for rss_feed_id in self.feeds.keys():
             html += """<h2><a name="%s"><a href="%s" rel="noreferrer"
@@ -200,15 +199,14 @@ class Root:
                         (feed_id, self.feeds[feed_id][3].encode('utf-8'))
             html += """</select><input type="submit" value="OK"></form>\n"""
             html += """<p>Active e-mail notifications: <a href="/list_notification">%s</a></p>\n""" % \
-                        (len([feed for feed in self.feeds.values() if feed[6] == "1"]),)
+                        (self.nb_mail_notifications,)
             html += """<p>You like <a href="/list_favorites/">%s</a> article(s).</p>\n""" % \
-                        (sum([len([article for article in self.articles[feed_id] if article[7] == "1"]) \
-                            for feed_id in self.feeds.keys()]), )
+                        (self.nb_favorites, )
 
         html += "<hr />\n"
         html += """<p>The database contains a total of %s article(s) with
                 <a href="/unread/All">%s unread article(s)</a>.<br />""" % \
-                    (self.nb_articles, sum([feed[1] for feed in self.feeds.values()]))
+                    (self.nb_articles, self.nb_unread_articles)
         html += """Database: %s.\n<br />Size: %s bytes.</p>\n""" % \
                     (os.path.abspath(utils.sqlite_base), os.path.getsize(utils.sqlite_base))
 
@@ -225,7 +223,6 @@ class Root:
         html += "<hr />\n"
         if self.articles:
             self.top_words = utils.top_words(self.articles, n=50, size=int(word_size))
-            utils.create_histogram(self.top_words[:10])
             html += "<h1>Statistics</h1>\n<br />\n"
             if "oice" not in utils.IMPORT_ERROR:
                 nb_french = 0
@@ -247,31 +244,19 @@ class Root:
                     select = ""
                 html += """\t<option value="%s" %s>%s</option>\n""" % (size, select,size)
             html += """</select><input type="submit" value="OK"></form>\n"""
-            html += "<table border=0>\n"
-            html += '<tr><td colspan="2">'
-            html += "<h3>Tag cloud</h3>\n"
+            html += "<br /><h3>Tag cloud</h3>\n"
             html += '<div style="width: 35%; overflow:hidden; text-align: justify">' + \
                         utils.tag_cloud(self.top_words) + '</div>'
-            html += "<td></tr>"
-            html += "<tr><td>"
-            html += "<h3>Words count</h3>\n"
-            html += "<ol>\n"
-            for word, frequency in sorted(self.top_words, key=operator.itemgetter(1), reverse=True)[:10]:
-                html += """\t<li><a href="/q/?querystring=%s">%s</a>: %s</li>\n""" % \
-                                (word, word, frequency)
-            html += "</ol>\n"
-            html += "<h3>Languages</h3>\n"
+            html += "<br /><h3>Languages</h3>\n"
             if "oice" in utils.IMPORT_ERROR:
                 html += "Install the module "
                 html += """<a href="http://pypi.python.org/pypi/oice.langdet/">oice.langdet</a>"""
-                html += "</td>\n<td>"
             else:
                 html += "<ul>\n"
                 for language in ['english', 'french', 'other']:
                     html += """\t<li>%s articles in <a href="/language/%s">%s</a></li>\n""" % \
                                     (locals()["nb_"+language], language, language)
-                html += "</ul>\n</td>\n<td>"
-            html += """<img src="/var/histogram.png" /></td></tr></table>\n<br />\n"""
+                html += "</ul>\n<br />"
 
             html += "<hr />\n"
         html += htmlfooter
@@ -349,7 +334,6 @@ class Root:
         """
         feed_getter = feedgetter.FeedGetter()
         feed_getter.retrieve_feed()
-        #self.update()
         return self.index()
 
     fetch.exposed = True
@@ -506,7 +490,8 @@ class Root:
                         html += article[1].encode('utf-8') + \
                                 """ - <a href="/description/%s:%s" rel="noreferrer" target="_blank">%s</a>
                                 from <i><a href="%s">%s</a></i><br />\n""" % \
-                                        (rss_feed_id, article[0].encode('utf-8'), article[2].encode('utf-8'), \
+                                        (rss_feed_id, article[0].encode('utf-8'), \
+                                        article[2].encode('utf-8'), \
                                         self.feeds[rss_feed_id][5].encode('utf-8'), \
                                         self.feeds[rss_feed_id][3].encode('utf-8'))
             html += """<hr />\n<a href="/mark_as_read/All:">Mark articles as read</a>\n"""
@@ -841,15 +826,19 @@ class Root:
 
     def update(self, path=None, event = None):
         """
-        Synchronizes transient objects with the database,
-        computes the list of most frequent words and generates the histogram.
-        Called when an article is marked as read or when new articles are fetched.
+        Synchronizes transient objects (dictionary of feed and articles)
+        with the database.
+        Called when a changes in the database is detected.
         """
         self.articles, self.feeds = utils.load_feed()
         self.nb_articles = sum([feed[0] for feed in self.feeds.values()])
+        self.nb_unread_articles = sum([feed[1] for feed in self.feeds.values()])
+        self.nb_mail_notifications = len([feed for feed in self.feeds.values() \
+                                if feed[6] == "1"])
+        self.nb_favorites = sum([len([article for article in self.articles[feed_id] \
+                                if article[7] == "1"]) \
+                                    for feed_id in self.feeds.keys()])
         if self.articles != {}:
-            self.top_words = utils.top_words(self.articles, 10, size=6)
-            utils.create_histogram(self.top_words)
             print "Base (%s) loaded" % utils.sqlite_base
         else:
             print "Base (%s) empty!" % utils.sqlite_base
diff --git a/utils.py b/utils.py
index fc6d6891..a6822685 100755
--- a/utils.py
+++ b/utils.py
@@ -10,10 +10,6 @@ __license__ = "GPLv3"
 IMPORT_ERROR = []
 
 import re
-try:
-    import pylab
-except:
-    IMPORT_ERROR.append("pylab")
 import string
 import hashlib
 import sqlite3
@@ -55,6 +51,12 @@ smtp_server = config.get('mail','smtp')
 username =  config.get('mail','username')
 password =  config.get('mail','password')
 
+url_finders = [ \
+    re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \
+    re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \
+    re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \
+    re.compile("'\\<((mailto:)|)[-A-Za-z0-9\\.]+@[-A-Za-z0-9\\.]+"), \
+]
 
 def detect_language(text):
     """
@@ -104,49 +106,10 @@ def tag_cloud(tags):
     Generates a tags cloud.
     """
     tags.sort(key=operator.itemgetter(0))
-    return ' '.join([('<font size="%d"><a href="/q/?querystring=%s">%s</a></font>\n' % \
-                    (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, word)) \
+    return ' '.join([('<font size="%d"><a href="/q/?querystring=%s" title="Count: %s">%s</a></font>\n' % \
+                    (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, count, word)) \
                         for (word, count) in tags])
 
-def create_histogram(words, file_name="./var/histogram.png"):
-    """
-    Create a histogram.
-    """
-    if "pylab" in IMPORT_ERROR:
-        return
-    length = 10
-    ind = pylab.arange(length) # abscissa
-    width = 0.35 # bars width
-
-    w = [elem[0] for elem in words]
-    count = [int(elem[1]) for elem in words]
-
-    max_count = max(count)  # maximal weight
-
-    p = pylab.bar(ind, count, width, color='r')
-
-    pylab.ylabel("Count")
-    pylab.title("Most frequent words")
-    pylab.xticks(ind + (width / 2), range(1, len(w)+1))
-    pylab.xlim(-width, len(ind))
-
-    # changing the ordinate scale according to the max.
-    if max_count <= 100:
-        pylab.ylim(0, max_count + 5)
-        pylab.yticks(pylab.arange(0, max_count + 5, 5))
-    elif max_count <= 200:
-        pylab.ylim(0, max_count + 10)
-        pylab.yticks(pylab.arange(0, max_count + 10, 10))
-    elif max_count <= 600:
-        pylab.ylim(0, max_count + 25)
-        pylab.yticks(pylab.arange(0, max_count + 25, 25))
-    elif max_count <= 800:
-        pylab.ylim(0, max_count + 50)
-        pylab.yticks(pylab.arange(0, max_count + 50, 50))
-
-    pylab.savefig(file_name, dpi = 80)
-    pylab.close()
-
 def send_mail(mfrom, mto, feed_title, message):
     """Send the warning via mail
     """
diff --git a/var/feed.lst b/var/feed.lst
index 29bb04e0..fbee3acf 100755
--- a/var/feed.lst
+++ b/var/feed.lst
@@ -24,4 +24,5 @@ http://www.jeffersonswheel.org/?feed=rss2
 http://www.laquadrature.net/en/rss.xml
 http://static.fsf.org/fsforg/rss/blogs.xml
 http://esr.ibiblio.org/?feed=rss2
-http://www.maitre-eolas.fr/feed/atom
-\ No newline at end of file
+http://www.maitre-eolas.fr/feed/atom
+http://linuxfr.org/backend/journaux/rss20.rss
author	cedricbonhomme <devnull@localhost>	2010-07-07 11:40:31 +0200
committer	cedricbonhomme <devnull@localhost>	2010-07-07 11:40:31 +0200
commit	ae508ad4d7968a03f1254d82b3da6bdf541fd515 (patch)
tree	2ff5ec88926c5c585d16632797552bd92b741bd7
parent	Faster top_words function. (diff)
download	newspipe-ae508ad4d7968a03f1254d82b3da6bdf541fd515.tar.gz newspipe-ae508ad4d7968a03f1254d82b3da6bdf541fd515.tar.bz2 newspipe-ae508ad4d7968a03f1254d82b3da6bdf541fd515.zip