diff options
author | cedricbonhomme <devnull@localhost> | 2010-07-07 11:40:31 +0200 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-07-07 11:40:31 +0200 |
commit | ae508ad4d7968a03f1254d82b3da6bdf541fd515 (patch) | |
tree | 2ff5ec88926c5c585d16632797552bd92b741bd7 | |
parent | Faster top_words function. (diff) | |
download | newspipe-ae508ad4d7968a03f1254d82b3da6bdf541fd515.tar.gz newspipe-ae508ad4d7968a03f1254d82b3da6bdf541fd515.tar.bz2 newspipe-ae508ad4d7968a03f1254d82b3da6bdf541fd515.zip |
Lot of performance improvements
-rwxr-xr-x | feedgetter.py | 9 | ||||
-rwxr-xr-x | pyAggr3g470r.py | 51 | ||||
-rwxr-xr-x | utils.py | 53 | ||||
-rwxr-xr-x | var/feed.lst | 3 |
4 files changed, 31 insertions, 85 deletions
diff --git a/feedgetter.py b/feedgetter.py index 267246db..d2aca435 100755 --- a/feedgetter.py +++ b/feedgetter.py @@ -16,13 +16,6 @@ from datetime import datetime import utils -url_finders = [ \ - re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \ - re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \ - re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \ - re.compile("'\\<((mailto:)|)[-A-Za-z0-9\\.]+@[-A-Za-z0-9\\.]+"), \ -] - feeds_list = [] list_of_threads = [] @@ -47,7 +40,7 @@ class FeedGetter(object): with open("./var/feed.lst") as f: for a_feed in f: # test if the URL is well formed - for url_regexp in url_finders: + for url_regexp in utils.url_finders: if url_regexp.match(a_feed): the_good_url = url_regexp.match(a_feed).group(0).replace("\n", "") try: diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py index cc649cf1..af6b772e 100755 --- a/pyAggr3g470r.py +++ b/pyAggr3g470r.py @@ -81,14 +81,13 @@ class Root: if self.articles: html += """<a href="/list_favorites/"><img src="/css/img/heart.png" title="Your favorites (%s)" /></a>\n""" % \ - (sum([len([article for article in self.articles[feed_id] if article[7] == "1"]) \ - for feed_id in self.feeds.keys()]),) + (self.nb_favorites,) html += """<a href="/list_notification"><img src="/css/img/email.png" title="Active e-mail notifications (%s)" /></a>\n""" % \ - (len([feed for feed in self.feeds.values() if feed[6] == "1"]),) + (self.nb_mail_notifications,) html += """<a href="/unread/All">Unread article(s): %s</a>\n""" % \ - (sum([feed[1] for feed in self.feeds.values()]),) + (self.nb_unread_articles,) for rss_feed_id in self.feeds.keys(): html += """<h2><a name="%s"><a href="%s" rel="noreferrer" @@ -200,15 +199,14 @@ class Root: (feed_id, self.feeds[feed_id][3].encode('utf-8')) html += """</select><input type="submit" value="OK"></form>\n""" html += """<p>Active e-mail notifications: <a href="/list_notification">%s</a></p>\n""" % \ - (len([feed for feed in self.feeds.values() if feed[6] == "1"]),) + (self.nb_mail_notifications,) html += """<p>You like <a href="/list_favorites/">%s</a> article(s).</p>\n""" % \ - (sum([len([article for article in self.articles[feed_id] if article[7] == "1"]) \ - for feed_id in self.feeds.keys()]), ) + (self.nb_favorites, ) html += "<hr />\n" html += """<p>The database contains a total of %s article(s) with <a href="/unread/All">%s unread article(s)</a>.<br />""" % \ - (self.nb_articles, sum([feed[1] for feed in self.feeds.values()])) + (self.nb_articles, self.nb_unread_articles) html += """Database: %s.\n<br />Size: %s bytes.</p>\n""" % \ (os.path.abspath(utils.sqlite_base), os.path.getsize(utils.sqlite_base)) @@ -225,7 +223,6 @@ class Root: html += "<hr />\n" if self.articles: self.top_words = utils.top_words(self.articles, n=50, size=int(word_size)) - utils.create_histogram(self.top_words[:10]) html += "<h1>Statistics</h1>\n<br />\n" if "oice" not in utils.IMPORT_ERROR: nb_french = 0 @@ -247,31 +244,19 @@ class Root: select = "" html += """\t<option value="%s" %s>%s</option>\n""" % (size, select,size) html += """</select><input type="submit" value="OK"></form>\n""" - html += "<table border=0>\n" - html += '<tr><td colspan="2">' - html += "<h3>Tag cloud</h3>\n" + html += "<br /><h3>Tag cloud</h3>\n" html += '<div style="width: 35%; overflow:hidden; text-align: justify">' + \ utils.tag_cloud(self.top_words) + '</div>' - html += "<td></tr>" - html += "<tr><td>" - html += "<h3>Words count</h3>\n" - html += "<ol>\n" - for word, frequency in sorted(self.top_words, key=operator.itemgetter(1), reverse=True)[:10]: - html += """\t<li><a href="/q/?querystring=%s">%s</a>: %s</li>\n""" % \ - (word, word, frequency) - html += "</ol>\n" - html += "<h3>Languages</h3>\n" + html += "<br /><h3>Languages</h3>\n" if "oice" in utils.IMPORT_ERROR: html += "Install the module " html += """<a href="http://pypi.python.org/pypi/oice.langdet/">oice.langdet</a>""" - html += "</td>\n<td>" else: html += "<ul>\n" for language in ['english', 'french', 'other']: html += """\t<li>%s articles in <a href="/language/%s">%s</a></li>\n""" % \ (locals()["nb_"+language], language, language) - html += "</ul>\n</td>\n<td>" - html += """<img src="/var/histogram.png" /></td></tr></table>\n<br />\n""" + html += "</ul>\n<br />" html += "<hr />\n" html += htmlfooter @@ -349,7 +334,6 @@ class Root: """ feed_getter = feedgetter.FeedGetter() feed_getter.retrieve_feed() - #self.update() return self.index() fetch.exposed = True @@ -506,7 +490,8 @@ class Root: html += article[1].encode('utf-8') + \ """ - <a href="/description/%s:%s" rel="noreferrer" target="_blank">%s</a> from <i><a href="%s">%s</a></i><br />\n""" % \ - (rss_feed_id, article[0].encode('utf-8'), article[2].encode('utf-8'), \ + (rss_feed_id, article[0].encode('utf-8'), \ + article[2].encode('utf-8'), \ self.feeds[rss_feed_id][5].encode('utf-8'), \ self.feeds[rss_feed_id][3].encode('utf-8')) html += """<hr />\n<a href="/mark_as_read/All:">Mark articles as read</a>\n""" @@ -841,15 +826,19 @@ class Root: def update(self, path=None, event = None): """ - Synchronizes transient objects with the database, - computes the list of most frequent words and generates the histogram. - Called when an article is marked as read or when new articles are fetched. + Synchronizes transient objects (dictionary of feed and articles) + with the database. + Called when a changes in the database is detected. """ self.articles, self.feeds = utils.load_feed() self.nb_articles = sum([feed[0] for feed in self.feeds.values()]) + self.nb_unread_articles = sum([feed[1] for feed in self.feeds.values()]) + self.nb_mail_notifications = len([feed for feed in self.feeds.values() \ + if feed[6] == "1"]) + self.nb_favorites = sum([len([article for article in self.articles[feed_id] \ + if article[7] == "1"]) \ + for feed_id in self.feeds.keys()]) if self.articles != {}: - self.top_words = utils.top_words(self.articles, 10, size=6) - utils.create_histogram(self.top_words) print "Base (%s) loaded" % utils.sqlite_base else: print "Base (%s) empty!" % utils.sqlite_base @@ -10,10 +10,6 @@ __license__ = "GPLv3" IMPORT_ERROR = [] import re -try: - import pylab -except: - IMPORT_ERROR.append("pylab") import string import hashlib import sqlite3 @@ -55,6 +51,12 @@ smtp_server = config.get('mail','smtp') username = config.get('mail','username') password = config.get('mail','password') +url_finders = [ \ + re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \ + re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \ + re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \ + re.compile("'\\<((mailto:)|)[-A-Za-z0-9\\.]+@[-A-Za-z0-9\\.]+"), \ +] def detect_language(text): """ @@ -104,49 +106,10 @@ def tag_cloud(tags): Generates a tags cloud. """ tags.sort(key=operator.itemgetter(0)) - return ' '.join([('<font size="%d"><a href="/q/?querystring=%s">%s</a></font>\n' % \ - (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, word)) \ + return ' '.join([('<font size="%d"><a href="/q/?querystring=%s" title="Count: %s">%s</a></font>\n' % \ + (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, count, word)) \ for (word, count) in tags]) -def create_histogram(words, file_name="./var/histogram.png"): - """ - Create a histogram. - """ - if "pylab" in IMPORT_ERROR: - return - length = 10 - ind = pylab.arange(length) # abscissa - width = 0.35 # bars width - - w = [elem[0] for elem in words] - count = [int(elem[1]) for elem in words] - - max_count = max(count) # maximal weight - - p = pylab.bar(ind, count, width, color='r') - - pylab.ylabel("Count") - pylab.title("Most frequent words") - pylab.xticks(ind + (width / 2), range(1, len(w)+1)) - pylab.xlim(-width, len(ind)) - - # changing the ordinate scale according to the max. - if max_count <= 100: - pylab.ylim(0, max_count + 5) - pylab.yticks(pylab.arange(0, max_count + 5, 5)) - elif max_count <= 200: - pylab.ylim(0, max_count + 10) - pylab.yticks(pylab.arange(0, max_count + 10, 10)) - elif max_count <= 600: - pylab.ylim(0, max_count + 25) - pylab.yticks(pylab.arange(0, max_count + 25, 25)) - elif max_count <= 800: - pylab.ylim(0, max_count + 50) - pylab.yticks(pylab.arange(0, max_count + 50, 50)) - - pylab.savefig(file_name, dpi = 80) - pylab.close() - def send_mail(mfrom, mto, feed_title, message): """Send the warning via mail """ diff --git a/var/feed.lst b/var/feed.lst index 29bb04e0..fbee3acf 100755 --- a/var/feed.lst +++ b/var/feed.lst @@ -24,4 +24,5 @@ http://www.jeffersonswheel.org/?feed=rss2 http://www.laquadrature.net/en/rss.xml http://static.fsf.org/fsforg/rss/blogs.xml http://esr.ibiblio.org/?feed=rss2 -http://www.maitre-eolas.fr/feed/atom
\ No newline at end of file +http://www.maitre-eolas.fr/feed/atom +http://linuxfr.org/backend/journaux/rss20.rss |