aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcedricbonhomme <devnull@localhost>2010-07-07 11:40:31 +0200
committercedricbonhomme <devnull@localhost>2010-07-07 11:40:31 +0200
commitae508ad4d7968a03f1254d82b3da6bdf541fd515 (patch)
tree2ff5ec88926c5c585d16632797552bd92b741bd7
parentFaster top_words function. (diff)
downloadnewspipe-ae508ad4d7968a03f1254d82b3da6bdf541fd515.tar.gz
newspipe-ae508ad4d7968a03f1254d82b3da6bdf541fd515.tar.bz2
newspipe-ae508ad4d7968a03f1254d82b3da6bdf541fd515.zip
Lot of performance improvements
-rwxr-xr-xfeedgetter.py9
-rwxr-xr-xpyAggr3g470r.py51
-rwxr-xr-xutils.py53
-rwxr-xr-xvar/feed.lst3
4 files changed, 31 insertions, 85 deletions
diff --git a/feedgetter.py b/feedgetter.py
index 267246db..d2aca435 100755
--- a/feedgetter.py
+++ b/feedgetter.py
@@ -16,13 +16,6 @@ from datetime import datetime
import utils
-url_finders = [ \
- re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \
- re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \
- re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \
- re.compile("'\\<((mailto:)|)[-A-Za-z0-9\\.]+@[-A-Za-z0-9\\.]+"), \
-]
-
feeds_list = []
list_of_threads = []
@@ -47,7 +40,7 @@ class FeedGetter(object):
with open("./var/feed.lst") as f:
for a_feed in f:
# test if the URL is well formed
- for url_regexp in url_finders:
+ for url_regexp in utils.url_finders:
if url_regexp.match(a_feed):
the_good_url = url_regexp.match(a_feed).group(0).replace("\n", "")
try:
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py
index cc649cf1..af6b772e 100755
--- a/pyAggr3g470r.py
+++ b/pyAggr3g470r.py
@@ -81,14 +81,13 @@ class Root:
if self.articles:
html += """<a href="/list_favorites/"><img src="/css/img/heart.png" title="Your favorites (%s)" /></a>\n""" % \
- (sum([len([article for article in self.articles[feed_id] if article[7] == "1"]) \
- for feed_id in self.feeds.keys()]),)
+ (self.nb_favorites,)
html += """<a href="/list_notification"><img src="/css/img/email.png" title="Active e-mail notifications (%s)" /></a>\n""" % \
- (len([feed for feed in self.feeds.values() if feed[6] == "1"]),)
+ (self.nb_mail_notifications,)
html += """<a href="/unread/All">Unread article(s): %s</a>\n""" % \
- (sum([feed[1] for feed in self.feeds.values()]),)
+ (self.nb_unread_articles,)
for rss_feed_id in self.feeds.keys():
html += """<h2><a name="%s"><a href="%s" rel="noreferrer"
@@ -200,15 +199,14 @@ class Root:
(feed_id, self.feeds[feed_id][3].encode('utf-8'))
html += """</select><input type="submit" value="OK"></form>\n"""
html += """<p>Active e-mail notifications: <a href="/list_notification">%s</a></p>\n""" % \
- (len([feed for feed in self.feeds.values() if feed[6] == "1"]),)
+ (self.nb_mail_notifications,)
html += """<p>You like <a href="/list_favorites/">%s</a> article(s).</p>\n""" % \
- (sum([len([article for article in self.articles[feed_id] if article[7] == "1"]) \
- for feed_id in self.feeds.keys()]), )
+ (self.nb_favorites, )
html += "<hr />\n"
html += """<p>The database contains a total of %s article(s) with
<a href="/unread/All">%s unread article(s)</a>.<br />""" % \
- (self.nb_articles, sum([feed[1] for feed in self.feeds.values()]))
+ (self.nb_articles, self.nb_unread_articles)
html += """Database: %s.\n<br />Size: %s bytes.</p>\n""" % \
(os.path.abspath(utils.sqlite_base), os.path.getsize(utils.sqlite_base))
@@ -225,7 +223,6 @@ class Root:
html += "<hr />\n"
if self.articles:
self.top_words = utils.top_words(self.articles, n=50, size=int(word_size))
- utils.create_histogram(self.top_words[:10])
html += "<h1>Statistics</h1>\n<br />\n"
if "oice" not in utils.IMPORT_ERROR:
nb_french = 0
@@ -247,31 +244,19 @@ class Root:
select = ""
html += """\t<option value="%s" %s>%s</option>\n""" % (size, select,size)
html += """</select><input type="submit" value="OK"></form>\n"""
- html += "<table border=0>\n"
- html += '<tr><td colspan="2">'
- html += "<h3>Tag cloud</h3>\n"
+ html += "<br /><h3>Tag cloud</h3>\n"
html += '<div style="width: 35%; overflow:hidden; text-align: justify">' + \
utils.tag_cloud(self.top_words) + '</div>'
- html += "<td></tr>"
- html += "<tr><td>"
- html += "<h3>Words count</h3>\n"
- html += "<ol>\n"
- for word, frequency in sorted(self.top_words, key=operator.itemgetter(1), reverse=True)[:10]:
- html += """\t<li><a href="/q/?querystring=%s">%s</a>: %s</li>\n""" % \
- (word, word, frequency)
- html += "</ol>\n"
- html += "<h3>Languages</h3>\n"
+ html += "<br /><h3>Languages</h3>\n"
if "oice" in utils.IMPORT_ERROR:
html += "Install the module "
html += """<a href="http://pypi.python.org/pypi/oice.langdet/">oice.langdet</a>"""
- html += "</td>\n<td>"
else:
html += "<ul>\n"
for language in ['english', 'french', 'other']:
html += """\t<li>%s articles in <a href="/language/%s">%s</a></li>\n""" % \
(locals()["nb_"+language], language, language)
- html += "</ul>\n</td>\n<td>"
- html += """<img src="/var/histogram.png" /></td></tr></table>\n<br />\n"""
+ html += "</ul>\n<br />"
html += "<hr />\n"
html += htmlfooter
@@ -349,7 +334,6 @@ class Root:
"""
feed_getter = feedgetter.FeedGetter()
feed_getter.retrieve_feed()
- #self.update()
return self.index()
fetch.exposed = True
@@ -506,7 +490,8 @@ class Root:
html += article[1].encode('utf-8') + \
""" - <a href="/description/%s:%s" rel="noreferrer" target="_blank">%s</a>
from <i><a href="%s">%s</a></i><br />\n""" % \
- (rss_feed_id, article[0].encode('utf-8'), article[2].encode('utf-8'), \
+ (rss_feed_id, article[0].encode('utf-8'), \
+ article[2].encode('utf-8'), \
self.feeds[rss_feed_id][5].encode('utf-8'), \
self.feeds[rss_feed_id][3].encode('utf-8'))
html += """<hr />\n<a href="/mark_as_read/All:">Mark articles as read</a>\n"""
@@ -841,15 +826,19 @@ class Root:
def update(self, path=None, event = None):
"""
- Synchronizes transient objects with the database,
- computes the list of most frequent words and generates the histogram.
- Called when an article is marked as read or when new articles are fetched.
+ Synchronizes transient objects (dictionary of feed and articles)
+ with the database.
+ Called when a changes in the database is detected.
"""
self.articles, self.feeds = utils.load_feed()
self.nb_articles = sum([feed[0] for feed in self.feeds.values()])
+ self.nb_unread_articles = sum([feed[1] for feed in self.feeds.values()])
+ self.nb_mail_notifications = len([feed for feed in self.feeds.values() \
+ if feed[6] == "1"])
+ self.nb_favorites = sum([len([article for article in self.articles[feed_id] \
+ if article[7] == "1"]) \
+ for feed_id in self.feeds.keys()])
if self.articles != {}:
- self.top_words = utils.top_words(self.articles, 10, size=6)
- utils.create_histogram(self.top_words)
print "Base (%s) loaded" % utils.sqlite_base
else:
print "Base (%s) empty!" % utils.sqlite_base
diff --git a/utils.py b/utils.py
index fc6d6891..a6822685 100755
--- a/utils.py
+++ b/utils.py
@@ -10,10 +10,6 @@ __license__ = "GPLv3"
IMPORT_ERROR = []
import re
-try:
- import pylab
-except:
- IMPORT_ERROR.append("pylab")
import string
import hashlib
import sqlite3
@@ -55,6 +51,12 @@ smtp_server = config.get('mail','smtp')
username = config.get('mail','username')
password = config.get('mail','password')
+url_finders = [ \
+ re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \
+ re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \
+ re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \
+ re.compile("'\\<((mailto:)|)[-A-Za-z0-9\\.]+@[-A-Za-z0-9\\.]+"), \
+]
def detect_language(text):
"""
@@ -104,49 +106,10 @@ def tag_cloud(tags):
Generates a tags cloud.
"""
tags.sort(key=operator.itemgetter(0))
- return ' '.join([('<font size="%d"><a href="/q/?querystring=%s">%s</a></font>\n' % \
- (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, word)) \
+ return ' '.join([('<font size="%d"><a href="/q/?querystring=%s" title="Count: %s">%s</a></font>\n' % \
+ (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, count, word)) \
for (word, count) in tags])
-def create_histogram(words, file_name="./var/histogram.png"):
- """
- Create a histogram.
- """
- if "pylab" in IMPORT_ERROR:
- return
- length = 10
- ind = pylab.arange(length) # abscissa
- width = 0.35 # bars width
-
- w = [elem[0] for elem in words]
- count = [int(elem[1]) for elem in words]
-
- max_count = max(count) # maximal weight
-
- p = pylab.bar(ind, count, width, color='r')
-
- pylab.ylabel("Count")
- pylab.title("Most frequent words")
- pylab.xticks(ind + (width / 2), range(1, len(w)+1))
- pylab.xlim(-width, len(ind))
-
- # changing the ordinate scale according to the max.
- if max_count <= 100:
- pylab.ylim(0, max_count + 5)
- pylab.yticks(pylab.arange(0, max_count + 5, 5))
- elif max_count <= 200:
- pylab.ylim(0, max_count + 10)
- pylab.yticks(pylab.arange(0, max_count + 10, 10))
- elif max_count <= 600:
- pylab.ylim(0, max_count + 25)
- pylab.yticks(pylab.arange(0, max_count + 25, 25))
- elif max_count <= 800:
- pylab.ylim(0, max_count + 50)
- pylab.yticks(pylab.arange(0, max_count + 50, 50))
-
- pylab.savefig(file_name, dpi = 80)
- pylab.close()
-
def send_mail(mfrom, mto, feed_title, message):
"""Send the warning via mail
"""
diff --git a/var/feed.lst b/var/feed.lst
index 29bb04e0..fbee3acf 100755
--- a/var/feed.lst
+++ b/var/feed.lst
@@ -24,4 +24,5 @@ http://www.jeffersonswheel.org/?feed=rss2
http://www.laquadrature.net/en/rss.xml
http://static.fsf.org/fsforg/rss/blogs.xml
http://esr.ibiblio.org/?feed=rss2
-http://www.maitre-eolas.fr/feed/atom \ No newline at end of file
+http://www.maitre-eolas.fr/feed/atom
+http://linuxfr.org/backend/journaux/rss20.rss
bgstack15