aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcedricbonhomme <devnull@localhost>2010-02-24 09:08:06 +0100
committercedricbonhomme <devnull@localhost>2010-02-24 09:08:06 +0100
commit3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7 (patch)
treeaaaaa1b07cad5e7501083102cca7da0aa3f90578
parentStatistics on words are only processed on articles content. (diff)
downloadnewspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.gz
newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.bz2
newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.zip
Added utils.py and lot of improvements.
-rw-r--r--pyAggr3g470r.py189
-rw-r--r--utils.py156
2 files changed, 191 insertions, 154 deletions
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py
index d72a6008..87be077b 100644
--- a/pyAggr3g470r.py
+++ b/pyAggr3g470r.py
@@ -7,19 +7,13 @@ __date__ = "$Date: 2010/02/23 $"
__copyright__ = "Copyright (c) 2010 Cedric Bonhomme"
__license__ = "GPLv3"
-import re
import os
-import pylab
-import sqlite3
-import hashlib
import cherrypy
import ConfigParser
-from datetime import datetime
-from string import punctuation
-from collections import defaultdict
from cherrypy.lib.static import serve_file
+import utils
import feedgetter
config = ConfigParser.RawConfigParser()
@@ -41,10 +35,10 @@ htmlheader = """<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"
<title>pyAggr3g470r - RSS Feed Reader</title> </head>"""
htmlfooter = """<p>This software is under GPLv3 license. You are welcome to copy, modify or
- redistribute the source code according to the GPLv3 license.</p></div>
- </body></html>"""
+ redistribute the source code according to the GPLv3 license.</p></div>\n
+ </body>\n</html>"""
-htmlnav = """<body><h1><a name="top"><a href="/">pyAggr3g470r - RSS Feed Reader</a></a></h1><a
+htmlnav = """<body>\n<h1><a name="top"><a href="/">pyAggr3g470r - RSS Feed Reader</a></a></h1>\n<a
href="http://bitbucket.org/cedricbonhomme/pyaggr3g470r/" rel="noreferrer" target="_blank">
pyAggr3g470r (source code)</a>
"""
@@ -55,7 +49,7 @@ class Root:
"""
Main page containing the list of feeds and articles.
"""
- self.dic, self.dic_info = self.load_feed()
+ self.dic, self.dic_info = utils.load_feed()
html = htmlheader
html += htmlnav
html += """<div class="right inner">\n"""
@@ -113,10 +107,13 @@ class Root:
html += htmlfooter
return html
+ index.exposed = True
+
+
def management(self):
"""
"""
- self.dic, self.dic_info = self.load_feed()
+ self.dic, self.dic_info = utils.load_feed()
html = htmlheader
html += htmlnav
html += """</div> <div class="left inner">\n"""
@@ -147,35 +144,24 @@ class Root:
html += "<hr />\n"
html += "<h1>Statistics</h1>\n"
- N = 10
- words = {}
- article_content = ""
- for rss_feed_id in self.dic.keys():
- for article in self.dic[rss_feed_id]:
- article_content += remove_html_tags(article[4].encode('utf-8'))
-
- words_gen = (word.strip(punctuation).lower() \
- for word in article_content.split() \
- if len(word) >= 5)
- words = defaultdict(int)
- for word in words_gen:
- words[word] += 1
+ top_words = utils.top_words(self.dic, 10)
- top_words = sorted(words.iteritems(),
- key=lambda(word, count): (-count, word))[:N]
html += "<table border=0>\n<tr><td>"
html += "<ol>\n"
for word, frequency in top_words:
html += """\t<li><a href="/q/?querystring=%s">%s</a>: %s</li>\n""" % \
(word, word, frequency)
html += "</ol>\n</td><td>"
- create_histogram(top_words)
+ utils.create_histogram(top_words)
html += """<img src="/var/histogram.png" /></td></tr></table>"""
html += "<hr />\n"
html += htmlfooter
return html
+ management.exposed = True
+
+
def q(self, querystring=None):
"""
Search for a feed. Simply search for the string 'querystring'
@@ -193,7 +179,7 @@ class Root:
if feed_id is not None:
for article in self.dic[rss_feed_id]:
- article_content = remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8'))
+ article_content = utils.remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8'))
if querystring.lower() in article_content.lower():
if article[7] == "0":
# not readed articles are in bold
@@ -211,7 +197,7 @@ class Root:
else:
for rss_feed_id in self.dic.keys():
for article in self.dic[rss_feed_id]:
- article_content = remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8'))
+ article_content = utils.remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8'))
if querystring.lower() in article_content.lower():
if article[7] == "0":
# not readed articles are in bold
@@ -231,6 +217,9 @@ class Root:
html += htmlfooter
return html
+ q.exposed = True
+
+
def fetch(self):
"""
Fetch all feeds
@@ -239,6 +228,9 @@ class Root:
feed_getter.retrieve_feed()
return self.index()
+ fetch.exposed = True
+
+
def description(self, article_id):
"""
Display the description of an article in a new Web page.
@@ -260,10 +252,16 @@ class Root:
html += description
else:
html += "No description available."
- html += """<hr />\n<a href="%s">Complete story</a>\n""" % (article[3].encode('utf-8'),)
+ html += """<hr />\n<a href="%s">Complete story</a>\n<br />\n""" % \
+ (article[3].encode('utf-8'),)
+ html += """<a href="http://delicious.com/%s">Delicious</a>""" % \
+ (article[3].encode('utf-8'),)
html += "<hr />\n" + htmlfooter
return html
+ description.exposed = True
+
+
def all_articles(self, feed_id):
"""
Display all articles of a feed ('feed_title').
@@ -309,6 +307,9 @@ class Root:
html += htmlfooter
return html
+ all_articles.exposed = True
+
+
def unread(self, feed_id):
"""
Display all unread articles of a feed ('feed_title').
@@ -334,53 +335,8 @@ class Root:
html += htmlfooter
return html
- def load_feed(self):
- """
- Load feeds in a dictionary.
- """
- list_of_articles = None
- try:
- conn = sqlite3.connect("./var/feed.db", isolation_level = None)
- c = conn.cursor()
- list_of_articles = c.execute("SELECT * FROM rss_feed").fetchall()
- c.close()
- except:
- pass
-
- # The key of dic is the id of the feed:
- # dic[feed_id] = (article_id, article_date, article_title,
- # article_link, article_description, feed_title,
- # feed_link, article_readed)
- # dic_info[feed_id] = (nb_article, nb_article_unreaded)
- dic, dic_info = {}, {}
- if list_of_articles is not None:
- for article in list_of_articles:
- sha256_hash = hashlib.sha256()
- sha256_hash.update(article[5].encode('utf-8'))
- feed_id = sha256_hash.hexdigest()
- sha256_hash.update(article[2].encode('utf-8'))
- article_id = sha256_hash.hexdigest()
-
- article_list = [article_id, article[0], article[1], \
- article[2], article[3], article[4], article[5], article[6]]
-
- if feed_id not in dic:
- dic[feed_id] = [article_list]
- else:
- dic[feed_id].append(article_list)
-
- # sort articles by date for each feeds
- for feeds in dic.keys():
- dic[feeds].sort(lambda x,y: compare(y[1], x[1]))
-
- for rss_feed_id in dic.keys():
- dic_info[rss_feed_id] = (len(dic[rss_feed_id]), \
- len([article for article in dic[rss_feed_id] \
- if article[7]=="0"]) \
- )
+ unread.exposed = True
- return (dic, dic_info)
- return (dic, dic_info)
def mark_as_read(self, target):
"""
@@ -405,91 +361,16 @@ class Root:
except Exception, e:
pass
- self.dic, self.dic_info = self.load_feed()
+ self.dic, self.dic_info = utils.load_feed()
if param == "All" or param == "Feed_FromMainPage":
return self.index()
elif param == "Feed":
return self.all_articles(identifiant)
- index.exposed = True
- management.exposed = True
- fetch.exposed = True
- q.exposed = True
- description.exposed = True
- all_articles.exposed = True
mark_as_read.exposed = True
- unread.exposed = True
-def remove_html_tags(data):
- """
- Remove HTML tags for the search.
- """
- p = re.compile(r'<[^<]*?/?>')
- return p.sub('', data)
-
-def create_histogram(words, file_name="./var/histogram.png"):
- """
- Create a histogram.
- """
- length = 10
- ind = pylab.arange(length) # abscissa
- width = 0.35 # bars width
-
- w = [elem[0] for elem in words]
- count = [int(elem[1]) for elem in words]
-
- max_count = max(count) # maximal weight
-
- p = pylab.bar(ind, count, width, color='r')
-
- pylab.ylabel("Count")
- pylab.title("Most frequent words")
- pylab.xticks(ind + (width / 2), range(1, len(w)+1))
- pylab.xlim(-width, len(ind))
-
- # changing the ordinate scale according to the max.
- if max_count <= 100:
- pylab.ylim(0, max_count + 5)
- pylab.yticks(pylab.arange(0, max_count + 5, 5))
- elif max_count <= 200:
- pylab.ylim(0, max_count + 10)
- pylab.yticks(pylab.arange(0, max_count + 10, 10))
- elif max_count <= 600:
- pylab.ylim(0, max_count + 25)
- pylab.yticks(pylab.arange(0, max_count + 25, 25))
- elif max_count <= 800:
- pylab.ylim(0, max_count + 50)
- pylab.yticks(pylab.arange(0, max_count + 50, 50))
-
- pylab.savefig(file_name, dpi = 80)
- pylab.close()
-
-def compare(stringtime1, stringtime2):
- """
- Compare two dates in the format 'yyyy-mm-dd hh:mm:ss'.
- """
- date1, time1 = stringtime1.split(' ')
- date2, time2 = stringtime2.split(' ')
-
- year1, month1, day1 = date1.split('-')
- year2, month2, day2 = date2.split('-')
-
- hour1, minute1, second1 = time1.split(':')
- hour2, minute2, second2 = time2.split(':')
-
- datetime1 = datetime(year=int(year1), month=int(month1), day=int(day1), \
- hour=int(hour1), minute=int(minute1), second=int(second1))
-
- datetime2 = datetime(year=int(year2), month=int(month2), day=int(day2), \
- hour=int(hour2), minute=int(minute2), second=int(second2))
-
- if datetime1 < datetime2:
- return -1
- elif datetime1 > datetime2:
- return 1
- else:
- return 0
+
if __name__ == '__main__':
diff --git a/utils.py b/utils.py
new file mode 100644
index 00000000..7193fde3
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,156 @@
+#! /usr/local/bin/python
+#-*- coding: utf-8 -*-
+
+__author__ = "Cedric Bonhomme"
+__version__ = "$Revision: 0.1 $"
+__date__ = "$Date: 2010/02/24 $"
+__copyright__ = "Copyright (c) 2010 Cedric Bonhomme"
+__license__ = "GPLv3"
+
+import re
+import pylab
+import sqlite3
+import hashlib
+
+from datetime import datetime
+from string import punctuation
+from collections import defaultdict
+
+
+def remove_html_tags(data):
+ """
+ Remove HTML tags for the search.
+ """
+ p = re.compile(r'<[^<]*?/?>')
+ return p.sub('', data)
+
+def top_words(dic_articles, n=10):
+ """
+ """
+ N = 10
+ words = {}
+ articles_content = ""
+ for rss_feed_id in dic_articles.keys():
+ for article in dic_articles[rss_feed_id]:
+ articles_content += remove_html_tags(article[4].encode('utf-8'))
+ words_gen = (word.strip(punctuation).lower() \
+ for word in articles_content.split() \
+ if len(word) >= 5)
+ words = defaultdict(int)
+ for word in words_gen:
+ words[word] += 1
+ top_words = sorted(words.iteritems(),
+ key=lambda(word, count): (-count, word))[:N]
+ return top_words
+
+def create_histogram(words, file_name="./var/histogram.png"):
+ """
+ Create a histogram.
+ """
+ length = 10
+ ind = pylab.arange(length) # abscissa
+ width = 0.35 # bars width
+
+ w = [elem[0] for elem in words]
+ count = [int(elem[1]) for elem in words]
+
+ max_count = max(count) # maximal weight
+
+ p = pylab.bar(ind, count, width, color='r')
+
+ pylab.ylabel("Count")
+ pylab.title("Most frequent words")
+ pylab.xticks(ind + (width / 2), range(1, len(w)+1))
+ pylab.xlim(-width, len(ind))
+
+ # changing the ordinate scale according to the max.
+ if max_count <= 100:
+ pylab.ylim(0, max_count + 5)
+ pylab.yticks(pylab.arange(0, max_count + 5, 5))
+ elif max_count <= 200:
+ pylab.ylim(0, max_count + 10)
+ pylab.yticks(pylab.arange(0, max_count + 10, 10))
+ elif max_count <= 600:
+ pylab.ylim(0, max_count + 25)
+ pylab.yticks(pylab.arange(0, max_count + 25, 25))
+ elif max_count <= 800:
+ pylab.ylim(0, max_count + 50)
+ pylab.yticks(pylab.arange(0, max_count + 50, 50))
+
+ pylab.savefig(file_name, dpi = 80)
+ pylab.close()
+
+def compare(stringtime1, stringtime2):
+ """
+ Compare two dates in the format 'yyyy-mm-dd hh:mm:ss'.
+ """
+ date1, time1 = stringtime1.split(' ')
+ date2, time2 = stringtime2.split(' ')
+
+ year1, month1, day1 = date1.split('-')
+ year2, month2, day2 = date2.split('-')
+
+ hour1, minute1, second1 = time1.split(':')
+ hour2, minute2, second2 = time2.split(':')
+
+ datetime1 = datetime(year=int(year1), month=int(month1), day=int(day1), \
+ hour=int(hour1), minute=int(minute1), second=int(second1))
+
+ datetime2 = datetime(year=int(year2), month=int(month2), day=int(day2), \
+ hour=int(hour2), minute=int(minute2), second=int(second2))
+
+ if datetime1 < datetime2:
+ return -1
+ elif datetime1 > datetime2:
+ return 1
+ else:
+ return 0
+
+
+def load_feed():
+ """
+ Load feeds in a dictionary.
+ """
+ list_of_articles = None
+ try:
+ conn = sqlite3.connect("./var/feed.db", isolation_level = None)
+ c = conn.cursor()
+ list_of_articles = c.execute("SELECT * FROM rss_feed").fetchall()
+ c.close()
+ except:
+ pass
+
+ # The key of dic is the id of the feed:
+ # dic[feed_id] = (article_id, article_date, article_title,
+ # article_link, article_description, feed_title,
+ # feed_link, article_readed)
+ # dic_info[feed_id] = (nb_article, nb_article_unreaded)
+ dic, dic_info = {}, {}
+ if list_of_articles is not None:
+ for article in list_of_articles:
+ sha256_hash = hashlib.sha256()
+ sha256_hash.update(article[5].encode('utf-8'))
+ feed_id = sha256_hash.hexdigest()
+ sha256_hash.update(article[2].encode('utf-8'))
+ article_id = sha256_hash.hexdigest()
+
+ article_list = [article_id, article[0], article[1], \
+ article[2], article[3], article[4], article[5], article[6]]
+
+ if feed_id not in dic:
+ dic[feed_id] = [article_list]
+ else:
+ dic[feed_id].append(article_list)
+
+ # sort articles by date for each feeds
+ for feeds in dic.keys():
+ dic[feeds].sort(lambda x,y: compare(y[1], x[1]))
+
+ for rss_feed_id in dic.keys():
+ dic_info[rss_feed_id] = (len(dic[rss_feed_id]), \
+ len([article for article in dic[rss_feed_id] \
+ if article[7]=="0"]) \
+ )
+
+ return (dic, dic_info)
+ return (dic, dic_info) \ No newline at end of file
bgstack15