Added utils.py and lot of improvements.

author: cedricbonhomme <devnull@localhost> 2010-02-24 09:08:06 +0100
committer: cedricbonhomme <devnull@localhost> 2010-02-24 09:08:06 +0100
commit: 3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7 (patch)
tree: aaaaa1b07cad5e7501083102cca7da0aa3f90578
parent: Statistics on words are only processed on articles content. (diff)
download: newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.gz
newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.bz2
newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.zip
2 files changed, 191 insertions, 154 deletions
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py
index d72a6008..87be077b 100644
--- a/pyAggr3g470r.py
+++ b/pyAggr3g470r.py
@@ -7,19 +7,13 @@ __date__ = "$Date: 2010/02/23 $"
 __copyright__ = "Copyright (c) 2010 Cedric Bonhomme"
 __license__ = "GPLv3"
 
-import re
 import os
-import pylab
-import sqlite3
-import hashlib
 import cherrypy
 import ConfigParser
 
-from datetime import datetime
-from string import punctuation
-from collections import defaultdict
 from cherrypy.lib.static import serve_file
 
+import utils
 import feedgetter
 
 config = ConfigParser.RawConfigParser()
@@ -41,10 +35,10 @@ htmlheader = """<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"
                 <title>pyAggr3g470r - RSS Feed Reader</title> </head>"""
 
 htmlfooter =  """<p>This software is under GPLv3 license. You are welcome to copy, modify or
-                redistribute the source code according to the GPLv3 license.</p></div>
-                </body></html>"""
+                redistribute the source code according to the GPLv3 license.</p></div>\n
+                </body>\n</html>"""
 
-htmlnav = """<body><h1><a name="top"><a href="/">pyAggr3g470r - RSS Feed Reader</a></a></h1><a
+htmlnav = """<body>\n<h1><a name="top"><a href="/">pyAggr3g470r - RSS Feed Reader</a></a></h1>\n<a
 href="http://bitbucket.org/cedricbonhomme/pyaggr3g470r/" rel="noreferrer" target="_blank">
 pyAggr3g470r (source code)</a>
 """
@@ -55,7 +49,7 @@ class Root:
         """
         Main page containing the list of feeds and articles.
         """
-        self.dic, self.dic_info = self.load_feed()
+        self.dic, self.dic_info = utils.load_feed()
         html = htmlheader
         html += htmlnav
         html += """<div class="right inner">\n"""
@@ -113,10 +107,13 @@ class Root:
         html += htmlfooter
         return html
 
+    index.exposed = True
+
+
     def management(self):
         """
         """
-        self.dic, self.dic_info = self.load_feed()
+        self.dic, self.dic_info = utils.load_feed()
         html = htmlheader
         html += htmlnav
         html += """</div> <div class="left inner">\n"""
@@ -147,35 +144,24 @@ class Root:
 
         html += "<hr />\n"
         html += "<h1>Statistics</h1>\n"
-        N = 10
-        words = {}
-        article_content = ""
-        for rss_feed_id in self.dic.keys():
-                for article in self.dic[rss_feed_id]:
-                    article_content += remove_html_tags(article[4].encode('utf-8'))
-
-        words_gen = (word.strip(punctuation).lower() \
-                        for word in article_content.split() \
-                        if len(word) >= 5)
-        words = defaultdict(int)
-        for word in words_gen:
-            words[word] += 1
+        top_words = utils.top_words(self.dic, 10)
 
-        top_words = sorted(words.iteritems(),
-                        key=lambda(word, count): (-count, word))[:N]
         html += "<table border=0>\n<tr><td>"
         html += "<ol>\n"
         for word, frequency in top_words:
             html += """\t<li><a href="/q/?querystring=%s">%s</a>: %s</li>\n""" % \
                             (word, word, frequency)
         html += "</ol>\n</td><td>"
-        create_histogram(top_words)
+        utils.create_histogram(top_words)
         html += """<img src="/var/histogram.png" /></td></tr></table>"""
 
         html += "<hr />\n"
         html += htmlfooter
         return html
 
+    management.exposed = True
+
+
     def q(self, querystring=None):
         """
         Search for a feed. Simply search for the string 'querystring'
@@ -193,7 +179,7 @@ class Root:
 
         if feed_id is not None:
             for article in self.dic[rss_feed_id]:
-                article_content = remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8'))
+                article_content = utils.remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8'))
                 if querystring.lower() in article_content.lower():
                     if article[7] == "0":
                         # not readed articles are in bold
@@ -211,7 +197,7 @@ class Root:
         else:
             for rss_feed_id in self.dic.keys():
                 for article in self.dic[rss_feed_id]:
-                    article_content = remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8'))
+                    article_content = utils.remove_html_tags(article[4].encode('utf-8') + article[2].encode('utf-8'))
                     if querystring.lower() in article_content.lower():
                         if article[7] == "0":
                             # not readed articles are in bold
@@ -231,6 +217,9 @@ class Root:
         html += htmlfooter
         return html
 
+    q.exposed = True
+
+
     def fetch(self):
         """
         Fetch all feeds
@@ -239,6 +228,9 @@ class Root:
         feed_getter.retrieve_feed()
         return self.index()
 
+    fetch.exposed = True
+
+
     def description(self, article_id):
         """
         Display the description of an article in a new Web page.
@@ -260,10 +252,16 @@ class Root:
                         html += description
                     else:
                         html += "No description available."
-                    html += """<hr />\n<a href="%s">Complete story</a>\n""" % (article[3].encode('utf-8'),)
+                    html += """<hr />\n<a href="%s">Complete story</a>\n<br />\n""" % \
+                                    (article[3].encode('utf-8'),)
+                    html += """<a href="http://delicious.com/%s">Delicious</a>""" % \
+                                    (article[3].encode('utf-8'),)
         html += "<hr />\n" + htmlfooter
         return html
 
+    description.exposed = True
+
+
     def all_articles(self, feed_id):
         """
         Display all articles of a feed ('feed_title').
@@ -309,6 +307,9 @@ class Root:
         html += htmlfooter
         return html
 
+    all_articles.exposed = True
+
+
     def unread(self, feed_id):
         """
         Display all unread articles of a feed ('feed_title').
@@ -334,53 +335,8 @@ class Root:
         html += htmlfooter
         return html
 
-    def load_feed(self):
-        """
-        Load feeds in a dictionary.
-        """
-        list_of_articles = None
-        try:
-            conn = sqlite3.connect("./var/feed.db", isolation_level = None)
-            c = conn.cursor()
-            list_of_articles = c.execute("SELECT * FROM rss_feed").fetchall()
-            c.close()
-        except:
-            pass
-
-        # The key of dic is the id of the feed:
-        # dic[feed_id] = (article_id, article_date, article_title,
-        #               article_link, article_description, feed_title,
-        #               feed_link, article_readed)
-        # dic_info[feed_id] = (nb_article, nb_article_unreaded)
-        dic, dic_info = {}, {}
-        if list_of_articles is not None:
-            for article in list_of_articles:
-                sha256_hash = hashlib.sha256()
-                sha256_hash.update(article[5].encode('utf-8'))
-                feed_id = sha256_hash.hexdigest()
-                sha256_hash.update(article[2].encode('utf-8'))
-                article_id = sha256_hash.hexdigest()
-
-                article_list = [article_id, article[0], article[1], \
-                    article[2], article[3], article[4], article[5], article[6]]
-
-                if feed_id not in dic:
-                    dic[feed_id] = [article_list]
-                else:
-                    dic[feed_id].append(article_list)
-
-            # sort articles by date for each feeds
-            for feeds in dic.keys():
-                dic[feeds].sort(lambda x,y: compare(y[1], x[1]))
-
-            for rss_feed_id in dic.keys():
-                dic_info[rss_feed_id] = (len(dic[rss_feed_id]), \
-                                        len([article for article in dic[rss_feed_id] \
-                                                                if article[7]=="0"]) \
-                                        )
+    unread.exposed = True
 
-            return (dic, dic_info)
-        return (dic, dic_info)
 
     def mark_as_read(self, target):
         """
@@ -405,91 +361,16 @@ class Root:
         except Exception, e:
             pass
 
-        self.dic, self.dic_info = self.load_feed()
+        self.dic, self.dic_info = utils.load_feed()
 
         if param == "All" or param == "Feed_FromMainPage":
             return self.index()
         elif param == "Feed":
             return self.all_articles(identifiant)
 
-    index.exposed = True
-    management.exposed = True
-    fetch.exposed = True
-    q.exposed = True
-    description.exposed = True
-    all_articles.exposed = True
     mark_as_read.exposed = True
-    unread.exposed = True
 
-def remove_html_tags(data):
-    """
-    Remove HTML tags for the search.
-    """
-    p = re.compile(r'<[^<]*?/?>')
-    return p.sub('', data)
-
-def create_histogram(words, file_name="./var/histogram.png"):
-    """
-    Create a histogram.
-    """
-    length = 10
-    ind = pylab.arange(length) # abscissa
-    width = 0.35 # bars width
-
-    w = [elem[0] for elem in words]
-    count = [int(elem[1]) for elem in words]
-
-    max_count = max(count)  # maximal weight
-
-    p = pylab.bar(ind, count, width, color='r')
-
-    pylab.ylabel("Count")
-    pylab.title("Most frequent words")
-    pylab.xticks(ind + (width / 2), range(1, len(w)+1))
-    pylab.xlim(-width, len(ind))
-
-    # changing the ordinate scale according to the max.
-    if max_count <= 100:
-        pylab.ylim(0, max_count + 5)
-        pylab.yticks(pylab.arange(0, max_count + 5, 5))
-    elif max_count <= 200:
-        pylab.ylim(0, max_count + 10)
-        pylab.yticks(pylab.arange(0, max_count + 10, 10))
-    elif max_count <= 600:
-        pylab.ylim(0, max_count + 25)
-        pylab.yticks(pylab.arange(0, max_count + 25, 25))
-    elif max_count <= 800:
-        pylab.ylim(0, max_count + 50)
-        pylab.yticks(pylab.arange(0, max_count + 50, 50))
-
-    pylab.savefig(file_name, dpi = 80)
-    pylab.close()
-
-def compare(stringtime1, stringtime2):
-    """
-    Compare two dates in the format 'yyyy-mm-dd hh:mm:ss'.
-    """
-    date1, time1 = stringtime1.split(' ')
-    date2, time2 = stringtime2.split(' ')
-
-    year1, month1, day1 = date1.split('-')
-    year2, month2, day2 = date2.split('-')
-
-    hour1, minute1, second1 = time1.split(':')
-    hour2, minute2, second2 = time2.split(':')
-
-    datetime1 = datetime(year=int(year1), month=int(month1), day=int(day1), \
-                        hour=int(hour1), minute=int(minute1), second=int(second1))
-
-    datetime2 = datetime(year=int(year2), month=int(month2), day=int(day2), \
-                        hour=int(hour2), minute=int(minute2), second=int(second2))
-
-    if datetime1 < datetime2:
-        return -1
-    elif datetime1 > datetime2:
-        return 1
-    else:
-        return 0
+
 
 
 if __name__ == '__main__':
diff --git a/utils.py b/utils.py
new file mode 100644
index 00000000..7193fde3
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,156 @@
+#! /usr/local/bin/python
+#-*- coding: utf-8 -*-
+
+__author__ = "Cedric Bonhomme"
+__version__ = "$Revision: 0.1 $"
+__date__ = "$Date: 2010/02/24 $"
+__copyright__ = "Copyright (c) 2010 Cedric Bonhomme"
+__license__ = "GPLv3"
+
+import re
+import pylab
+import sqlite3
+import hashlib
+
+from datetime import datetime
+from string import punctuation
+from collections import defaultdict
+
+
+def remove_html_tags(data):
+    """
+    Remove HTML tags for the search.
+    """
+    p = re.compile(r'<[^<]*?/?>')
+    return p.sub('', data)
+
+def top_words(dic_articles, n=10):
+    """
+    """
+    N = 10
+    words = {}
+    articles_content = ""
+    for rss_feed_id in dic_articles.keys():
+                for article in dic_articles[rss_feed_id]:
+                    articles_content += remove_html_tags(article[4].encode('utf-8'))
+    words_gen = (word.strip(punctuation).lower() \
+                for word in articles_content.split() \
+                if len(word) >= 5)
+    words = defaultdict(int)
+    for word in words_gen:
+        words[word] += 1
+    top_words = sorted(words.iteritems(),
+                key=lambda(word, count): (-count, word))[:N]
+    return top_words
+
+def create_histogram(words, file_name="./var/histogram.png"):
+    """
+    Create a histogram.
+    """
+    length = 10
+    ind = pylab.arange(length) # abscissa
+    width = 0.35 # bars width
+
+    w = [elem[0] for elem in words]
+    count = [int(elem[1]) for elem in words]
+
+    max_count = max(count)  # maximal weight
+
+    p = pylab.bar(ind, count, width, color='r')
+
+    pylab.ylabel("Count")
+    pylab.title("Most frequent words")
+    pylab.xticks(ind + (width / 2), range(1, len(w)+1))
+    pylab.xlim(-width, len(ind))
+
+    # changing the ordinate scale according to the max.
+    if max_count <= 100:
+        pylab.ylim(0, max_count + 5)
+        pylab.yticks(pylab.arange(0, max_count + 5, 5))
+    elif max_count <= 200:
+        pylab.ylim(0, max_count + 10)
+        pylab.yticks(pylab.arange(0, max_count + 10, 10))
+    elif max_count <= 600:
+        pylab.ylim(0, max_count + 25)
+        pylab.yticks(pylab.arange(0, max_count + 25, 25))
+    elif max_count <= 800:
+        pylab.ylim(0, max_count + 50)
+        pylab.yticks(pylab.arange(0, max_count + 50, 50))
+
+    pylab.savefig(file_name, dpi = 80)
+    pylab.close()
+
+def compare(stringtime1, stringtime2):
+    """
+    Compare two dates in the format 'yyyy-mm-dd hh:mm:ss'.
+    """
+    date1, time1 = stringtime1.split(' ')
+    date2, time2 = stringtime2.split(' ')
+
+    year1, month1, day1 = date1.split('-')
+    year2, month2, day2 = date2.split('-')
+
+    hour1, minute1, second1 = time1.split(':')
+    hour2, minute2, second2 = time2.split(':')
+
+    datetime1 = datetime(year=int(year1), month=int(month1), day=int(day1), \
+                        hour=int(hour1), minute=int(minute1), second=int(second1))
+
+    datetime2 = datetime(year=int(year2), month=int(month2), day=int(day2), \
+                        hour=int(hour2), minute=int(minute2), second=int(second2))
+
+    if datetime1 < datetime2:
+        return -1
+    elif datetime1 > datetime2:
+        return 1
+    else:
+        return 0
+
+
+def load_feed():
+    """
+    Load feeds in a dictionary.
+    """
+    list_of_articles = None
+    try:
+        conn = sqlite3.connect("./var/feed.db", isolation_level = None)
+        c = conn.cursor()
+        list_of_articles = c.execute("SELECT * FROM rss_feed").fetchall()
+        c.close()
+    except:
+        pass
+
+    # The key of dic is the id of the feed:
+    # dic[feed_id] = (article_id, article_date, article_title,
+    #               article_link, article_description, feed_title,
+    #               feed_link, article_readed)
+    # dic_info[feed_id] = (nb_article, nb_article_unreaded)
+    dic, dic_info = {}, {}
+    if list_of_articles is not None:
+        for article in list_of_articles:
+            sha256_hash = hashlib.sha256()
+            sha256_hash.update(article[5].encode('utf-8'))
+            feed_id = sha256_hash.hexdigest()
+            sha256_hash.update(article[2].encode('utf-8'))
+            article_id = sha256_hash.hexdigest()
+
+            article_list = [article_id, article[0], article[1], \
+                article[2], article[3], article[4], article[5], article[6]]
+
+            if feed_id not in dic:
+                dic[feed_id] = [article_list]
+            else:
+                dic[feed_id].append(article_list)
+
+        # sort articles by date for each feeds
+        for feeds in dic.keys():
+            dic[feeds].sort(lambda x,y: compare(y[1], x[1]))
+
+        for rss_feed_id in dic.keys():
+            dic_info[rss_feed_id] = (len(dic[rss_feed_id]), \
+                                    len([article for article in dic[rss_feed_id] \
+                                                            if article[7]=="0"]) \
+                                    )
+
+        return (dic, dic_info)
+    return (dic, dic_info)
+\ No newline at end of file
author	cedricbonhomme <devnull@localhost>	2010-02-24 09:08:06 +0100
committer	cedricbonhomme <devnull@localhost>	2010-02-24 09:08:06 +0100
commit	3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7 (patch)
tree	aaaaa1b07cad5e7501083102cca7da0aa3f90578
parent	Statistics on words are only processed on articles content. (diff)
download	newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.gz newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.bz2 newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.zip