Added utils.py and lot of improvements.

author: cedricbonhomme <devnull@localhost> 2010-02-24 09:08:06 +0100
committer: cedricbonhomme <devnull@localhost> 2010-02-24 09:08:06 +0100
commit: 3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7 (patch)
tree: aaaaa1b07cad5e7501083102cca7da0aa3f90578 /utils.py
parent: Statistics on words are only processed on articles content. (diff)
download: newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.gz
newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.bz2
newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.zip
1 files changed, 156 insertions, 0 deletions
diff --git a/utils.py b/utils.py
new file mode 100644
index 00000000..7193fde3
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,156 @@
+#! /usr/local/bin/python
+#-*- coding: utf-8 -*-
+
+__author__ = "Cedric Bonhomme"
+__version__ = "$Revision: 0.1 $"
+__date__ = "$Date: 2010/02/24 $"
+__copyright__ = "Copyright (c) 2010 Cedric Bonhomme"
+__license__ = "GPLv3"
+
+import re
+import pylab
+import sqlite3
+import hashlib
+
+from datetime import datetime
+from string import punctuation
+from collections import defaultdict
+
+
+def remove_html_tags(data):
+    """
+    Remove HTML tags for the search.
+    """
+    p = re.compile(r'<[^<]*?/?>')
+    return p.sub('', data)
+
+def top_words(dic_articles, n=10):
+    """
+    """
+    N = 10
+    words = {}
+    articles_content = ""
+    for rss_feed_id in dic_articles.keys():
+                for article in dic_articles[rss_feed_id]:
+                    articles_content += remove_html_tags(article[4].encode('utf-8'))
+    words_gen = (word.strip(punctuation).lower() \
+                for word in articles_content.split() \
+                if len(word) >= 5)
+    words = defaultdict(int)
+    for word in words_gen:
+        words[word] += 1
+    top_words = sorted(words.iteritems(),
+                key=lambda(word, count): (-count, word))[:N]
+    return top_words
+
+def create_histogram(words, file_name="./var/histogram.png"):
+    """
+    Create a histogram.
+    """
+    length = 10
+    ind = pylab.arange(length) # abscissa
+    width = 0.35 # bars width
+
+    w = [elem[0] for elem in words]
+    count = [int(elem[1]) for elem in words]
+
+    max_count = max(count)  # maximal weight
+
+    p = pylab.bar(ind, count, width, color='r')
+
+    pylab.ylabel("Count")
+    pylab.title("Most frequent words")
+    pylab.xticks(ind + (width / 2), range(1, len(w)+1))
+    pylab.xlim(-width, len(ind))
+
+    # changing the ordinate scale according to the max.
+    if max_count <= 100:
+        pylab.ylim(0, max_count + 5)
+        pylab.yticks(pylab.arange(0, max_count + 5, 5))
+    elif max_count <= 200:
+        pylab.ylim(0, max_count + 10)
+        pylab.yticks(pylab.arange(0, max_count + 10, 10))
+    elif max_count <= 600:
+        pylab.ylim(0, max_count + 25)
+        pylab.yticks(pylab.arange(0, max_count + 25, 25))
+    elif max_count <= 800:
+        pylab.ylim(0, max_count + 50)
+        pylab.yticks(pylab.arange(0, max_count + 50, 50))
+
+    pylab.savefig(file_name, dpi = 80)
+    pylab.close()
+
+def compare(stringtime1, stringtime2):
+    """
+    Compare two dates in the format 'yyyy-mm-dd hh:mm:ss'.
+    """
+    date1, time1 = stringtime1.split(' ')
+    date2, time2 = stringtime2.split(' ')
+
+    year1, month1, day1 = date1.split('-')
+    year2, month2, day2 = date2.split('-')
+
+    hour1, minute1, second1 = time1.split(':')
+    hour2, minute2, second2 = time2.split(':')
+
+    datetime1 = datetime(year=int(year1), month=int(month1), day=int(day1), \
+                        hour=int(hour1), minute=int(minute1), second=int(second1))
+
+    datetime2 = datetime(year=int(year2), month=int(month2), day=int(day2), \
+                        hour=int(hour2), minute=int(minute2), second=int(second2))
+
+    if datetime1 < datetime2:
+        return -1
+    elif datetime1 > datetime2:
+        return 1
+    else:
+        return 0
+
+
+def load_feed():
+    """
+    Load feeds in a dictionary.
+    """
+    list_of_articles = None
+    try:
+        conn = sqlite3.connect("./var/feed.db", isolation_level = None)
+        c = conn.cursor()
+        list_of_articles = c.execute("SELECT * FROM rss_feed").fetchall()
+        c.close()
+    except:
+        pass
+
+    # The key of dic is the id of the feed:
+    # dic[feed_id] = (article_id, article_date, article_title,
+    #               article_link, article_description, feed_title,
+    #               feed_link, article_readed)
+    # dic_info[feed_id] = (nb_article, nb_article_unreaded)
+    dic, dic_info = {}, {}
+    if list_of_articles is not None:
+        for article in list_of_articles:
+            sha256_hash = hashlib.sha256()
+            sha256_hash.update(article[5].encode('utf-8'))
+            feed_id = sha256_hash.hexdigest()
+            sha256_hash.update(article[2].encode('utf-8'))
+            article_id = sha256_hash.hexdigest()
+
+            article_list = [article_id, article[0], article[1], \
+                article[2], article[3], article[4], article[5], article[6]]
+
+            if feed_id not in dic:
+                dic[feed_id] = [article_list]
+            else:
+                dic[feed_id].append(article_list)
+
+        # sort articles by date for each feeds
+        for feeds in dic.keys():
+            dic[feeds].sort(lambda x,y: compare(y[1], x[1]))
+
+        for rss_feed_id in dic.keys():
+            dic_info[rss_feed_id] = (len(dic[rss_feed_id]), \
+                                    len([article for article in dic[rss_feed_id] \
+                                                            if article[7]=="0"]) \
+                                    )
+
+        return (dic, dic_info)
+    return (dic, dic_info)
+\ No newline at end of file
author	cedricbonhomme <devnull@localhost>	2010-02-24 09:08:06 +0100
committer	cedricbonhomme <devnull@localhost>	2010-02-24 09:08:06 +0100
commit	3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7 (patch)
tree	aaaaa1b07cad5e7501083102cca7da0aa3f90578 /utils.py
parent	Statistics on words are only processed on articles content. (diff)
download	newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.gz newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.tar.bz2 newspipe-3ee7e79c6e9f2569bebc34ad02c74b2e541fb2d7.zip