1 files changed, 86 insertions, 60 deletions
diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index 3ed89f55..ea8a87bf 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -4,7 +4,7 @@
 # pyAggr3g470r - A Web based news aggregator.
 # Copyright (C) 2010-2015  Cédric Bonhomme - https://www.cedricbonhomme.org
 #
-# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r/
+# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -20,15 +20,16 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 __author__ = "Cedric Bonhomme"
-__version__ = "$Revision: 1.6 $"
+__version__ = "$Revision: 1.7 $"
 __date__ = "$Date: 2010/12/07 $"
-__revision__ = "$Date: 2013/11/17 $"
+__revision__ = "$Date: 2015/03/28 $"
 __copyright__ = "Copyright (c) Cedric Bonhomme"
 __license__ = "AGPLv3"
 
 #
 # This file provides functions used for:
-# - the database management;
+# - detection of duplicate articles;
+# - import from a JSON file;
 # - generation of tags cloud;
 # - HTML processing.
 #
@@ -41,30 +42,34 @@ import logging
 import datetime
 import operator
 import urllib
+import itertools
 import subprocess
+import sqlalchemy
 try:
     from urlparse import urlparse, parse_qs, urlunparse
 except:
     from urllib.parse import urlparse, parse_qs, urlunparse
 from bs4 import BeautifulSoup
+from datetime import timedelta
 from collections import Counter
 from contextlib import contextmanager
 
 import conf
 from flask import g
+from bootstrap import application as app, db
+from pyaggr3g470r import controllers
 from pyaggr3g470r.models import User, Feed, Article
 
-
-# regular expression to check URL
-url_finders = [
-    re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \
-    re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \
-    re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \
-    re.compile("'\\<((mailto:)|)[-A-Za-z0-9\\.]+@[-A-Za-z0-9\\.]+") \
-]
-
 logger = logging.getLogger(__name__)
 
+ALLOWED_EXTENSIONS = set(['xml', 'opml', 'json'])
+
+def allowed_file(filename):
+    """
+    Check if the uploaded file is allowed.
+    """
+    return '.' in filename and \
+            filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS
 
 @contextmanager
 def opened_w_error(filename, mode="r"):
@@ -79,9 +84,31 @@ def opened_w_error(filename, mode="r"):
             f.close()
 
 def fetch(id, feed_id=None):
-    cmd = [conf.PYTHON, conf.basedir+'/manager.py', 'fetch_asyncio', str(id), str(feed_id)]
+    """
+    Fetch the feeds in a new processus.
+    The "asyncio" crawler is launched with the manager.
+    """
+    cmd = [conf.PYTHON, conf.basedir+'/manager.py', 'fetch_asyncio', str(id),
+            str(feed_id)]
     p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
 
+def history(user_id, year=None, month=None):
+    """
+    Sort articles by year and month.
+    """
+    articles_counter = Counter()
+    articles = controllers.ArticleController(user_id).read()
+    if None != year:
+        articles = articles.filter(sqlalchemy.extract('year', Article.date) == year)
+        if None != month:
+            articles = articles.filter(sqlalchemy.extract('month', Article.date) == month)
+    for article in articles.all():
+        if None != year:
+            articles_counter[article.date.month] += 1
+        else:
+            articles_counter[article.date.year] += 1
+    return articles_counter, articles
+
 def import_opml(email, opml_content):
     """
     Import new feeds from an OPML file.
@@ -98,45 +125,35 @@ def import_opml(email, opml_content):
         Parse recursively through the categories and sub-categories.
         """
         for subscription in subsubscription:
-
             if len(subscription) != 0:
                 nb = read(subscription, nb)
             else:
-
                 try:
                     title = subscription.text
-
                 except:
                     title = ""
-
                 try:
                     description = subscription.description
                 except:
                     description = ""
-
                 try:
                     link = subscription.xmlUrl
                 except:
                     continue
-
                 if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == link).first():
                     continue
-
                 try:
                     site_link = subscription.htmlUrl
                 except:
                     site_link = ""
-
                 new_feed = Feed(title=title, description=description,
                                 link=link, site_link=site_link,
                                 enabled=True)
-
                 user.feeds.append(new_feed)
                 nb += 1
         return nb
-
     nb = read(subscriptions)
-    g.db.session.commit()
+    db.session.commit()
     return nb
 
 def import_json(email, json_content):
@@ -146,44 +163,46 @@ def import_json(email, json_content):
     user = User.query.filter(User.email == email).first()
     json_account = json.loads(json_content)
     nb_feeds, nb_articles = 0, 0
-
-    # Create feeds
+    # Create feeds:
     for feed in json_account["result"]:
-
-        if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == feed["link"]).first():
+        if None != Feed.query.filter(Feed.user_id == user.id,
+                                    Feed.link == feed["link"]).first():
             continue
-
-        new_feed = Feed(title=feed["title"], description="", link=feed["link"], \
-                                    site_link=feed["site_link"], \
-                                    created_date=datetime.datetime.fromtimestamp(int(feed["created_date"])),
-                                    enabled=feed["enabled"])
+        new_feed = Feed(title=feed["title"],
+                        description="",
+                        link=feed["link"],
+                        site_link=feed["site_link"],
+                        created_date=datetime.datetime.\
+                            fromtimestamp(int(feed["created_date"])),
+                        enabled=feed["enabled"])
         user.feeds.append(new_feed)
         nb_feeds += 1
-    g.db.session.commit()
-
-    # Create articles
+    db.session.commit()
+    # Create articles:
     for feed in json_account["result"]:
-        user_feed = Feed.query.filter(Feed.user_id == user.id, Feed.link == feed["link"]).first()
+        user_feed = Feed.query.filter(Feed.user_id == user.id,
+                                        Feed.link == feed["link"]).first()
         if None != user_feed:
             for article in feed["articles"]:
-
                 if None == Article.query.filter(Article.user_id == user.id,
-                                        Article.feed_id == user_feed.id,
-                                        Article.link == article["link"]).first():
-
-                    new_article = Article(link=article["link"], title=article["title"], \
-                                            content=article["content"], readed=article["readed"], like=article["like"], \
-                                            retrieved_date=datetime.datetime.fromtimestamp(int(article["retrieved_date"])),
-                                            date=datetime.datetime.fromtimestamp(int(article["date"])),
-                                            user_id=user.id, feed_id=user_feed.id)
-
+                                    Article.feed_id == user_feed.id,
+                                    Article.link == article["link"]).first():
+                    new_article = Article(link=article["link"],
+                                title=article["title"],
+                                content=article["content"],
+                                readed=article["readed"],
+                                like=article["like"], \
+                                retrieved_date=datetime.datetime.\
+                                    fromtimestamp(int(article["retrieved_date"])),
+                                date=datetime.datetime.\
+                                    fromtimestamp(int(article["date"])),
+                                user_id=user.id,
+                                feed_id=user_feed.id)
                     user_feed.articles.append(new_article)
                     nb_articles += 1
-    g.db.session.commit()
-
+    db.session.commit()
     return nb_feeds, nb_articles
 
-
 def clean_url(url):
     """
     Remove utm_* parameters
@@ -201,7 +220,6 @@ def clean_url(url):
         parsed_url.fragment
     ]).rstrip('=')
 
-
 def open_url(url):
     """
     Open an URL with the proxy and the user-agent
@@ -220,7 +238,6 @@ def open_url(url):
         # server couldn't fulfill the request
         error = (url, e.code, \
                         http.server.BaseHTTPRequestHandler.responses[e.code][1])
-        #pyaggr3g470r_log.error(url + " " + str(e.code) + " " + http.server.BaseHTTPRequestHandler.responses[e.code][1])
         return (False, error)
     except urllib.error.URLError as e:
         # failed to reach the server
@@ -229,10 +246,8 @@ def open_url(url):
             #pyaggr3g470r_log.error(url + " " + e.reason)
         else:
             error = (url, e.reason.errno, e.reason.strerror)
-            #pyaggr3g470r_log.error(url + " " + str(e.reason.errno) + " " + e.reason.strerror)
         return (False, error)
 
-
 def clear_string(data):
     """
     Clear a string by removing HTML tags, HTML special caracters
@@ -242,7 +257,6 @@ def clear_string(data):
     q = re.compile('\s') # consecutive white spaces
     return p.sub('', q.sub(' ', data))
 
-
 def load_stop_words():
     """
     Load the stop words and return them in a list.
@@ -258,7 +272,6 @@ def load_stop_words():
                 stop_words += stop_wods_file.read().split(";")
     return stop_words
 
-
 def top_words(articles, n=10, size=5):
     """
     Return the n most frequent words in a list.
@@ -273,16 +286,30 @@ def top_words(articles, n=10, size=5):
             words[word] += 1
     return words.most_common(n)
 
-
 def tag_cloud(tags):
     """
     Generates a tags cloud.
     """
     tags.sort(key=operator.itemgetter(0))
-    return '\n'.join([('<font size=%d><a href="/search/?query=%s" title="Count: %s">%s</a></font>' % \
+    return '\n'.join([('<font size=%d><a href="/search?query=%s" title="Count: %s">%s</a></font>' % \
                     (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, format(count, ',d'), word)) \
                         for (word, count) in tags])
 
+def compare_documents(feed):
+    """
+    Compare a list of documents by pair.
+    Pairs of duplicates are sorted by "retrieved date".
+    """
+    duplicates = []
+    for pair in itertools.combinations(feed.articles, 2):
+        date1, date2 = pair[0].date, pair[1].date
+        if clear_string(pair[0].title) == clear_string(pair[1].title) and \
+                                        (date1 - date2) < timedelta(days = 1):
+            if pair[0].retrieved_date < pair[1].retrieved_date:
+                duplicates.append((pair[0], pair[1]))
+            else:
+                duplicates.append((pair[1], pair[0]))
+    return duplicates
 
 def search_feed(url):
     """
@@ -306,7 +333,6 @@ def search_feed(url):
         return feed_link['href']
     return None
 
-
 if __name__ == "__main__":
     import_opml("root@pyAggr3g470r.localhost", "./var/feeds_test.opml")
     #import_opml("root@pyAggr3g470r.localhost", "./var/pyAggr3g470r.opml")