From 2378de49ba37116c5bf93054fd6aed65fa44022a Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <cedric@cedricbonhomme.org>
Date: Sun, 8 Mar 2015 12:07:36 +0100
Subject: Moved duplicate() function in utils.py. Some minor cosmethic changes.

---
 pyaggr3g470r/utils.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'pyaggr3g470r/utils.py')

diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index 3ed89f55..ba440c78 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -41,12 +41,14 @@ import logging
 import datetime
 import operator
 import urllib
+import itertools
 import subprocess
 try:
     from urlparse import urlparse, parse_qs, urlunparse
 except:
     from urllib.parse import urlparse, parse_qs, urlunparse
 from bs4 import BeautifulSoup
+from datetime import timedelta
 from collections import Counter
 from contextlib import contextmanager
 
@@ -283,6 +285,17 @@ def tag_cloud(tags):
                     (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, format(count, ',d'), word)) \
                         for (word, count) in tags])
 
+def compare_documents(feed):
+    """
+    Compare a list of documents by pair.
+    """
+    duplicates = []
+    for pair in itertools.combinations(feed.articles, 2):
+        date1, date2 = pair[0].date, pair[1].date
+        if clear_string(pair[0].title) == clear_string(pair[1].title) and \
+                                        (date1 - date2) < timedelta(days = 1):
+            duplicates.append(pair)
+    return duplicates
 
 def search_feed(url):
     """
-- 
cgit 


From fb5df6041fc7bf97429bfe689e26fdc08e7e307f Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <cedric@cedricbonhomme.org>
Date: Sun, 15 Mar 2015 23:59:14 +0100
Subject: A new test for the history page.

---
 pyaggr3g470r/utils.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'pyaggr3g470r/utils.py')

diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index ba440c78..ae140327 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -54,6 +54,7 @@ from contextlib import contextmanager
 
 import conf
 from flask import g
+from pyaggr3g470r import controllers
 from pyaggr3g470r.models import User, Feed, Article
 
 
@@ -84,6 +85,31 @@ def fetch(id, feed_id=None):
     cmd = [conf.PYTHON, conf.basedir+'/manager.py', 'fetch_asyncio', str(id), str(feed_id)]
     p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
 
+def history(year=None, month=None):
+    """
+    """
+    import datetime, time, sqlalchemy
+    articles_counter = Counter()
+    if None != month and None != year:
+        articles = controllers.ArticleController(1).read(). \
+                    filter(sqlalchemy.extract('year', Article.date) == year). \
+                    filter(sqlalchemy.extract('month', Article.date) == month)
+    elif None != year:
+        articles = controllers.ArticleController(1).read(). \
+                    filter(sqlalchemy.extract('year', Article.date) == year)
+    else:
+        articles = controllers.ArticleController(1).read()
+
+    for article in articles:
+        if None != month:
+            articles_counter[article.date.day] += 1
+        elif None != year:
+            articles_counter[article.date.month] += 1
+        else:
+            articles_counter[article.date.year] += 1
+
+    return articles_counter, articles
+
 def import_opml(email, opml_content):
     """
     Import new feeds from an OPML file.
-- 
cgit 


From 8d0fea82761f2fdc1ea93687429990eefa851fc8 Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <cedric@cedricbonhomme.org>
Date: Mon, 16 Mar 2015 07:22:52 +0100
Subject: Improvements and fixes for the history() function.

---
 pyaggr3g470r/utils.py | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'pyaggr3g470r/utils.py')

diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index ae140327..1ad2896a 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -43,6 +43,7 @@ import operator
 import urllib
 import itertools
 import subprocess
+import sqlalchemy
 try:
     from urlparse import urlparse, parse_qs, urlunparse
 except:
@@ -85,25 +86,18 @@ def fetch(id, feed_id=None):
     cmd = [conf.PYTHON, conf.basedir+'/manager.py', 'fetch_asyncio', str(id), str(feed_id)]
     p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
 
-def history(year=None, month=None):
+def history(user_id, year=None, month=None):
     """
     """
-    import datetime, time, sqlalchemy
     articles_counter = Counter()
-    if None != month and None != year:
-        articles = controllers.ArticleController(1).read(). \
-                    filter(sqlalchemy.extract('year', Article.date) == year). \
-                    filter(sqlalchemy.extract('month', Article.date) == month)
-    elif None != year:
-        articles = controllers.ArticleController(1).read(). \
-                    filter(sqlalchemy.extract('year', Article.date) == year)
-    else:
-        articles = controllers.ArticleController(1).read()
-
-    for article in articles:
+    articles = controllers.ArticleController(user_id).read()
+    if None != year:
+        articles = articles.filter(sqlalchemy.extract('year', Article.date) == year)
         if None != month:
-            articles_counter[article.date.day] += 1
-        elif None != year:
+            articles = articles.filter(sqlalchemy.extract('month', Article.date) == month)
+    print(articles.count())
+    for article in articles.all():
+        if None != year:
             articles_counter[article.date.month] += 1
         else:
             articles_counter[article.date.year] += 1
-- 
cgit 


From 052dd984ee98adf75c9ad022f6f2d6cc9cc88b36 Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <cedric@cedricbonhomme.org>
Date: Mon, 16 Mar 2015 07:24:24 +0100
Subject: Removed debug print.

---
 pyaggr3g470r/utils.py | 1 -
 1 file changed, 1 deletion(-)

(limited to 'pyaggr3g470r/utils.py')

diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index 1ad2896a..b1c9ff50 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -95,7 +95,6 @@ def history(user_id, year=None, month=None):
         articles = articles.filter(sqlalchemy.extract('year', Article.date) == year)
         if None != month:
             articles = articles.filter(sqlalchemy.extract('month', Article.date) == month)
-    print(articles.count())
     for article in articles.all():
         if None != year:
             articles_counter[article.date.month] += 1
-- 
cgit 


From aaa6013c0e8a477378fb5de9f3362ae17d933788 Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <cedric@cedricbonhomme.org>
Date: Mon, 16 Mar 2015 07:25:01 +0100
Subject: Removed blank line.

---
 pyaggr3g470r/utils.py | 1 -
 1 file changed, 1 deletion(-)

(limited to 'pyaggr3g470r/utils.py')

diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index b1c9ff50..e88f5261 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -100,7 +100,6 @@ def history(user_id, year=None, month=None):
             articles_counter[article.date.month] += 1
         else:
             articles_counter[article.date.year] += 1
-
     return articles_counter, articles
 
 def import_opml(email, opml_content):
-- 
cgit 


From 687d985298957c3689547268928313a104bf6f89 Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <cedric@cedricbonhomme.org>
Date: Sat, 28 Mar 2015 11:23:47 +0100
Subject: Bugfix: unnecessary trailing slash.

---
 pyaggr3g470r/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pyaggr3g470r/utils.py')

diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index e88f5261..a7f8ad64 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -299,7 +299,7 @@ def tag_cloud(tags):
     Generates a tags cloud.
     """
     tags.sort(key=operator.itemgetter(0))
-    return '\n'.join([('<font size=%d><a href="/search/?query=%s" title="Count: %s">%s</a></font>' % \
+    return '\n'.join([('<font size=%d><a href="/search?query=%s" title="Count: %s">%s</a></font>' % \
                     (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, format(count, ',d'), word)) \
                         for (word, count) in tags])
 
-- 
cgit 


From bb5a7e24baf072af366b013240cc9d63f997153c Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <cedric@cedricbonhomme.org>
Date: Sat, 28 Mar 2015 14:16:45 +0100
Subject: Pairs of duplicates are sorted by retrieved date.

---
 pyaggr3g470r/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'pyaggr3g470r/utils.py')

diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index a7f8ad64..d1e4759b 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -312,7 +312,10 @@ def compare_documents(feed):
         date1, date2 = pair[0].date, pair[1].date
         if clear_string(pair[0].title) == clear_string(pair[1].title) and \
                                         (date1 - date2) < timedelta(days = 1):
-            duplicates.append(pair)
+            if pair[0].retrieved_date < pair[1].retrieved_date:
+                duplicates.append((pair[0], pair[1]))
+            else:
+                duplicates.append(((pair[1], pair[0]))
     return duplicates
 
 def search_feed(url):
-- 
cgit 


From 1781835efa43eaf9bfc2cc135a8f693ee57135e7 Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <cedric@cedricbonhomme.org>
Date: Sat, 28 Mar 2015 14:18:52 +0100
Subject: Typo...

---
 pyaggr3g470r/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'pyaggr3g470r/utils.py')

diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index d1e4759b..513913a1 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -315,7 +315,7 @@ def compare_documents(feed):
             if pair[0].retrieved_date < pair[1].retrieved_date:
                 duplicates.append((pair[0], pair[1]))
             else:
-                duplicates.append(((pair[1], pair[0]))
+                duplicates.append((pair[1], pair[0]))
     return duplicates
 
 def search_feed(url):
-- 
cgit 


From 341008ac376479468d68bb2fb1c9ce0cd82b10e8 Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <cedric@cedricbonhomme.org>
Date: Sat, 28 Mar 2015 14:37:23 +0100
Subject: Clean the file utils.py

---
 pyaggr3g470r/utils.py | 96 +++++++++++++++++++++------------------------------
 1 file changed, 39 insertions(+), 57 deletions(-)

(limited to 'pyaggr3g470r/utils.py')

diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index 513913a1..5f2d8707 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -4,7 +4,7 @@
 # pyAggr3g470r - A Web based news aggregator.
 # Copyright (C) 2010-2015  Cédric Bonhomme - https://www.cedricbonhomme.org
 #
-# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r/
+# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -20,15 +20,16 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 __author__ = "Cedric Bonhomme"
-__version__ = "$Revision: 1.6 $"
+__version__ = "$Revision: 1.7 $"
 __date__ = "$Date: 2010/12/07 $"
-__revision__ = "$Date: 2013/11/17 $"
+__revision__ = "$Date: 2015/03/28 $"
 __copyright__ = "Copyright (c) Cedric Bonhomme"
 __license__ = "AGPLv3"
 
 #
 # This file provides functions used for:
-# - the database management;
+# - detection of duplicate articles;
+# - import from a JSON file;
 # - generation of tags cloud;
 # - HTML processing.
 #
@@ -58,18 +59,8 @@ from flask import g
 from pyaggr3g470r import controllers
 from pyaggr3g470r.models import User, Feed, Article
 
-
-# regular expression to check URL
-url_finders = [
-    re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \
-    re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \
-    re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \
-    re.compile("'\\<((mailto:)|)[-A-Za-z0-9\\.]+@[-A-Za-z0-9\\.]+") \
-]
-
 logger = logging.getLogger(__name__)
 
-
 @contextmanager
 def opened_w_error(filename, mode="r"):
     try:
@@ -83,11 +74,17 @@ def opened_w_error(filename, mode="r"):
             f.close()
 
 def fetch(id, feed_id=None):
-    cmd = [conf.PYTHON, conf.basedir+'/manager.py', 'fetch_asyncio', str(id), str(feed_id)]
+    """
+    Fetch the feeds in a new processus.
+    The "asyncio" crawler is launched with the manager.
+    """
+    cmd = [conf.PYTHON, conf.basedir+'/manager.py', 'fetch_asyncio', str(id),
+            str(feed_id)]
     p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
 
 def history(user_id, year=None, month=None):
     """
+    Sort articles by year and month.
     """
     articles_counter = Counter()
     articles = controllers.ArticleController(user_id).read()
@@ -118,43 +115,33 @@ def import_opml(email, opml_content):
         Parse recursively through the categories and sub-categories.
         """
         for subscription in subsubscription:
-
             if len(subscription) != 0:
                 nb = read(subscription, nb)
             else:
-
                 try:
                     title = subscription.text
-
                 except:
                     title = ""
-
                 try:
                     description = subscription.description
                 except:
                     description = ""
-
                 try:
                     link = subscription.xmlUrl
                 except:
                     continue
-
                 if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == link).first():
                     continue
-
                 try:
                     site_link = subscription.htmlUrl
                 except:
                     site_link = ""
-
                 new_feed = Feed(title=title, description=description,
                                 link=link, site_link=site_link,
                                 enabled=True)
-
                 user.feeds.append(new_feed)
                 nb += 1
         return nb
-
     nb = read(subscriptions)
     g.db.session.commit()
     return nb
@@ -166,44 +153,46 @@ def import_json(email, json_content):
     user = User.query.filter(User.email == email).first()
     json_account = json.loads(json_content)
     nb_feeds, nb_articles = 0, 0
-
-    # Create feeds
+    # Create feeds:
     for feed in json_account["result"]:
-
-        if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == feed["link"]).first():
+        if None != Feed.query.filter(Feed.user_id == user.id,
+                                    Feed.link == feed["link"]).first():
             continue
-
-        new_feed = Feed(title=feed["title"], description="", link=feed["link"], \
-                                    site_link=feed["site_link"], \
-                                    created_date=datetime.datetime.fromtimestamp(int(feed["created_date"])),
-                                    enabled=feed["enabled"])
+        new_feed = Feed(title=feed["title"],
+                        description="",
+                        link=feed["link"],
+                        site_link=feed["site_link"],
+                        created_date=datetime.datetime.\
+                            fromtimestamp(int(feed["created_date"])),
+                        enabled=feed["enabled"])
         user.feeds.append(new_feed)
         nb_feeds += 1
     g.db.session.commit()
-
-    # Create articles
+    # Create articles:
     for feed in json_account["result"]:
-        user_feed = Feed.query.filter(Feed.user_id == user.id, Feed.link == feed["link"]).first()
+        user_feed = Feed.query.filter(Feed.user_id == user.id,
+                                        Feed.link == feed["link"]).first()
         if None != user_feed:
             for article in feed["articles"]:
-
                 if None == Article.query.filter(Article.user_id == user.id,
-                                        Article.feed_id == user_feed.id,
-                                        Article.link == article["link"]).first():
-
-                    new_article = Article(link=article["link"], title=article["title"], \
-                                            content=article["content"], readed=article["readed"], like=article["like"], \
-                                            retrieved_date=datetime.datetime.fromtimestamp(int(article["retrieved_date"])),
-                                            date=datetime.datetime.fromtimestamp(int(article["date"])),
-                                            user_id=user.id, feed_id=user_feed.id)
-
+                                    Article.feed_id == user_feed.id,
+                                    Article.link == article["link"]).first():
+                    new_article = Article(link=article["link"],
+                                title=article["title"],
+                                content=article["content"],
+                                readed=article["readed"],
+                                like=article["like"], \
+                                retrieved_date=datetime.datetime.\
+                                    fromtimestamp(int(article["retrieved_date"])),
+                                date=datetime.datetime.\
+                                    fromtimestamp(int(article["date"])),
+                                user_id=user.id,
+                                feed_id=user_feed.id)
                     user_feed.articles.append(new_article)
                     nb_articles += 1
     g.db.session.commit()
-
     return nb_feeds, nb_articles
 
-
 def clean_url(url):
     """
     Remove utm_* parameters
@@ -221,7 +210,6 @@ def clean_url(url):
         parsed_url.fragment
     ]).rstrip('=')
 
-
 def open_url(url):
     """
     Open an URL with the proxy and the user-agent
@@ -240,7 +228,6 @@ def open_url(url):
         # server couldn't fulfill the request
         error = (url, e.code, \
                         http.server.BaseHTTPRequestHandler.responses[e.code][1])
-        #pyaggr3g470r_log.error(url + " " + str(e.code) + " " + http.server.BaseHTTPRequestHandler.responses[e.code][1])
         return (False, error)
     except urllib.error.URLError as e:
         # failed to reach the server
@@ -249,10 +236,8 @@ def open_url(url):
             #pyaggr3g470r_log.error(url + " " + e.reason)
         else:
             error = (url, e.reason.errno, e.reason.strerror)
-            #pyaggr3g470r_log.error(url + " " + str(e.reason.errno) + " " + e.reason.strerror)
         return (False, error)
 
-
 def clear_string(data):
     """
     Clear a string by removing HTML tags, HTML special caracters
@@ -262,7 +247,6 @@ def clear_string(data):
     q = re.compile('\s') # consecutive white spaces
     return p.sub('', q.sub(' ', data))
 
-
 def load_stop_words():
     """
     Load the stop words and return them in a list.
@@ -278,7 +262,6 @@ def load_stop_words():
                 stop_words += stop_wods_file.read().split(";")
     return stop_words
 
-
 def top_words(articles, n=10, size=5):
     """
     Return the n most frequent words in a list.
@@ -293,7 +276,6 @@ def top_words(articles, n=10, size=5):
             words[word] += 1
     return words.most_common(n)
 
-
 def tag_cloud(tags):
     """
     Generates a tags cloud.
@@ -306,6 +288,7 @@ def tag_cloud(tags):
 def compare_documents(feed):
     """
     Compare a list of documents by pair.
+    Pairs of duplicates are sorted by "retrieved date".
     """
     duplicates = []
     for pair in itertools.combinations(feed.articles, 2):
@@ -340,7 +323,6 @@ def search_feed(url):
         return feed_link['href']
     return None
 
-
 if __name__ == "__main__":
     import_opml("root@pyAggr3g470r.localhost", "./var/feeds_test.opml")
     #import_opml("root@pyAggr3g470r.localhost", "./var/pyAggr3g470r.opml")
-- 
cgit 


From 854862e1c73bf0c521e08b8973721e8be7aeb82e Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <cedric@cedricbonhomme.org>
Date: Wed, 1 Apr 2015 22:57:15 +0200
Subject: Bugfix.

---
 pyaggr3g470r/utils.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'pyaggr3g470r/utils.py')

diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index 5f2d8707..ea8a87bf 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -56,11 +56,21 @@ from contextlib import contextmanager
 
 import conf
 from flask import g
+from bootstrap import application as app, db
 from pyaggr3g470r import controllers
 from pyaggr3g470r.models import User, Feed, Article
 
 logger = logging.getLogger(__name__)
 
+ALLOWED_EXTENSIONS = set(['xml', 'opml', 'json'])
+
+def allowed_file(filename):
+    """
+    Check if the uploaded file is allowed.
+    """
+    return '.' in filename and \
+            filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS
+
 @contextmanager
 def opened_w_error(filename, mode="r"):
     try:
@@ -143,7 +153,7 @@ def import_opml(email, opml_content):
                 nb += 1
         return nb
     nb = read(subscriptions)
-    g.db.session.commit()
+    db.session.commit()
     return nb
 
 def import_json(email, json_content):
@@ -167,7 +177,7 @@ def import_json(email, json_content):
                         enabled=feed["enabled"])
         user.feeds.append(new_feed)
         nb_feeds += 1
-    g.db.session.commit()
+    db.session.commit()
     # Create articles:
     for feed in json_account["result"]:
         user_feed = Feed.query.filter(Feed.user_id == user.id,
@@ -190,7 +200,7 @@ def import_json(email, json_content):
                                 feed_id=user_feed.id)
                     user_feed.articles.append(new_article)
                     nb_articles += 1
-    g.db.session.commit()
+    db.session.commit()
     return nb_feeds, nb_articles
 
 def clean_url(url):
-- 
cgit