From 341008ac376479468d68bb2fb1c9ce0cd82b10e8 Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <cedric@cedricbonhomme.org>
Date: Sat, 28 Mar 2015 14:37:23 +0100
Subject: Clean the file utils.py

---
 pyaggr3g470r/utils.py | 96 +++++++++++++++++++++------------------------------
 1 file changed, 39 insertions(+), 57 deletions(-)

(limited to 'pyaggr3g470r/utils.py')

diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index 513913a1..5f2d8707 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -4,7 +4,7 @@
 # pyAggr3g470r - A Web based news aggregator.
 # Copyright (C) 2010-2015  Cédric Bonhomme - https://www.cedricbonhomme.org
 #
-# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r/
+# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
@@ -20,15 +20,16 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 __author__ = "Cedric Bonhomme"
-__version__ = "$Revision: 1.6 $"
+__version__ = "$Revision: 1.7 $"
 __date__ = "$Date: 2010/12/07 $"
-__revision__ = "$Date: 2013/11/17 $"
+__revision__ = "$Date: 2015/03/28 $"
 __copyright__ = "Copyright (c) Cedric Bonhomme"
 __license__ = "AGPLv3"
 
 #
 # This file provides functions used for:
-# - the database management;
+# - detection of duplicate articles;
+# - import from a JSON file;
 # - generation of tags cloud;
 # - HTML processing.
 #
@@ -58,18 +59,8 @@ from flask import g
 from pyaggr3g470r import controllers
 from pyaggr3g470r.models import User, Feed, Article
 
-
-# regular expression to check URL
-url_finders = [
-    re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \
-    re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \
-    re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \
-    re.compile("'\\<((mailto:)|)[-A-Za-z0-9\\.]+@[-A-Za-z0-9\\.]+") \
-]
-
 logger = logging.getLogger(__name__)
 
-
 @contextmanager
 def opened_w_error(filename, mode="r"):
     try:
@@ -83,11 +74,17 @@ def opened_w_error(filename, mode="r"):
             f.close()
 
 def fetch(id, feed_id=None):
-    cmd = [conf.PYTHON, conf.basedir+'/manager.py', 'fetch_asyncio', str(id), str(feed_id)]
+    """
+    Fetch the feeds in a new processus.
+    The "asyncio" crawler is launched with the manager.
+    """
+    cmd = [conf.PYTHON, conf.basedir+'/manager.py', 'fetch_asyncio', str(id),
+            str(feed_id)]
     p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
 
 def history(user_id, year=None, month=None):
     """
+    Sort articles by year and month.
     """
     articles_counter = Counter()
     articles = controllers.ArticleController(user_id).read()
@@ -118,43 +115,33 @@ def import_opml(email, opml_content):
         Parse recursively through the categories and sub-categories.
         """
         for subscription in subsubscription:
-
             if len(subscription) != 0:
                 nb = read(subscription, nb)
             else:
-
                 try:
                     title = subscription.text
-
                 except:
                     title = ""
-
                 try:
                     description = subscription.description
                 except:
                     description = ""
-
                 try:
                     link = subscription.xmlUrl
                 except:
                     continue
-
                 if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == link).first():
                     continue
-
                 try:
                     site_link = subscription.htmlUrl
                 except:
                     site_link = ""
-
                 new_feed = Feed(title=title, description=description,
                                 link=link, site_link=site_link,
                                 enabled=True)
-
                 user.feeds.append(new_feed)
                 nb += 1
         return nb
-
     nb = read(subscriptions)
     g.db.session.commit()
     return nb
@@ -166,44 +153,46 @@ def import_json(email, json_content):
     user = User.query.filter(User.email == email).first()
     json_account = json.loads(json_content)
     nb_feeds, nb_articles = 0, 0
-
-    # Create feeds
+    # Create feeds:
     for feed in json_account["result"]:
-
-        if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == feed["link"]).first():
+        if None != Feed.query.filter(Feed.user_id == user.id,
+                                    Feed.link == feed["link"]).first():
             continue
-
-        new_feed = Feed(title=feed["title"], description="", link=feed["link"], \
-                                    site_link=feed["site_link"], \
-                                    created_date=datetime.datetime.fromtimestamp(int(feed["created_date"])),
-                                    enabled=feed["enabled"])
+        new_feed = Feed(title=feed["title"],
+                        description="",
+                        link=feed["link"],
+                        site_link=feed["site_link"],
+                        created_date=datetime.datetime.\
+                            fromtimestamp(int(feed["created_date"])),
+                        enabled=feed["enabled"])
         user.feeds.append(new_feed)
         nb_feeds += 1
     g.db.session.commit()
-
-    # Create articles
+    # Create articles:
     for feed in json_account["result"]:
-        user_feed = Feed.query.filter(Feed.user_id == user.id, Feed.link == feed["link"]).first()
+        user_feed = Feed.query.filter(Feed.user_id == user.id,
+                                        Feed.link == feed["link"]).first()
         if None != user_feed:
             for article in feed["articles"]:
-
                 if None == Article.query.filter(Article.user_id == user.id,
-                                        Article.feed_id == user_feed.id,
-                                        Article.link == article["link"]).first():
-
-                    new_article = Article(link=article["link"], title=article["title"], \
-                                            content=article["content"], readed=article["readed"], like=article["like"], \
-                                            retrieved_date=datetime.datetime.fromtimestamp(int(article["retrieved_date"])),
-                                            date=datetime.datetime.fromtimestamp(int(article["date"])),
-                                            user_id=user.id, feed_id=user_feed.id)
-
+                                    Article.feed_id == user_feed.id,
+                                    Article.link == article["link"]).first():
+                    new_article = Article(link=article["link"],
+                                title=article["title"],
+                                content=article["content"],
+                                readed=article["readed"],
+                                like=article["like"], \
+                                retrieved_date=datetime.datetime.\
+                                    fromtimestamp(int(article["retrieved_date"])),
+                                date=datetime.datetime.\
+                                    fromtimestamp(int(article["date"])),
+                                user_id=user.id,
+                                feed_id=user_feed.id)
                     user_feed.articles.append(new_article)
                     nb_articles += 1
     g.db.session.commit()
-
     return nb_feeds, nb_articles
 
-
 def clean_url(url):
     """
     Remove utm_* parameters
@@ -221,7 +210,6 @@ def clean_url(url):
         parsed_url.fragment
     ]).rstrip('=')
 
-
 def open_url(url):
     """
     Open an URL with the proxy and the user-agent
@@ -240,7 +228,6 @@ def open_url(url):
         # server couldn't fulfill the request
         error = (url, e.code, \
                         http.server.BaseHTTPRequestHandler.responses[e.code][1])
-        #pyaggr3g470r_log.error(url + " " + str(e.code) + " " + http.server.BaseHTTPRequestHandler.responses[e.code][1])
         return (False, error)
     except urllib.error.URLError as e:
         # failed to reach the server
@@ -249,10 +236,8 @@ def open_url(url):
             #pyaggr3g470r_log.error(url + " " + e.reason)
         else:
             error = (url, e.reason.errno, e.reason.strerror)
-            #pyaggr3g470r_log.error(url + " " + str(e.reason.errno) + " " + e.reason.strerror)
         return (False, error)
 
-
 def clear_string(data):
     """
     Clear a string by removing HTML tags, HTML special caracters
@@ -262,7 +247,6 @@ def clear_string(data):
     q = re.compile('\s') # consecutive white spaces
     return p.sub('', q.sub(' ', data))
 
-
 def load_stop_words():
     """
     Load the stop words and return them in a list.
@@ -278,7 +262,6 @@ def load_stop_words():
                 stop_words += stop_wods_file.read().split(";")
     return stop_words
 
-
 def top_words(articles, n=10, size=5):
     """
     Return the n most frequent words in a list.
@@ -293,7 +276,6 @@ def top_words(articles, n=10, size=5):
             words[word] += 1
     return words.most_common(n)
 
-
 def tag_cloud(tags):
     """
     Generates a tags cloud.
@@ -306,6 +288,7 @@ def tag_cloud(tags):
 def compare_documents(feed):
     """
     Compare a list of documents by pair.
+    Pairs of duplicates are sorted by "retrieved date".
     """
     duplicates = []
     for pair in itertools.combinations(feed.articles, 2):
@@ -340,7 +323,6 @@ def search_feed(url):
         return feed_link['href']
     return None
 
-
 if __name__ == "__main__":
     import_opml("root@pyAggr3g470r.localhost", "./var/feeds_test.opml")
     #import_opml("root@pyAggr3g470r.localhost", "./var/pyAggr3g470r.opml")
-- 
cgit