From 222c0d994dd8b27a3b0be509fa8958e28208f28b Mon Sep 17 00:00:00 2001
From: François Schmidts <francois.schmidts@gmail.com>
Date: Sun, 8 Jun 2014 12:56:26 +0200
Subject: supporting feed without date or with ill formated date

---
 pyaggr3g470r/crawler.py | 72 +++++++++++++++++++++++++++++--------------------
 pyaggr3g470r/utils.py   | 25 +++++++++--------
 requirements.txt        |  1 +
 3 files changed, 56 insertions(+), 42 deletions(-)

diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index aa6fdbc0..8f88e2d5 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -26,9 +26,11 @@ __revision__ = "$Date: 2014/04/13 $"
 __copyright__ = "Copyright (c) Cedric Bonhomme"
 __license__ = "AGPLv3"
 
+import re
 import feedparser
 import urllib2
 import requests
+import dateutil.parser
 from bs4 import BeautifulSoup
 from datetime import datetime
 from sqlalchemy.exc import IntegrityError
@@ -106,7 +108,7 @@ class FeedGetter(object):
         # 4 - Indexation
         if not conf.ON_HEROKU:
             self.index(new_articles)
-            
+
         # 5 - Mail notification
         if not conf.ON_HEROKU and conf.MAIL_ENABLED:
             self.mail_notification(new_articles)
@@ -151,51 +153,62 @@ class FeedGetter(object):
                                             proxies=self.proxies)
                         nice_url = r.url.encode("utf-8")
                     except Timeout:
-                        pyaggr3g470r_log.\
-                        warning("Timeout when getting the real URL of %s." %
-                                    (article.link,))
+                        pyaggr3g470r_log.warning(
+                                "Timeout when getting the real URL of %s.",
+                                article.link)
                         continue
-                    except Exception as e:
-                        pyaggr3g470r_log.\
-                        warning("Unable to get the real URL of %s. Error: %s" %
-                                    (article.link, str(e)))
+                    except Exception as error:
+                        pyaggr3g470r_log.warning(
+                                "Unable to get the real URL of %s. Error: %s",
+                                article.link, error)
                         continue
                 # remove utm_* parameters
                 nice_url = utils.clean_url(nice_url)
 
                 description = ""
-                article_title = ""
+                article_title = article.get('title', '')
                 try:
                     # article content
                     description = article.content[0].value
                 except AttributeError:
-                    try:
-                        # article description
-                        description = article.description
-                    except Exception:
-                        description = ""
+                    # article description
+                    description = article.get('description', '')
+
                 try:
                     description = BeautifulSoup(description, "lxml").decode()
                 except:
-                    pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" %
-                                            (article_title, nice_url))
-                article_title = article.title
+                    pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)",
+                                           article_title, nice_url)
 
-                try:
-                    post_date = datetime(*article.published_parsed[:6])
-                except:
-                    post_date = datetime(*article.updated_parsed[:6])
+                post_date = None
+                for date_key in ('published_parsed', 'published',
+                                 'updated_parsed', 'updated'):
+                    if not date_key in article:
+                        continue
+
+                    try:
+                        post_date = dateutil.parser.parse(article[date_key],
+                                dayfirst=True)
+                        break
+                    except:
+                        try:  # trying to clean date field from letters
+                            post_date = dateutil.parser.parse(
+                                        re.sub('[A-z]', '', article[date_key]),
+                                        dayfirst=True)
+                            break
+                        except:
+                            pass
 
                 # create the models.Article object and append it to the list of articles
-                article = Article(link=nice_url, title=article_title, \
-                                content=description, readed=False, like=False, date=post_date, \
-                                user_id=self.user.id, feed_id=feed.id)
+                article = Article(link=nice_url, title=article_title,
+                                content=description, readed=False, like=False,
+                                date=post_date, user_id=self.user.id,
+                                feed_id=feed.id)
                 articles.append(article)
 
             # return the feed with the list of retrieved articles
             return feed, articles
 
-        jobs = []
         pool = Pool(20)
         jobs = [pool.spawn(fetch, feed) for feed in feeds]
         pool.join()
@@ -211,7 +224,7 @@ class FeedGetter(object):
         for feed, articles in elements:
 
             for article in articles:
-                
+
 
                 exist = Article.query.filter(Article.user_id == self.user.id,
                                         Article.feed_id == feed.id,
@@ -220,6 +233,9 @@ class FeedGetter(object):
                     pyaggr3g470r_log.error("Article %s (%s) already in the database." %
                                            (article.title, article.link))
                     continue
+                if article.date is None:
+                    article.date = datetime.now(dateutil.tz.tzlocal())
+
                 new_articles.append(article)
 
                 try:
@@ -253,7 +269,7 @@ class FeedGetter(object):
             except:
                 pyaggr3g470r_log.error("Problem during indexation.")
         return True
-    
+
     def mail_notification(self, new_articles):
         """
         Mail notification.
@@ -264,5 +280,3 @@ class FeedGetter(object):
                 emails.new_article_notification(self.user, element.source, element)
 
         return True
-
-                
diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index 5e8be5f8..320c49ce 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -40,7 +40,7 @@ import opml
 import json
 import datetime
 import operator
-from urllib import urlencode
+import urllib
 from urlparse import urlparse, parse_qs, urlunparse
 from bs4 import BeautifulSoup
 
@@ -145,40 +145,40 @@ def import_json(email, json_file):
 
     # Create feeds
     for feed in json_account["result"]:
-        
+
         if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == feed["link"]).first():
             continue
-    
+
         new_feed = Feed(title=feed["title"], description="", link=feed["link"], \
                                     site_link=feed["site_link"], email_notification=feed["email_notification"], \
                                     created_date=datetime.datetime.fromtimestamp(int(feed["created_date"])),
                                     enabled=feed["enabled"])
         user.feeds.append(new_feed)
-        nb_feeds += 1    
+        nb_feeds += 1
     db.session.commit()
 
     # Create articles
     for feed in json_account["result"]:
         user_feed = Feed.query.filter(Feed.user_id == user.id, Feed.link == feed["link"]).first()
-        if None != user_feed:        
+        if None != user_feed:
             for article in feed["articles"]:
-                
+
                 if None == Article.query.filter(Article.user_id == user.id,
                                         Article.feed_id == user_feed.id,
                                         Article.link == article["link"]).first():
-                
+
                     new_article = Article(link=article["link"], title=article["title"], \
                                             content=article["content"], readed=article["readed"], like=article["like"], \
                                             retrieved_date=datetime.datetime.fromtimestamp(int(article["retrieved_date"])),
                                             date=datetime.datetime.fromtimestamp(int(article["date"])),
                                             user_id=user.id, feed_id=user_feed.id)
-            
+
                     user_feed.articles.append(new_article)
                     nb_articles += 1
     db.session.commit()
 
     return nb_feeds, nb_articles
-    
+
 
 def clean_url(url):
     """
@@ -188,15 +188,14 @@ def clean_url(url):
     qd = parse_qs(parsed_url.query, keep_blank_values=True)
     filtered = dict((k, v) for k, v in qd.iteritems()
                                         if not k.startswith('utm_'))
-    nice_url = urlunparse([
+    return urlunparse([
         parsed_url.scheme,
         parsed_url.netloc,
-        parsed_url.path,
+        urllib.quote(parsed_url.path),
         parsed_url.params,
-        urlencode(filtered, doseq=True),
+        urllib.urlencode(filtered, doseq=True),
         parsed_url.fragment
     ])
-    return nice_url
 
 
 def open_url(url):
diff --git a/requirements.txt b/requirements.txt
index fc00a393..42867a2c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,3 +16,4 @@ WTForms
 python-postmark
 gevent
 whoosh
+python-dateutil
-- 
cgit