From 222c0d994dd8b27a3b0be509fa8958e28208f28b Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Sun, 8 Jun 2014 12:56:26 +0200 Subject: supporting feed without date or with ill formated date --- pyaggr3g470r/crawler.py | 72 +++++++++++++++++++++++++++++-------------------- pyaggr3g470r/utils.py | 25 +++++++++-------- requirements.txt | 1 + 3 files changed, 56 insertions(+), 42 deletions(-) diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py index aa6fdbc0..8f88e2d5 100644 --- a/pyaggr3g470r/crawler.py +++ b/pyaggr3g470r/crawler.py @@ -26,9 +26,11 @@ __revision__ = "$Date: 2014/04/13 $" __copyright__ = "Copyright (c) Cedric Bonhomme" __license__ = "AGPLv3" +import re import feedparser import urllib2 import requests +import dateutil.parser from bs4 import BeautifulSoup from datetime import datetime from sqlalchemy.exc import IntegrityError @@ -106,7 +108,7 @@ class FeedGetter(object): # 4 - Indexation if not conf.ON_HEROKU: self.index(new_articles) - + # 5 - Mail notification if not conf.ON_HEROKU and conf.MAIL_ENABLED: self.mail_notification(new_articles) @@ -151,51 +153,62 @@ class FeedGetter(object): proxies=self.proxies) nice_url = r.url.encode("utf-8") except Timeout: - pyaggr3g470r_log.\ - warning("Timeout when getting the real URL of %s." % - (article.link,)) + pyaggr3g470r_log.warning( + "Timeout when getting the real URL of %s.", + article.link) continue - except Exception as e: - pyaggr3g470r_log.\ - warning("Unable to get the real URL of %s. Error: %s" % - (article.link, str(e))) + except Exception as error: + pyaggr3g470r_log.warning( + "Unable to get the real URL of %s. Error: %s", + article.link, error) continue # remove utm_* parameters nice_url = utils.clean_url(nice_url) description = "" - article_title = "" + article_title = article.get('title', '') try: # article content description = article.content[0].value except AttributeError: - try: - # article description - description = article.description - except Exception: - description = "" + # article description + description = article.get('description', '') + try: description = BeautifulSoup(description, "lxml").decode() except: - pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % - (article_title, nice_url)) - article_title = article.title + pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)", + article_title, nice_url) - try: - post_date = datetime(*article.published_parsed[:6]) - except: - post_date = datetime(*article.updated_parsed[:6]) + post_date = None + for date_key in ('published_parsed', 'published', + 'updated_parsed', 'updated'): + if not date_key in article: + continue + + try: + post_date = dateutil.parser.parse(article[date_key], + dayfirst=True) + break + except: + try: # trying to clean date field from letters + post_date = dateutil.parser.parse( + re.sub('[A-z]', '', article[date_key]), + dayfirst=True) + break + except: + pass # create the models.Article object and append it to the list of articles - article = Article(link=nice_url, title=article_title, \ - content=description, readed=False, like=False, date=post_date, \ - user_id=self.user.id, feed_id=feed.id) + article = Article(link=nice_url, title=article_title, + content=description, readed=False, like=False, + date=post_date, user_id=self.user.id, + feed_id=feed.id) articles.append(article) # return the feed with the list of retrieved articles return feed, articles - jobs = [] pool = Pool(20) jobs = [pool.spawn(fetch, feed) for feed in feeds] pool.join() @@ -211,7 +224,7 @@ class FeedGetter(object): for feed, articles in elements: for article in articles: - + exist = Article.query.filter(Article.user_id == self.user.id, Article.feed_id == feed.id, @@ -220,6 +233,9 @@ class FeedGetter(object): pyaggr3g470r_log.error("Article %s (%s) already in the database." % (article.title, article.link)) continue + if article.date is None: + article.date = datetime.now(dateutil.tz.tzlocal()) + new_articles.append(article) try: @@ -253,7 +269,7 @@ class FeedGetter(object): except: pyaggr3g470r_log.error("Problem during indexation.") return True - + def mail_notification(self, new_articles): """ Mail notification. @@ -264,5 +280,3 @@ class FeedGetter(object): emails.new_article_notification(self.user, element.source, element) return True - - diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py index 5e8be5f8..320c49ce 100755 --- a/pyaggr3g470r/utils.py +++ b/pyaggr3g470r/utils.py @@ -40,7 +40,7 @@ import opml import json import datetime import operator -from urllib import urlencode +import urllib from urlparse import urlparse, parse_qs, urlunparse from bs4 import BeautifulSoup @@ -145,40 +145,40 @@ def import_json(email, json_file): # Create feeds for feed in json_account["result"]: - + if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == feed["link"]).first(): continue - + new_feed = Feed(title=feed["title"], description="", link=feed["link"], \ site_link=feed["site_link"], email_notification=feed["email_notification"], \ created_date=datetime.datetime.fromtimestamp(int(feed["created_date"])), enabled=feed["enabled"]) user.feeds.append(new_feed) - nb_feeds += 1 + nb_feeds += 1 db.session.commit() # Create articles for feed in json_account["result"]: user_feed = Feed.query.filter(Feed.user_id == user.id, Feed.link == feed["link"]).first() - if None != user_feed: + if None != user_feed: for article in feed["articles"]: - + if None == Article.query.filter(Article.user_id == user.id, Article.feed_id == user_feed.id, Article.link == article["link"]).first(): - + new_article = Article(link=article["link"], title=article["title"], \ content=article["content"], readed=article["readed"], like=article["like"], \ retrieved_date=datetime.datetime.fromtimestamp(int(article["retrieved_date"])), date=datetime.datetime.fromtimestamp(int(article["date"])), user_id=user.id, feed_id=user_feed.id) - + user_feed.articles.append(new_article) nb_articles += 1 db.session.commit() return nb_feeds, nb_articles - + def clean_url(url): """ @@ -188,15 +188,14 @@ def clean_url(url): qd = parse_qs(parsed_url.query, keep_blank_values=True) filtered = dict((k, v) for k, v in qd.iteritems() if not k.startswith('utm_')) - nice_url = urlunparse([ + return urlunparse([ parsed_url.scheme, parsed_url.netloc, - parsed_url.path, + urllib.quote(parsed_url.path), parsed_url.params, - urlencode(filtered, doseq=True), + urllib.urlencode(filtered, doseq=True), parsed_url.fragment ]) - return nice_url def open_url(url): diff --git a/requirements.txt b/requirements.txt index fc00a393..42867a2c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ WTForms python-postmark gevent whoosh +python-dateutil -- cgit