diff options
Diffstat (limited to 'pyaggr3g470r/crawler.py')
-rw-r--r-- | pyaggr3g470r/crawler.py | 72 |
1 files changed, 43 insertions, 29 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py index aa6fdbc0..8f88e2d5 100644 --- a/pyaggr3g470r/crawler.py +++ b/pyaggr3g470r/crawler.py @@ -26,9 +26,11 @@ __revision__ = "$Date: 2014/04/13 $" __copyright__ = "Copyright (c) Cedric Bonhomme" __license__ = "AGPLv3" +import re import feedparser import urllib2 import requests +import dateutil.parser from bs4 import BeautifulSoup from datetime import datetime from sqlalchemy.exc import IntegrityError @@ -106,7 +108,7 @@ class FeedGetter(object): # 4 - Indexation if not conf.ON_HEROKU: self.index(new_articles) - + # 5 - Mail notification if not conf.ON_HEROKU and conf.MAIL_ENABLED: self.mail_notification(new_articles) @@ -151,51 +153,62 @@ class FeedGetter(object): proxies=self.proxies) nice_url = r.url.encode("utf-8") except Timeout: - pyaggr3g470r_log.\ - warning("Timeout when getting the real URL of %s." % - (article.link,)) + pyaggr3g470r_log.warning( + "Timeout when getting the real URL of %s.", + article.link) continue - except Exception as e: - pyaggr3g470r_log.\ - warning("Unable to get the real URL of %s. Error: %s" % - (article.link, str(e))) + except Exception as error: + pyaggr3g470r_log.warning( + "Unable to get the real URL of %s. Error: %s", + article.link, error) continue # remove utm_* parameters nice_url = utils.clean_url(nice_url) description = "" - article_title = "" + article_title = article.get('title', '') try: # article content description = article.content[0].value except AttributeError: - try: - # article description - description = article.description - except Exception: - description = "" + # article description + description = article.get('description', '') + try: description = BeautifulSoup(description, "lxml").decode() except: - pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % - (article_title, nice_url)) - article_title = article.title + pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)", + article_title, nice_url) - try: - post_date = datetime(*article.published_parsed[:6]) - except: - post_date = datetime(*article.updated_parsed[:6]) + post_date = None + for date_key in ('published_parsed', 'published', + 'updated_parsed', 'updated'): + if not date_key in article: + continue + + try: + post_date = dateutil.parser.parse(article[date_key], + dayfirst=True) + break + except: + try: # trying to clean date field from letters + post_date = dateutil.parser.parse( + re.sub('[A-z]', '', article[date_key]), + dayfirst=True) + break + except: + pass # create the models.Article object and append it to the list of articles - article = Article(link=nice_url, title=article_title, \ - content=description, readed=False, like=False, date=post_date, \ - user_id=self.user.id, feed_id=feed.id) + article = Article(link=nice_url, title=article_title, + content=description, readed=False, like=False, + date=post_date, user_id=self.user.id, + feed_id=feed.id) articles.append(article) # return the feed with the list of retrieved articles return feed, articles - jobs = [] pool = Pool(20) jobs = [pool.spawn(fetch, feed) for feed in feeds] pool.join() @@ -211,7 +224,7 @@ class FeedGetter(object): for feed, articles in elements: for article in articles: - + exist = Article.query.filter(Article.user_id == self.user.id, Article.feed_id == feed.id, @@ -220,6 +233,9 @@ class FeedGetter(object): pyaggr3g470r_log.error("Article %s (%s) already in the database." % (article.title, article.link)) continue + if article.date is None: + article.date = datetime.now(dateutil.tz.tzlocal()) + new_articles.append(article) try: @@ -253,7 +269,7 @@ class FeedGetter(object): except: pyaggr3g470r_log.error("Problem during indexation.") return True - + def mail_notification(self, new_articles): """ Mail notification. @@ -264,5 +280,3 @@ class FeedGetter(object): emails.new_article_notification(self.user, element.source, element) return True - - |