aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pyaggr3g470r/crawler.py72
-rwxr-xr-xpyaggr3g470r/utils.py25
-rw-r--r--requirements.txt1
3 files changed, 56 insertions, 42 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index aa6fdbc0..8f88e2d5 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -26,9 +26,11 @@ __revision__ = "$Date: 2014/04/13 $"
__copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "AGPLv3"
+import re
import feedparser
import urllib2
import requests
+import dateutil.parser
from bs4 import BeautifulSoup
from datetime import datetime
from sqlalchemy.exc import IntegrityError
@@ -106,7 +108,7 @@ class FeedGetter(object):
# 4 - Indexation
if not conf.ON_HEROKU:
self.index(new_articles)
-
+
# 5 - Mail notification
if not conf.ON_HEROKU and conf.MAIL_ENABLED:
self.mail_notification(new_articles)
@@ -151,51 +153,62 @@ class FeedGetter(object):
proxies=self.proxies)
nice_url = r.url.encode("utf-8")
except Timeout:
- pyaggr3g470r_log.\
- warning("Timeout when getting the real URL of %s." %
- (article.link,))
+ pyaggr3g470r_log.warning(
+ "Timeout when getting the real URL of %s.",
+ article.link)
continue
- except Exception as e:
- pyaggr3g470r_log.\
- warning("Unable to get the real URL of %s. Error: %s" %
- (article.link, str(e)))
+ except Exception as error:
+ pyaggr3g470r_log.warning(
+ "Unable to get the real URL of %s. Error: %s",
+ article.link, error)
continue
# remove utm_* parameters
nice_url = utils.clean_url(nice_url)
description = ""
- article_title = ""
+ article_title = article.get('title', '')
try:
# article content
description = article.content[0].value
except AttributeError:
- try:
- # article description
- description = article.description
- except Exception:
- description = ""
+ # article description
+ description = article.get('description', '')
+
try:
description = BeautifulSoup(description, "lxml").decode()
except:
- pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" %
- (article_title, nice_url))
- article_title = article.title
+ pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)",
+ article_title, nice_url)
- try:
- post_date = datetime(*article.published_parsed[:6])
- except:
- post_date = datetime(*article.updated_parsed[:6])
+ post_date = None
+ for date_key in ('published_parsed', 'published',
+ 'updated_parsed', 'updated'):
+ if not date_key in article:
+ continue
+
+ try:
+ post_date = dateutil.parser.parse(article[date_key],
+ dayfirst=True)
+ break
+ except:
+ try: # trying to clean date field from letters
+ post_date = dateutil.parser.parse(
+ re.sub('[A-z]', '', article[date_key]),
+ dayfirst=True)
+ break
+ except:
+ pass
# create the models.Article object and append it to the list of articles
- article = Article(link=nice_url, title=article_title, \
- content=description, readed=False, like=False, date=post_date, \
- user_id=self.user.id, feed_id=feed.id)
+ article = Article(link=nice_url, title=article_title,
+ content=description, readed=False, like=False,
+ date=post_date, user_id=self.user.id,
+ feed_id=feed.id)
articles.append(article)
# return the feed with the list of retrieved articles
return feed, articles
- jobs = []
pool = Pool(20)
jobs = [pool.spawn(fetch, feed) for feed in feeds]
pool.join()
@@ -211,7 +224,7 @@ class FeedGetter(object):
for feed, articles in elements:
for article in articles:
-
+
exist = Article.query.filter(Article.user_id == self.user.id,
Article.feed_id == feed.id,
@@ -220,6 +233,9 @@ class FeedGetter(object):
pyaggr3g470r_log.error("Article %s (%s) already in the database." %
(article.title, article.link))
continue
+ if article.date is None:
+ article.date = datetime.now(dateutil.tz.tzlocal())
+
new_articles.append(article)
try:
@@ -253,7 +269,7 @@ class FeedGetter(object):
except:
pyaggr3g470r_log.error("Problem during indexation.")
return True
-
+
def mail_notification(self, new_articles):
"""
Mail notification.
@@ -264,5 +280,3 @@ class FeedGetter(object):
emails.new_article_notification(self.user, element.source, element)
return True
-
-
diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index 5e8be5f8..320c49ce 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -40,7 +40,7 @@ import opml
import json
import datetime
import operator
-from urllib import urlencode
+import urllib
from urlparse import urlparse, parse_qs, urlunparse
from bs4 import BeautifulSoup
@@ -145,40 +145,40 @@ def import_json(email, json_file):
# Create feeds
for feed in json_account["result"]:
-
+
if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == feed["link"]).first():
continue
-
+
new_feed = Feed(title=feed["title"], description="", link=feed["link"], \
site_link=feed["site_link"], email_notification=feed["email_notification"], \
created_date=datetime.datetime.fromtimestamp(int(feed["created_date"])),
enabled=feed["enabled"])
user.feeds.append(new_feed)
- nb_feeds += 1
+ nb_feeds += 1
db.session.commit()
# Create articles
for feed in json_account["result"]:
user_feed = Feed.query.filter(Feed.user_id == user.id, Feed.link == feed["link"]).first()
- if None != user_feed:
+ if None != user_feed:
for article in feed["articles"]:
-
+
if None == Article.query.filter(Article.user_id == user.id,
Article.feed_id == user_feed.id,
Article.link == article["link"]).first():
-
+
new_article = Article(link=article["link"], title=article["title"], \
content=article["content"], readed=article["readed"], like=article["like"], \
retrieved_date=datetime.datetime.fromtimestamp(int(article["retrieved_date"])),
date=datetime.datetime.fromtimestamp(int(article["date"])),
user_id=user.id, feed_id=user_feed.id)
-
+
user_feed.articles.append(new_article)
nb_articles += 1
db.session.commit()
return nb_feeds, nb_articles
-
+
def clean_url(url):
"""
@@ -188,15 +188,14 @@ def clean_url(url):
qd = parse_qs(parsed_url.query, keep_blank_values=True)
filtered = dict((k, v) for k, v in qd.iteritems()
if not k.startswith('utm_'))
- nice_url = urlunparse([
+ return urlunparse([
parsed_url.scheme,
parsed_url.netloc,
- parsed_url.path,
+ urllib.quote(parsed_url.path),
parsed_url.params,
- urlencode(filtered, doseq=True),
+ urllib.urlencode(filtered, doseq=True),
parsed_url.fragment
])
- return nice_url
def open_url(url):
diff --git a/requirements.txt b/requirements.txt
index fc00a393..42867a2c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,3 +16,4 @@ WTForms
python-postmark
gevent
whoosh
+python-dateutil
bgstack15