aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/crawler.py
diff options
context:
space:
mode:
Diffstat (limited to 'pyaggr3g470r/crawler.py')
-rw-r--r--pyaggr3g470r/crawler.py42
1 files changed, 26 insertions, 16 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index dade3bea..ea149f5e 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -31,7 +31,6 @@ import urllib2
import requests
from requests.exceptions import *
#from requests.packages.urllib3.exceptions import DecodeError
-from urlparse import urlparse
from datetime import datetime
import gevent.monkey
@@ -51,7 +50,6 @@ requests_log.propagate = True
"""
-import models
import conf
if not conf.ON_HEROKU:
import search as fastsearch
@@ -60,9 +58,9 @@ import utils
if not conf.ON_HEROKU:
from flask.ext.mail import Message
from pyaggr3g470r import mail
-
-from pyaggr3g470r import app, db
-from pyaggr3g470r.models import User, Feed, Article
+
+from pyaggr3g470r import db
+from pyaggr3g470r.models import User, Article
import log
pyaggr3g470r_log = log.Log("feedgetter")
@@ -75,6 +73,7 @@ class TooLong(Exception):
"""
pyaggr3g470r_log.warning("Greenlet took to long")
+
class FeedGetter(object):
"""
This class is in charge of retrieving the feeds.
@@ -88,7 +87,7 @@ class FeedGetter(object):
self.proxy = urllib2.ProxyHandler({})
self.proxies = {}
else:
- self.proxy = urllib2.ProxyHandler({"http" : conf.HTTP_PROXY, \
+ self.proxy = urllib2.ProxyHandler({"http": conf.HTTP_PROXY,
"https": conf.HTTP_PROXY})
self.proxies = {
"http": "http://" + conf.HTTP_PROXY,
@@ -110,7 +109,8 @@ class FeedGetter(object):
feeds = [feed for feed in feeds if feed.id == feed_id]
# 2 - Fetch the feeds.
- # 'responses' contains all the jobs returned by the function retrieve_async()
+ # 'responses' contains all the jobs returned by
+ # the function retrieve_async()
responses = self.retrieve_async(feeds)
elements = [item.value for item in responses if item.value is not None]
@@ -133,7 +133,7 @@ class FeedGetter(object):
Fetch a feed.
"""
pyaggr3g470r_log.info("Fetching the feed:" + feed.title)
- a_feed = feedparser.parse(feed.link, handlers = [self.proxy])
+ a_feed = feedparser.parse(feed.link, handlers=[self.proxy])
if a_feed['entries'] == []:
return
@@ -155,14 +155,20 @@ class FeedGetter(object):
nice_url = article.link.encode("utf-8")
if conf.RESOLVE_ARTICLE_URL:
try:
- # resolves URL behind proxies (like feedproxy.google.com)
- r = requests.get(article.link, timeout=5.0, proxies=self.proxies)
+ # resolves URL behind proxies
+ # (like feedproxy.google.com)
+ r = requests.get(article.link, timeout=5.0,
+ proxies=self.proxies)
nice_url = r.url.encode("utf-8")
except Timeout:
- pyaggr3g470r_log.warning("Timeout when getting the real URL of %s." % (article.link,))
+ pyaggr3g470r_log.\
+ warning("Timeout when getting the real URL of %s." %
+ (article.link,))
continue
except Exception as e:
- pyaggr3g470r_log.warning("Unable to get the real URL of %s. Error: %s" % (article.link, str(e)))
+ pyaggr3g470r_log.\
+ warning("Unable to get the real URL of %s. Error: %s" %
+ (article.link, str(e)))
continue
# remove utm_* parameters
nice_url = utils.clean_url(nice_url)
@@ -181,7 +187,7 @@ class FeedGetter(object):
try:
description = BeautifulSoup(description, "html.parser").decode()
article_title = BeautifulSoup(article.title, "html.parser").decode()
- except Exception as E:
+ except Exception:
pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url))
article_title = article.title
@@ -215,7 +221,9 @@ class FeedGetter(object):
for article in articles:
- exist = Article.query.filter(Article.user_id == self.user.id, Article.feed_id == feed.id, Article.link == article.link).first()
+ exist = Article.query.filter(Article.user_id == self.user.id,
+ Article.feed_id == feed.id,
+ Article.link == article.link).first()
if exist != None:
pyaggr3g470r_log.error("Article %s (%s) already in the database." % (article.title, article.link))
continue
@@ -242,9 +250,11 @@ class FeedGetter(object):
pyaggr3g470r_log.info("Indexing new articles.")
for feed, articles in elements:
for element in articles:
- article = Article.query.filter(Article.user_id == self.user.id, Article.link == element.link).first()
+ article = Article.query.filter(Article.user_id == self.user.id,
+ Article.link == element.link).first()
try:
- fastsearch.add_to_index(self.user.id, [article], article.source)
+ fastsearch.add_to_index(self.user.id, [article],
+ article.source)
except:
pyaggr3g470r_log.error("Problem during indexation.")
return True \ No newline at end of file
bgstack15