Cleaned code.

author: Cédric Bonhomme <cedric@cedricbonhomme.org> 2014-04-27 02:09:26 +0200
committer: Cédric Bonhomme <cedric@cedricbonhomme.org> 2014-04-27 02:09:26 +0200
commit: 89cf405ab970c4e289b7b79485b27aed8edb1a41 (patch)
tree: 7f6330fb7b9bab82a84aeb2a639377b9c4fe1d80 /pyaggr3g470r/crawler.py
parent: This fixes #3. (diff)
download: newspipe-89cf405ab970c4e289b7b79485b27aed8edb1a41.tar.gz
newspipe-89cf405ab970c4e289b7b79485b27aed8edb1a41.tar.bz2
newspipe-89cf405ab970c4e289b7b79485b27aed8edb1a41.zip
1 files changed, 26 insertions, 16 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index dade3bea..ea149f5e 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -31,7 +31,6 @@ import urllib2
 import requests
 from requests.exceptions import *
 #from requests.packages.urllib3.exceptions import DecodeError
-from urlparse import urlparse
 from datetime import datetime
 
 import gevent.monkey
@@ -51,7 +50,6 @@ requests_log.propagate = True
 """
 
 
-import models
 import conf
 if not conf.ON_HEROKU:
     import search as fastsearch
@@ -60,9 +58,9 @@ import utils
 if not conf.ON_HEROKU:
     from flask.ext.mail import Message
     from pyaggr3g470r import mail
-    
-from pyaggr3g470r import app, db
-from pyaggr3g470r.models import User, Feed, Article
+
+from pyaggr3g470r import db
+from pyaggr3g470r.models import User, Article
 
 import log
 pyaggr3g470r_log = log.Log("feedgetter")
@@ -75,6 +73,7 @@ class TooLong(Exception):
         """
         pyaggr3g470r_log.warning("Greenlet took to long")
 
+
 class FeedGetter(object):
     """
     This class is in charge of retrieving the feeds.
@@ -88,7 +87,7 @@ class FeedGetter(object):
             self.proxy = urllib2.ProxyHandler({})
             self.proxies = {}
         else:
-            self.proxy = urllib2.ProxyHandler({"http" : conf.HTTP_PROXY, \
+            self.proxy = urllib2.ProxyHandler({"http": conf.HTTP_PROXY,
                                                "https": conf.HTTP_PROXY})
             self.proxies = {
                             "http": "http://" + conf.HTTP_PROXY,
@@ -110,7 +109,8 @@ class FeedGetter(object):
             feeds = [feed for feed in feeds if feed.id == feed_id]
 
         # 2 - Fetch the feeds.
-        # 'responses' contains all the jobs returned by the function retrieve_async()
+        # 'responses' contains all the jobs returned by
+        # the function retrieve_async()
         responses = self.retrieve_async(feeds)
         elements = [item.value for item in responses if item.value is not None]
 
@@ -133,7 +133,7 @@ class FeedGetter(object):
             Fetch a feed.
             """
             pyaggr3g470r_log.info("Fetching the feed:" + feed.title)
-            a_feed = feedparser.parse(feed.link, handlers = [self.proxy])
+            a_feed = feedparser.parse(feed.link, handlers=[self.proxy])
             if a_feed['entries'] == []:
                 return
 
@@ -155,14 +155,20 @@ class FeedGetter(object):
                 nice_url = article.link.encode("utf-8")
                 if conf.RESOLVE_ARTICLE_URL:
                     try:
-                        # resolves URL behind proxies (like feedproxy.google.com)
-                        r = requests.get(article.link, timeout=5.0, proxies=self.proxies)
+                        # resolves URL behind proxies
+                        # (like feedproxy.google.com)
+                        r = requests.get(article.link, timeout=5.0,
+                                            proxies=self.proxies)
                         nice_url = r.url.encode("utf-8")
                     except Timeout:
-                        pyaggr3g470r_log.warning("Timeout when getting the real URL of %s." % (article.link,))
+                        pyaggr3g470r_log.\
+                        warning("Timeout when getting the real URL of %s." %
+                                    (article.link,))
                         continue
                     except Exception as e:
-                        pyaggr3g470r_log.warning("Unable to get the real URL of %s. Error: %s" % (article.link, str(e)))
+                        pyaggr3g470r_log.\
+                        warning("Unable to get the real URL of %s. Error: %s" %
+                                    (article.link, str(e)))
                         continue
                 # remove utm_* parameters
                 nice_url = utils.clean_url(nice_url)
@@ -181,7 +187,7 @@ class FeedGetter(object):
                 try:
                     description = BeautifulSoup(description, "html.parser").decode()
                     article_title = BeautifulSoup(article.title, "html.parser").decode()
-                except Exception as E:
+                except Exception:
                     pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url))
                     article_title = article.title
 
@@ -215,7 +221,9 @@ class FeedGetter(object):
 
             for article in articles:
 
-                exist = Article.query.filter(Article.user_id == self.user.id, Article.feed_id == feed.id, Article.link == article.link).first()
+                exist = Article.query.filter(Article.user_id == self.user.id,
+                                        Article.feed_id == feed.id,
+                                        Article.link == article.link).first()
                 if exist != None:
                     pyaggr3g470r_log.error("Article %s (%s) already in the database." % (article.title, article.link))
                     continue
@@ -242,9 +250,11 @@ class FeedGetter(object):
         pyaggr3g470r_log.info("Indexing new articles.")
         for feed, articles in elements:
             for element in articles:
-                article = Article.query.filter(Article.user_id == self.user.id, Article.link == element.link).first()
+                article = Article.query.filter(Article.user_id == self.user.id,
+                                        Article.link == element.link).first()
                 try:
-                    fastsearch.add_to_index(self.user.id, [article], article.source)
+                    fastsearch.add_to_index(self.user.id, [article],
+                                                article.source)
                 except:
                     pyaggr3g470r_log.error("Problem during indexation.")
         return True
 \ No newline at end of file
author	Cédric Bonhomme <cedric@cedricbonhomme.org>	2014-04-27 02:09:26 +0200
committer	Cédric Bonhomme <cedric@cedricbonhomme.org>	2014-04-27 02:09:26 +0200
commit	89cf405ab970c4e289b7b79485b27aed8edb1a41 (patch)
tree	7f6330fb7b9bab82a84aeb2a639377b9c4fe1d80 /pyaggr3g470r/crawler.py
parent	This fixes #3. (diff)
download	newspipe-89cf405ab970c4e289b7b79485b27aed8edb1a41.tar.gz newspipe-89cf405ab970c4e289b7b79485b27aed8edb1a41.tar.bz2 newspipe-89cf405ab970c4e289b7b79485b27aed8edb1a41.zip