From 021f03931231142510864d013ec84492c1843ea6 Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <cedric@cedricbonhomme.org>
Date: Sun, 13 Apr 2014 18:58:32 +0200
Subject: Updated comments and log messages.

---
 pyaggr3g470r/crawler.py | 65 ++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 33 deletions(-)

diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index 0393ae31..072a8f25 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -68,28 +68,20 @@ import log
 pyaggr3g470r_log = log.Log("feedgetter")
 
 
-
-
-
-HEADERS = {'User-Agent': conf.USER_AGENT}
-
 class TooLong(Exception):
     def __init__(self):
         """
         Log a when greenlet took to long to fetch a resource.
         """
-        pass #logger.warning("Greenlet took to long")
-
+        pyaggr3g470r_log.warning("Greenlet took to long")
 
 class FeedGetter(object):
     """
-    This class is in charge of retrieving feeds listed in ./var/feed.lst.
-    This class uses feedparser module from Mark Pilgrim.
-    For each feed a new thread is launched.
+    This class is in charge of retrieving the feeds.
     """
     def __init__(self, email):
         """
-        Initializes the database connection.
+        Initialization.
         """
         feedparser.USER_AGENT = conf.USER_AGENT
         if conf.HTTP_PROXY == "":
@@ -105,16 +97,37 @@ class FeedGetter(object):
         feedparser.USER_AGENT = conf.USER_AGENT
         self.user = User.query.filter(User.email == email).first()
 
+    def retrieve_feed(self, feed_id=None):
+        """
+        Launch the processus.
+        """
+        pyaggr3g470r_log.info("Starting to retrieve feeds.")
+
+        # 1 - Get the list of feeds to fetch
+        user = User.query.filter(User.email == self.user.email).first()
+        feeds = [feed for feed in user.feeds if feed.enabled]
+        if feed_id != None:
+            feeds = [feed for feed in feeds if feed.id == feed_id]
+
+        # 2 - Fetch the feeds.
+        # 'responses' contains all the jobs returned by the function retrieve_async()
+        responses = self.retrieve_async(feeds)
+
+        # 3 - Insert articles in the database
+        self.insert_database([item.value for item in responses if item.value is not None])
+        
+        pyaggr3g470r_log.info("All articles retrieved. End of the processus.")
+
     def retrieve_async(self, feeds):
         """
-        Spawn different jobs in order to retrieve a list of distant resources.
-        Returns a list of models.Item objects.
+        Spawn different jobs in order to retrieve a list of feeds.
+        Returns a list of jobs.
         """
         def fetch(feed):
             """
-            Fetch the content located at 'wsw_item.url'.
+            Fetch a feed.
             """
-            pyaggr3g470r_log.info("Fetching " + feed.title)
+            pyaggr3g470r_log.info("Fetching the feed:" + feed.title)
             a_feed = feedparser.parse(feed.link, handlers = [self.proxy])
             if a_feed['entries'] == []:
                 return
@@ -164,7 +177,7 @@ class FeedGetter(object):
                     description = BeautifulSoup(description, "html.parser").decode()
                     article_title = BeautifulSoup(article.title, "html.parser").decode()
                 except Exception as E:
-                    #pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url))
+                    pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url))
                     article_title = article.title
 
                 try:
@@ -172,14 +185,13 @@ class FeedGetter(object):
                 except:
                     post_date = datetime(*article.updated_parsed[:6])
 
-                # save the article
+                # create the models.Article object and append it to the list of articles
                 article = Article(link=nice_url, title=article_title, \
                                 content=description, readed=False, like=False, date=post_date, \
                                 user_id=self.user.id, feed_id=feed.id)
                 articles.append(article)
 
-
-
+            # return the feed with the list of retrieved articles
             return feed, articles
 
         jobs = []
@@ -217,17 +229,4 @@ class FeedGetter(object):
                     continue
         db.session.close()
         return True
-
-
-    def retrieve_feed(self, feed_id=None):
-        """
-        Launch
-        """
-        user = User.query.filter(User.email == self.user.email).first()
-        feeds = [feed for feed in user.feeds if feed.enabled]
-        if feed_id != None:
-            feeds = [feed for feed in feeds if feed.id == feed_id]
-
-        responses = self.retrieve_async(feeds)
-
-        self.insert_database([item.value for item in responses if item.value is not None])        
\ No newline at end of file
+        
\ No newline at end of file
-- 
cgit