aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/crawler.py
diff options
context:
space:
mode:
Diffstat (limited to 'pyaggr3g470r/crawler.py')
-rw-r--r--pyaggr3g470r/crawler.py65
1 files changed, 32 insertions, 33 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index 0393ae31..072a8f25 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -68,28 +68,20 @@ import log
pyaggr3g470r_log = log.Log("feedgetter")
-
-
-
-HEADERS = {'User-Agent': conf.USER_AGENT}
-
class TooLong(Exception):
def __init__(self):
"""
Log a when greenlet took to long to fetch a resource.
"""
- pass #logger.warning("Greenlet took to long")
-
+ pyaggr3g470r_log.warning("Greenlet took to long")
class FeedGetter(object):
"""
- This class is in charge of retrieving feeds listed in ./var/feed.lst.
- This class uses feedparser module from Mark Pilgrim.
- For each feed a new thread is launched.
+ This class is in charge of retrieving the feeds.
"""
def __init__(self, email):
"""
- Initializes the database connection.
+ Initialization.
"""
feedparser.USER_AGENT = conf.USER_AGENT
if conf.HTTP_PROXY == "":
@@ -105,16 +97,37 @@ class FeedGetter(object):
feedparser.USER_AGENT = conf.USER_AGENT
self.user = User.query.filter(User.email == email).first()
+ def retrieve_feed(self, feed_id=None):
+ """
+ Launch the processus.
+ """
+ pyaggr3g470r_log.info("Starting to retrieve feeds.")
+
+ # 1 - Get the list of feeds to fetch
+ user = User.query.filter(User.email == self.user.email).first()
+ feeds = [feed for feed in user.feeds if feed.enabled]
+ if feed_id != None:
+ feeds = [feed for feed in feeds if feed.id == feed_id]
+
+ # 2 - Fetch the feeds.
+ # 'responses' contains all the jobs returned by the function retrieve_async()
+ responses = self.retrieve_async(feeds)
+
+ # 3 - Insert articles in the database
+ self.insert_database([item.value for item in responses if item.value is not None])
+
+ pyaggr3g470r_log.info("All articles retrieved. End of the processus.")
+
def retrieve_async(self, feeds):
"""
- Spawn different jobs in order to retrieve a list of distant resources.
- Returns a list of models.Item objects.
+ Spawn different jobs in order to retrieve a list of feeds.
+ Returns a list of jobs.
"""
def fetch(feed):
"""
- Fetch the content located at 'wsw_item.url'.
+ Fetch a feed.
"""
- pyaggr3g470r_log.info("Fetching " + feed.title)
+ pyaggr3g470r_log.info("Fetching the feed:" + feed.title)
a_feed = feedparser.parse(feed.link, handlers = [self.proxy])
if a_feed['entries'] == []:
return
@@ -164,7 +177,7 @@ class FeedGetter(object):
description = BeautifulSoup(description, "html.parser").decode()
article_title = BeautifulSoup(article.title, "html.parser").decode()
except Exception as E:
- #pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url))
+ pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url))
article_title = article.title
try:
@@ -172,14 +185,13 @@ class FeedGetter(object):
except:
post_date = datetime(*article.updated_parsed[:6])
- # save the article
+ # create the models.Article object and append it to the list of articles
article = Article(link=nice_url, title=article_title, \
content=description, readed=False, like=False, date=post_date, \
user_id=self.user.id, feed_id=feed.id)
articles.append(article)
-
-
+ # return the feed with the list of retrieved articles
return feed, articles
jobs = []
@@ -217,17 +229,4 @@ class FeedGetter(object):
continue
db.session.close()
return True
-
-
- def retrieve_feed(self, feed_id=None):
- """
- Launch
- """
- user = User.query.filter(User.email == self.user.email).first()
- feeds = [feed for feed in user.feeds if feed.enabled]
- if feed_id != None:
- feeds = [feed for feed in feeds if feed.id == feed_id]
-
- responses = self.retrieve_async(feeds)
-
- self.insert_database([item.value for item in responses if item.value is not None]) \ No newline at end of file
+ \ No newline at end of file
bgstack15