From 955b126903eb46065a1e90a986e6af197481437b Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Thu, 3 Nov 2016 13:39:27 +0100 Subject: Some improvements for the manager and the asyncio crawler. --- src/crawler/classic_crawler.py | 19 ++++++++++--------- src/manager.py | 9 ++++++++- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/crawler/classic_crawler.py b/src/crawler/classic_crawler.py index 4f21a29f..dac34e8c 100644 --- a/src/crawler/classic_crawler.py +++ b/src/crawler/classic_crawler.py @@ -118,7 +118,6 @@ async def insert_database(user, feed): logger.info('Inserting articles for {}'.format(feed.title)) - logger.info('Database insertion for {}'.format(feed.title)) new_articles = [] art_contr = ArticleController(user.id) for article in articles: @@ -133,8 +132,8 @@ async def insert_database(user, feed): if exist: # if the article has been already retrieved, we only update # the content or the title - logger.debug("Article %r (%r) already in the database.", - article['title'], article['link']) + logger.debug('Article already in the database: '. \ + format(article['title'])) existing_article = existing_article_req.first() new_updated_date = None try: @@ -187,11 +186,13 @@ def retrieve_feed(loop, user, feed_id=None): logger.info('Starting to retrieve feeds for {}'.format(user.nickname)) # Get the list of feeds to fetch - user = User.query.filter(User.email == user.email).first() - feeds = [feed for feed in user.feeds if - feed.error_count <= conf.DEFAULT_MAX_ERROR and feed.enabled] + filters = {} + filters['user_id'] = user.id if feed_id is not None: - feeds = [feed for feed in feeds if feed.id == feed_id] + filters['id'] = feed_id + filters['enabled'] = True + filters['error_count__lt'] = conf.DEFAULT_MAX_ERROR + feeds = FeedController().read(**filters).all() if feeds == []: return @@ -203,5 +204,5 @@ def retrieve_feed(loop, user, feed_id=None): loop.run_until_complete(asyncio.wait(tasks)) except Exception: logger.exception('an error occured') - - logger.info("All articles retrieved. End of the processus.") + finally: + logger.info('Articles retrieved for {}'.format(user.nickname)) diff --git a/src/manager.py b/src/manager.py index 47a88339..46f8fe10 100755 --- a/src/manager.py +++ b/src/manager.py @@ -3,6 +3,7 @@ import os import logging +from datetime import datetime from werkzeug import generate_password_hash from bootstrap import application, db, conf, set_logging from flask_script import Manager @@ -67,11 +68,17 @@ def fetch_asyncio(user_id=None, feed_id=None): except: feed_id = None + logger.info('Starting crawler.') + + start = datetime.now() loop = asyncio.get_event_loop() for user in users: - logger.info("Fetching articles for " + user.nickname) classic_crawler.retrieve_feed(loop, user, feed_id) loop.close() + end = datetime.now() + + logger.info('Crawler finished in {} seconds.' \ + .format((end - start).seconds)) if __name__ == '__main__': -- cgit