From 2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Tue, 8 Nov 2016 14:39:47 +0100 Subject: various improvements to the crawler (better use of coroutines, test if an article should be updated). tags are now retrieved for the k-means clustering (previously achived with the content of articles) --- src/crawler/classic_crawler.py | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) (limited to 'src/crawler') diff --git a/src/crawler/classic_crawler.py b/src/crawler/classic_crawler.py index dac34e8c..7d29d462 100644 --- a/src/crawler/classic_crawler.py +++ b/src/crawler/classic_crawler.py @@ -30,7 +30,7 @@ import asyncio import logging import feedparser import dateutil.parser -from datetime import datetime +from datetime import datetime, timezone from sqlalchemy import or_ import conf @@ -111,7 +111,6 @@ async def parse_feed(user, feed): async def insert_database(user, feed): - articles = await parse_feed(user, feed) if None is articles: return [] @@ -121,48 +120,41 @@ async def insert_database(user, feed): new_articles = [] art_contr = ArticleController(user.id) for article in articles: + new_article = await construct_article(article, feed) + try: existing_article_req = art_contr.read(feed_id=feed.id, - **extract_id(article)) + entry_id=extract_id(article)) except Exception as e: logger.exception("existing_article_req: " + str(e)) continue - exist = existing_article_req.count() != 0 if exist: # if the article has been already retrieved, we only update # the content or the title - logger.debug('Article already in the database: '. \ - format(article['title'])) + logger.info('Article already in the database: {}'. \ + format(article['link'])) existing_article = existing_article_req.first() - new_updated_date = None - try: - new_updated_date = dateutil.parser.parse(article['updated']) - except Exception as e: - new_updated_date = existing_article.date - logger.exception('new_updated_date failed: {}'.format(e)) - - if None is existing_article.updated_date: - existing_article.updated_date = new_updated_date.replace(tzinfo=None) - if existing_article.updated_date.strftime('%Y-%m-%dT%H:%M:%S') != \ - new_updated_date.strftime('%Y-%m-%dT%H:%M:%S'): - logger.info('article updated') - existing_article.updated_date = \ - new_updated_date.replace(tzinfo=None) - if existing_article.title != article['title']: - existing_article.title = article['title'] + + if new_article['date'].replace(tzinfo=None) != \ + existing_article.date: + existing_article.date = new_article['date'] + existing_article.updated_date = new_article['date'] + if existing_article.title != new_article['title']: + existing_article.title = new_article['title'] content = get_article_content(article) if existing_article.content != content: existing_article.content = content existing_article.readed = False art_contr.update({'entry_id': existing_article.entry_id}, existing_article.dump()) + logger.info('Article updated: {}'.format(article['link'])) continue + # insertion of the new article - article = construct_article(article, feed) try: - new_articles.append(art_contr.create(**article)) - logger.info('New article added: {}'.format(article['link'])) + new_articles.append(art_contr.create(**new_article)) + logger.info('New article added: {}'.format(new_article['link'])) except Exception: logger.exception('Error when inserting article in database:') continue -- cgit