From 47159b1e3818d1a5f1f4eeca15356e99a12c9922 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Tue, 9 Feb 2016 23:05:50 +0100 Subject: The classic crawler is now able to update an article. Only content or title are updatable, as discussed in #27. Need to implement the notification system with the boolean when the content of an article has been updated. --- src/crawler/classic_crawler.py | 20 +++++++++++++++++--- src/web/lib/article_utils.py | 14 +++++++++----- 2 files changed, 26 insertions(+), 8 deletions(-) (limited to 'src') diff --git a/src/crawler/classic_crawler.py b/src/crawler/classic_crawler.py index 1bc37a47..b21a7859 100644 --- a/src/crawler/classic_crawler.py +++ b/src/crawler/classic_crawler.py @@ -38,7 +38,8 @@ from bootstrap import db from web.models import User from web.controllers import FeedController, ArticleController from web.lib.feed_utils import construct_feed_from, is_parsing_ok -from web.lib.article_utils import construct_article, extract_id +from web.lib.article_utils import construct_article, extract_id, \ + get_article_content logger = logging.getLogger(__name__) @@ -117,11 +118,24 @@ async def insert_database(user, feed): new_articles = [] art_contr = ArticleController(user.id) for article in articles: - exist = art_contr.read(feed_id=feed.id, - **extract_id(article)).count() != 0 + existing_article_req = art_contr.read(feed_id=feed.id, + **extract_id(article)) + exist = existing_article_req.count() != 0 if exist: + existing_article = existing_article_req.first() + is_updated = False logger.debug("Article %r (%r) already in the database.", article['title'], article['link']) + content = get_article_content(article) + if existing_article.title != article['title']: + existing_article.title = article['title'] + is_updated = True + if existing_article.content != content: + existing_article.content = content + existing_article.readed = False + is_updated = True + if is_updated: + art_contr.update({'entry_id': existing_article.entry_id}, existing_article.dump()) continue article = construct_article(article, feed) try: diff --git a/src/web/lib/article_utils.py b/src/web/lib/article_utils.py index 02ca2cd1..328c27ab 100644 --- a/src/web/lib/article_utils.py +++ b/src/web/lib/article_utils.py @@ -46,11 +46,7 @@ def construct_article(entry, feed): pass else: break - content = '' - if entry.get('content'): - content = entry['content'][0]['value'] - elif entry.get('summary'): - content = entry['summary'] + content = get_article_content(entry) article_link = entry.get('link') if conf.RESOLVE_ARTICLE_URL and article_link: @@ -72,3 +68,11 @@ def construct_article(entry, feed): 'content': content, 'retrieved_date': now.isoformat(), 'date': (date or now).isoformat()} + +def get_article_content(entry): + content = '' + if entry.get('content'): + content = entry['content'][0]['value'] + elif entry.get('summary'): + content = entry['summary'] + return content -- cgit