aboutsummaryrefslogtreecommitdiff
path: root/src/crawler/default_crawler.py
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2019-05-21 23:50:21 +0200
committerCédric Bonhomme <cedric@cedricbonhomme.org>2019-05-21 23:50:21 +0200
commit31577e696b24516697b2a15cb123e34b41098d3e (patch)
tree18dffb4b0028d8c0c9dbec4440c091c13ac2db5f /src/crawler/default_crawler.py
parentImproved crawler. (diff)
downloadnewspipe-31577e696b24516697b2a15cb123e34b41098d3e.tar.gz
newspipe-31577e696b24516697b2a15cb123e34b41098d3e.tar.bz2
newspipe-31577e696b24516697b2a15cb123e34b41098d3e.zip
Do not update existing articles.
Diffstat (limited to 'src/crawler/default_crawler.py')
-rw-r--r--src/crawler/default_crawler.py20
1 files changed, 0 insertions, 20 deletions
diff --git a/src/crawler/default_crawler.py b/src/crawler/default_crawler.py
index bba8431a..e511eb9d 100644
--- a/src/crawler/default_crawler.py
+++ b/src/crawler/default_crawler.py
@@ -141,26 +141,6 @@ async def insert_articles(queue, nḅ_producers=1):
exist = existing_article_req.count() != 0
if exist:
continue
- # if the article has been already retrieved, we only update
- # the content or the title
- logger.info('Article already in the database: {}'. \
- format(article['link']))
- existing_article = existing_article_req.first()
-
- if new_article['date'].replace(tzinfo=None) != \
- existing_article.date:
- existing_article.date = new_article['date']
- existing_article.updated_date = new_article['date']
- if existing_article.title != new_article['title']:
- existing_article.title = new_article['title']
- content = get_article_content(article)
- if existing_article.content != content:
- existing_article.content = content
- existing_article.readed = False
- art_contr.update({'entry_id': existing_article.entry_id},
- existing_article.dump())
- logger.info('Article updated: {}'.format(article['link']))
- continue
# insertion of the new article
try:
bgstack15