diff options
author | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2019-05-21 23:50:21 +0200 |
---|---|---|
committer | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2019-05-21 23:50:21 +0200 |
commit | 31577e696b24516697b2a15cb123e34b41098d3e (patch) | |
tree | 18dffb4b0028d8c0c9dbec4440c091c13ac2db5f | |
parent | Improved crawler. (diff) | |
download | newspipe-31577e696b24516697b2a15cb123e34b41098d3e.tar.gz newspipe-31577e696b24516697b2a15cb123e34b41098d3e.tar.bz2 newspipe-31577e696b24516697b2a15cb123e34b41098d3e.zip |
Do not update existing articles.
-rw-r--r-- | src/crawler/default_crawler.py | 20 |
1 files changed, 0 insertions, 20 deletions
diff --git a/src/crawler/default_crawler.py b/src/crawler/default_crawler.py index bba8431a..e511eb9d 100644 --- a/src/crawler/default_crawler.py +++ b/src/crawler/default_crawler.py @@ -141,26 +141,6 @@ async def insert_articles(queue, nḅ_producers=1): exist = existing_article_req.count() != 0 if exist: continue - # if the article has been already retrieved, we only update - # the content or the title - logger.info('Article already in the database: {}'. \ - format(article['link'])) - existing_article = existing_article_req.first() - - if new_article['date'].replace(tzinfo=None) != \ - existing_article.date: - existing_article.date = new_article['date'] - existing_article.updated_date = new_article['date'] - if existing_article.title != new_article['title']: - existing_article.title = new_article['title'] - content = get_article_content(article) - if existing_article.content != content: - existing_article.content = content - existing_article.readed = False - art_contr.update({'entry_id': existing_article.entry_id}, - existing_article.dump()) - logger.info('Article updated: {}'.format(article['link'])) - continue # insertion of the new article try: |