diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/crawler/classic_crawler.py | 32 | ||||
-rw-r--r-- | src/web/lib/article_utils.py | 9 | ||||
-rw-r--r-- | src/web/models/article.py | 2 |
3 files changed, 29 insertions, 14 deletions
diff --git a/src/crawler/classic_crawler.py b/src/crawler/classic_crawler.py index b21a7859..1628852f 100644 --- a/src/crawler/classic_crawler.py +++ b/src/crawler/classic_crawler.py @@ -122,20 +122,28 @@ async def insert_database(user, feed): **extract_id(article)) exist = existing_article_req.count() != 0 if exist: - existing_article = existing_article_req.first() - is_updated = False + # if the article has been already retrieved, we only update + # the content or the title logger.debug("Article %r (%r) already in the database.", article['title'], article['link']) - content = get_article_content(article) - if existing_article.title != article['title']: - existing_article.title = article['title'] - is_updated = True - if existing_article.content != content: - existing_article.content = content - existing_article.readed = False - is_updated = True - if is_updated: - art_contr.update({'entry_id': existing_article.entry_id}, existing_article.dump()) + existing_article = existing_article_req.first() + new_updated_date = None + try: + new_updated_date = dateutil.parser.parse(article['updated']) + except Exception as e: + print(e)#new_updated_date = existing_article.date + if existing_article.updated_date.strftime('%Y-%m-%dT%H:%M:%S') != \ + new_updated_date.strftime('%Y-%m-%dT%H:%M:%S'): + existing_article.updated_date = \ + new_updated_date.replace(tzinfo=None) + if existing_article.title != article['title']: + existing_article.title = article['title'] + content = get_article_content(article) + if existing_article.content != content: + existing_article.content = content + existing_article.readed = False + art_contr.update({'entry_id': existing_article.entry_id}, + existing_article.dump()) continue article = construct_article(article, feed) try: diff --git a/src/web/lib/article_utils.py b/src/web/lib/article_utils.py index 328c27ab..176f6a98 100644 --- a/src/web/lib/article_utils.py +++ b/src/web/lib/article_utils.py @@ -37,7 +37,7 @@ def construct_article(entry, feed): feed = feed.dump() "Safe method to transorm a feedparser entry into an article" now = datetime.now() - date = None + date, updated_date = None, None for date_key in ('published', 'updated'): if entry.get(date_key): try: @@ -46,6 +46,10 @@ def construct_article(entry, feed): pass else: break + try: + updated_date = dateutil.parser.parse(entry['updated']) + except Exception: + pass content = get_article_content(entry) article_link = entry.get('link') @@ -67,7 +71,8 @@ def construct_article(entry, feed): 'readed': False, 'like': False, 'content': content, 'retrieved_date': now.isoformat(), - 'date': (date or now).isoformat()} + 'date': (date or now).isoformat(), + 'updated_date': (updated_date or date or now).isoformat()} def get_article_content(entry): content = '' diff --git a/src/web/models/article.py b/src/web/models/article.py index 6c2df462..3f1db731 100644 --- a/src/web/models/article.py +++ b/src/web/models/article.py @@ -41,6 +41,7 @@ class Article(db.Model): readed = db.Column(db.Boolean(), default=False) like = db.Column(db.Boolean(), default=False) date = db.Column(db.DateTime(), default=datetime.now) + updated_date = db.Column(db.DateTime(), default=datetime.now) retrieved_date = db.Column(db.DateTime(), default=datetime.now) user_id = db.Column(db.Integer(), db.ForeignKey('user.id')) @@ -78,6 +79,7 @@ class Article(db.Model): "readed": self.readed, "like": self.like, "date": self.date, + "updated_date": self.updated_date, "retrieved_date": self.retrieved_date, "feed_id": self.feed_id, "category_id": self.category_id} |