From 646640da5cec286ed59a99af8039f5f5feabec47 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Sat, 13 Feb 2016 10:59:08 +0100 Subject: The date of modification of articles is now used to detect changes. --- ...686_add_updated_date_column_to_article_table.py | 24 ++++++++++++++++ src/crawler/classic_crawler.py | 32 ++++++++++++++-------- src/web/lib/article_utils.py | 9 ++++-- src/web/models/article.py | 2 ++ 4 files changed, 53 insertions(+), 14 deletions(-) create mode 100644 migrations/versions/7f973d010686_add_updated_date_column_to_article_table.py diff --git a/migrations/versions/7f973d010686_add_updated_date_column_to_article_table.py b/migrations/versions/7f973d010686_add_updated_date_column_to_article_table.py new file mode 100644 index 00000000..6e12964f --- /dev/null +++ b/migrations/versions/7f973d010686_add_updated_date_column_to_article_table.py @@ -0,0 +1,24 @@ +"""add updated_date column to article table + +Revision ID: 7f973d010686 +Revises: 25ca960a207 +Create Date: 2016-02-12 21:51:40.868539 + +""" + +# revision identifiers, used by Alembic. +revision = '7f973d010686' +down_revision = '25ca960a207' +branch_labels = None +depends_on = None + +from alembic import op +import sqlalchemy as sa + + +def upgrade(): + op.add_column('article', sa.Column('updated_date', sa.DateTime(), nullable=True)) + + +def downgrade(): + op.drop_column('article', 'updated_date') diff --git a/src/crawler/classic_crawler.py b/src/crawler/classic_crawler.py index b21a7859..1628852f 100644 --- a/src/crawler/classic_crawler.py +++ b/src/crawler/classic_crawler.py @@ -122,20 +122,28 @@ async def insert_database(user, feed): **extract_id(article)) exist = existing_article_req.count() != 0 if exist: - existing_article = existing_article_req.first() - is_updated = False + # if the article has been already retrieved, we only update + # the content or the title logger.debug("Article %r (%r) already in the database.", article['title'], article['link']) - content = get_article_content(article) - if existing_article.title != article['title']: - existing_article.title = article['title'] - is_updated = True - if existing_article.content != content: - existing_article.content = content - existing_article.readed = False - is_updated = True - if is_updated: - art_contr.update({'entry_id': existing_article.entry_id}, existing_article.dump()) + existing_article = existing_article_req.first() + new_updated_date = None + try: + new_updated_date = dateutil.parser.parse(article['updated']) + except Exception as e: + print(e)#new_updated_date = existing_article.date + if existing_article.updated_date.strftime('%Y-%m-%dT%H:%M:%S') != \ + new_updated_date.strftime('%Y-%m-%dT%H:%M:%S'): + existing_article.updated_date = \ + new_updated_date.replace(tzinfo=None) + if existing_article.title != article['title']: + existing_article.title = article['title'] + content = get_article_content(article) + if existing_article.content != content: + existing_article.content = content + existing_article.readed = False + art_contr.update({'entry_id': existing_article.entry_id}, + existing_article.dump()) continue article = construct_article(article, feed) try: diff --git a/src/web/lib/article_utils.py b/src/web/lib/article_utils.py index 328c27ab..176f6a98 100644 --- a/src/web/lib/article_utils.py +++ b/src/web/lib/article_utils.py @@ -37,7 +37,7 @@ def construct_article(entry, feed): feed = feed.dump() "Safe method to transorm a feedparser entry into an article" now = datetime.now() - date = None + date, updated_date = None, None for date_key in ('published', 'updated'): if entry.get(date_key): try: @@ -46,6 +46,10 @@ def construct_article(entry, feed): pass else: break + try: + updated_date = dateutil.parser.parse(entry['updated']) + except Exception: + pass content = get_article_content(entry) article_link = entry.get('link') @@ -67,7 +71,8 @@ def construct_article(entry, feed): 'readed': False, 'like': False, 'content': content, 'retrieved_date': now.isoformat(), - 'date': (date or now).isoformat()} + 'date': (date or now).isoformat(), + 'updated_date': (updated_date or date or now).isoformat()} def get_article_content(entry): content = '' diff --git a/src/web/models/article.py b/src/web/models/article.py index 6c2df462..3f1db731 100644 --- a/src/web/models/article.py +++ b/src/web/models/article.py @@ -41,6 +41,7 @@ class Article(db.Model): readed = db.Column(db.Boolean(), default=False) like = db.Column(db.Boolean(), default=False) date = db.Column(db.DateTime(), default=datetime.now) + updated_date = db.Column(db.DateTime(), default=datetime.now) retrieved_date = db.Column(db.DateTime(), default=datetime.now) user_id = db.Column(db.Integer(), db.ForeignKey('user.id')) @@ -78,6 +79,7 @@ class Article(db.Model): "readed": self.readed, "like": self.like, "date": self.date, + "updated_date": self.updated_date, "retrieved_date": self.retrieved_date, "feed_id": self.feed_id, "category_id": self.category_id} -- cgit