aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--migrations/versions/7f973d010686_add_updated_date_column_to_article_table.py24
-rw-r--r--src/crawler/classic_crawler.py32
-rw-r--r--src/web/lib/article_utils.py9
-rw-r--r--src/web/models/article.py2
4 files changed, 53 insertions, 14 deletions
diff --git a/migrations/versions/7f973d010686_add_updated_date_column_to_article_table.py b/migrations/versions/7f973d010686_add_updated_date_column_to_article_table.py
new file mode 100644
index 00000000..6e12964f
--- /dev/null
+++ b/migrations/versions/7f973d010686_add_updated_date_column_to_article_table.py
@@ -0,0 +1,24 @@
+"""add updated_date column to article table
+
+Revision ID: 7f973d010686
+Revises: 25ca960a207
+Create Date: 2016-02-12 21:51:40.868539
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = '7f973d010686'
+down_revision = '25ca960a207'
+branch_labels = None
+depends_on = None
+
+from alembic import op
+import sqlalchemy as sa
+
+
+def upgrade():
+ op.add_column('article', sa.Column('updated_date', sa.DateTime(), nullable=True))
+
+
+def downgrade():
+ op.drop_column('article', 'updated_date')
diff --git a/src/crawler/classic_crawler.py b/src/crawler/classic_crawler.py
index b21a7859..1628852f 100644
--- a/src/crawler/classic_crawler.py
+++ b/src/crawler/classic_crawler.py
@@ -122,20 +122,28 @@ async def insert_database(user, feed):
**extract_id(article))
exist = existing_article_req.count() != 0
if exist:
- existing_article = existing_article_req.first()
- is_updated = False
+ # if the article has been already retrieved, we only update
+ # the content or the title
logger.debug("Article %r (%r) already in the database.",
article['title'], article['link'])
- content = get_article_content(article)
- if existing_article.title != article['title']:
- existing_article.title = article['title']
- is_updated = True
- if existing_article.content != content:
- existing_article.content = content
- existing_article.readed = False
- is_updated = True
- if is_updated:
- art_contr.update({'entry_id': existing_article.entry_id}, existing_article.dump())
+ existing_article = existing_article_req.first()
+ new_updated_date = None
+ try:
+ new_updated_date = dateutil.parser.parse(article['updated'])
+ except Exception as e:
+ print(e)#new_updated_date = existing_article.date
+ if existing_article.updated_date.strftime('%Y-%m-%dT%H:%M:%S') != \
+ new_updated_date.strftime('%Y-%m-%dT%H:%M:%S'):
+ existing_article.updated_date = \
+ new_updated_date.replace(tzinfo=None)
+ if existing_article.title != article['title']:
+ existing_article.title = article['title']
+ content = get_article_content(article)
+ if existing_article.content != content:
+ existing_article.content = content
+ existing_article.readed = False
+ art_contr.update({'entry_id': existing_article.entry_id},
+ existing_article.dump())
continue
article = construct_article(article, feed)
try:
diff --git a/src/web/lib/article_utils.py b/src/web/lib/article_utils.py
index 328c27ab..176f6a98 100644
--- a/src/web/lib/article_utils.py
+++ b/src/web/lib/article_utils.py
@@ -37,7 +37,7 @@ def construct_article(entry, feed):
feed = feed.dump()
"Safe method to transorm a feedparser entry into an article"
now = datetime.now()
- date = None
+ date, updated_date = None, None
for date_key in ('published', 'updated'):
if entry.get(date_key):
try:
@@ -46,6 +46,10 @@ def construct_article(entry, feed):
pass
else:
break
+ try:
+ updated_date = dateutil.parser.parse(entry['updated'])
+ except Exception:
+ pass
content = get_article_content(entry)
article_link = entry.get('link')
@@ -67,7 +71,8 @@ def construct_article(entry, feed):
'readed': False, 'like': False,
'content': content,
'retrieved_date': now.isoformat(),
- 'date': (date or now).isoformat()}
+ 'date': (date or now).isoformat(),
+ 'updated_date': (updated_date or date or now).isoformat()}
def get_article_content(entry):
content = ''
diff --git a/src/web/models/article.py b/src/web/models/article.py
index 6c2df462..3f1db731 100644
--- a/src/web/models/article.py
+++ b/src/web/models/article.py
@@ -41,6 +41,7 @@ class Article(db.Model):
readed = db.Column(db.Boolean(), default=False)
like = db.Column(db.Boolean(), default=False)
date = db.Column(db.DateTime(), default=datetime.now)
+ updated_date = db.Column(db.DateTime(), default=datetime.now)
retrieved_date = db.Column(db.DateTime(), default=datetime.now)
user_id = db.Column(db.Integer(), db.ForeignKey('user.id'))
@@ -78,6 +79,7 @@ class Article(db.Model):
"readed": self.readed,
"like": self.like,
"date": self.date,
+ "updated_date": self.updated_date,
"retrieved_date": self.retrieved_date,
"feed_id": self.feed_id,
"category_id": self.category_id}
bgstack15