aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/crawler/classic_crawler.py20
-rw-r--r--src/web/lib/article_utils.py14
2 files changed, 26 insertions, 8 deletions
diff --git a/src/crawler/classic_crawler.py b/src/crawler/classic_crawler.py
index 1bc37a47..b21a7859 100644
--- a/src/crawler/classic_crawler.py
+++ b/src/crawler/classic_crawler.py
@@ -38,7 +38,8 @@ from bootstrap import db
from web.models import User
from web.controllers import FeedController, ArticleController
from web.lib.feed_utils import construct_feed_from, is_parsing_ok
-from web.lib.article_utils import construct_article, extract_id
+from web.lib.article_utils import construct_article, extract_id, \
+ get_article_content
logger = logging.getLogger(__name__)
@@ -117,11 +118,24 @@ async def insert_database(user, feed):
new_articles = []
art_contr = ArticleController(user.id)
for article in articles:
- exist = art_contr.read(feed_id=feed.id,
- **extract_id(article)).count() != 0
+ existing_article_req = art_contr.read(feed_id=feed.id,
+ **extract_id(article))
+ exist = existing_article_req.count() != 0
if exist:
+ existing_article = existing_article_req.first()
+ is_updated = False
logger.debug("Article %r (%r) already in the database.",
article['title'], article['link'])
+ content = get_article_content(article)
+ if existing_article.title != article['title']:
+ existing_article.title = article['title']
+ is_updated = True
+ if existing_article.content != content:
+ existing_article.content = content
+ existing_article.readed = False
+ is_updated = True
+ if is_updated:
+ art_contr.update({'entry_id': existing_article.entry_id}, existing_article.dump())
continue
article = construct_article(article, feed)
try:
diff --git a/src/web/lib/article_utils.py b/src/web/lib/article_utils.py
index 02ca2cd1..328c27ab 100644
--- a/src/web/lib/article_utils.py
+++ b/src/web/lib/article_utils.py
@@ -46,11 +46,7 @@ def construct_article(entry, feed):
pass
else:
break
- content = ''
- if entry.get('content'):
- content = entry['content'][0]['value']
- elif entry.get('summary'):
- content = entry['summary']
+ content = get_article_content(entry)
article_link = entry.get('link')
if conf.RESOLVE_ARTICLE_URL and article_link:
@@ -72,3 +68,11 @@ def construct_article(entry, feed):
'content': content,
'retrieved_date': now.isoformat(),
'date': (date or now).isoformat()}
+
+def get_article_content(entry):
+ content = ''
+ if entry.get('content'):
+ content = entry['content'][0]['value']
+ elif entry.get('summary'):
+ content = entry['summary']
+ return content
bgstack15