diff options
author | Cédric Bonhomme <kimble.mandel+bitbucket@gmail.com> | 2015-07-03 15:27:36 +0200 |
---|---|---|
committer | Cédric Bonhomme <kimble.mandel+bitbucket@gmail.com> | 2015-07-03 15:27:36 +0200 |
commit | 7f3c9790fcbe513436cb515b31b4ad81f2d10336 (patch) | |
tree | b621c1e4cbd1698bfcf52695e3cc8b547cfd5e9e /pyaggr3g470r/lib/crawler.py | |
parent | Minor improvements to the edit feed forms. (diff) | |
parent | fixing bug on reset link for feeds and hidding feed title for eXtraSmall device (diff) | |
download | newspipe-7f3c9790fcbe513436cb515b31b4ad81f2d10336.tar.gz newspipe-7f3c9790fcbe513436cb515b31b4ad81f2d10336.tar.bz2 newspipe-7f3c9790fcbe513436cb515b31b4ad81f2d10336.zip |
Merged in jaesivsm/pyaggr3g470r (pull request #15)
the icon feature
Diffstat (limited to 'pyaggr3g470r/lib/crawler.py')
-rw-r--r-- | pyaggr3g470r/lib/crawler.py | 33 |
1 files changed, 20 insertions, 13 deletions
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 324f0d8e..2ba5403a 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -24,7 +24,7 @@ from datetime import datetime from time import strftime, gmtime from concurrent.futures import ThreadPoolExecutor from requests_futures.sessions import FuturesSession -from pyaggr3g470r.lib.utils import default_handler +from pyaggr3g470r.lib.utils import default_handler, construct_feed_from logger = logging.getLogger(__name__) logging.captureWarnings(True) @@ -136,7 +136,7 @@ class PyAggUpdater(AbstractCrawler): self.feed = feed self.entries = entries self.headers = headers - self.parsed_feed = parsed_feed.get('feed', {}) + self.parsed_feed = parsed_feed super(PyAggUpdater, self).__init__(auth) def to_article(self, entry): @@ -188,19 +188,26 @@ class PyAggUpdater(AbstractCrawler): self.headers.get('etag', ''), self.headers.get('last-modified', '')) - dico = {'error_count': 0, 'last_error': None, - 'etag': self.headers.get('etag', ''), - 'last_modified': self.headers.get('last-modified', - strftime('%a, %d %b %Y %X %Z', gmtime())), - 'site_link': self.parsed_feed.get('link')} + up_feed = {'error_count': 0, 'last_error': None, + 'etag': self.headers.get('etag', ''), + 'last_modified': self.headers.get('last-modified', + strftime('%a, %d %b %Y %X %Z', gmtime()))} + fresh_feed = construct_feed_from(url=self.feed['link'], + fp_parsed=self.parsed_feed, + feed=self.feed) + for key in ('description', 'site_link', 'icon'): + if fresh_feed.get(key) and fresh_feed[key] != self.feed.get(key): + up_feed[key] = fresh_feed[key] if not self.feed.get('title'): - dico['title'] = self.parsed_feed.get('title', '') + up_feed['title'] = fresh_feed.get('title', '') + logger.info('%r %r - pushing feed attrs %r', self.feed['id'], self.feed['title'], - {key: "%s -> %s" % (dico[key], self.feed.get(key)) - for key in dico if dico[key] != self.feed.get(key)}) - if any([dico[key] != self.feed.get(key) for key in dico]): - future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico) + {key: "%s -> %s" % (up_feed[key], self.feed.get(key)) + for key in up_feed if up_feed[key] != self.feed.get(key)}) + if any([up_feed[key] != self.feed.get(key) for key in up_feed]): + future = self.query_pyagg('put', + 'feed/%d' % self.feed['id'], up_feed) future.add_done_callback(self.get_counter_callback()) @@ -265,7 +272,7 @@ class FeedCrawler(AbstractCrawler): self.feed['id'], self.feed['title']) ids, entries = [], {} - parsed_response = feedparser.parse(response.text) + parsed_response = feedparser.parse(response.content) for entry in parsed_response['entries']: entry_ids = extract_id(entry) entry_ids['feed_id'] = self.feed['id'] |