diff options
author | François Schmidts <francois.schmidts@gmail.com> | 2015-04-06 10:19:58 +0200 |
---|---|---|
committer | François Schmidts <francois.schmidts@gmail.com> | 2015-04-06 10:19:58 +0200 |
commit | f2463bc333cc207ffa9ab935b7edf59a9894720d (patch) | |
tree | ba7e4a0cfce8a910e5cd53f06d118390f9d12981 /pyaggr3g470r/lib | |
parent | implementing cache construction on crawler side (limiting useless pushes) (diff) | |
download | newspipe-f2463bc333cc207ffa9ab935b7edf59a9894720d.tar.gz newspipe-f2463bc333cc207ffa9ab935b7edf59a9894720d.tar.bz2 newspipe-f2463bc333cc207ffa9ab935b7edf59a9894720d.zip |
misc update
updating the way we maintain feed up to date in the database
fixing the counter
bumping the minimum error count
Diffstat (limited to 'pyaggr3g470r/lib')
-rw-r--r-- | pyaggr3g470r/lib/crawler.py | 49 |
1 files changed, 30 insertions, 19 deletions
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 99967671..1ac6029a 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -85,9 +85,12 @@ class AbstractCrawler: @wraps(func) def wrapper(*args, **kwargs): cls.__counter__ += 1 - result = func(*args, **kwargs) - cls.__counter__ -= 1 - return result + try: + return func(*args, **kwargs) + except: + logger.exception('an error occured while %r', func) + finally: + cls.__counter__ -= 1 return wrapper @classmethod @@ -172,21 +175,27 @@ class PyAggUpdater(AbstractCrawler): for id_to_create in results: entry = self.to_article( self.entries[tuple(sorted(id_to_create.items()))]) - logger.info('creating %r - %r', entry['title'], id_to_create) + logger.warn('%r %r - creating %r - %r', self.feed['id'], + self.feed['title'], entry['title'], id_to_create) self.query_pyagg('post', 'article', entry) now = datetime.now() logger.debug('%r %r - updating feed etag %r last_mod %r', self.feed['id'], self.feed['title'], - self.headers.get('etag'), now) + self.headers.get('etag', ''), + self.headers.get('last-modified', '')) - dico = {'error_count': 0, 'last_error': '', + dico = {'error_count': 0, 'last_error': None, 'etag': self.headers.get('etag', ''), 'last_modified': self.headers.get('last-modified', ''), 'site_link': self.parsed_feed.get('link')} if not self.feed.get('title'): dico['title'] = self.parsed_feed.get('title', '') - if any([dico[key] == self.feed.get(key) for key in dico]): + logger.info('%r %r - pushing feed attrs %r', + self.feed['id'], self.feed['title'], + {key: "%s -> %s" % (dico[key], self.feed.get(key)) + for key in dico if dico[key] != self.feed.get(key)}) + if any([dico[key] != self.feed.get(key) for key in dico]): future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico) future.add_done_callback(self.get_counter_callback()) @@ -223,19 +232,18 @@ class FeedCrawler(AbstractCrawler): future.add_done_callback(self.get_counter_callback()) return - etag_generated = False if response.status_code == 304: logger.info("%r %r - feed responded with 304", self.feed['id'], self.feed['title']) self.clean_feed() return - if not response.headers.get('etag'): - etag_generated = True + if 'etag' not in response.headers: logger.debug('%r %r - manually generating etag', self.feed['id'], self.feed['title']) response.headers['etag'] = 'pyagg/"%s"' % to_hash(response.text) - if self.feed['etag'] and response.headers['etag'] == self.feed['etag']: - if etag_generated: + if response.headers['etag'] and self.feed['etag'] \ + and response.headers['etag'] == self.feed['etag']: + if 'pyagg' in self.feed['etag']: logger.info("%r %r - calculated hash matches (%d)", self.feed['id'], self.feed['title'], response.status_code) @@ -246,9 +254,12 @@ class FeedCrawler(AbstractCrawler): self.clean_feed() return else: - logger.info('%r %r - etag mismatch %r != %r', - self.feed['id'], self.feed['title'], - response.headers['etag'], self.feed['etag']) + logger.debug('%r %r - etag mismatch %r != %r', + self.feed['id'], self.feed['title'], + response.headers['etag'], self.feed['etag']) + logger.info('%r %r - cache validation failed, challenging entries', + self.feed['id'], self.feed['title']) + ids, entries = [], {} parsed_response = feedparser.parse(response.text) for entry in parsed_response['entries']: @@ -272,10 +283,10 @@ class CrawlerScheduler(AbstractCrawler): def prepare_headers(self, feed): """For a known feed, will construct some header dictionnary""" headers = {'User-Agent': 'pyaggr3g470r/crawler'} - if feed.get('etag') and 'pyagg' not in feed.get('etag', ''): - headers['If-None-Match'] = feed['etag'] if feed.get('last_modified'): headers['If-Modified-Since'] = feed['last_modified'] + if feed.get('etag') and 'pyagg' not in feed['etag']: + headers['If-None-Match'] = feed['etag'] logger.debug('%r %r - calculated headers %r', feed['id'], feed['title'], headers) return headers @@ -289,8 +300,8 @@ class CrawlerScheduler(AbstractCrawler): feeds = response.json() logger.debug('%d to fetch %r', len(feeds), feeds) for feed in feeds: - logger.info('%r %r - fetching resources', - feed['id'], feed['title']) + logger.debug('%r %r - fetching resources', + feed['id'], feed['title']) future = self.session.get(feed['link'], headers=self.prepare_headers(feed)) future.add_done_callback(FeedCrawler(feed, self.auth).callback) |