diff options
author | François Schmidts <francois.schmidts@gmail.com> | 2015-04-06 10:19:58 +0200 |
---|---|---|
committer | François Schmidts <francois.schmidts@gmail.com> | 2015-04-06 10:19:58 +0200 |
commit | f2463bc333cc207ffa9ab935b7edf59a9894720d (patch) | |
tree | ba7e4a0cfce8a910e5cd53f06d118390f9d12981 | |
parent | implementing cache construction on crawler side (limiting useless pushes) (diff) | |
download | newspipe-f2463bc333cc207ffa9ab935b7edf59a9894720d.tar.gz newspipe-f2463bc333cc207ffa9ab935b7edf59a9894720d.tar.bz2 newspipe-f2463bc333cc207ffa9ab935b7edf59a9894720d.zip |
misc update
updating the way we maintain feed up to date in the database
fixing the counter
bumping the minimum error count
-rw-r--r-- | pyaggr3g470r/controllers/feed.py | 5 | ||||
-rw-r--r-- | pyaggr3g470r/lib/crawler.py | 49 | ||||
-rw-r--r-- | pyaggr3g470r/templates/home.html | 4 | ||||
-rw-r--r-- | pyaggr3g470r/views/views.py | 2 |
4 files changed, 36 insertions, 24 deletions
diff --git a/pyaggr3g470r/controllers/feed.py b/pyaggr3g470r/controllers/feed.py index ff496efc..b99a3a7f 100644 --- a/pyaggr3g470r/controllers/feed.py +++ b/pyaggr3g470r/controllers/feed.py @@ -5,14 +5,15 @@ from .abstract import AbstractController from pyaggr3g470r.models import Feed logger = logging.getLogger(__name__) -DEFAULT_MAX_ERROR = 3 +DEFAULT_MAX_ERROR = 6 DEFAULT_LIMIT = 5 class FeedController(AbstractController): _db_cls = Feed - def list_late(self, max_last, max_error=DEFAULT_MAX_ERROR, limit=DEFAULT_LIMIT): + def list_late(self, max_last, max_error=DEFAULT_MAX_ERROR, + limit=DEFAULT_LIMIT): return [feed for feed in self.read( error_count__lt=max_error, enabled=True, last_retrieved__lt=max_last) diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 99967671..1ac6029a 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -85,9 +85,12 @@ class AbstractCrawler: @wraps(func) def wrapper(*args, **kwargs): cls.__counter__ += 1 - result = func(*args, **kwargs) - cls.__counter__ -= 1 - return result + try: + return func(*args, **kwargs) + except: + logger.exception('an error occured while %r', func) + finally: + cls.__counter__ -= 1 return wrapper @classmethod @@ -172,21 +175,27 @@ class PyAggUpdater(AbstractCrawler): for id_to_create in results: entry = self.to_article( self.entries[tuple(sorted(id_to_create.items()))]) - logger.info('creating %r - %r', entry['title'], id_to_create) + logger.warn('%r %r - creating %r - %r', self.feed['id'], + self.feed['title'], entry['title'], id_to_create) self.query_pyagg('post', 'article', entry) now = datetime.now() logger.debug('%r %r - updating feed etag %r last_mod %r', self.feed['id'], self.feed['title'], - self.headers.get('etag'), now) + self.headers.get('etag', ''), + self.headers.get('last-modified', '')) - dico = {'error_count': 0, 'last_error': '', + dico = {'error_count': 0, 'last_error': None, 'etag': self.headers.get('etag', ''), 'last_modified': self.headers.get('last-modified', ''), 'site_link': self.parsed_feed.get('link')} if not self.feed.get('title'): dico['title'] = self.parsed_feed.get('title', '') - if any([dico[key] == self.feed.get(key) for key in dico]): + logger.info('%r %r - pushing feed attrs %r', + self.feed['id'], self.feed['title'], + {key: "%s -> %s" % (dico[key], self.feed.get(key)) + for key in dico if dico[key] != self.feed.get(key)}) + if any([dico[key] != self.feed.get(key) for key in dico]): future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico) future.add_done_callback(self.get_counter_callback()) @@ -223,19 +232,18 @@ class FeedCrawler(AbstractCrawler): future.add_done_callback(self.get_counter_callback()) return - etag_generated = False if response.status_code == 304: logger.info("%r %r - feed responded with 304", self.feed['id'], self.feed['title']) self.clean_feed() return - if not response.headers.get('etag'): - etag_generated = True + if 'etag' not in response.headers: logger.debug('%r %r - manually generating etag', self.feed['id'], self.feed['title']) response.headers['etag'] = 'pyagg/"%s"' % to_hash(response.text) - if self.feed['etag'] and response.headers['etag'] == self.feed['etag']: - if etag_generated: + if response.headers['etag'] and self.feed['etag'] \ + and response.headers['etag'] == self.feed['etag']: + if 'pyagg' in self.feed['etag']: logger.info("%r %r - calculated hash matches (%d)", self.feed['id'], self.feed['title'], response.status_code) @@ -246,9 +254,12 @@ class FeedCrawler(AbstractCrawler): self.clean_feed() return else: - logger.info('%r %r - etag mismatch %r != %r', - self.feed['id'], self.feed['title'], - response.headers['etag'], self.feed['etag']) + logger.debug('%r %r - etag mismatch %r != %r', + self.feed['id'], self.feed['title'], + response.headers['etag'], self.feed['etag']) + logger.info('%r %r - cache validation failed, challenging entries', + self.feed['id'], self.feed['title']) + ids, entries = [], {} parsed_response = feedparser.parse(response.text) for entry in parsed_response['entries']: @@ -272,10 +283,10 @@ class CrawlerScheduler(AbstractCrawler): def prepare_headers(self, feed): """For a known feed, will construct some header dictionnary""" headers = {'User-Agent': 'pyaggr3g470r/crawler'} - if feed.get('etag') and 'pyagg' not in feed.get('etag', ''): - headers['If-None-Match'] = feed['etag'] if feed.get('last_modified'): headers['If-Modified-Since'] = feed['last_modified'] + if feed.get('etag') and 'pyagg' not in feed['etag']: + headers['If-None-Match'] = feed['etag'] logger.debug('%r %r - calculated headers %r', feed['id'], feed['title'], headers) return headers @@ -289,8 +300,8 @@ class CrawlerScheduler(AbstractCrawler): feeds = response.json() logger.debug('%d to fetch %r', len(feeds), feeds) for feed in feeds: - logger.info('%r %r - fetching resources', - feed['id'], feed['title']) + logger.debug('%r %r - fetching resources', + feed['id'], feed['title']) future = self.session.get(feed['link'], headers=self.prepare_headers(feed)) future.add_done_callback(FeedCrawler(feed, self.auth).callback) diff --git a/pyaggr3g470r/templates/home.html b/pyaggr3g470r/templates/home.html index 3a9608d5..d2a961ab 100644 --- a/pyaggr3g470r/templates/home.html +++ b/pyaggr3g470r/templates/home.html @@ -22,7 +22,7 @@ <li class="feed-menu"><a href="{{ gen_url(feed=fid) }}"> {% if feed_id == fid %}<b>{% endif %} {% if in_error.get(fid, 0) > 0 %} - <span style="background-color: {{ "red" if in_error[fid] > 2 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }}</span> + <span style="background-color: {{ "red" if in_error[fid] > 5 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }}</span> {% endif %} <span id="unread-{{ fid }}" class="badge pull-right">{{ nbunread }}</span> {{ feeds[fid]|safe }} @@ -40,7 +40,7 @@ {% for fid, ftitle in feeds|dictsort(case_sensitive=False, by='value') if not fid in unread %} <li class="feed-menu"><a href="{{ gen_url(feed=fid) }}"> {% if in_error.get(fid, 0) > 0 %} - <span style="background-color: {{ "red" if in_error[fid] > 2 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }}</span> + <span style="background-color: {{ "red" if in_error[fid] > 5 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }}</span> {% endif %} {% if feed_id == fid %}<b>{% endif %} {{ ftitle|safe }} diff --git a/pyaggr3g470r/views/views.py b/pyaggr3g470r/views/views.py index 0f1f8765..7934eef8 100644 --- a/pyaggr3g470r/views/views.py +++ b/pyaggr3g470r/views/views.py @@ -248,7 +248,7 @@ def home(): .filter(Article.readed == False, Article.user_id == g.user.id)\ .group_by(Article.feed_id).all() in_error = {feed.id: feed.error_count for feed in - FeedController(g.user.id).read(error_count__gt=0).all()} + FeedController(g.user.id).read(error_count__gt=2).all()} def gen_url(filter_=filter_, limit=limit, feed=feed_id): return '?filter_=%s&limit=%s&feed=%d' % (filter_, limit, feed) return render_template('home.html', gen_url=gen_url, feed_id=feed_id, |