aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r
diff options
context:
space:
mode:
authorFrançois Schmidts <francois.schmidts@gmail.com>2015-04-06 10:19:58 +0200
committerFrançois Schmidts <francois.schmidts@gmail.com>2015-04-06 10:19:58 +0200
commitf2463bc333cc207ffa9ab935b7edf59a9894720d (patch)
treeba7e4a0cfce8a910e5cd53f06d118390f9d12981 /pyaggr3g470r
parentimplementing cache construction on crawler side (limiting useless pushes) (diff)
downloadnewspipe-f2463bc333cc207ffa9ab935b7edf59a9894720d.tar.gz
newspipe-f2463bc333cc207ffa9ab935b7edf59a9894720d.tar.bz2
newspipe-f2463bc333cc207ffa9ab935b7edf59a9894720d.zip
misc update
updating the way we maintain feed up to date in the database fixing the counter bumping the minimum error count
Diffstat (limited to 'pyaggr3g470r')
-rw-r--r--pyaggr3g470r/controllers/feed.py5
-rw-r--r--pyaggr3g470r/lib/crawler.py49
-rw-r--r--pyaggr3g470r/templates/home.html4
-rw-r--r--pyaggr3g470r/views/views.py2
4 files changed, 36 insertions, 24 deletions
diff --git a/pyaggr3g470r/controllers/feed.py b/pyaggr3g470r/controllers/feed.py
index ff496efc..b99a3a7f 100644
--- a/pyaggr3g470r/controllers/feed.py
+++ b/pyaggr3g470r/controllers/feed.py
@@ -5,14 +5,15 @@ from .abstract import AbstractController
from pyaggr3g470r.models import Feed
logger = logging.getLogger(__name__)
-DEFAULT_MAX_ERROR = 3
+DEFAULT_MAX_ERROR = 6
DEFAULT_LIMIT = 5
class FeedController(AbstractController):
_db_cls = Feed
- def list_late(self, max_last, max_error=DEFAULT_MAX_ERROR, limit=DEFAULT_LIMIT):
+ def list_late(self, max_last, max_error=DEFAULT_MAX_ERROR,
+ limit=DEFAULT_LIMIT):
return [feed for feed in self.read(
error_count__lt=max_error, enabled=True,
last_retrieved__lt=max_last)
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py
index 99967671..1ac6029a 100644
--- a/pyaggr3g470r/lib/crawler.py
+++ b/pyaggr3g470r/lib/crawler.py
@@ -85,9 +85,12 @@ class AbstractCrawler:
@wraps(func)
def wrapper(*args, **kwargs):
cls.__counter__ += 1
- result = func(*args, **kwargs)
- cls.__counter__ -= 1
- return result
+ try:
+ return func(*args, **kwargs)
+ except:
+ logger.exception('an error occured while %r', func)
+ finally:
+ cls.__counter__ -= 1
return wrapper
@classmethod
@@ -172,21 +175,27 @@ class PyAggUpdater(AbstractCrawler):
for id_to_create in results:
entry = self.to_article(
self.entries[tuple(sorted(id_to_create.items()))])
- logger.info('creating %r - %r', entry['title'], id_to_create)
+ logger.warn('%r %r - creating %r - %r', self.feed['id'],
+ self.feed['title'], entry['title'], id_to_create)
self.query_pyagg('post', 'article', entry)
now = datetime.now()
logger.debug('%r %r - updating feed etag %r last_mod %r',
self.feed['id'], self.feed['title'],
- self.headers.get('etag'), now)
+ self.headers.get('etag', ''),
+ self.headers.get('last-modified', ''))
- dico = {'error_count': 0, 'last_error': '',
+ dico = {'error_count': 0, 'last_error': None,
'etag': self.headers.get('etag', ''),
'last_modified': self.headers.get('last-modified', ''),
'site_link': self.parsed_feed.get('link')}
if not self.feed.get('title'):
dico['title'] = self.parsed_feed.get('title', '')
- if any([dico[key] == self.feed.get(key) for key in dico]):
+ logger.info('%r %r - pushing feed attrs %r',
+ self.feed['id'], self.feed['title'],
+ {key: "%s -> %s" % (dico[key], self.feed.get(key))
+ for key in dico if dico[key] != self.feed.get(key)})
+ if any([dico[key] != self.feed.get(key) for key in dico]):
future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico)
future.add_done_callback(self.get_counter_callback())
@@ -223,19 +232,18 @@ class FeedCrawler(AbstractCrawler):
future.add_done_callback(self.get_counter_callback())
return
- etag_generated = False
if response.status_code == 304:
logger.info("%r %r - feed responded with 304",
self.feed['id'], self.feed['title'])
self.clean_feed()
return
- if not response.headers.get('etag'):
- etag_generated = True
+ if 'etag' not in response.headers:
logger.debug('%r %r - manually generating etag',
self.feed['id'], self.feed['title'])
response.headers['etag'] = 'pyagg/"%s"' % to_hash(response.text)
- if self.feed['etag'] and response.headers['etag'] == self.feed['etag']:
- if etag_generated:
+ if response.headers['etag'] and self.feed['etag'] \
+ and response.headers['etag'] == self.feed['etag']:
+ if 'pyagg' in self.feed['etag']:
logger.info("%r %r - calculated hash matches (%d)",
self.feed['id'], self.feed['title'],
response.status_code)
@@ -246,9 +254,12 @@ class FeedCrawler(AbstractCrawler):
self.clean_feed()
return
else:
- logger.info('%r %r - etag mismatch %r != %r',
- self.feed['id'], self.feed['title'],
- response.headers['etag'], self.feed['etag'])
+ logger.debug('%r %r - etag mismatch %r != %r',
+ self.feed['id'], self.feed['title'],
+ response.headers['etag'], self.feed['etag'])
+ logger.info('%r %r - cache validation failed, challenging entries',
+ self.feed['id'], self.feed['title'])
+
ids, entries = [], {}
parsed_response = feedparser.parse(response.text)
for entry in parsed_response['entries']:
@@ -272,10 +283,10 @@ class CrawlerScheduler(AbstractCrawler):
def prepare_headers(self, feed):
"""For a known feed, will construct some header dictionnary"""
headers = {'User-Agent': 'pyaggr3g470r/crawler'}
- if feed.get('etag') and 'pyagg' not in feed.get('etag', ''):
- headers['If-None-Match'] = feed['etag']
if feed.get('last_modified'):
headers['If-Modified-Since'] = feed['last_modified']
+ if feed.get('etag') and 'pyagg' not in feed['etag']:
+ headers['If-None-Match'] = feed['etag']
logger.debug('%r %r - calculated headers %r',
feed['id'], feed['title'], headers)
return headers
@@ -289,8 +300,8 @@ class CrawlerScheduler(AbstractCrawler):
feeds = response.json()
logger.debug('%d to fetch %r', len(feeds), feeds)
for feed in feeds:
- logger.info('%r %r - fetching resources',
- feed['id'], feed['title'])
+ logger.debug('%r %r - fetching resources',
+ feed['id'], feed['title'])
future = self.session.get(feed['link'],
headers=self.prepare_headers(feed))
future.add_done_callback(FeedCrawler(feed, self.auth).callback)
diff --git a/pyaggr3g470r/templates/home.html b/pyaggr3g470r/templates/home.html
index 3a9608d5..d2a961ab 100644
--- a/pyaggr3g470r/templates/home.html
+++ b/pyaggr3g470r/templates/home.html
@@ -22,7 +22,7 @@
<li class="feed-menu"><a href="{{ gen_url(feed=fid) }}">
{% if feed_id == fid %}<b>{% endif %}
{% if in_error.get(fid, 0) > 0 %}
- <span style="background-color: {{ "red" if in_error[fid] > 2 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }}</span>
+ <span style="background-color: {{ "red" if in_error[fid] > 5 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }}</span>
{% endif %}
<span id="unread-{{ fid }}" class="badge pull-right">{{ nbunread }}</span>
{{ feeds[fid]|safe }}
@@ -40,7 +40,7 @@
{% for fid, ftitle in feeds|dictsort(case_sensitive=False, by='value') if not fid in unread %}
<li class="feed-menu"><a href="{{ gen_url(feed=fid) }}">
{% if in_error.get(fid, 0) > 0 %}
- <span style="background-color: {{ "red" if in_error[fid] > 2 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }}</span>
+ <span style="background-color: {{ "red" if in_error[fid] > 5 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }}</span>
{% endif %}
{% if feed_id == fid %}<b>{% endif %}
{{ ftitle|safe }}
diff --git a/pyaggr3g470r/views/views.py b/pyaggr3g470r/views/views.py
index 0f1f8765..7934eef8 100644
--- a/pyaggr3g470r/views/views.py
+++ b/pyaggr3g470r/views/views.py
@@ -248,7 +248,7 @@ def home():
.filter(Article.readed == False, Article.user_id == g.user.id)\
.group_by(Article.feed_id).all()
in_error = {feed.id: feed.error_count for feed in
- FeedController(g.user.id).read(error_count__gt=0).all()}
+ FeedController(g.user.id).read(error_count__gt=2).all()}
def gen_url(filter_=filter_, limit=limit, feed=feed_id):
return '?filter_=%s&limit=%s&feed=%d' % (filter_, limit, feed)
return render_template('home.html', gen_url=gen_url, feed_id=feed_id,
bgstack15