aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pyaggr3g470r/controllers/feed.py21
-rw-r--r--pyaggr3g470r/lib/crawler.py123
-rw-r--r--pyaggr3g470r/models/article.py4
-rw-r--r--pyaggr3g470r/models/feed.py4
-rw-r--r--pyaggr3g470r/models/role.py4
-rw-r--r--pyaggr3g470r/models/user.py3
-rw-r--r--pyaggr3g470r/templates/home.html9
-rw-r--r--pyaggr3g470r/views/views.py2
8 files changed, 118 insertions, 52 deletions
diff --git a/pyaggr3g470r/controllers/feed.py b/pyaggr3g470r/controllers/feed.py
index 342ab88e..b99a3a7f 100644
--- a/pyaggr3g470r/controllers/feed.py
+++ b/pyaggr3g470r/controllers/feed.py
@@ -1,22 +1,31 @@
-from datetime import datetime
+import logging
+from datetime import datetime, timedelta
+
from .abstract import AbstractController
from pyaggr3g470r.models import Feed
-DEFAULT_MAX_ERROR = 3
+logger = logging.getLogger(__name__)
+DEFAULT_MAX_ERROR = 6
DEFAULT_LIMIT = 5
class FeedController(AbstractController):
_db_cls = Feed
+ def list_late(self, max_last, max_error=DEFAULT_MAX_ERROR,
+ limit=DEFAULT_LIMIT):
+ return [feed for feed in self.read(
+ error_count__lt=max_error, enabled=True,
+ last_retrieved__lt=max_last)
+ .order_by('Feed.last_retrieved')
+ .limit(limit)]
+
def list_fetchable(self, max_error=DEFAULT_MAX_ERROR, limit=DEFAULT_LIMIT):
from pyaggr3g470r.controllers import UserController
now = datetime.now()
user = UserController(self.user_id).get(id=self.user_id)
- #max_last = now - timedelta(minutes=user.refresh_rate or 60)
- feeds = [feed for feed in self.read(user_id=self.user_id,
- error_count__lt=max_error, enabled=True).limit(limit)]
- #last_retrieved__lt=max_last).limit(limit)]
+ max_last = now - timedelta(minutes=user.refresh_rate or 60)
+ feeds = self.list_late(max_last, max_error, limit)
if feeds:
self.update({'id__in': [feed.id for feed in feeds]},
{'last_retrieved': now})
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py
index 68a7efd0..1cb61973 100644
--- a/pyaggr3g470r/lib/crawler.py
+++ b/pyaggr3g470r/lib/crawler.py
@@ -19,6 +19,7 @@ import logging
import requests
import feedparser
import dateutil.parser
+from hashlib import md5
from functools import wraps
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
@@ -30,6 +31,10 @@ logging.captureWarnings(True)
API_ROOT = "api/v2.0/"
+def to_hash(text):
+ return md5(text.encode('utf8')).hexdigest()
+
+
def extract_id(entry, keys=[('link', 'link'),
('published', 'retrieved_date'),
('updated', 'retrieved_date')], force_id=False):
@@ -41,8 +46,8 @@ def extract_id(entry, keys=[('link', 'link'),
if entry_id:
return {'entry_id': entry_id}
if not entry_id and force_id:
- entry_id = hash("".join(entry[entry_key] for _, entry_key in keys
- if entry_key in entry))
+ entry_id = to_hash("".join(entry[entry_key] for _, entry_key in keys
+ if entry_key in entry).encode('utf8'))
else:
ids = {}
for entry_key, pyagg_key in keys:
@@ -59,6 +64,7 @@ class AbstractCrawler:
__counter__ = 0
def __init__(self, auth):
+ AbstractCrawler.__counter__ += 1
self.auth = auth
self.session = self.get_session()
self.url = conf.PLATFORM_URL
@@ -80,11 +86,21 @@ class AbstractCrawler:
@wraps(func)
def wrapper(*args, **kwargs):
cls.__counter__ += 1
- result = func(*args, **kwargs)
- cls.__counter__ -= 1
- return result
+ try:
+ return func(*args, **kwargs)
+ except:
+ logger.exception('an error occured while %r', func)
+ finally:
+ cls.__counter__ -= 1
return wrapper
+ @classmethod
+ def get_counter_callback(cls):
+ cls.__counter__ += 1
+ def debump(*args, **kwargs):
+ cls.__counter__ -= 1
+ return debump
+
def query_pyagg(self, method, urn, data=None):
"""A wrapper for internal call, method should be ones you can find
on requests (header, post, get, options, ...), urn the distant
@@ -96,22 +112,30 @@ class AbstractCrawler:
return method("%s%s%s" % (self.url, API_ROOT, urn),
auth=self.auth, data=json.dumps(data,
default=default_handler),
- headers={'Content-Type': 'application/json'})
+ headers={'Content-Type': 'application/json',
+ 'User-Agent': 'pyaggr3g470r'})
@classmethod
- def wait(cls):
+ def wait(cls, max_wait=600):
"See count_on_me, that method will just wait for the counter to be 0"
time.sleep(1)
+ second_waited = 1
while cls.__counter__:
+ if second_waited > max_wait:
+ logger.warn('Exiting after %d seconds, counter at %d',
+ max_wait, cls.__counter__)
+ break
time.sleep(1)
+ second_waited += 1
class PyAggUpdater(AbstractCrawler):
- def __init__(self, feed, entries, headers, auth):
+ def __init__(self, feed, entries, headers, parsed_feed, auth):
self.feed = feed
self.entries = entries
self.headers = headers
+ self.parsed_feed = parsed_feed.get('feed', {})
super(PyAggUpdater, self).__init__(auth)
def to_article(self, entry):
@@ -145,24 +169,36 @@ class PyAggUpdater(AbstractCrawler):
def callback(self, response):
"""Will process the result from the challenge, creating missing article
and updating the feed"""
+ AbstractCrawler.__counter__ -= 1
results = response.result().json()
logger.debug('%r %r - %d entries were not matched and will be created',
self.feed['id'], self.feed['title'], len(results))
for id_to_create in results:
entry = self.to_article(
self.entries[tuple(sorted(id_to_create.items()))])
- logger.info('creating %r - %r', entry['title'], id_to_create)
+ logger.warn('%r %r - creating %r - %r', self.feed['id'],
+ self.feed['title'], entry['title'], id_to_create)
self.query_pyagg('post', 'article', entry)
now = datetime.now()
logger.debug('%r %r - updating feed etag %r last_mod %r',
self.feed['id'], self.feed['title'],
- self.headers.get('etag'), now)
-
- self.query_pyagg('put', 'feed/%d' % self.feed['id'], {'error_count': 0,
- 'etag': self.headers.get('etag', ''),
- 'last_error': '',
- 'last_modified': self.headers.get('last-modified', '')})
+ self.headers.get('etag', ''),
+ self.headers.get('last-modified', ''))
+
+ dico = {'error_count': 0, 'last_error': None,
+ 'etag': self.headers.get('etag', ''),
+ 'last_modified': self.headers.get('last-modified', ''),
+ 'site_link': self.parsed_feed.get('link')}
+ if not self.feed.get('title'):
+ dico['title'] = self.parsed_feed.get('title', '')
+ logger.info('%r %r - pushing feed attrs %r',
+ self.feed['id'], self.feed['title'],
+ {key: "%s -> %s" % (dico[key], self.feed.get(key))
+ for key in dico if dico[key] != self.feed.get(key)})
+ if any([dico[key] != self.feed.get(key) for key in dico]):
+ future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico)
+ future.add_done_callback(self.get_counter_callback())
class FeedCrawler(AbstractCrawler):
@@ -174,13 +210,15 @@ class FeedCrawler(AbstractCrawler):
def clean_feed(self):
"""Will reset the errors counters on a feed that have known errors"""
if self.feed.get('error_count') or self.feed.get('last_error'):
- self.query_pyagg('put', 'feed/%d' % self.feed['id'],
- {'error_count': 0, 'last_error': ''})
+ future = self.query_pyagg('put', 'feed/%d' % self.feed['id'],
+ {'error_count': 0, 'last_error': ''})
+ future.add_done_callback(self.get_counter_callback())
@AbstractCrawler.count_on_me
def callback(self, response):
"""will fetch the feed and interprete results (304, etag) or will
challenge pyagg to compare gotten entries with existing ones"""
+ AbstractCrawler.__counter__ -= 1
try:
response = response.result()
response.raise_for_status()
@@ -189,23 +227,40 @@ class FeedCrawler(AbstractCrawler):
logger.warn('%r %r - an error occured while fetching feed; bumping'
' error count to %r', self.feed['id'],
self.feed['title'], error_count)
- self.query_pyagg('put', 'feed/%d' % self.feed['id'],
- {'error_count': error_count,
- 'last_error': str(error)})
+ future = self.query_pyagg('put', 'feed/%d' % self.feed['id'],
+ {'error_count': error_count,
+ 'last_error': str(error)})
+ future.add_done_callback(self.get_counter_callback())
return
if response.status_code == 304:
logger.info("%r %r - feed responded with 304",
- self.feed['id'], self.feed['title'])
+ self.feed['id'], self.feed['title'])
self.clean_feed()
return
- if self.feed['etag'] and response.headers.get('etag') \
- and response.headers.get('etag') == self.feed['etag']:
- logger.info("%r %r - feed responded with same etag (%d)",
- self.feed['id'], self.feed['title'],
- response.status_code)
+ if 'etag' not in response.headers:
+ logger.debug('%r %r - manually generating etag',
+ self.feed['id'], self.feed['title'])
+ response.headers['etag'] = 'pyagg/"%s"' % to_hash(response.text)
+ if response.headers['etag'] and self.feed['etag'] \
+ and response.headers['etag'] == self.feed['etag']:
+ if 'pyagg' in self.feed['etag']:
+ logger.info("%r %r - calculated hash matches (%d)",
+ self.feed['id'], self.feed['title'],
+ response.status_code)
+ else:
+ logger.info("%r %r - feed responded with same etag (%d)",
+ self.feed['id'], self.feed['title'],
+ response.status_code)
self.clean_feed()
return
+ else:
+ logger.debug('%r %r - etag mismatch %r != %r',
+ self.feed['id'], self.feed['title'],
+ response.headers['etag'], self.feed['etag'])
+ logger.info('%r %r - cache validation failed, challenging entries',
+ self.feed['id'], self.feed['title'])
+
ids, entries = [], {}
parsed_response = feedparser.parse(response.text)
for entry in parsed_response['entries']:
@@ -214,7 +269,8 @@ class FeedCrawler(AbstractCrawler):
logger.debug('%r %r - found %d entries %r',
self.feed['id'], self.feed['title'], len(ids), ids)
future = self.query_pyagg('get', 'articles/challenge', {'ids': ids})
- updater = PyAggUpdater(self.feed, entries, response.headers, self.auth)
+ updater = PyAggUpdater(self.feed, entries, response.headers,
+ parsed_response, self.auth)
future.add_done_callback(updater.callback)
@@ -223,14 +279,15 @@ class CrawlerScheduler(AbstractCrawler):
def __init__(self, username, password):
self.auth = (username, password)
super(CrawlerScheduler, self).__init__(self.auth)
+ AbstractCrawler.__counter__ = 0
def prepare_headers(self, feed):
"""For a known feed, will construct some header dictionnary"""
- headers = {}
- if feed.get('etag', None):
- headers['If-None-Match'] = feed['etag']
+ headers = {'User-Agent': 'pyaggr3g470r/crawler'}
if feed.get('last_modified'):
headers['If-Modified-Since'] = feed['last_modified']
+ if feed.get('etag') and 'pyagg' not in feed['etag']:
+ headers['If-None-Match'] = feed['etag']
logger.debug('%r %r - calculated headers %r',
feed['id'], feed['title'], headers)
return headers
@@ -238,13 +295,14 @@ class CrawlerScheduler(AbstractCrawler):
@AbstractCrawler.count_on_me
def callback(self, response):
"""processes feeds that need to be fetched"""
+ AbstractCrawler.__counter__ -= 1
response = response.result()
response.raise_for_status()
feeds = response.json()
logger.debug('%d to fetch %r', len(feeds), feeds)
for feed in feeds:
- logger.info('%r %r - fetching resources',
- feed['id'], feed['title'])
+ logger.debug('%r %r - fetching resources',
+ feed['id'], feed['title'])
future = self.session.get(feed['link'],
headers=self.prepare_headers(feed))
future.add_done_callback(FeedCrawler(feed, self.auth).callback)
@@ -255,4 +313,5 @@ class CrawlerScheduler(AbstractCrawler):
and launch the whole thing"""
logger.debug('retreving fetchable feed')
future = self.query_pyagg('get', 'feeds/fetchable', kwargs)
+ AbstractCrawler.__counter__ += 1
future.add_done_callback(self.callback)
diff --git a/pyaggr3g470r/models/article.py b/pyaggr3g470r/models/article.py
index f026984a..58cd0384 100644
--- a/pyaggr3g470r/models/article.py
+++ b/pyaggr3g470r/models/article.py
@@ -27,12 +27,10 @@ __copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "GPLv3"
import json
+from bootstrap import db
from datetime import datetime
-from flask import g
from sqlalchemy import asc, desc
-db = g.db
-
class Article(db.Model):
"""
diff --git a/pyaggr3g470r/models/feed.py b/pyaggr3g470r/models/feed.py
index ccc8094f..a36d9573 100644
--- a/pyaggr3g470r/models/feed.py
+++ b/pyaggr3g470r/models/feed.py
@@ -26,12 +26,10 @@ __revision__ = "$Date: 2014/04/12 $"
__copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "GPLv3"
+from bootstrap import db
from datetime import datetime
-from flask import g
from sqlalchemy import desc
-db = g.db
-
class Feed(db.Model):
"""
diff --git a/pyaggr3g470r/models/role.py b/pyaggr3g470r/models/role.py
index 71497caf..f5a18fdc 100644
--- a/pyaggr3g470r/models/role.py
+++ b/pyaggr3g470r/models/role.py
@@ -26,9 +26,7 @@ __revision__ = "$Date: 2014/04/12 $"
__copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "GPLv3"
-from flask import g
-
-db = g.db
+from bootstrap import db
class Role(db.Model):
diff --git a/pyaggr3g470r/models/user.py b/pyaggr3g470r/models/user.py
index f2a268db..0bf9fe04 100644
--- a/pyaggr3g470r/models/user.py
+++ b/pyaggr3g470r/models/user.py
@@ -30,11 +30,10 @@ import re
import random
import hashlib
from datetime import datetime
-from flask import g
from werkzeug import generate_password_hash, check_password_hash
from flask.ext.login import UserMixin
-db = g.db
+from bootstrap import db
class User(db.Model, UserMixin):
diff --git a/pyaggr3g470r/templates/home.html b/pyaggr3g470r/templates/home.html
index e6cc2e7c..69ca582b 100644
--- a/pyaggr3g470r/templates/home.html
+++ b/pyaggr3g470r/templates/home.html
@@ -1,5 +1,10 @@
{% extends "layout.html" %}
{% block content %}
+<style>
+ li.feed-commands {display: none; text-align: right;}
+ li.feed-commands > span > a {margin-right: 10px;}
+ li.feed-menu:hover + li.feed-commands, li.feed-commands:hover {display: block;}
+</style>
{% if feeds|count == 0 %}
<div class="col-md-4 col-md-offset-4">
<h1>{{ _("You don't have any feeds.") }}</h1>
@@ -17,7 +22,7 @@
<li class="feed-menu"><a href="{{ gen_url(feed=fid) }}">
{% if feed_id == fid %}<b>{% endif %}
{% if in_error.get(fid, 0) > 0 %}
- <span style="background-color: {{ "red" if in_error[fid] > 2 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }} {{ _("error") }}{% if in_error[fid] > 1 %}s{% endif %}</span>
+ <span style="background-color: {{ "red" if in_error[fid] > 5 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }} {{ _("error") }}{% if in_error[fid] > 1 %}s{% endif %}</span>
{% endif %}
<span id="unread-{{ fid }}" class="badge pull-right">{{ nbunread }}</span>
{{ feeds[fid]|safe }}
@@ -35,7 +40,7 @@
{% for fid, ftitle in feeds|dictsort(case_sensitive=False, by='value') if not fid in unread %}
<li class="feed-menu"><a href="{{ gen_url(feed=fid) }}">
{% if in_error.get(fid, 0) > 0 %}
- <span style="background-color: {{ "red" if in_error[fid] > 2 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }} {{ _("error") }}{% if in_error[fid] > 1 %}s{% endif %}</span>
+ <span style="background-color: {{ "red" if in_error[fid] > 5 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }} {{ _("error") }}{% if in_error[fid] > 1 %}s{% endif %}</span>
{% endif %}
{% if feed_id == fid %}<b>{% endif %}
{{ ftitle|safe }}
diff --git a/pyaggr3g470r/views/views.py b/pyaggr3g470r/views/views.py
index 3b39e9b0..e202ad4d 100644
--- a/pyaggr3g470r/views/views.py
+++ b/pyaggr3g470r/views/views.py
@@ -248,7 +248,7 @@ def home():
.filter(Article.readed == False, Article.user_id == g.user.id)\
.group_by(Article.feed_id).all()
in_error = {feed.id: feed.error_count for feed in
- FeedController(g.user.id).read(error_count__gt=0).all()}
+ FeedController(g.user.id).read(error_count__gt=2).all()}
def gen_url(filter_=filter_, limit=limit, feed=feed_id):
return '?filter_=%s&limit=%s&feed=%d' % (filter_, limit, feed)
return render_template('home.html', gen_url=gen_url, feed_id=feed_id,
bgstack15