aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pyaggr3g470r/controllers/abstract.py26
-rw-r--r--pyaggr3g470r/controllers/feed.py21
-rw-r--r--pyaggr3g470r/controllers/user.py2
-rw-r--r--pyaggr3g470r/lib/crawler.py123
-rw-r--r--pyaggr3g470r/models/article.py4
-rw-r--r--pyaggr3g470r/models/feed.py4
-rw-r--r--pyaggr3g470r/models/role.py4
-rw-r--r--pyaggr3g470r/models/user.py3
-rw-r--r--pyaggr3g470r/templates/home.html9
-rw-r--r--pyaggr3g470r/views/views.py22
10 files changed, 152 insertions, 66 deletions
diff --git a/pyaggr3g470r/controllers/abstract.py b/pyaggr3g470r/controllers/abstract.py
index c084deb9..f1173817 100644
--- a/pyaggr3g470r/controllers/abstract.py
+++ b/pyaggr3g470r/controllers/abstract.py
@@ -9,11 +9,25 @@ class AbstractController(object):
_db_cls = None # reference to the database class
_user_id_key = 'user_id'
- def __init__(self, user_id):
+ def __init__(self, user_id=None):
+ """User id is a right management mechanism that should be used to
+ filter objects in database on their denormalized "user_id" field
+ (or "id" field for users).
+ Should no user_id be provided, the Controller won't apply any filter
+ allowing for a kind of "super user" mode.
+ """
self.user_id = user_id
def _to_filters(self, **filters):
- if self.user_id:
+ """
+ Will translate filters to sqlalchemy filter.
+ This method will also apply user_id restriction if available.
+
+ each parameters of the function is treated as an equality unless the
+ name of the parameter ends with either "__gt", "__lt", "__ge", "__le",
+ "__ne" or "__in".
+ """
+ if self.user_id is not None:
filters[self._user_id_key] = self.user_id
db_filters = set()
for key, value in filters.items():
@@ -37,17 +51,21 @@ class AbstractController(object):
return self._db_cls.query.filter(*self._to_filters(**filters))
def get(self, **filters):
+ """Will return one single objects corresponding to filters"""
obj = self._get(**filters).first()
if not obj:
raise NotFound({'message': 'No %r (%r)'
% (self._db_cls.__class__.__name__, filters)})
- if getattr(obj, self._user_id_key) != self.user_id:
+ if self.user_id is not None \
+ and getattr(obj, self._user_id_key) != self.user_id:
raise Forbidden({'message': 'No authorized to access %r (%r)'
% (self._db_cls.__class__.__name__, filters)})
return obj
def create(self, **attrs):
- attrs[self._user_id_key] = self.user_id
+ assert self._user_id_key in attrs or self.user_id is not None, \
+ "You must provide user_id one way or another"
+ attrs[self._user_id_key] = self.user_id or attrs.get(self._user_id_key)
obj = self._db_cls(**attrs)
db.session.add(obj)
db.session.commit()
diff --git a/pyaggr3g470r/controllers/feed.py b/pyaggr3g470r/controllers/feed.py
index 342ab88e..b99a3a7f 100644
--- a/pyaggr3g470r/controllers/feed.py
+++ b/pyaggr3g470r/controllers/feed.py
@@ -1,22 +1,31 @@
-from datetime import datetime
+import logging
+from datetime import datetime, timedelta
+
from .abstract import AbstractController
from pyaggr3g470r.models import Feed
-DEFAULT_MAX_ERROR = 3
+logger = logging.getLogger(__name__)
+DEFAULT_MAX_ERROR = 6
DEFAULT_LIMIT = 5
class FeedController(AbstractController):
_db_cls = Feed
+ def list_late(self, max_last, max_error=DEFAULT_MAX_ERROR,
+ limit=DEFAULT_LIMIT):
+ return [feed for feed in self.read(
+ error_count__lt=max_error, enabled=True,
+ last_retrieved__lt=max_last)
+ .order_by('Feed.last_retrieved')
+ .limit(limit)]
+
def list_fetchable(self, max_error=DEFAULT_MAX_ERROR, limit=DEFAULT_LIMIT):
from pyaggr3g470r.controllers import UserController
now = datetime.now()
user = UserController(self.user_id).get(id=self.user_id)
- #max_last = now - timedelta(minutes=user.refresh_rate or 60)
- feeds = [feed for feed in self.read(user_id=self.user_id,
- error_count__lt=max_error, enabled=True).limit(limit)]
- #last_retrieved__lt=max_last).limit(limit)]
+ max_last = now - timedelta(minutes=user.refresh_rate or 60)
+ feeds = self.list_late(max_last, max_error, limit)
if feeds:
self.update({'id__in': [feed.id for feed in feeds]},
{'last_retrieved': now})
diff --git a/pyaggr3g470r/controllers/user.py b/pyaggr3g470r/controllers/user.py
index ed46e1e7..c6c1d545 100644
--- a/pyaggr3g470r/controllers/user.py
+++ b/pyaggr3g470r/controllers/user.py
@@ -4,4 +4,4 @@ from pyaggr3g470r.models import User
class UserController(AbstractController):
_db_cls = User
- _user_id_key = 'email'
+ _user_id_key = 'id'
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py
index 68a7efd0..1cb61973 100644
--- a/pyaggr3g470r/lib/crawler.py
+++ b/pyaggr3g470r/lib/crawler.py
@@ -19,6 +19,7 @@ import logging
import requests
import feedparser
import dateutil.parser
+from hashlib import md5
from functools import wraps
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
@@ -30,6 +31,10 @@ logging.captureWarnings(True)
API_ROOT = "api/v2.0/"
+def to_hash(text):
+ return md5(text.encode('utf8')).hexdigest()
+
+
def extract_id(entry, keys=[('link', 'link'),
('published', 'retrieved_date'),
('updated', 'retrieved_date')], force_id=False):
@@ -41,8 +46,8 @@ def extract_id(entry, keys=[('link', 'link'),
if entry_id:
return {'entry_id': entry_id}
if not entry_id and force_id:
- entry_id = hash("".join(entry[entry_key] for _, entry_key in keys
- if entry_key in entry))
+ entry_id = to_hash("".join(entry[entry_key] for _, entry_key in keys
+ if entry_key in entry).encode('utf8'))
else:
ids = {}
for entry_key, pyagg_key in keys:
@@ -59,6 +64,7 @@ class AbstractCrawler:
__counter__ = 0
def __init__(self, auth):
+ AbstractCrawler.__counter__ += 1
self.auth = auth
self.session = self.get_session()
self.url = conf.PLATFORM_URL
@@ -80,11 +86,21 @@ class AbstractCrawler:
@wraps(func)
def wrapper(*args, **kwargs):
cls.__counter__ += 1
- result = func(*args, **kwargs)
- cls.__counter__ -= 1
- return result
+ try:
+ return func(*args, **kwargs)
+ except:
+ logger.exception('an error occured while %r', func)
+ finally:
+ cls.__counter__ -= 1
return wrapper
+ @classmethod
+ def get_counter_callback(cls):
+ cls.__counter__ += 1
+ def debump(*args, **kwargs):
+ cls.__counter__ -= 1
+ return debump
+
def query_pyagg(self, method, urn, data=None):
"""A wrapper for internal call, method should be ones you can find
on requests (header, post, get, options, ...), urn the distant
@@ -96,22 +112,30 @@ class AbstractCrawler:
return method("%s%s%s" % (self.url, API_ROOT, urn),
auth=self.auth, data=json.dumps(data,
default=default_handler),
- headers={'Content-Type': 'application/json'})
+ headers={'Content-Type': 'application/json',
+ 'User-Agent': 'pyaggr3g470r'})
@classmethod
- def wait(cls):
+ def wait(cls, max_wait=600):
"See count_on_me, that method will just wait for the counter to be 0"
time.sleep(1)
+ second_waited = 1
while cls.__counter__:
+ if second_waited > max_wait:
+ logger.warn('Exiting after %d seconds, counter at %d',
+ max_wait, cls.__counter__)
+ break
time.sleep(1)
+ second_waited += 1
class PyAggUpdater(AbstractCrawler):
- def __init__(self, feed, entries, headers, auth):
+ def __init__(self, feed, entries, headers, parsed_feed, auth):
self.feed = feed
self.entries = entries
self.headers = headers
+ self.parsed_feed = parsed_feed.get('feed', {})
super(PyAggUpdater, self).__init__(auth)
def to_article(self, entry):
@@ -145,24 +169,36 @@ class PyAggUpdater(AbstractCrawler):
def callback(self, response):
"""Will process the result from the challenge, creating missing article
and updating the feed"""
+ AbstractCrawler.__counter__ -= 1
results = response.result().json()
logger.debug('%r %r - %d entries were not matched and will be created',
self.feed['id'], self.feed['title'], len(results))
for id_to_create in results:
entry = self.to_article(
self.entries[tuple(sorted(id_to_create.items()))])
- logger.info('creating %r - %r', entry['title'], id_to_create)
+ logger.warn('%r %r - creating %r - %r', self.feed['id'],
+ self.feed['title'], entry['title'], id_to_create)
self.query_pyagg('post', 'article', entry)
now = datetime.now()
logger.debug('%r %r - updating feed etag %r last_mod %r',
self.feed['id'], self.feed['title'],
- self.headers.get('etag'), now)
-
- self.query_pyagg('put', 'feed/%d' % self.feed['id'], {'error_count': 0,
- 'etag': self.headers.get('etag', ''),
- 'last_error': '',
- 'last_modified': self.headers.get('last-modified', '')})
+ self.headers.get('etag', ''),
+ self.headers.get('last-modified', ''))
+
+ dico = {'error_count': 0, 'last_error': None,
+ 'etag': self.headers.get('etag', ''),
+ 'last_modified': self.headers.get('last-modified', ''),
+ 'site_link': self.parsed_feed.get('link')}
+ if not self.feed.get('title'):
+ dico['title'] = self.parsed_feed.get('title', '')
+ logger.info('%r %r - pushing feed attrs %r',
+ self.feed['id'], self.feed['title'],
+ {key: "%s -> %s" % (dico[key], self.feed.get(key))
+ for key in dico if dico[key] != self.feed.get(key)})
+ if any([dico[key] != self.feed.get(key) for key in dico]):
+ future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico)
+ future.add_done_callback(self.get_counter_callback())
class FeedCrawler(AbstractCrawler):
@@ -174,13 +210,15 @@ class FeedCrawler(AbstractCrawler):
def clean_feed(self):
"""Will reset the errors counters on a feed that have known errors"""
if self.feed.get('error_count') or self.feed.get('last_error'):
- self.query_pyagg('put', 'feed/%d' % self.feed['id'],
- {'error_count': 0, 'last_error': ''})
+ future = self.query_pyagg('put', 'feed/%d' % self.feed['id'],
+ {'error_count': 0, 'last_error': ''})
+ future.add_done_callback(self.get_counter_callback())
@AbstractCrawler.count_on_me
def callback(self, response):
"""will fetch the feed and interprete results (304, etag) or will
challenge pyagg to compare gotten entries with existing ones"""
+ AbstractCrawler.__counter__ -= 1
try:
response = response.result()
response.raise_for_status()
@@ -189,23 +227,40 @@ class FeedCrawler(AbstractCrawler):
logger.warn('%r %r - an error occured while fetching feed; bumping'
' error count to %r', self.feed['id'],
self.feed['title'], error_count)
- self.query_pyagg('put', 'feed/%d' % self.feed['id'],
- {'error_count': error_count,
- 'last_error': str(error)})
+ future = self.query_pyagg('put', 'feed/%d' % self.feed['id'],
+ {'error_count': error_count,
+ 'last_error': str(error)})
+ future.add_done_callback(self.get_counter_callback())
return
if response.status_code == 304:
logger.info("%r %r - feed responded with 304",
- self.feed['id'], self.feed['title'])
+ self.feed['id'], self.feed['title'])
self.clean_feed()
return
- if self.feed['etag'] and response.headers.get('etag') \
- and response.headers.get('etag') == self.feed['etag']:
- logger.info("%r %r - feed responded with same etag (%d)",
- self.feed['id'], self.feed['title'],
- response.status_code)
+ if 'etag' not in response.headers:
+ logger.debug('%r %r - manually generating etag',
+ self.feed['id'], self.feed['title'])
+ response.headers['etag'] = 'pyagg/"%s"' % to_hash(response.text)
+ if response.headers['etag'] and self.feed['etag'] \
+ and response.headers['etag'] == self.feed['etag']:
+ if 'pyagg' in self.feed['etag']:
+ logger.info("%r %r - calculated hash matches (%d)",
+ self.feed['id'], self.feed['title'],
+ response.status_code)
+ else:
+ logger.info("%r %r - feed responded with same etag (%d)",
+ self.feed['id'], self.feed['title'],
+ response.status_code)
self.clean_feed()
return
+ else:
+ logger.debug('%r %r - etag mismatch %r != %r',
+ self.feed['id'], self.feed['title'],
+ response.headers['etag'], self.feed['etag'])
+ logger.info('%r %r - cache validation failed, challenging entries',
+ self.feed['id'], self.feed['title'])
+
ids, entries = [], {}
parsed_response = feedparser.parse(response.text)
for entry in parsed_response['entries']:
@@ -214,7 +269,8 @@ class FeedCrawler(AbstractCrawler):
logger.debug('%r %r - found %d entries %r',
self.feed['id'], self.feed['title'], len(ids), ids)
future = self.query_pyagg('get', 'articles/challenge', {'ids': ids})
- updater = PyAggUpdater(self.feed, entries, response.headers, self.auth)
+ updater = PyAggUpdater(self.feed, entries, response.headers,
+ parsed_response, self.auth)
future.add_done_callback(updater.callback)
@@ -223,14 +279,15 @@ class CrawlerScheduler(AbstractCrawler):
def __init__(self, username, password):
self.auth = (username, password)
super(CrawlerScheduler, self).__init__(self.auth)
+ AbstractCrawler.__counter__ = 0
def prepare_headers(self, feed):
"""For a known feed, will construct some header dictionnary"""
- headers = {}
- if feed.get('etag', None):
- headers['If-None-Match'] = feed['etag']
+ headers = {'User-Agent': 'pyaggr3g470r/crawler'}
if feed.get('last_modified'):
headers['If-Modified-Since'] = feed['last_modified']
+ if feed.get('etag') and 'pyagg' not in feed['etag']:
+ headers['If-None-Match'] = feed['etag']
logger.debug('%r %r - calculated headers %r',
feed['id'], feed['title'], headers)
return headers
@@ -238,13 +295,14 @@ class CrawlerScheduler(AbstractCrawler):
@AbstractCrawler.count_on_me
def callback(self, response):
"""processes feeds that need to be fetched"""
+ AbstractCrawler.__counter__ -= 1
response = response.result()
response.raise_for_status()
feeds = response.json()
logger.debug('%d to fetch %r', len(feeds), feeds)
for feed in feeds:
- logger.info('%r %r - fetching resources',
- feed['id'], feed['title'])
+ logger.debug('%r %r - fetching resources',
+ feed['id'], feed['title'])
future = self.session.get(feed['link'],
headers=self.prepare_headers(feed))
future.add_done_callback(FeedCrawler(feed, self.auth).callback)
@@ -255,4 +313,5 @@ class CrawlerScheduler(AbstractCrawler):
and launch the whole thing"""
logger.debug('retreving fetchable feed')
future = self.query_pyagg('get', 'feeds/fetchable', kwargs)
+ AbstractCrawler.__counter__ += 1
future.add_done_callback(self.callback)
diff --git a/pyaggr3g470r/models/article.py b/pyaggr3g470r/models/article.py
index f026984a..58cd0384 100644
--- a/pyaggr3g470r/models/article.py
+++ b/pyaggr3g470r/models/article.py
@@ -27,12 +27,10 @@ __copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "GPLv3"
import json
+from bootstrap import db
from datetime import datetime
-from flask import g
from sqlalchemy import asc, desc
-db = g.db
-
class Article(db.Model):
"""
diff --git a/pyaggr3g470r/models/feed.py b/pyaggr3g470r/models/feed.py
index ccc8094f..a36d9573 100644
--- a/pyaggr3g470r/models/feed.py
+++ b/pyaggr3g470r/models/feed.py
@@ -26,12 +26,10 @@ __revision__ = "$Date: 2014/04/12 $"
__copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "GPLv3"
+from bootstrap import db
from datetime import datetime
-from flask import g
from sqlalchemy import desc
-db = g.db
-
class Feed(db.Model):
"""
diff --git a/pyaggr3g470r/models/role.py b/pyaggr3g470r/models/role.py
index 71497caf..f5a18fdc 100644
--- a/pyaggr3g470r/models/role.py
+++ b/pyaggr3g470r/models/role.py
@@ -26,9 +26,7 @@ __revision__ = "$Date: 2014/04/12 $"
__copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "GPLv3"
-from flask import g
-
-db = g.db
+from bootstrap import db
class Role(db.Model):
diff --git a/pyaggr3g470r/models/user.py b/pyaggr3g470r/models/user.py
index f2a268db..0bf9fe04 100644
--- a/pyaggr3g470r/models/user.py
+++ b/pyaggr3g470r/models/user.py
@@ -30,11 +30,10 @@ import re
import random
import hashlib
from datetime import datetime
-from flask import g
from werkzeug import generate_password_hash, check_password_hash
from flask.ext.login import UserMixin
-db = g.db
+from bootstrap import db
class User(db.Model, UserMixin):
diff --git a/pyaggr3g470r/templates/home.html b/pyaggr3g470r/templates/home.html
index e6cc2e7c..69ca582b 100644
--- a/pyaggr3g470r/templates/home.html
+++ b/pyaggr3g470r/templates/home.html
@@ -1,5 +1,10 @@
{% extends "layout.html" %}
{% block content %}
+<style>
+ li.feed-commands {display: none; text-align: right;}
+ li.feed-commands > span > a {margin-right: 10px;}
+ li.feed-menu:hover + li.feed-commands, li.feed-commands:hover {display: block;}
+</style>
{% if feeds|count == 0 %}
<div class="col-md-4 col-md-offset-4">
<h1>{{ _("You don't have any feeds.") }}</h1>
@@ -17,7 +22,7 @@
<li class="feed-menu"><a href="{{ gen_url(feed=fid) }}">
{% if feed_id == fid %}<b>{% endif %}
{% if in_error.get(fid, 0) > 0 %}
- <span style="background-color: {{ "red" if in_error[fid] > 2 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }} {{ _("error") }}{% if in_error[fid] > 1 %}s{% endif %}</span>
+ <span style="background-color: {{ "red" if in_error[fid] > 5 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }} {{ _("error") }}{% if in_error[fid] > 1 %}s{% endif %}</span>
{% endif %}
<span id="unread-{{ fid }}" class="badge pull-right">{{ nbunread }}</span>
{{ feeds[fid]|safe }}
@@ -35,7 +40,7 @@
{% for fid, ftitle in feeds|dictsort(case_sensitive=False, by='value') if not fid in unread %}
<li class="feed-menu"><a href="{{ gen_url(feed=fid) }}">
{% if in_error.get(fid, 0) > 0 %}
- <span style="background-color: {{ "red" if in_error[fid] > 2 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }} {{ _("error") }}{% if in_error[fid] > 1 %}s{% endif %}</span>
+ <span style="background-color: {{ "red" if in_error[fid] > 5 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }} {{ _("error") }}{% if in_error[fid] > 1 %}s{% endif %}</span>
{% endif %}
{% if feed_id == fid %}<b>{% endif %}
{{ ftitle|safe }}
diff --git a/pyaggr3g470r/views/views.py b/pyaggr3g470r/views/views.py
index 3b39e9b0..fd970cba 100644
--- a/pyaggr3g470r/views/views.py
+++ b/pyaggr3g470r/views/views.py
@@ -93,7 +93,7 @@ def before_request():
@login_manager.user_loader
def load_user(email):
# Return an instance of the User model
- return controllers.UserController(email).get(email=email)
+ return controllers.UserController().get(email=email)
#
@@ -153,7 +153,7 @@ def login():
form = SigninForm()
if form.validate_on_submit():
- user = controllers.UserController(form.email.data).get(email=form.email.data)
+ user = controllers.UserController().get(email=form.email.data)
login_user(user)
g.user = user
session['email'] = form.email.data
@@ -248,7 +248,7 @@ def home():
.filter(Article.readed == False, Article.user_id == g.user.id)\
.group_by(Article.feed_id).all()
in_error = {feed.id: feed.error_count for feed in
- FeedController(g.user.id).read(error_count__gt=0).all()}
+ FeedController(g.user.id).read(error_count__gt=2).all()}
def gen_url(filter_=filter_, limit=limit, feed=feed_id):
return '?filter_=%s&limit=%s&feed=%d' % (filter_, limit, feed)
return render_template('home.html', gen_url=gen_url, feed_id=feed_id,
@@ -382,7 +382,7 @@ def inactives():
List of inactive feeds.
"""
nb_days = int(request.args.get('nb_days', 365))
- user = controllers.UserController(g.user.email).get(email=g.user.email)
+ user = controllers.UserController(g.user.id).get(email=g.user.email)
today = datetime.datetime.now()
inactives = []
for feed in user.feeds:
@@ -429,7 +429,7 @@ def export_articles():
"""
Export all articles to HTML or JSON.
"""
- user = controllers.UserController(g.user.email).get(id=g.user.id)
+ user = controllers.UserController(g.user.id).get(id=g.user.id)
if request.args.get('format') == "HTML":
# Export to HTML
try:
@@ -439,7 +439,8 @@ def export_articles():
return redirect(redirect_url())
response = make_response(archive_file)
response.headers['Content-Type'] = 'application/x-compressed'
- response.headers['Content-Disposition'] = 'attachment; filename='+archive_file_name
+ response.headers['Content-Disposition'] = 'attachment; filename=%s' \
+ % archive_file_name
elif request.args.get('format') == "JSON":
# Export to JSON
try:
@@ -461,8 +462,9 @@ def export_opml():
"""
Export all feeds to OPML.
"""
- user = controllers.UserController(g.user.email).get(id=g.user.id)
- response = make_response(render_template('opml.xml', user=user, now=datetime.datetime.now()))
+ user = controllers.UserController(g.user.id).get(id=g.user.id)
+ response = make_response(render_template('opml.xml', user=user,
+ now=datetime.datetime.now()))
response.headers['Content-Type'] = 'application/xml'
response.headers['Content-Disposition'] = 'attachment; filename=feeds.opml'
return response
@@ -637,7 +639,7 @@ def profile():
"""
Edit the profile of the currently logged user.
"""
- user = controllers.UserController(g.user.email).get(id=g.user.id)
+ user = controllers.UserController(g.user.id).get(id=g.user.id)
form = ProfileForm()
if request.method == 'POST':
@@ -663,7 +665,7 @@ def delete_account():
"""
Delete the account of the user (with all its data).
"""
- user = controllers.UserController(g.user.email).get(id=g.user.id)
+ user = controllers.UserController(g.user.id).get(id=g.user.id)
if user is not None:
db.session.delete(user)
db.session.commit()
bgstack15