aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pyaggr3g470r/controllers/feed.py13
-rw-r--r--pyaggr3g470r/lib/crawler.py41
-rw-r--r--pyaggr3g470r/models/article.py4
-rw-r--r--pyaggr3g470r/models/feed.py4
-rw-r--r--pyaggr3g470r/models/role.py4
-rw-r--r--pyaggr3g470r/models/user.py3
6 files changed, 42 insertions, 27 deletions
diff --git a/pyaggr3g470r/controllers/feed.py b/pyaggr3g470r/controllers/feed.py
index a3cdcddd..b8e28ee6 100644
--- a/pyaggr3g470r/controllers/feed.py
+++ b/pyaggr3g470r/controllers/feed.py
@@ -2,6 +2,9 @@ from datetime import datetime, timedelta
from .abstract import AbstractController
from pyaggr3g470r.models import Feed
+import logging
+logger = logging.getLogger(__name__)
+
DEFAULT_MAX_ERROR = 3
DEFAULT_LIMIT = 5
@@ -13,11 +16,11 @@ class FeedController(AbstractController):
from pyaggr3g470r.controllers import UserController
now = datetime.now()
user = UserController(self.user_id).get(id=self.user_id)
- #max_last = now - timedelta(minutes=user.refresh_rate or 60)
+ max_last = now - timedelta(minutes=user.refresh_rate or 60)
feeds = [feed for feed in self.read(user_id=self.user_id,
- error_count__lt=max_error, enabled=True).limit(limit)]
- #last_retrieved__lt=max_last).limit(limit)]
- """if feeds:
+ error_count__lt=max_error, enabled=True,
+ last_retrieved__lt=max_last).limit(limit)]
+ if feeds:
self.update({'id__in': [feed.id for feed in feeds]},
- {'last_retrieved': now})"""
+ {'last_retrieved': now})
return feeds
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py
index 64ef8b6d..97f14363 100644
--- a/pyaggr3g470r/lib/crawler.py
+++ b/pyaggr3g470r/lib/crawler.py
@@ -58,6 +58,7 @@ class AbstractCrawler:
__counter__ = 0
def __init__(self, auth):
+ AbstractCrawler.__counter__ += 1
self.auth = auth
self.session = self.get_session()
self.url = conf.PLATFORM_URL
@@ -84,6 +85,13 @@ class AbstractCrawler:
return result
return wrapper
+ @classmethod
+ def get_counter_callback(cls):
+ cls.__counter__ += 1
+ def debump(*args, **kwargs):
+ cls.__counter__ -= 1
+ return debump
+
def query_pyagg(self, method, urn, data=None):
"""A wrapper for internal call, method should be ones you can find
on requests (header, post, get, options, ...), urn the distant
@@ -95,13 +103,15 @@ class AbstractCrawler:
return method("%s%s%s" % (self.url, API_ROOT, urn),
auth=self.auth, data=json.dumps(data,
default=default_handler),
- headers={'Content-Type': 'application/json'})
+ headers={'Content-Type': 'application/json',
+ 'User-Agent': 'pyaggr3g470r'})
@classmethod
def wait(cls):
"See count_on_me, that method will just wait for the counter to be 0"
time.sleep(1)
while cls.__counter__:
+ print('running %d' % cls.__counter__)
time.sleep(1)
@@ -144,6 +154,7 @@ class PyAggUpdater(AbstractCrawler):
def callback(self, response):
"""Will process the result from the challenge, creating missing article
and updating the feed"""
+ AbstractCrawler.__counter__ -= 1
results = response.result().json()
logger.debug('%r %r - %d entries were not matched and will be created',
self.feed['id'], self.feed['title'], len(results))
@@ -158,10 +169,12 @@ class PyAggUpdater(AbstractCrawler):
self.feed['id'], self.feed['title'],
self.headers.get('etag'), now)
- self.query_pyagg('put', 'feed/%d' % self.feed['id'], {'error_count': 0,
- 'etag': self.headers.get('etag', ''),
- 'last_error': '',
- 'last_modified': self.headers.get('last-modified', '')})
+ dico = {'error_count': 0, 'last_error': '',
+ 'etag': self.headers.get('etag', ''),
+ 'last_modified': self.headers.get('last-modified', '')}
+ if any([dico[key] == self.feed.get(key) for key in dico]):
+ future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico)
+ future.add_done_callback(self.get_counter_callback())
class FeedCrawler(AbstractCrawler):
@@ -173,13 +186,15 @@ class FeedCrawler(AbstractCrawler):
def clean_feed(self):
"""Will reset the errors counters on a feed that have known errors"""
if self.feed.get('error_count') or self.feed.get('last_error'):
- self.query_pyagg('put', 'feed/%d' % self.feed['id'],
- {'error_count': 0, 'last_error': ''})
+ future = self.query_pyagg('put', 'feed/%d' % self.feed['id'],
+ {'error_count': 0, 'last_error': ''})
+ future.add_done_callback(self.get_counter_callback())
@AbstractCrawler.count_on_me
def callback(self, response):
"""will fetch the feed and interprete results (304, etag) or will
challenge pyagg to compare gotten entries with existing ones"""
+ AbstractCrawler.__counter__ -= 1
try:
response = response.result()
response.raise_for_status()
@@ -188,9 +203,10 @@ class FeedCrawler(AbstractCrawler):
logger.warn('%r %r - an error occured while fetching feed; bumping'
' error count to %r', self.feed['id'],
self.feed['title'], error_count)
- self.query_pyagg('put', 'feed/%d' % self.feed['id'],
- {'error_count': error_count,
- 'last_error': str(error)})
+ future = self.query_pyagg('put', 'feed/%d' % self.feed['id'],
+ {'error_count': error_count,
+ 'last_error': str(error)})
+ future.add_done_callback(self.get_counter_callback())
return
if response.status_code == 304:
@@ -222,10 +238,11 @@ class CrawlerScheduler(AbstractCrawler):
def __init__(self, username, password):
self.auth = (username, password)
super(CrawlerScheduler, self).__init__(self.auth)
+ AbstractCrawler.__counter__ = 0
def prepare_headers(self, feed):
"""For a known feed, will construct some header dictionnary"""
- headers = {}
+ headers = {'User-Agent': 'pyaggr3g470r/crawler'}
if feed.get('etag', None):
headers['If-None-Match'] = feed['etag']
if feed.get('last_modified'):
@@ -237,6 +254,7 @@ class CrawlerScheduler(AbstractCrawler):
@AbstractCrawler.count_on_me
def callback(self, response):
"""processes feeds that need to be fetched"""
+ AbstractCrawler.__counter__ -= 1
response = response.result()
response.raise_for_status()
feeds = response.json()
@@ -254,4 +272,5 @@ class CrawlerScheduler(AbstractCrawler):
and launch the whole thing"""
logger.debug('retreving fetchable feed')
future = self.query_pyagg('get', 'feeds/fetchable', kwargs)
+ AbstractCrawler.__counter__ += 1
future.add_done_callback(self.callback)
diff --git a/pyaggr3g470r/models/article.py b/pyaggr3g470r/models/article.py
index 0466bc35..f8f9d2d8 100644
--- a/pyaggr3g470r/models/article.py
+++ b/pyaggr3g470r/models/article.py
@@ -27,12 +27,10 @@ __copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "GPLv3"
import json
+from bootstrap import db
from datetime import datetime
-from flask import g
from sqlalchemy import asc, desc
-db = g.db
-
class Article(db.Model):
"""
diff --git a/pyaggr3g470r/models/feed.py b/pyaggr3g470r/models/feed.py
index 24542c28..a37744d6 100644
--- a/pyaggr3g470r/models/feed.py
+++ b/pyaggr3g470r/models/feed.py
@@ -26,12 +26,10 @@ __revision__ = "$Date: 2014/04/12 $"
__copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "GPLv3"
+from bootstrap import db
from datetime import datetime
-from flask import g
from sqlalchemy import desc
-db = g.db
-
class Feed(db.Model):
"""
diff --git a/pyaggr3g470r/models/role.py b/pyaggr3g470r/models/role.py
index 71497caf..f5a18fdc 100644
--- a/pyaggr3g470r/models/role.py
+++ b/pyaggr3g470r/models/role.py
@@ -26,9 +26,7 @@ __revision__ = "$Date: 2014/04/12 $"
__copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "GPLv3"
-from flask import g
-
-db = g.db
+from bootstrap import db
class Role(db.Model):
diff --git a/pyaggr3g470r/models/user.py b/pyaggr3g470r/models/user.py
index f2a268db..0bf9fe04 100644
--- a/pyaggr3g470r/models/user.py
+++ b/pyaggr3g470r/models/user.py
@@ -30,11 +30,10 @@ import re
import random
import hashlib
from datetime import datetime
-from flask import g
from werkzeug import generate_password_hash, check_password_hash
from flask.ext.login import UserMixin
-db = g.db
+from bootstrap import db
class User(db.Model, UserMixin):
bgstack15