From 8c0c605ea0d59355ece2e4f2755acb535ab7b90d Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Thu, 5 Mar 2015 23:36:52 +0100 Subject: correcting wait counter and reactivating last_retrieved --- pyaggr3g470r/controllers/feed.py | 13 ++++++++----- pyaggr3g470r/lib/crawler.py | 41 +++++++++++++++++++++++++++++----------- pyaggr3g470r/models/article.py | 4 +--- pyaggr3g470r/models/feed.py | 4 +--- pyaggr3g470r/models/role.py | 4 +--- pyaggr3g470r/models/user.py | 3 +-- 6 files changed, 42 insertions(+), 27 deletions(-) (limited to 'pyaggr3g470r') diff --git a/pyaggr3g470r/controllers/feed.py b/pyaggr3g470r/controllers/feed.py index a3cdcddd..b8e28ee6 100644 --- a/pyaggr3g470r/controllers/feed.py +++ b/pyaggr3g470r/controllers/feed.py @@ -2,6 +2,9 @@ from datetime import datetime, timedelta from .abstract import AbstractController from pyaggr3g470r.models import Feed +import logging +logger = logging.getLogger(__name__) + DEFAULT_MAX_ERROR = 3 DEFAULT_LIMIT = 5 @@ -13,11 +16,11 @@ class FeedController(AbstractController): from pyaggr3g470r.controllers import UserController now = datetime.now() user = UserController(self.user_id).get(id=self.user_id) - #max_last = now - timedelta(minutes=user.refresh_rate or 60) + max_last = now - timedelta(minutes=user.refresh_rate or 60) feeds = [feed for feed in self.read(user_id=self.user_id, - error_count__lt=max_error, enabled=True).limit(limit)] - #last_retrieved__lt=max_last).limit(limit)] - """if feeds: + error_count__lt=max_error, enabled=True, + last_retrieved__lt=max_last).limit(limit)] + if feeds: self.update({'id__in': [feed.id for feed in feeds]}, - {'last_retrieved': now})""" + {'last_retrieved': now}) return feeds diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 64ef8b6d..97f14363 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -58,6 +58,7 @@ class AbstractCrawler: __counter__ = 0 def __init__(self, auth): + AbstractCrawler.__counter__ += 1 self.auth = auth self.session = self.get_session() self.url = conf.PLATFORM_URL @@ -84,6 +85,13 @@ class AbstractCrawler: return result return wrapper + @classmethod + def get_counter_callback(cls): + cls.__counter__ += 1 + def debump(*args, **kwargs): + cls.__counter__ -= 1 + return debump + def query_pyagg(self, method, urn, data=None): """A wrapper for internal call, method should be ones you can find on requests (header, post, get, options, ...), urn the distant @@ -95,13 +103,15 @@ class AbstractCrawler: return method("%s%s%s" % (self.url, API_ROOT, urn), auth=self.auth, data=json.dumps(data, default=default_handler), - headers={'Content-Type': 'application/json'}) + headers={'Content-Type': 'application/json', + 'User-Agent': 'pyaggr3g470r'}) @classmethod def wait(cls): "See count_on_me, that method will just wait for the counter to be 0" time.sleep(1) while cls.__counter__: + print('running %d' % cls.__counter__) time.sleep(1) @@ -144,6 +154,7 @@ class PyAggUpdater(AbstractCrawler): def callback(self, response): """Will process the result from the challenge, creating missing article and updating the feed""" + AbstractCrawler.__counter__ -= 1 results = response.result().json() logger.debug('%r %r - %d entries were not matched and will be created', self.feed['id'], self.feed['title'], len(results)) @@ -158,10 +169,12 @@ class PyAggUpdater(AbstractCrawler): self.feed['id'], self.feed['title'], self.headers.get('etag'), now) - self.query_pyagg('put', 'feed/%d' % self.feed['id'], {'error_count': 0, - 'etag': self.headers.get('etag', ''), - 'last_error': '', - 'last_modified': self.headers.get('last-modified', '')}) + dico = {'error_count': 0, 'last_error': '', + 'etag': self.headers.get('etag', ''), + 'last_modified': self.headers.get('last-modified', '')} + if any([dico[key] == self.feed.get(key) for key in dico]): + future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico) + future.add_done_callback(self.get_counter_callback()) class FeedCrawler(AbstractCrawler): @@ -173,13 +186,15 @@ class FeedCrawler(AbstractCrawler): def clean_feed(self): """Will reset the errors counters on a feed that have known errors""" if self.feed.get('error_count') or self.feed.get('last_error'): - self.query_pyagg('put', 'feed/%d' % self.feed['id'], - {'error_count': 0, 'last_error': ''}) + future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], + {'error_count': 0, 'last_error': ''}) + future.add_done_callback(self.get_counter_callback()) @AbstractCrawler.count_on_me def callback(self, response): """will fetch the feed and interprete results (304, etag) or will challenge pyagg to compare gotten entries with existing ones""" + AbstractCrawler.__counter__ -= 1 try: response = response.result() response.raise_for_status() @@ -188,9 +203,10 @@ class FeedCrawler(AbstractCrawler): logger.warn('%r %r - an error occured while fetching feed; bumping' ' error count to %r', self.feed['id'], self.feed['title'], error_count) - self.query_pyagg('put', 'feed/%d' % self.feed['id'], - {'error_count': error_count, - 'last_error': str(error)}) + future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], + {'error_count': error_count, + 'last_error': str(error)}) + future.add_done_callback(self.get_counter_callback()) return if response.status_code == 304: @@ -222,10 +238,11 @@ class CrawlerScheduler(AbstractCrawler): def __init__(self, username, password): self.auth = (username, password) super(CrawlerScheduler, self).__init__(self.auth) + AbstractCrawler.__counter__ = 0 def prepare_headers(self, feed): """For a known feed, will construct some header dictionnary""" - headers = {} + headers = {'User-Agent': 'pyaggr3g470r/crawler'} if feed.get('etag', None): headers['If-None-Match'] = feed['etag'] if feed.get('last_modified'): @@ -237,6 +254,7 @@ class CrawlerScheduler(AbstractCrawler): @AbstractCrawler.count_on_me def callback(self, response): """processes feeds that need to be fetched""" + AbstractCrawler.__counter__ -= 1 response = response.result() response.raise_for_status() feeds = response.json() @@ -254,4 +272,5 @@ class CrawlerScheduler(AbstractCrawler): and launch the whole thing""" logger.debug('retreving fetchable feed') future = self.query_pyagg('get', 'feeds/fetchable', kwargs) + AbstractCrawler.__counter__ += 1 future.add_done_callback(self.callback) diff --git a/pyaggr3g470r/models/article.py b/pyaggr3g470r/models/article.py index 0466bc35..f8f9d2d8 100644 --- a/pyaggr3g470r/models/article.py +++ b/pyaggr3g470r/models/article.py @@ -27,12 +27,10 @@ __copyright__ = "Copyright (c) Cedric Bonhomme" __license__ = "GPLv3" import json +from bootstrap import db from datetime import datetime -from flask import g from sqlalchemy import asc, desc -db = g.db - class Article(db.Model): """ diff --git a/pyaggr3g470r/models/feed.py b/pyaggr3g470r/models/feed.py index 24542c28..a37744d6 100644 --- a/pyaggr3g470r/models/feed.py +++ b/pyaggr3g470r/models/feed.py @@ -26,12 +26,10 @@ __revision__ = "$Date: 2014/04/12 $" __copyright__ = "Copyright (c) Cedric Bonhomme" __license__ = "GPLv3" +from bootstrap import db from datetime import datetime -from flask import g from sqlalchemy import desc -db = g.db - class Feed(db.Model): """ diff --git a/pyaggr3g470r/models/role.py b/pyaggr3g470r/models/role.py index 71497caf..f5a18fdc 100644 --- a/pyaggr3g470r/models/role.py +++ b/pyaggr3g470r/models/role.py @@ -26,9 +26,7 @@ __revision__ = "$Date: 2014/04/12 $" __copyright__ = "Copyright (c) Cedric Bonhomme" __license__ = "GPLv3" -from flask import g - -db = g.db +from bootstrap import db class Role(db.Model): diff --git a/pyaggr3g470r/models/user.py b/pyaggr3g470r/models/user.py index f2a268db..0bf9fe04 100644 --- a/pyaggr3g470r/models/user.py +++ b/pyaggr3g470r/models/user.py @@ -30,11 +30,10 @@ import re import random import hashlib from datetime import datetime -from flask import g from werkzeug import generate_password_hash, check_password_hash from flask.ext.login import UserMixin -db = g.db +from bootstrap import db class User(db.Model, UserMixin): -- cgit