From 4c14f36ab73f227bdd7ed636667d8035c33986e0 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Wed, 29 Oct 2014 14:32:24 +0100 Subject: global esthetics tweak --- pyaggr3g470r/lib/__init__.py | 0 pyaggr3g470r/lib/client.py | 16 ++++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 pyaggr3g470r/lib/__init__.py create mode 100755 pyaggr3g470r/lib/client.py (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/__init__.py b/pyaggr3g470r/lib/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pyaggr3g470r/lib/client.py b/pyaggr3g470r/lib/client.py new file mode 100755 index 00000000..da6b1727 --- /dev/null +++ b/pyaggr3g470r/lib/client.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python +import json +import requests +URL = 'domain.net' + + +def get_client(email, password): + client = requests.session() + client.get(URL + 'api/csrf', verify=False, + data=json.dumps({'email': email, + 'password': password})) + return client + + +def get_articles(client): + return client.get(URL + 'api/v1.0/articles/').json -- cgit From 2849c82255b4b889c7342a0a8fa8a4aecfbe599d Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Sat, 17 Jan 2015 16:50:38 +0100 Subject: a first big refacto of the existing arch --- pyaggr3g470r/lib/exceptions.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 pyaggr3g470r/lib/exceptions.py (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/exceptions.py b/pyaggr3g470r/lib/exceptions.py new file mode 100644 index 00000000..30c71a5c --- /dev/null +++ b/pyaggr3g470r/lib/exceptions.py @@ -0,0 +1,13 @@ +class PyAggError(Exception): + status_code = None + default_message = '' + + +class Forbidden(PyAggError): + status_code = 403 + default_message = 'You do not have the rights to access that resource' + + +class NotFound(PyAggError): + status_code = 404 + default_message = 'Resource was not found' -- cgit From 4f0ad9e442e64f69d420dea4d737805eefaaf981 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Wed, 21 Jan 2015 14:07:00 +0100 Subject: continuing refacto --- pyaggr3g470r/lib/client.py | 6 +++--- pyaggr3g470r/lib/crawler.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 pyaggr3g470r/lib/crawler.py (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/client.py b/pyaggr3g470r/lib/client.py index da6b1727..6b2fc9ae 100755 --- a/pyaggr3g470r/lib/client.py +++ b/pyaggr3g470r/lib/client.py @@ -1,16 +1,16 @@ #!/usr/bin/env python import json import requests -URL = 'domain.net' +import conf def get_client(email, password): client = requests.session() - client.get(URL + 'api/csrf', verify=False, + client.get(conf.PLATFORM_URL + 'api/csrf', verify=False, data=json.dumps({'email': email, 'password': password})) return client def get_articles(client): - return client.get(URL + 'api/v1.0/articles/').json + return client.get(conf.PLATFORM_URL + 'api/v1.0/articles/').json diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py new file mode 100644 index 00000000..1d7fca71 --- /dev/null +++ b/pyaggr3g470r/lib/crawler.py @@ -0,0 +1,11 @@ +import feedparser +import dateutil.parser.parse + + +def get_feed_content(feed): + etag = feed.get('etag', None) + last_modified = None + if feed.get('last_modified'): + last_modified = dateutil.parser.parse(feed['last_modified'])\ + .strftime('%a, %d %b %Y %H:%M:%S %Z') + return feedparser.parse(feed['link'], etag=etag, modified=last_modified) -- cgit From 5572851eca3b2f1bc56aed7232284acc436d2f49 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Sun, 1 Mar 2015 03:20:12 +0100 Subject: new crawler with cache control and error handling --- pyaggr3g470r/lib/crawler.py | 204 ++++++++++++++++++++++++++++++++++++++++++-- pyaggr3g470r/lib/utils.py | 14 +++ 2 files changed, 210 insertions(+), 8 deletions(-) create mode 100644 pyaggr3g470r/lib/utils.py (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 1d7fca71..6697e4c3 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -1,11 +1,199 @@ +import conf +import json +import logging +import requests import feedparser -import dateutil.parser.parse +import dateutil.parser +from datetime import datetime +from concurrent.futures import ThreadPoolExecutor +from requests_futures.sessions import FuturesSession +from pyaggr3g470r.lib.utils import default_handler +logger = logging.getLogger(__name__) -def get_feed_content(feed): - etag = feed.get('etag', None) - last_modified = None - if feed.get('last_modified'): - last_modified = dateutil.parser.parse(feed['last_modified'])\ - .strftime('%a, %d %b %Y %H:%M:%S %Z') - return feedparser.parse(feed['link'], etag=etag, modified=last_modified) + +def extract_id(entry, keys=[('link', 'link'), + ('published', 'retrieved_date'), + ('updated', 'retrieved_date')], force_id=False): + entry_id = entry.get('entry_id') or entry.get('id') + if entry_id: + return {'entry_id': entry_id} + if not entry_id and force_id: + entry_id = hash("".join(entry[entry_key] for _, entry_key in keys + if entry_key in entry)) + else: + ids = {} + for entry_key, pyagg_key in keys: + if entry_key in entry and pyagg_key not in ids: + ids[pyagg_key] = entry[entry_key] + if 'date' in pyagg_key: + ids[pyagg_key] = dateutil.parser.parse(ids[pyagg_key])\ + .isoformat() + return ids + + +class AbstractCrawler: + __session__ = None + + def __init__(self, auth): + self.auth = auth + self.session = self.get_session() + self.url = conf.PLATFORM_URL + + @classmethod + def get_session(cls): + if cls.__session__ is None: + cls.__session__ = FuturesSession( + executor=ThreadPoolExecutor(max_workers=conf.NB_WORKER)) + cls.__session__.verify = False + return cls.__session__ + + def query_pyagg(self, method, urn, data=None): + if data is None: + data = {} + method = getattr(self.session, method) + return method("%sapi/v1.0/%s" % (self.url, urn), + auth=self.auth, data=json.dumps(data, + default=default_handler), + headers={'Content-Type': 'application/json'}) + + +class PyAggUpdater(AbstractCrawler): + + def __init__(self, feed, entries, headers, auth): + self.feed = feed + self.entries = entries + self.headers = headers + super(PyAggUpdater, self).__init__(auth) + + def to_article(self, entry): + date = datetime.now() + + for date_key in ('published', 'updated'): + if entry.get(date_key): + try: + date = dateutil.parser.parse(entry[date_key]) + except Exception: + pass + else: + break + content = '' + if entry.get('content'): + content = entry['content'][0]['value'] + elif entry.get('summary'): + content = entry['summary'] + + return {'feed_id': self.feed['id'], + 'entry_id': extract_id(entry).get('entry_id', None), + 'link': entry.get('link', self.feed['site_link']), + 'title': entry.get('title', 'No title'), + 'readed': False, 'like': False, + 'content': content, + 'retrieved_date': date.isoformat(), + 'date': date.isoformat()} + + def callback(self, response): + try: + results = response.result().json() + except Exception: + logger.exception('something went wront with feed %r %r %r %r', + self.feed, self.headers, response.result(), + getattr(response.result(), 'data', None)) + return + logger.debug('%r %r - %d entries were not matched', + self.feed['id'], self.feed['title'], len(results)) + for id_to_create in results: + entry = self.entries[tuple(sorted(id_to_create.items()))] + try: + logger.debug('creating %r - %r', entry['title'], id_to_create) + self.to_article(entry) + except: + logger.exception('%r %r %r something failed when parsing %r', + self.feed['title'], self.feed['id'], + self.feed['link'], entry) + self.query_pyagg('post', 'article', self.to_article(entry)) + + now = datetime.now() + logger.debug('%r %r - updating feed etag %r last_mod %r', + self.feed['id'], self.feed['title'], + self.headers.get('etag'), now) + + self.query_pyagg('put', 'feed/%d' % self.feed['id'], {'error_count': 0, + 'etag': self.headers.get('etag', ''), + 'last_modified': self.headers.get('last-modified', '')}) + + +class FeedCrawler(AbstractCrawler): + + def __init__(self, feed, auth): + self.feed = feed + super(FeedCrawler, self).__init__(auth) + + def callback(self, response): + try: + response = response.result() + response.raise_for_status() + except Exception as error: + error_count = self.feed['error_count'] + 1 + logger.warn('%r %r - an error occured while fetching feed; bumping' + ' error count to %r', self.feed['title'], + self.feed['id'], error_count) + self.query_pyagg('put', 'feed/%d' % self.feed['id'], + {'error_count': error_count, + 'last_error': str(error)}) + return + + if response.status_code == 304: + logger.debug("%r %r - feed responded with 304", + self.feed['id'], self.feed['title']) + return + if self.feed['etag'] and response.headers.get('etag') \ + and response.headers.get('etag') == self.feed['etag']: + logger.debug("%r %r - feed responded with same etag (%d) %r", + self.feed['id'], self.feed['title'], + response.status_code, self.feed['link']) + return + ids, entries = [], {} + parsed_response = feedparser.parse(response.text) + for entry in parsed_response['entries']: + entries[tuple(sorted(extract_id(entry).items()))] = entry + ids.append(extract_id(entry)) + logger.debug('%r %r - found %d entries %r', + self.feed['id'], self.feed['title'], len(ids), ids) + future = self.query_pyagg('get', 'articles/challenge', {'ids': ids}) + updater = PyAggUpdater(self.feed, entries, response.headers, self.auth) + future.add_done_callback(updater.callback) + + +class CrawlerScheduler(AbstractCrawler): + + def __init__(self, username, password): + self.auth = (username, password) + super(CrawlerScheduler, self).__init__(self.auth) + + def prepare_headers(self, feed): + headers = {} + if feed.get('etag', None): + headers['If-None-Match'] = feed['etag'] + elif feed.get('last_modified'): + headers['If-Modified-Since'] = feed['last_modified'] + logger.debug('%r %r - calculated headers %r', + feed['id'], feed['title'], headers) + return headers + + def callback(self, response): + response = response.result() + response.raise_for_status() + feeds = response.json() + logger.debug('%d to fetch %r', len(feeds), feeds) + for feed in feeds: + logger.info('%r %r - fetching resources', + feed['id'], feed['title']) + future = self.session.get(feed['link'], + headers=self.prepare_headers(feed)) + future.add_done_callback(FeedCrawler(feed, self.auth).callback) + + def run(self): + logger.debug('retreving fetchable feed') + future = self.query_pyagg('get', 'feeds/fetchable') + future.add_done_callback(self.callback) diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py new file mode 100644 index 00000000..a4f4b3ec --- /dev/null +++ b/pyaggr3g470r/lib/utils.py @@ -0,0 +1,14 @@ +import types + +def default_handler(obj): + """JSON handler for default query formatting""" + if hasattr(obj, 'isoformat'): + return obj.isoformat() + if hasattr(obj, 'dump'): + return obj.dump() + if isinstance(obj, (set, frozenset, types.GeneratorType)): + return list(obj) + if isinstance(obj, BaseException): + return str(obj) + raise TypeError("Object of type %s with value of %r " + "is not JSON serializable" % (type(obj), obj)) -- cgit From a4fb151ea53d8054cc8e3fb309395c8fa0e23aaf Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Sun, 1 Mar 2015 14:08:02 +0100 Subject: fixing/restoring logging level --- pyaggr3g470r/lib/client.py | 16 ---------------- pyaggr3g470r/lib/crawler.py | 7 ++++--- pyaggr3g470r/lib/exceptions.py | 13 ------------- 3 files changed, 4 insertions(+), 32 deletions(-) delete mode 100755 pyaggr3g470r/lib/client.py delete mode 100644 pyaggr3g470r/lib/exceptions.py (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/client.py b/pyaggr3g470r/lib/client.py deleted file mode 100755 index 6b2fc9ae..00000000 --- a/pyaggr3g470r/lib/client.py +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env python -import json -import requests -import conf - - -def get_client(email, password): - client = requests.session() - client.get(conf.PLATFORM_URL + 'api/csrf', verify=False, - data=json.dumps({'email': email, - 'password': password})) - return client - - -def get_articles(client): - return client.get(conf.PLATFORM_URL + 'api/v1.0/articles/').json diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 6697e4c3..de770934 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -10,6 +10,7 @@ from requests_futures.sessions import FuturesSession from pyaggr3g470r.lib.utils import default_handler logger = logging.getLogger(__name__) +API_ROOT = "api/v2.0/" def extract_id(entry, keys=[('link', 'link'), @@ -52,7 +53,7 @@ class AbstractCrawler: if data is None: data = {} method = getattr(self.session, method) - return method("%sapi/v1.0/%s" % (self.url, urn), + return method("%s%s%s" % (self.url, API_ROOT, urn), auth=self.auth, data=json.dumps(data, default=default_handler), headers={'Content-Type': 'application/json'}) @@ -193,7 +194,7 @@ class CrawlerScheduler(AbstractCrawler): headers=self.prepare_headers(feed)) future.add_done_callback(FeedCrawler(feed, self.auth).callback) - def run(self): + def run(self, **kwargs): logger.debug('retreving fetchable feed') - future = self.query_pyagg('get', 'feeds/fetchable') + future = self.query_pyagg('get', 'feeds/fetchable', kwargs) future.add_done_callback(self.callback) diff --git a/pyaggr3g470r/lib/exceptions.py b/pyaggr3g470r/lib/exceptions.py deleted file mode 100644 index 30c71a5c..00000000 --- a/pyaggr3g470r/lib/exceptions.py +++ /dev/null @@ -1,13 +0,0 @@ -class PyAggError(Exception): - status_code = None - default_message = '' - - -class Forbidden(PyAggError): - status_code = 403 - default_message = 'You do not have the rights to access that resource' - - -class NotFound(PyAggError): - status_code = 404 - default_message = 'Resource was not found' -- cgit From ed305ec39e35c8e66b4554df13e6342a611d8125 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Mon, 2 Mar 2015 23:08:16 +0100 Subject: multi crawler update * adding action in the manager * removing debug code and adapting logging level * adding a wait method so it can be ran through cli --- pyaggr3g470r/lib/crawler.py | 53 +++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 21 deletions(-) (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index de770934..5525c7de 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -1,9 +1,11 @@ +import time import conf import json import logging import requests import feedparser import dateutil.parser +from functools import wraps from datetime import datetime from concurrent.futures import ThreadPoolExecutor from requests_futures.sessions import FuturesSession @@ -35,6 +37,7 @@ def extract_id(entry, keys=[('link', 'link'), class AbstractCrawler: __session__ = None + __counter__ = 0 def __init__(self, auth): self.auth = auth @@ -49,6 +52,16 @@ class AbstractCrawler: cls.__session__.verify = False return cls.__session__ + @classmethod + def count_on_me(cls, func): + @wraps(func) + def wrapper(*args, **kwargs): + cls.__counter__ += 1 + result = func(*args, **kwargs) + cls.__counter__ -= 1 + return result + return wrapper + def query_pyagg(self, method, urn, data=None): if data is None: data = {} @@ -58,6 +71,12 @@ class AbstractCrawler: default=default_handler), headers={'Content-Type': 'application/json'}) + @classmethod + def wait(self): + time.sleep(1) + while self.__counter__: + time.sleep(1) + class PyAggUpdater(AbstractCrawler): @@ -93,25 +112,14 @@ class PyAggUpdater(AbstractCrawler): 'retrieved_date': date.isoformat(), 'date': date.isoformat()} + @AbstractCrawler.count_on_me def callback(self, response): - try: - results = response.result().json() - except Exception: - logger.exception('something went wront with feed %r %r %r %r', - self.feed, self.headers, response.result(), - getattr(response.result(), 'data', None)) - return - logger.debug('%r %r - %d entries were not matched', + results = response.result().json() + logger.debug('%r %r - %d entries were not matched and will be created', self.feed['id'], self.feed['title'], len(results)) for id_to_create in results: entry = self.entries[tuple(sorted(id_to_create.items()))] - try: - logger.debug('creating %r - %r', entry['title'], id_to_create) - self.to_article(entry) - except: - logger.exception('%r %r %r something failed when parsing %r', - self.feed['title'], self.feed['id'], - self.feed['link'], entry) + logger.info('creating %r - %r', entry['title'], id_to_create) self.query_pyagg('post', 'article', self.to_article(entry)) now = datetime.now() @@ -130,6 +138,7 @@ class FeedCrawler(AbstractCrawler): self.feed = feed super(FeedCrawler, self).__init__(auth) + @AbstractCrawler.count_on_me def callback(self, response): try: response = response.result() @@ -137,22 +146,22 @@ class FeedCrawler(AbstractCrawler): except Exception as error: error_count = self.feed['error_count'] + 1 logger.warn('%r %r - an error occured while fetching feed; bumping' - ' error count to %r', self.feed['title'], - self.feed['id'], error_count) + ' error count to %r', self.feed['id'], + self.feed['title'], error_count) self.query_pyagg('put', 'feed/%d' % self.feed['id'], {'error_count': error_count, 'last_error': str(error)}) return if response.status_code == 304: - logger.debug("%r %r - feed responded with 304", + logger.info("%r %r - feed responded with 304", self.feed['id'], self.feed['title']) return if self.feed['etag'] and response.headers.get('etag') \ and response.headers.get('etag') == self.feed['etag']: - logger.debug("%r %r - feed responded with same etag (%d) %r", + logger.info("%r %r - feed responded with same etag (%d)", self.feed['id'], self.feed['title'], - response.status_code, self.feed['link']) + response.status_code) return ids, entries = [], {} parsed_response = feedparser.parse(response.text) @@ -176,12 +185,13 @@ class CrawlerScheduler(AbstractCrawler): headers = {} if feed.get('etag', None): headers['If-None-Match'] = feed['etag'] - elif feed.get('last_modified'): + if feed.get('last_modified'): headers['If-Modified-Since'] = feed['last_modified'] logger.debug('%r %r - calculated headers %r', feed['id'], feed['title'], headers) return headers + @AbstractCrawler.count_on_me def callback(self, response): response = response.result() response.raise_for_status() @@ -194,6 +204,7 @@ class CrawlerScheduler(AbstractCrawler): headers=self.prepare_headers(feed)) future.add_done_callback(FeedCrawler(feed, self.auth).callback) + @AbstractCrawler.count_on_me def run(self, **kwargs): logger.debug('retreving fetchable feed') future = self.query_pyagg('get', 'feeds/fetchable', kwargs) -- cgit From 643f4590445928b7ac568b922f1edb6f52765b68 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Tue, 3 Mar 2015 00:02:09 +0100 Subject: displaying feed errors in ui --- pyaggr3g470r/lib/crawler.py | 1 + 1 file changed, 1 insertion(+) (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 5525c7de..5e828dbf 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -129,6 +129,7 @@ class PyAggUpdater(AbstractCrawler): self.query_pyagg('put', 'feed/%d' % self.feed['id'], {'error_count': 0, 'etag': self.headers.get('etag', ''), + 'last_error': '', 'last_modified': self.headers.get('last-modified', '')}) -- cgit From cafcd09ff278aa775adafce936a54ce4d2aec5fd Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Tue, 3 Mar 2015 00:23:34 +0100 Subject: cleaning feed on okay download resulting in 304 --- pyaggr3g470r/lib/crawler.py | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 5e828dbf..1b9f5d60 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -139,6 +139,11 @@ class FeedCrawler(AbstractCrawler): self.feed = feed super(FeedCrawler, self).__init__(auth) + def clean_feed(self): + if self.feed.get('error_count') or self.feed.get('last_error'): + self.query_pyagg('put', 'feed/%d' % self.feed['id'], + {'error_count': 0, 'last_error': ''}) + @AbstractCrawler.count_on_me def callback(self, response): try: @@ -157,12 +162,14 @@ class FeedCrawler(AbstractCrawler): if response.status_code == 304: logger.info("%r %r - feed responded with 304", self.feed['id'], self.feed['title']) + self.clean_feed() return if self.feed['etag'] and response.headers.get('etag') \ and response.headers.get('etag') == self.feed['etag']: logger.info("%r %r - feed responded with same etag (%d)", self.feed['id'], self.feed['title'], response.status_code) + self.clean_feed() return ids, entries = [], {} parsed_response = feedparser.parse(response.text) -- cgit From 631fc8a3ebaf74dc609a445dc0b11b73eb0eab02 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Tue, 3 Mar 2015 18:12:11 +0100 Subject: adding some docstring --- pyaggr3g470r/lib/crawler.py | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 1b9f5d60..c00b0dbf 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -1,3 +1,17 @@ +""" +Here's a sum up on how it works : + +CrawlerScheduler.run + will retreive a list of feeds to be refreshed and pass result to +CrawlerScheduler.callback + which will retreive each feed and treat result with +FeedCrawler.callback + which will interprete the result (status_code, etag) collect ids + and match them agaisnt pyagg which will cause +PyAggUpdater.callback + to create the missing entries +""" + import time import conf import json @@ -18,6 +32,10 @@ API_ROOT = "api/v2.0/" def extract_id(entry, keys=[('link', 'link'), ('published', 'retrieved_date'), ('updated', 'retrieved_date')], force_id=False): + """For a given entry will return a dict that allows to identify it. The + dict will be constructed on the uid of the entry. if that identifier is + absent, the dict will be constructed upon the values of "keys". + """ entry_id = entry.get('entry_id') or entry.get('id') if entry_id: return {'entry_id': entry_id} @@ -46,6 +64,7 @@ class AbstractCrawler: @classmethod def get_session(cls): + """methods that allows us to treat session as a singleton""" if cls.__session__ is None: cls.__session__ = FuturesSession( executor=ThreadPoolExecutor(max_workers=conf.NB_WORKER)) @@ -54,6 +73,9 @@ class AbstractCrawler: @classmethod def count_on_me(cls, func): + """A basic decorator which will count +1 at the begining of a call + and -1 at the end. It kinda allows us to wait for the __counter__ value + to be 0, meaning nothing is done anymore.""" @wraps(func) def wrapper(*args, **kwargs): cls.__counter__ += 1 @@ -63,6 +85,10 @@ class AbstractCrawler: return wrapper def query_pyagg(self, method, urn, data=None): + """A wrapper for internal call, method should be ones you can find + on requests (header, post, get, options, ...), urn the distant + resources you want to access on pyagg, and data, the data you wanna + transmit.""" if data is None: data = {} method = getattr(self.session, method) @@ -72,9 +98,10 @@ class AbstractCrawler: headers={'Content-Type': 'application/json'}) @classmethod - def wait(self): + def wait(cls): + "See count_on_me, that method will just wait for the counter to be 0" time.sleep(1) - while self.__counter__: + while cls.__counter__: time.sleep(1) @@ -87,6 +114,7 @@ class PyAggUpdater(AbstractCrawler): super(PyAggUpdater, self).__init__(auth) def to_article(self, entry): + "Safe method to transorm a feedparser entry into an article" date = datetime.now() for date_key in ('published', 'updated'): @@ -114,6 +142,8 @@ class PyAggUpdater(AbstractCrawler): @AbstractCrawler.count_on_me def callback(self, response): + """Will process the result from the challenge, creating missing article + and updating the feed""" results = response.result().json() logger.debug('%r %r - %d entries were not matched and will be created', self.feed['id'], self.feed['title'], len(results)) @@ -140,12 +170,15 @@ class FeedCrawler(AbstractCrawler): super(FeedCrawler, self).__init__(auth) def clean_feed(self): + """Will reset the errors counters on a feed that have known errors""" if self.feed.get('error_count') or self.feed.get('last_error'): self.query_pyagg('put', 'feed/%d' % self.feed['id'], {'error_count': 0, 'last_error': ''}) @AbstractCrawler.count_on_me def callback(self, response): + """will fetch the feed and interprete results (304, etag) or will + challenge pyagg to compare gotten entries with existing ones""" try: response = response.result() response.raise_for_status() @@ -190,6 +223,7 @@ class CrawlerScheduler(AbstractCrawler): super(CrawlerScheduler, self).__init__(self.auth) def prepare_headers(self, feed): + """For a known feed, will construct some header dictionnary""" headers = {} if feed.get('etag', None): headers['If-None-Match'] = feed['etag'] @@ -201,6 +235,7 @@ class CrawlerScheduler(AbstractCrawler): @AbstractCrawler.count_on_me def callback(self, response): + """processes feeds that need to be fetched""" response = response.result() response.raise_for_status() feeds = response.json() @@ -214,6 +249,8 @@ class CrawlerScheduler(AbstractCrawler): @AbstractCrawler.count_on_me def run(self, **kwargs): + """entry point, will retreive feeds to be fetch + and launch the whole thing""" logger.debug('retreving fetchable feed') future = self.query_pyagg('get', 'feeds/fetchable', kwargs) future.add_done_callback(self.callback) -- cgit From 8e515cbf172f1aa7da37882fc3973f5a2dd70dd0 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Tue, 3 Mar 2015 23:05:21 +0100 Subject: last fixes --- pyaggr3g470r/lib/crawler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index c00b0dbf..64ef8b6d 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -148,9 +148,10 @@ class PyAggUpdater(AbstractCrawler): logger.debug('%r %r - %d entries were not matched and will be created', self.feed['id'], self.feed['title'], len(results)) for id_to_create in results: - entry = self.entries[tuple(sorted(id_to_create.items()))] + entry = self.to_article( + self.entries[tuple(sorted(id_to_create.items()))]) logger.info('creating %r - %r', entry['title'], id_to_create) - self.query_pyagg('post', 'article', self.to_article(entry)) + self.query_pyagg('post', 'article', entry) now = datetime.now() logger.debug('%r %r - updating feed etag %r last_mod %r', -- cgit