diff options
author | François Schmidts <francois.schmidts@gmail.com> | 2015-03-06 11:07:43 +0100 |
---|---|---|
committer | François Schmidts <francois.schmidts@gmail.com> | 2015-03-06 11:07:43 +0100 |
commit | 822e59f043ba7b12962c5e65f59f2fd33a339f54 (patch) | |
tree | e92c92daa8e81d7b52640d301dc615d9911ce902 /pyaggr3g470r | |
parent | correcting wait counter and reactivating last_retrieved (diff) | |
download | newspipe-822e59f043ba7b12962c5e65f59f2fd33a339f54.tar.gz newspipe-822e59f043ba7b12962c5e65f59f2fd33a339f54.tar.bz2 newspipe-822e59f043ba7b12962c5e65f59f2fd33a339f54.zip |
better crawling crontrol
Diffstat (limited to 'pyaggr3g470r')
-rw-r--r-- | pyaggr3g470r/controllers/feed.py | 15 | ||||
-rw-r--r-- | pyaggr3g470r/lib/crawler.py | 12 |
2 files changed, 18 insertions, 9 deletions
diff --git a/pyaggr3g470r/controllers/feed.py b/pyaggr3g470r/controllers/feed.py index b8e28ee6..ff496efc 100644 --- a/pyaggr3g470r/controllers/feed.py +++ b/pyaggr3g470r/controllers/feed.py @@ -1,10 +1,10 @@ +import logging from datetime import datetime, timedelta + from .abstract import AbstractController from pyaggr3g470r.models import Feed -import logging logger = logging.getLogger(__name__) - DEFAULT_MAX_ERROR = 3 DEFAULT_LIMIT = 5 @@ -12,14 +12,19 @@ DEFAULT_LIMIT = 5 class FeedController(AbstractController): _db_cls = Feed + def list_late(self, max_last, max_error=DEFAULT_MAX_ERROR, limit=DEFAULT_LIMIT): + return [feed for feed in self.read( + error_count__lt=max_error, enabled=True, + last_retrieved__lt=max_last) + .order_by('Feed.last_retrieved') + .limit(limit)] + def list_fetchable(self, max_error=DEFAULT_MAX_ERROR, limit=DEFAULT_LIMIT): from pyaggr3g470r.controllers import UserController now = datetime.now() user = UserController(self.user_id).get(id=self.user_id) max_last = now - timedelta(minutes=user.refresh_rate or 60) - feeds = [feed for feed in self.read(user_id=self.user_id, - error_count__lt=max_error, enabled=True, - last_retrieved__lt=max_last).limit(limit)] + feeds = self.list_late(max_last, max_error, limit) if feeds: self.update({'id__in': [feed.id for feed in feeds]}, {'last_retrieved': now}) diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 97f14363..8e61b7cf 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -111,16 +111,16 @@ class AbstractCrawler: "See count_on_me, that method will just wait for the counter to be 0" time.sleep(1) while cls.__counter__: - print('running %d' % cls.__counter__) time.sleep(1) class PyAggUpdater(AbstractCrawler): - def __init__(self, feed, entries, headers, auth): + def __init__(self, feed, entries, headers, parsed_feed, auth): self.feed = feed self.entries = entries self.headers = headers + self.parsed_feed = parsed_feed.get('feed', {}) super(PyAggUpdater, self).__init__(auth) def to_article(self, entry): @@ -171,7 +171,10 @@ class PyAggUpdater(AbstractCrawler): dico = {'error_count': 0, 'last_error': '', 'etag': self.headers.get('etag', ''), - 'last_modified': self.headers.get('last-modified', '')} + 'last_modified': self.headers.get('last-modified', ''), + 'site_link': self.parsed_feed.get('link')} + if not self.feed.get('title'): + dico['title'] = self.parsed_feed.get('title', '') if any([dico[key] == self.feed.get(key) for key in dico]): future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico) future.add_done_callback(self.get_counter_callback()) @@ -229,7 +232,8 @@ class FeedCrawler(AbstractCrawler): logger.debug('%r %r - found %d entries %r', self.feed['id'], self.feed['title'], len(ids), ids) future = self.query_pyagg('get', 'articles/challenge', {'ids': ids}) - updater = PyAggUpdater(self.feed, entries, response.headers, self.auth) + updater = PyAggUpdater(self.feed, entries, response.headers, + parsed_response, self.auth) future.add_done_callback(updater.callback) |