aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrançois Schmidts <francois.schmidts@gmail.com>2015-03-06 11:07:43 +0100
committerFrançois Schmidts <francois.schmidts@gmail.com>2015-03-06 11:07:43 +0100
commit822e59f043ba7b12962c5e65f59f2fd33a339f54 (patch)
treee92c92daa8e81d7b52640d301dc615d9911ce902
parentcorrecting wait counter and reactivating last_retrieved (diff)
downloadnewspipe-822e59f043ba7b12962c5e65f59f2fd33a339f54.tar.gz
newspipe-822e59f043ba7b12962c5e65f59f2fd33a339f54.tar.bz2
newspipe-822e59f043ba7b12962c5e65f59f2fd33a339f54.zip
better crawling crontrol
-rw-r--r--pyaggr3g470r/controllers/feed.py15
-rw-r--r--pyaggr3g470r/lib/crawler.py12
2 files changed, 18 insertions, 9 deletions
diff --git a/pyaggr3g470r/controllers/feed.py b/pyaggr3g470r/controllers/feed.py
index b8e28ee6..ff496efc 100644
--- a/pyaggr3g470r/controllers/feed.py
+++ b/pyaggr3g470r/controllers/feed.py
@@ -1,10 +1,10 @@
+import logging
from datetime import datetime, timedelta
+
from .abstract import AbstractController
from pyaggr3g470r.models import Feed
-import logging
logger = logging.getLogger(__name__)
-
DEFAULT_MAX_ERROR = 3
DEFAULT_LIMIT = 5
@@ -12,14 +12,19 @@ DEFAULT_LIMIT = 5
class FeedController(AbstractController):
_db_cls = Feed
+ def list_late(self, max_last, max_error=DEFAULT_MAX_ERROR, limit=DEFAULT_LIMIT):
+ return [feed for feed in self.read(
+ error_count__lt=max_error, enabled=True,
+ last_retrieved__lt=max_last)
+ .order_by('Feed.last_retrieved')
+ .limit(limit)]
+
def list_fetchable(self, max_error=DEFAULT_MAX_ERROR, limit=DEFAULT_LIMIT):
from pyaggr3g470r.controllers import UserController
now = datetime.now()
user = UserController(self.user_id).get(id=self.user_id)
max_last = now - timedelta(minutes=user.refresh_rate or 60)
- feeds = [feed for feed in self.read(user_id=self.user_id,
- error_count__lt=max_error, enabled=True,
- last_retrieved__lt=max_last).limit(limit)]
+ feeds = self.list_late(max_last, max_error, limit)
if feeds:
self.update({'id__in': [feed.id for feed in feeds]},
{'last_retrieved': now})
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py
index 97f14363..8e61b7cf 100644
--- a/pyaggr3g470r/lib/crawler.py
+++ b/pyaggr3g470r/lib/crawler.py
@@ -111,16 +111,16 @@ class AbstractCrawler:
"See count_on_me, that method will just wait for the counter to be 0"
time.sleep(1)
while cls.__counter__:
- print('running %d' % cls.__counter__)
time.sleep(1)
class PyAggUpdater(AbstractCrawler):
- def __init__(self, feed, entries, headers, auth):
+ def __init__(self, feed, entries, headers, parsed_feed, auth):
self.feed = feed
self.entries = entries
self.headers = headers
+ self.parsed_feed = parsed_feed.get('feed', {})
super(PyAggUpdater, self).__init__(auth)
def to_article(self, entry):
@@ -171,7 +171,10 @@ class PyAggUpdater(AbstractCrawler):
dico = {'error_count': 0, 'last_error': '',
'etag': self.headers.get('etag', ''),
- 'last_modified': self.headers.get('last-modified', '')}
+ 'last_modified': self.headers.get('last-modified', ''),
+ 'site_link': self.parsed_feed.get('link')}
+ if not self.feed.get('title'):
+ dico['title'] = self.parsed_feed.get('title', '')
if any([dico[key] == self.feed.get(key) for key in dico]):
future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico)
future.add_done_callback(self.get_counter_callback())
@@ -229,7 +232,8 @@ class FeedCrawler(AbstractCrawler):
logger.debug('%r %r - found %d entries %r',
self.feed['id'], self.feed['title'], len(ids), ids)
future = self.query_pyagg('get', 'articles/challenge', {'ids': ids})
- updater = PyAggUpdater(self.feed, entries, response.headers, self.auth)
+ updater = PyAggUpdater(self.feed, entries, response.headers,
+ parsed_response, self.auth)
future.add_done_callback(updater.callback)
bgstack15