diff options
author | François Schmidts <francois.schmidts@gmail.com> | 2015-03-06 11:07:43 +0100 |
---|---|---|
committer | François Schmidts <francois.schmidts@gmail.com> | 2015-03-06 11:07:43 +0100 |
commit | 822e59f043ba7b12962c5e65f59f2fd33a339f54 (patch) | |
tree | e92c92daa8e81d7b52640d301dc615d9911ce902 /pyaggr3g470r/lib | |
parent | correcting wait counter and reactivating last_retrieved (diff) | |
download | newspipe-822e59f043ba7b12962c5e65f59f2fd33a339f54.tar.gz newspipe-822e59f043ba7b12962c5e65f59f2fd33a339f54.tar.bz2 newspipe-822e59f043ba7b12962c5e65f59f2fd33a339f54.zip |
better crawling crontrol
Diffstat (limited to 'pyaggr3g470r/lib')
-rw-r--r-- | pyaggr3g470r/lib/crawler.py | 12 |
1 files changed, 8 insertions, 4 deletions
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 97f14363..8e61b7cf 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -111,16 +111,16 @@ class AbstractCrawler: "See count_on_me, that method will just wait for the counter to be 0" time.sleep(1) while cls.__counter__: - print('running %d' % cls.__counter__) time.sleep(1) class PyAggUpdater(AbstractCrawler): - def __init__(self, feed, entries, headers, auth): + def __init__(self, feed, entries, headers, parsed_feed, auth): self.feed = feed self.entries = entries self.headers = headers + self.parsed_feed = parsed_feed.get('feed', {}) super(PyAggUpdater, self).__init__(auth) def to_article(self, entry): @@ -171,7 +171,10 @@ class PyAggUpdater(AbstractCrawler): dico = {'error_count': 0, 'last_error': '', 'etag': self.headers.get('etag', ''), - 'last_modified': self.headers.get('last-modified', '')} + 'last_modified': self.headers.get('last-modified', ''), + 'site_link': self.parsed_feed.get('link')} + if not self.feed.get('title'): + dico['title'] = self.parsed_feed.get('title', '') if any([dico[key] == self.feed.get(key) for key in dico]): future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico) future.add_done_callback(self.get_counter_callback()) @@ -229,7 +232,8 @@ class FeedCrawler(AbstractCrawler): logger.debug('%r %r - found %d entries %r', self.feed['id'], self.feed['title'], len(ids), ids) future = self.query_pyagg('get', 'articles/challenge', {'ids': ids}) - updater = PyAggUpdater(self.feed, entries, response.headers, self.auth) + updater = PyAggUpdater(self.feed, entries, response.headers, + parsed_response, self.auth) future.add_done_callback(updater.callback) |