diff options
author | Cédric Bonhomme <kimble.mandel+bitbucket@gmail.com> | 2015-04-22 11:06:27 +0200 |
---|---|---|
committer | Cédric Bonhomme <kimble.mandel+bitbucket@gmail.com> | 2015-04-22 11:06:27 +0200 |
commit | 4fa09afdb7465db6730cb69a9f99279afdb0cf87 (patch) | |
tree | ec221bc4fc68389fb58672cd01b34bf1740c43b0 /pyaggr3g470r/lib | |
parent | Updated NEWS.rst (diff) | |
parent | impacting wosh suppression to controller (diff) | |
download | newspipe-4fa09afdb7465db6730cb69a9f99279afdb0cf87.tar.gz newspipe-4fa09afdb7465db6730cb69a9f99279afdb0cf87.tar.bz2 newspipe-4fa09afdb7465db6730cb69a9f99279afdb0cf87.zip |
Merged in jaesivsm/pyaggr3g470r (pull request #11)
misc improvement in http crawler
Diffstat (limited to 'pyaggr3g470r/lib')
-rw-r--r-- | pyaggr3g470r/lib/crawler.py | 15 |
1 files changed, 9 insertions, 6 deletions
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 1cb61973..339c4b12 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -16,7 +16,6 @@ import time import conf import json import logging -import requests import feedparser import dateutil.parser from hashlib import md5 @@ -97,6 +96,7 @@ class AbstractCrawler: @classmethod def get_counter_callback(cls): cls.__counter__ += 1 + def debump(*args, **kwargs): cls.__counter__ -= 1 return debump @@ -157,6 +157,7 @@ class PyAggUpdater(AbstractCrawler): content = entry['summary'] return {'feed_id': self.feed['id'], + 'user_id': self.feed['user_id'], 'entry_id': extract_id(entry).get('entry_id', None), 'link': entry.get('link', self.feed['site_link']), 'title': entry.get('title', 'No title'), @@ -176,11 +177,11 @@ class PyAggUpdater(AbstractCrawler): for id_to_create in results: entry = self.to_article( self.entries[tuple(sorted(id_to_create.items()))]) - logger.warn('%r %r - creating %r - %r', self.feed['id'], - self.feed['title'], entry['title'], id_to_create) + logger.warn('%r %r - creating %r for %r - %r', self.feed['id'], + self.feed['title'], entry['title'], entry['user_id'], + id_to_create) self.query_pyagg('post', 'article', entry) - now = datetime.now() logger.debug('%r %r - updating feed etag %r last_mod %r', self.feed['id'], self.feed['title'], self.headers.get('etag', ''), @@ -264,8 +265,10 @@ class FeedCrawler(AbstractCrawler): ids, entries = [], {} parsed_response = feedparser.parse(response.text) for entry in parsed_response['entries']: - entries[tuple(sorted(extract_id(entry).items()))] = entry - ids.append(extract_id(entry)) + entry_ids = extract_id(entry) + entry_ids['feed_id'] = self.feed['id'] + entries[tuple(sorted(entry_ids.items()))] = entry + ids.append(entry_ids) logger.debug('%r %r - found %d entries %r', self.feed['id'], self.feed['title'], len(ids), ids) future = self.query_pyagg('get', 'articles/challenge', {'ids': ids}) |