aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/lib/crawler.py
diff options
context:
space:
mode:
authorFrançois Schmidts <francois.schmidts@gmail.com>2015-08-04 14:45:42 +0200
committerFrançois Schmidts <francois.schmidts@gmail.com>2015-08-05 16:11:29 +0200
commite624d47ec2f3fad98f4312769408b36b6d2213d7 (patch)
tree5486b4df2fbc22eb5699ea362e3b61a18a654c8c /pyaggr3g470r/lib/crawler.py
parentmaking feeds/fetchable and articles/challenge returns 204 on empty (diff)
downloadnewspipe-e624d47ec2f3fad98f4312769408b36b6d2213d7.tar.gz
newspipe-e624d47ec2f3fad98f4312769408b36b6d2213d7.tar.bz2
newspipe-e624d47ec2f3fad98f4312769408b36b6d2213d7.zip
redoing the way the crawler wait on itself
Diffstat (limited to 'pyaggr3g470r/lib/crawler.py')
-rw-r--r--pyaggr3g470r/lib/crawler.py62
1 files changed, 15 insertions, 47 deletions
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py
index cca3245e..b599e2d4 100644
--- a/pyaggr3g470r/lib/crawler.py
+++ b/pyaggr3g470r/lib/crawler.py
@@ -33,10 +33,8 @@ API_ROOT = "api/v2.0/"
class AbstractCrawler:
__session__ = None
- __counter__ = 0
def __init__(self, auth):
- AbstractCrawler.__counter__ += 1
self.auth = auth
self.session = self.get_session()
self.url = conf.PLATFORM_URL
@@ -50,30 +48,6 @@ class AbstractCrawler:
cls.__session__.verify = False
return cls.__session__
- @classmethod
- def count_on_me(cls, func):
- """A basic decorator which will count +1 at the begining of a call
- and -1 at the end. It kinda allows us to wait for the __counter__ value
- to be 0, meaning nothing is done anymore."""
- @wraps(func)
- def wrapper(*args, **kwargs):
- cls.__counter__ += 1
- try:
- return func(*args, **kwargs)
- except:
- logger.exception('an error occured while %r', func)
- finally:
- cls.__counter__ -= 1
- return wrapper
-
- @classmethod
- def get_counter_callback(cls):
- cls.__counter__ += 1
-
- def debump(*args, **kwargs):
- cls.__counter__ -= 1
- return debump
-
def query_pyagg(self, method, urn, data=None):
"""A wrapper for internal call, method should be ones you can find
on requests (header, post, get, options, ...), urn the distant
@@ -89,17 +63,23 @@ class AbstractCrawler:
'User-Agent': 'pyaggr3g470r'})
@classmethod
- def wait(cls, max_wait=600):
+ def wait(cls, max_wait=300, checks=5, wait_for=2):
"See count_on_me, that method will just wait for the counter to be 0"
- time.sleep(1)
- second_waited = 1
- while cls.__counter__:
+ checked, second_waited = 0, 0
+ checked = 0
+ while True:
+ time.sleep(wait_for)
+ second_waited += wait_for
if second_waited > max_wait:
logger.warn('Exiting after %d seconds, counter at %d',
- max_wait, cls.__counter__)
+ max_wait, len(cls.__counter__))
break
- time.sleep(1)
- second_waited += 1
+ if cls.get_session().executor._work_queue.queue:
+ checked = 0
+ continue
+ checked += 1
+ if checked == checks:
+ break
class PyAggUpdater(AbstractCrawler):
@@ -109,13 +89,11 @@ class PyAggUpdater(AbstractCrawler):
self.entries = entries
self.headers = headers
self.parsed_feed = parsed_feed
- super(PyAggUpdater, self).__init__(auth)
+ super().__init__(auth)
- @AbstractCrawler.count_on_me
def callback(self, response):
"""Will process the result from the challenge, creating missing article
and updating the feed"""
- AbstractCrawler.__counter__ -= 1
article_created = False
if response.result().status_code != 204:
results = response.result().json()
@@ -162,27 +140,23 @@ class PyAggUpdater(AbstractCrawler):
future = self.query_pyagg('put',
'feed/%d' % self.feed['id'], up_feed)
- future.add_done_callback(self.get_counter_callback())
class FeedCrawler(AbstractCrawler):
def __init__(self, feed, auth):
self.feed = feed
- super(FeedCrawler, self).__init__(auth)
+ super().__init__(auth)
def clean_feed(self):
"""Will reset the errors counters on a feed that have known errors"""
if self.feed.get('error_count') or self.feed.get('last_error'):
future = self.query_pyagg('put', 'feed/%d' % self.feed['id'],
{'error_count': 0, 'last_error': ''})
- future.add_done_callback(self.get_counter_callback())
- @AbstractCrawler.count_on_me
def callback(self, response):
"""will fetch the feed and interprete results (304, etag) or will
challenge pyagg to compare gotten entries with existing ones"""
- AbstractCrawler.__counter__ -= 1
try:
response = response.result()
response.raise_for_status()
@@ -194,7 +168,6 @@ class FeedCrawler(AbstractCrawler):
future = self.query_pyagg('put', 'feed/%d' % self.feed['id'],
{'error_count': error_count,
'last_error': str(error)})
- future.add_done_callback(self.get_counter_callback())
return
if response.status_code == 304:
@@ -246,7 +219,6 @@ class CrawlerScheduler(AbstractCrawler):
def __init__(self, username, password):
self.auth = (username, password)
super(CrawlerScheduler, self).__init__(self.auth)
- AbstractCrawler.__counter__ = 0
def prepare_headers(self, feed):
"""For a known feed, will construct some header dictionnary"""
@@ -259,10 +231,8 @@ class CrawlerScheduler(AbstractCrawler):
feed['id'], feed['title'], headers)
return headers
- @AbstractCrawler.count_on_me
def callback(self, response):
"""processes feeds that need to be fetched"""
- AbstractCrawler.__counter__ -= 1
response = response.result()
response.raise_for_status()
if response.status_code == 204:
@@ -277,11 +247,9 @@ class CrawlerScheduler(AbstractCrawler):
headers=self.prepare_headers(feed))
future.add_done_callback(FeedCrawler(feed, self.auth).callback)
- @AbstractCrawler.count_on_me
def run(self, **kwargs):
"""entry point, will retreive feeds to be fetch
and launch the whole thing"""
logger.debug('retreving fetchable feed')
future = self.query_pyagg('get', 'feeds/fetchable', kwargs)
- AbstractCrawler.__counter__ += 1
future.add_done_callback(self.callback)
bgstack15