diff options
author | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2016-11-08 14:39:47 +0100 |
---|---|---|
committer | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2016-11-08 14:39:47 +0100 |
commit | 2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96 (patch) | |
tree | 39895c10f68cf0b13d957073268769d04aa924a0 /src/web/lib/utils.py | |
parent | Closes section HTML tag. (diff) | |
download | newspipe-2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96.tar.gz newspipe-2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96.tar.bz2 newspipe-2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96.zip |
various improvements to the crawler (better use of coroutines, test if an article should be updated). tags are now retrieved for the k-means clustering (previously achived with the content of articles)
Diffstat (limited to 'src/web/lib/utils.py')
-rw-r--r-- | src/web/lib/utils.py | 24 |
1 files changed, 20 insertions, 4 deletions
diff --git a/src/web/lib/utils.py b/src/web/lib/utils.py index f2bed3ff..d206b769 100644 --- a/src/web/lib/utils.py +++ b/src/web/lib/utils.py @@ -6,6 +6,8 @@ import requests from hashlib import md5 from flask import request, url_for +import conf + logger = logging.getLogger(__name__) @@ -46,11 +48,17 @@ def try_get_icon_url(url, *splits): if split is None: continue rb_url = rebuild_url(url, split) - response = requests.get(rb_url, verify=False, timeout=10) + response = None # if html in content-type, we assume it's a fancy 404 page - content_type = response.headers.get('content-type', '') - if response.ok and 'html' not in content_type and response.content: - return response.url + try: + response = jarr_get(rb_url) + content_type = response.headers.get('content-type', '') + except Exception: + pass + else: + if response is not None and response.ok \ + and 'html' not in content_type and response.content: + return response.url return None @@ -71,3 +79,11 @@ def clear_string(data): def redirect_url(default='home'): return request.args.get('next') or request.referrer or url_for(default) + + +async def jarr_get(url, **kwargs): + request_kwargs = {'verify': False, 'allow_redirects': True, + 'timeout': conf.CRAWLER_TIMEOUT, + 'headers': {'User-Agent': conf.CRAWLER_USER_AGENT}} + request_kwargs.update(kwargs) + return requests.get(url, **request_kwargs) |