various improvements to the crawler (better use of coroutines, test if an article should be updated). tags are now retrieved for the k-means clustering (previously achived with the content of articles)

author: Cédric Bonhomme <cedric@cedricbonhomme.org> 2016-11-08 14:39:47 +0100
committer: Cédric Bonhomme <cedric@cedricbonhomme.org> 2016-11-08 14:39:47 +0100
commit: 2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96 (patch)
tree: 39895c10f68cf0b13d957073268769d04aa924a0 /src/web/lib/utils.py
parent: Closes section HTML tag. (diff)
download: newspipe-2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96.tar.gz
newspipe-2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96.tar.bz2
newspipe-2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96.zip
1 files changed, 20 insertions, 4 deletions
diff --git a/src/web/lib/utils.py b/src/web/lib/utils.py
index f2bed3ff..d206b769 100644
--- a/src/web/lib/utils.py
+++ b/src/web/lib/utils.py
@@ -6,6 +6,8 @@ import requests
 from hashlib import md5
 from flask import request, url_for
 
+import conf
+
 logger = logging.getLogger(__name__)
 
 
@@ -46,11 +48,17 @@ def try_get_icon_url(url, *splits):
         if split is None:
             continue
         rb_url = rebuild_url(url, split)
-        response = requests.get(rb_url, verify=False, timeout=10)
+        response = None
         # if html in content-type, we assume it's a fancy 404 page
-        content_type = response.headers.get('content-type', '')
-        if response.ok and 'html' not in content_type and response.content:
-            return response.url
+        try:
+            response = jarr_get(rb_url)
+            content_type = response.headers.get('content-type', '')
+        except Exception:
+            pass
+        else:
+            if response is not None and response.ok \
+                    and 'html' not in content_type and response.content:
+                return response.url
     return None
 
 
@@ -71,3 +79,11 @@ def clear_string(data):
 
 def redirect_url(default='home'):
     return request.args.get('next') or request.referrer or url_for(default)
+
+
+async def jarr_get(url, **kwargs):
+    request_kwargs = {'verify': False, 'allow_redirects': True,
+                      'timeout': conf.CRAWLER_TIMEOUT,
+                      'headers': {'User-Agent': conf.CRAWLER_USER_AGENT}}
+    request_kwargs.update(kwargs)
+    return requests.get(url, **request_kwargs)
author	Cédric Bonhomme <cedric@cedricbonhomme.org>	2016-11-08 14:39:47 +0100
committer	Cédric Bonhomme <cedric@cedricbonhomme.org>	2016-11-08 14:39:47 +0100
commit	2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96 (patch)
tree	39895c10f68cf0b13d957073268769d04aa924a0 /src/web/lib/utils.py
parent	Closes section HTML tag. (diff)
download	newspipe-2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96.tar.gz newspipe-2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96.tar.bz2 newspipe-2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96.zip