aboutsummaryrefslogtreecommitdiff
path: root/src/lib/utils.py
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2016-11-17 08:30:06 +0100
committerCédric Bonhomme <cedric@cedricbonhomme.org>2016-11-17 08:30:06 +0100
commitb0e987fbafaa28226c54157fb11993079c5341e2 (patch)
tree1f0cd04a505dce4680155f8bb4c7bb757984c030 /src/lib/utils.py
parentBugfix: should import Article in order to resolve the 'date' column for the o... (diff)
downloadnewspipe-b0e987fbafaa28226c54157fb11993079c5341e2.tar.gz
newspipe-b0e987fbafaa28226c54157fb11993079c5341e2.tar.bz2
newspipe-b0e987fbafaa28226c54157fb11993079c5341e2.zip
cleaning the mess in the libs directories
Diffstat (limited to 'src/lib/utils.py')
-rw-r--r--src/lib/utils.py89
1 files changed, 89 insertions, 0 deletions
diff --git a/src/lib/utils.py b/src/lib/utils.py
new file mode 100644
index 00000000..d206b769
--- /dev/null
+++ b/src/lib/utils.py
@@ -0,0 +1,89 @@
+import re
+import types
+import urllib
+import logging
+import requests
+from hashlib import md5
+from flask import request, url_for
+
+import conf
+
+logger = logging.getLogger(__name__)
+
+
+def default_handler(obj, role='admin'):
+ """JSON handler for default query formatting"""
+ if hasattr(obj, 'isoformat'):
+ return obj.isoformat()
+ if hasattr(obj, 'dump'):
+ return obj.dump(role=role)
+ if isinstance(obj, (set, frozenset, types.GeneratorType)):
+ return list(obj)
+ if isinstance(obj, BaseException):
+ return str(obj)
+ raise TypeError("Object of type %s with value of %r "
+ "is not JSON serializable" % (type(obj), obj))
+
+
+def try_keys(dico, *keys):
+ for key in keys:
+ if key in dico:
+ return dico[key]
+ return
+
+
+def rebuild_url(url, base_split):
+ split = urllib.parse.urlsplit(url)
+ if split.scheme and split.netloc:
+ return url # url is fine
+ new_split = urllib.parse.SplitResult(
+ scheme=split.scheme or base_split.scheme,
+ netloc=split.netloc or base_split.netloc,
+ path=split.path, query='', fragment='')
+ return urllib.parse.urlunsplit(new_split)
+
+
+def try_get_icon_url(url, *splits):
+ for split in splits:
+ if split is None:
+ continue
+ rb_url = rebuild_url(url, split)
+ response = None
+ # if html in content-type, we assume it's a fancy 404 page
+ try:
+ response = jarr_get(rb_url)
+ content_type = response.headers.get('content-type', '')
+ except Exception:
+ pass
+ else:
+ if response is not None and response.ok \
+ and 'html' not in content_type and response.content:
+ return response.url
+ return None
+
+
+def to_hash(text):
+ return md5(text.encode('utf8') if hasattr(text, 'encode') else text)\
+ .hexdigest()
+
+
+def clear_string(data):
+ """
+ Clear a string by removing HTML tags, HTML special caracters
+ and consecutive white spaces (more that one).
+ """
+ p = re.compile('<[^>]+>') # HTML tags
+ q = re.compile('\s') # consecutive white spaces
+ return p.sub('', q.sub(' ', data))
+
+
+def redirect_url(default='home'):
+ return request.args.get('next') or request.referrer or url_for(default)
+
+
+async def jarr_get(url, **kwargs):
+ request_kwargs = {'verify': False, 'allow_redirects': True,
+ 'timeout': conf.CRAWLER_TIMEOUT,
+ 'headers': {'User-Agent': conf.CRAWLER_USER_AGENT}}
+ request_kwargs.update(kwargs)
+ return requests.get(url, **request_kwargs)
bgstack15