From b0e987fbafaa28226c54157fb11993079c5341e2 Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <cedric@cedricbonhomme.org>
Date: Thu, 17 Nov 2016 08:30:06 +0100
Subject: cleaning the mess in the libs directories

---
 src/lib/utils.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 src/lib/utils.py

(limited to 'src/lib/utils.py')

diff --git a/src/lib/utils.py b/src/lib/utils.py
new file mode 100644
index 00000000..d206b769
--- /dev/null
+++ b/src/lib/utils.py
@@ -0,0 +1,89 @@
+import re
+import types
+import urllib
+import logging
+import requests
+from hashlib import md5
+from flask import request, url_for
+
+import conf
+
+logger = logging.getLogger(__name__)
+
+
+def default_handler(obj, role='admin'):
+    """JSON handler for default query formatting"""
+    if hasattr(obj, 'isoformat'):
+        return obj.isoformat()
+    if hasattr(obj, 'dump'):
+        return obj.dump(role=role)
+    if isinstance(obj, (set, frozenset, types.GeneratorType)):
+        return list(obj)
+    if isinstance(obj, BaseException):
+        return str(obj)
+    raise TypeError("Object of type %s with value of %r "
+                    "is not JSON serializable" % (type(obj), obj))
+
+
+def try_keys(dico, *keys):
+    for key in keys:
+        if key in dico:
+            return dico[key]
+    return
+
+
+def rebuild_url(url, base_split):
+    split = urllib.parse.urlsplit(url)
+    if split.scheme and split.netloc:
+        return url  # url is fine
+    new_split = urllib.parse.SplitResult(
+            scheme=split.scheme or base_split.scheme,
+            netloc=split.netloc or base_split.netloc,
+            path=split.path, query='', fragment='')
+    return urllib.parse.urlunsplit(new_split)
+
+
+def try_get_icon_url(url, *splits):
+    for split in splits:
+        if split is None:
+            continue
+        rb_url = rebuild_url(url, split)
+        response = None
+        # if html in content-type, we assume it's a fancy 404 page
+        try:
+            response = jarr_get(rb_url)
+            content_type = response.headers.get('content-type', '')
+        except Exception:
+            pass
+        else:
+            if response is not None and response.ok \
+                    and 'html' not in content_type and response.content:
+                return response.url
+    return None
+
+
+def to_hash(text):
+    return md5(text.encode('utf8') if hasattr(text, 'encode') else text)\
+            .hexdigest()
+
+
+def clear_string(data):
+    """
+    Clear a string by removing HTML tags, HTML special caracters
+    and consecutive white spaces (more that one).
+    """
+    p = re.compile('<[^>]+>')  # HTML tags
+    q = re.compile('\s')  # consecutive white spaces
+    return p.sub('', q.sub(' ', data))
+
+
+def redirect_url(default='home'):
+    return request.args.get('next') or request.referrer or url_for(default)
+
+
+async def jarr_get(url, **kwargs):
+    request_kwargs = {'verify': False, 'allow_redirects': True,
+                      'timeout': conf.CRAWLER_TIMEOUT,
+                      'headers': {'User-Agent': conf.CRAWLER_USER_AGENT}}
+    request_kwargs.update(kwargs)
+    return requests.get(url, **request_kwargs)
-- 
cgit