aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/lib/utils.py
diff options
context:
space:
mode:
authorFrançois Schmidts <francois.schmidts@gmail.com>2015-07-03 11:49:28 +0200
committerFrançois Schmidts <francois.schmidts@gmail.com>2015-07-03 15:01:23 +0200
commit1513cd97911fdf4500ae17f7e8ee6d90ac4bac84 (patch)
treeb66771229ecbffdf3e784859b33dcf65aaf0d603 /pyaggr3g470r/lib/utils.py
parentMinor improvements to the edit feed forms. (diff)
downloadnewspipe-1513cd97911fdf4500ae17f7e8ee6d90ac4bac84.tar.gz
newspipe-1513cd97911fdf4500ae17f7e8ee6d90ac4bac84.tar.bz2
newspipe-1513cd97911fdf4500ae17f7e8ee6d90ac4bac84.zip
the icon feature
* icon of feeds is now an url retrieved from the feed or the site link * the icon is displayed in the home page making it visually easier to read * the http crawler is in charge of keeping it up to date
Diffstat (limited to 'pyaggr3g470r/lib/utils.py')
-rw-r--r--pyaggr3g470r/lib/utils.py95
1 files changed, 95 insertions, 0 deletions
diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py
index a4f4b3ec..a0154b7f 100644
--- a/pyaggr3g470r/lib/utils.py
+++ b/pyaggr3g470r/lib/utils.py
@@ -1,4 +1,9 @@
import types
+import urllib
+import requests
+import feedparser
+from bs4 import BeautifulSoup, SoupStrainer
+
def default_handler(obj):
"""JSON handler for default query formatting"""
@@ -12,3 +17,93 @@ def default_handler(obj):
return str(obj)
raise TypeError("Object of type %s with value of %r "
"is not JSON serializable" % (type(obj), obj))
+
+
+def try_keys(dico, *keys):
+ for key in keys:
+ if key in dico:
+ return dico[key]
+ return
+
+
+def rebuild_url(url, base_split):
+ split = urllib.parse.urlsplit(url)
+ if split.scheme and split.netloc:
+ return url # url is fine
+ new_split = urllib.parse.SplitResult(
+ scheme=split.scheme or base_split.scheme,
+ netloc=split.netloc or base_split.netloc,
+ path=split.path, query='', fragment='')
+ return urllib.parse.urlunsplit(new_split)
+
+
+def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
+ if url is None and fp_parsed is not None:
+ url = fp_parsed.get('url')
+ if url is not None and fp_parsed is None:
+ response = requests.get(url, verify=False)
+ fp_parsed = feedparser.parse(response.content)
+ assert url is not None and fp_parsed is not None
+ feed = feed or {}
+ split = urllib.parse.urlsplit(url)
+ if not fp_parsed['bozo']:
+ feed['link'] = url
+ feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link')
+ feed['title'] = fp_parsed['feed'].get('title')
+ feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title')
+ feed['icon'] = try_keys(fp_parsed['feed'], 'icon')
+ else:
+ feed['site_link'] = url
+
+ if feed.get('site_link'):
+ feed['site_link'] = rebuild_url(feed['site_link'], split)
+ split = urllib.parse.urlsplit(feed['site_link'])
+
+ if feed.get('icon'):
+ feed['icon'] = rebuild_url(feed['icon'], split)
+
+ if not feed.get('site_link') or not query_site \
+ or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')):
+ return feed
+
+ response = requests.get(feed['site_link'], verify=False)
+ bs_parsed = BeautifulSoup(response.content, 'html.parser',
+ parse_only=SoupStrainer('head'))
+
+ if not feed.get('title'):
+ try:
+ feed['title'] = bs_parsed.find_all('title')[0].text
+ except Exception:
+ pass
+
+ def check_keys(**kwargs):
+ def wrapper(elem):
+ for key, vals in kwargs.items():
+ if not elem.has_attr(key):
+ return False
+ if not all(val in elem.attrs[key] for val in vals):
+ return False
+ return True
+ return wrapper
+
+ if not feed.get('icon'):
+ icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut']))
+ if not len(icons):
+ icons = bs_parsed.find_all(check_keys(rel=['icon']))
+ if len(icons) >= 1:
+ feed['icon'] = rebuild_url(icons[0].attrs['href'], split)
+ else: # trying the default one
+ icon = rebuild_url('/favicon.ico', split)
+ if requests.get(icon, verify=False).ok:
+ feed['icon'] = icon
+
+ if not feed.get('link'):
+ alternate = bs_parsed.find_all(check_keys(rel=['alternate'],
+ type=['application/rss+xml']))
+ if len(alternate) == 1:
+ feed['link'] = rebuild_url(alternate[0].attrs['href'], split)
+ elif len(alternate) > 1:
+ feed['link'] = rebuild_url(alternate[0].attrs['href'], split)
+ feed['other_link'] = [rebuild_url(al.attrs['href'], split)
+ for al in alternate[1:]]
+ return feed
bgstack15