diff options
author | Cédric Bonhomme <kimble.mandel+bitbucket@gmail.com> | 2015-07-06 15:21:16 +0200 |
---|---|---|
committer | Cédric Bonhomme <kimble.mandel+bitbucket@gmail.com> | 2015-07-06 15:21:16 +0200 |
commit | 921a8f71655e87a84f74ebe869671a3ae59b2a45 (patch) | |
tree | 2e1054449ec4707c3d6fec80f19be739b59b8daf /pyaggr3g470r/lib/utils.py | |
parent | Improved the way we get the URL of the feed's icon. (diff) | |
parent | adding cache control on icons (diff) | |
download | newspipe-921a8f71655e87a84f74ebe869671a3ae59b2a45.tar.gz newspipe-921a8f71655e87a84f74ebe869671a3ae59b2a45.tar.bz2 newspipe-921a8f71655e87a84f74ebe869671a3ae59b2a45.zip |
Merged in jaesivsm/pyaggr3g470r (pull request #16)
Master
Diffstat (limited to 'pyaggr3g470r/lib/utils.py')
-rw-r--r-- | pyaggr3g470r/lib/utils.py | 84 |
1 files changed, 14 insertions, 70 deletions
diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index a0154b7f..280256f6 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -1,8 +1,11 @@ import types import urllib +import base64 +import logging import requests -import feedparser -from bs4 import BeautifulSoup, SoupStrainer +from hashlib import md5 + +logger = logging.getLogger(__name__) def default_handler(obj): @@ -37,73 +40,14 @@ def rebuild_url(url, base_split): return urllib.parse.urlunsplit(new_split) -def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): - if url is None and fp_parsed is not None: - url = fp_parsed.get('url') - if url is not None and fp_parsed is None: - response = requests.get(url, verify=False) - fp_parsed = feedparser.parse(response.content) - assert url is not None and fp_parsed is not None - feed = feed or {} - split = urllib.parse.urlsplit(url) - if not fp_parsed['bozo']: - feed['link'] = url - feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link') - feed['title'] = fp_parsed['feed'].get('title') - feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title') - feed['icon'] = try_keys(fp_parsed['feed'], 'icon') - else: - feed['site_link'] = url - - if feed.get('site_link'): - feed['site_link'] = rebuild_url(feed['site_link'], split) - split = urllib.parse.urlsplit(feed['site_link']) - - if feed.get('icon'): - feed['icon'] = rebuild_url(feed['icon'], split) - - if not feed.get('site_link') or not query_site \ - or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')): - return feed - - response = requests.get(feed['site_link'], verify=False) - bs_parsed = BeautifulSoup(response.content, 'html.parser', - parse_only=SoupStrainer('head')) - - if not feed.get('title'): - try: - feed['title'] = bs_parsed.find_all('title')[0].text - except Exception: - pass - - def check_keys(**kwargs): - def wrapper(elem): - for key, vals in kwargs.items(): - if not elem.has_attr(key): - return False - if not all(val in elem.attrs[key] for val in vals): - return False - return True - return wrapper +def try_splits(url, *splits): + for split in splits: + rb_url = rebuild_url(url, split) + response = requests.get(rb_url, verify=False, timeout=10) + if response.ok and 'html' not in response.headers['content-type']: + return base64.b64encode(response.content).decode('utf8') + return None - if not feed.get('icon'): - icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut'])) - if not len(icons): - icons = bs_parsed.find_all(check_keys(rel=['icon'])) - if len(icons) >= 1: - feed['icon'] = rebuild_url(icons[0].attrs['href'], split) - else: # trying the default one - icon = rebuild_url('/favicon.ico', split) - if requests.get(icon, verify=False).ok: - feed['icon'] = icon - if not feed.get('link'): - alternate = bs_parsed.find_all(check_keys(rel=['alternate'], - type=['application/rss+xml'])) - if len(alternate) == 1: - feed['link'] = rebuild_url(alternate[0].attrs['href'], split) - elif len(alternate) > 1: - feed['link'] = rebuild_url(alternate[0].attrs['href'], split) - feed['other_link'] = [rebuild_url(al.attrs['href'], split) - for al in alternate[1:]] - return feed +def to_hash(text): + return md5(text.encode('utf8')).hexdigest() |