From 334737969cad57a916163b974cd6c1a412c70dd1 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Fri, 3 Jul 2015 15:00:33 +0200 Subject: fixing bug on reset link for feeds and hidding feed title for eXtraSmall device --- pyaggr3g470r/lib/utils.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'pyaggr3g470r/lib/utils.py') diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index a0154b7f..041a2d29 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -37,6 +37,13 @@ def rebuild_url(url, base_split): return urllib.parse.urlunsplit(new_split) +def try_splits(url, *splits): + for split in splits: + if requests.get(rebuild_url(url, split), verify=False).ok: + return rebuild_url(url, split) + return None + + def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): if url is None and fp_parsed is not None: url = fp_parsed.get('url') @@ -45,7 +52,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): fp_parsed = feedparser.parse(response.content) assert url is not None and fp_parsed is not None feed = feed or {} - split = urllib.parse.urlsplit(url) + feed_split = urllib.parse.urlsplit(url) if not fp_parsed['bozo']: feed['link'] = url feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link') @@ -56,11 +63,13 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): feed['site_link'] = url if feed.get('site_link'): - feed['site_link'] = rebuild_url(feed['site_link'], split) - split = urllib.parse.urlsplit(feed['site_link']) + feed['site_link'] = rebuild_url(feed['site_link'], feed_split) + site_split = urllib.parse.urlsplit(feed['site_link']) if feed.get('icon'): - feed['icon'] = rebuild_url(feed['icon'], split) + feed['icon'] = try_splits(feed['icon'], site_split, feed_split) + if feed['icon'] is None: + del feed['icon'] if not feed.get('site_link') or not query_site \ or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')): @@ -91,11 +100,16 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): if not len(icons): icons = bs_parsed.find_all(check_keys(rel=['icon'])) if len(icons) >= 1: - feed['icon'] = rebuild_url(icons[0].attrs['href'], split) - else: # trying the default one - icon = rebuild_url('/favicon.ico', split) - if requests.get(icon, verify=False).ok: - feed['icon'] = icon + for icon in icons: + feed['icon'] = try_splits(icon.attrs['href'], + site_split, feed_split) + if feed['icon'] is not None: + break + + if feed['icon'] is None: + feed['icon'] = try_splits('/favicon.ico', site_split, feed_split) + if feed['icon'] is None: + del feed['icon'] if not feed.get('link'): alternate = bs_parsed.find_all(check_keys(rel=['alternate'], -- cgit From d08cc46087d3349aff7b06908c70d97fecbdec8f Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Sun, 5 Jul 2015 16:38:26 +0200 Subject: constructing feed from normal url also --- pyaggr3g470r/lib/utils.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'pyaggr3g470r/lib/utils.py') diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index 041a2d29..6d6725c8 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -77,7 +77,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): response = requests.get(feed['site_link'], verify=False) bs_parsed = BeautifulSoup(response.content, 'html.parser', - parse_only=SoupStrainer('head')) + parse_only=SoupStrainer('head')) if not feed.get('title'): try: @@ -115,9 +115,8 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): alternate = bs_parsed.find_all(check_keys(rel=['alternate'], type=['application/rss+xml'])) if len(alternate) == 1: - feed['link'] = rebuild_url(alternate[0].attrs['href'], split) + feed['link'] = alternate[0].attrs['href'] elif len(alternate) > 1: - feed['link'] = rebuild_url(alternate[0].attrs['href'], split) - feed['other_link'] = [rebuild_url(al.attrs['href'], split) - for al in alternate[1:]] + feed['link'] = alternate[0].attrs['href'] + feed['other_link'] = [al.attrs['href'] for al in alternate[1:]] return feed -- cgit From c1551acb30513f96d0053b96e240da7ab68833d2 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Sun, 5 Jul 2015 17:41:06 +0200 Subject: making bookmaklet work for any url --- pyaggr3g470r/lib/utils.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'pyaggr3g470r/lib/utils.py') diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index 6d6725c8..a4f2e043 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -1,9 +1,12 @@ import types import urllib +import logging import requests import feedparser from bs4 import BeautifulSoup, SoupStrainer +logger = logging.getLogger(__name__) + def default_handler(obj): """JSON handler for default query formatting""" @@ -48,8 +51,13 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): if url is None and fp_parsed is not None: url = fp_parsed.get('url') if url is not None and fp_parsed is None: - response = requests.get(url, verify=False) - fp_parsed = feedparser.parse(response.content) + try: + response = requests.get(url, verify=False) + fp_parsed = feedparser.parse(response.content, + request_headers=response.headers) + except Exception: + logger.exception('failed to retreive that url') + fp_parsed = {'bozo': True} assert url is not None and fp_parsed is not None feed = feed or {} feed_split = urllib.parse.urlsplit(url) @@ -106,17 +114,14 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): if feed['icon'] is not None: break - if feed['icon'] is None: + if feed.get('icon') is None: feed['icon'] = try_splits('/favicon.ico', site_split, feed_split) - if feed['icon'] is None: + if 'icon' in feed and feed['icon'] is None: del feed['icon'] if not feed.get('link'): alternate = bs_parsed.find_all(check_keys(rel=['alternate'], type=['application/rss+xml'])) - if len(alternate) == 1: - feed['link'] = alternate[0].attrs['href'] - elif len(alternate) > 1: + if len(alternate) >= 1: feed['link'] = alternate[0].attrs['href'] - feed['other_link'] = [al.attrs['href'] for al in alternate[1:]] return feed -- cgit From 75df52051b167425adcfb68797f77fcbcad33c4e Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Mon, 6 Jul 2015 10:28:58 +0200 Subject: light refact * both crawlers now use the same utils methods * the original crawler use now more of the controllers, enabling the filters feature --- pyaggr3g470r/lib/utils.py | 83 ++--------------------------------------------- 1 file changed, 3 insertions(+), 80 deletions(-) (limited to 'pyaggr3g470r/lib/utils.py') diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index a4f2e043..b7e5cafc 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -2,8 +2,7 @@ import types import urllib import logging import requests -import feedparser -from bs4 import BeautifulSoup, SoupStrainer +from hashlib import md5 logger = logging.getLogger(__name__) @@ -47,81 +46,5 @@ def try_splits(url, *splits): return None -def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): - if url is None and fp_parsed is not None: - url = fp_parsed.get('url') - if url is not None and fp_parsed is None: - try: - response = requests.get(url, verify=False) - fp_parsed = feedparser.parse(response.content, - request_headers=response.headers) - except Exception: - logger.exception('failed to retreive that url') - fp_parsed = {'bozo': True} - assert url is not None and fp_parsed is not None - feed = feed or {} - feed_split = urllib.parse.urlsplit(url) - if not fp_parsed['bozo']: - feed['link'] = url - feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link') - feed['title'] = fp_parsed['feed'].get('title') - feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title') - feed['icon'] = try_keys(fp_parsed['feed'], 'icon') - else: - feed['site_link'] = url - - if feed.get('site_link'): - feed['site_link'] = rebuild_url(feed['site_link'], feed_split) - site_split = urllib.parse.urlsplit(feed['site_link']) - - if feed.get('icon'): - feed['icon'] = try_splits(feed['icon'], site_split, feed_split) - if feed['icon'] is None: - del feed['icon'] - - if not feed.get('site_link') or not query_site \ - or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')): - return feed - - response = requests.get(feed['site_link'], verify=False) - bs_parsed = BeautifulSoup(response.content, 'html.parser', - parse_only=SoupStrainer('head')) - - if not feed.get('title'): - try: - feed['title'] = bs_parsed.find_all('title')[0].text - except Exception: - pass - - def check_keys(**kwargs): - def wrapper(elem): - for key, vals in kwargs.items(): - if not elem.has_attr(key): - return False - if not all(val in elem.attrs[key] for val in vals): - return False - return True - return wrapper - - if not feed.get('icon'): - icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut'])) - if not len(icons): - icons = bs_parsed.find_all(check_keys(rel=['icon'])) - if len(icons) >= 1: - for icon in icons: - feed['icon'] = try_splits(icon.attrs['href'], - site_split, feed_split) - if feed['icon'] is not None: - break - - if feed.get('icon') is None: - feed['icon'] = try_splits('/favicon.ico', site_split, feed_split) - if 'icon' in feed and feed['icon'] is None: - del feed['icon'] - - if not feed.get('link'): - alternate = bs_parsed.find_all(check_keys(rel=['alternate'], - type=['application/rss+xml'])) - if len(alternate) >= 1: - feed['link'] = alternate[0].attrs['href'] - return feed +def to_hash(text): + return md5(text.encode('utf8')).hexdigest() -- cgit From c47e8fbd86eebd4c61888b1cba6cf39670679fd0 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Mon, 6 Jul 2015 11:47:45 +0200 Subject: the icon isn't a url but a b64 dump --- pyaggr3g470r/lib/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'pyaggr3g470r/lib/utils.py') diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index b7e5cafc..280256f6 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -1,5 +1,6 @@ import types import urllib +import base64 import logging import requests from hashlib import md5 @@ -41,8 +42,10 @@ def rebuild_url(url, base_split): def try_splits(url, *splits): for split in splits: - if requests.get(rebuild_url(url, split), verify=False).ok: - return rebuild_url(url, split) + rb_url = rebuild_url(url, split) + response = requests.get(rb_url, verify=False, timeout=10) + if response.ok and 'html' not in response.headers['content-type']: + return base64.b64encode(response.content).decode('utf8') return None -- cgit