aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/lib
diff options
context:
space:
mode:
Diffstat (limited to 'pyaggr3g470r/lib')
-rw-r--r--pyaggr3g470r/lib/crawler.py3
-rw-r--r--pyaggr3g470r/lib/utils.py32
2 files changed, 24 insertions, 11 deletions
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py
index 2ba5403a..45b1acde 100644
--- a/pyaggr3g470r/lib/crawler.py
+++ b/pyaggr3g470r/lib/crawler.py
@@ -193,8 +193,7 @@ class PyAggUpdater(AbstractCrawler):
'last_modified': self.headers.get('last-modified',
strftime('%a, %d %b %Y %X %Z', gmtime()))}
fresh_feed = construct_feed_from(url=self.feed['link'],
- fp_parsed=self.parsed_feed,
- feed=self.feed)
+ fp_parsed=self.parsed_feed)
for key in ('description', 'site_link', 'icon'):
if fresh_feed.get(key) and fresh_feed[key] != self.feed.get(key):
up_feed[key] = fresh_feed[key]
diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py
index a0154b7f..041a2d29 100644
--- a/pyaggr3g470r/lib/utils.py
+++ b/pyaggr3g470r/lib/utils.py
@@ -37,6 +37,13 @@ def rebuild_url(url, base_split):
return urllib.parse.urlunsplit(new_split)
+def try_splits(url, *splits):
+ for split in splits:
+ if requests.get(rebuild_url(url, split), verify=False).ok:
+ return rebuild_url(url, split)
+ return None
+
+
def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
if url is None and fp_parsed is not None:
url = fp_parsed.get('url')
@@ -45,7 +52,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
fp_parsed = feedparser.parse(response.content)
assert url is not None and fp_parsed is not None
feed = feed or {}
- split = urllib.parse.urlsplit(url)
+ feed_split = urllib.parse.urlsplit(url)
if not fp_parsed['bozo']:
feed['link'] = url
feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link')
@@ -56,11 +63,13 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
feed['site_link'] = url
if feed.get('site_link'):
- feed['site_link'] = rebuild_url(feed['site_link'], split)
- split = urllib.parse.urlsplit(feed['site_link'])
+ feed['site_link'] = rebuild_url(feed['site_link'], feed_split)
+ site_split = urllib.parse.urlsplit(feed['site_link'])
if feed.get('icon'):
- feed['icon'] = rebuild_url(feed['icon'], split)
+ feed['icon'] = try_splits(feed['icon'], site_split, feed_split)
+ if feed['icon'] is None:
+ del feed['icon']
if not feed.get('site_link') or not query_site \
or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')):
@@ -91,11 +100,16 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
if not len(icons):
icons = bs_parsed.find_all(check_keys(rel=['icon']))
if len(icons) >= 1:
- feed['icon'] = rebuild_url(icons[0].attrs['href'], split)
- else: # trying the default one
- icon = rebuild_url('/favicon.ico', split)
- if requests.get(icon, verify=False).ok:
- feed['icon'] = icon
+ for icon in icons:
+ feed['icon'] = try_splits(icon.attrs['href'],
+ site_split, feed_split)
+ if feed['icon'] is not None:
+ break
+
+ if feed['icon'] is None:
+ feed['icon'] = try_splits('/favicon.ico', site_split, feed_split)
+ if feed['icon'] is None:
+ del feed['icon']
if not feed.get('link'):
alternate = bs_parsed.find_all(check_keys(rel=['alternate'],
bgstack15