rebuilding feed url as well

author: François Schmidts <francois.schmidts@gmail.com> 2015-07-07 14:42:26 +0200
committer: François Schmidts <francois.schmidts@gmail.com> 2015-07-07 14:42:26 +0200
commit: 9c9e1058c588a3e2f80e35c5dd95bac234e597f4 (patch)
tree: 12b8cc24e0ef68c95c6950e306c4494ed3c1f273 /pyaggr3g470r/lib
parent: making the crawler getting the feed with high traffic earlier (diff)
download: newspipe-9c9e1058c588a3e2f80e35c5dd95bac234e597f4.tar.gz
newspipe-9c9e1058c588a3e2f80e35c5dd95bac234e597f4.tar.bz2
newspipe-9c9e1058c588a3e2f80e35c5dd95bac234e597f4.zip
2 files changed, 6 insertions, 3 deletions
diff --git a/pyaggr3g470r/lib/feed_utils.py b/pyaggr3g470r/lib/feed_utils.py
index a7149d79..367fd4b5 100644
--- a/pyaggr3g470r/lib/feed_utils.py
+++ b/pyaggr3g470r/lib/feed_utils.py
@@ -23,6 +23,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
     assert url is not None and fp_parsed is not None
     feed = feed or {}
     feed_split = urllib.parse.urlsplit(url)
+    site_split = None
     if not fp_parsed['bozo']:
         feed['link'] = url
         feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link')
@@ -82,8 +83,8 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
             del feed['icon']
 
     if not feed.get('link'):
-        alternate = bs_parsed.find_all(check_keys(rel=['alternate'],
+        alternates = bs_parsed.find_all(check_keys(rel=['alternate'],
                 type=['application/rss+xml']))
-        if len(alternate) >= 1:
-            feed['link'] = alternate[0].attrs['href']
+        if len(alternates) >= 1:
+            feed['link'] = rebuild_url(alternates[0].attrs['href'], feed_split)
     return feed
diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py
index 280256f6..62284de1 100644
--- a/pyaggr3g470r/lib/utils.py
+++ b/pyaggr3g470r/lib/utils.py
@@ -42,6 +42,8 @@ def rebuild_url(url, base_split):
 
 def try_splits(url, *splits):
     for split in splits:
+        if split is None:
+            continue
         rb_url = rebuild_url(url, split)
         response = requests.get(rb_url, verify=False, timeout=10)
         if response.ok and 'html' not in response.headers['content-type']:
author	François Schmidts <francois.schmidts@gmail.com>	2015-07-07 14:42:26 +0200
committer	François Schmidts <francois.schmidts@gmail.com>	2015-07-07 14:42:26 +0200
commit	9c9e1058c588a3e2f80e35c5dd95bac234e597f4 (patch)
tree	12b8cc24e0ef68c95c6950e306c4494ed3c1f273 /pyaggr3g470r/lib
parent	making the crawler getting the feed with high traffic earlier (diff)
download	newspipe-9c9e1058c588a3e2f80e35c5dd95bac234e597f4.tar.gz newspipe-9c9e1058c588a3e2f80e35c5dd95bac234e597f4.tar.bz2 newspipe-9c9e1058c588a3e2f80e35c5dd95bac234e597f4.zip