aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/feedgetter.py
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2014-01-06 22:35:34 +0100
committerCédric Bonhomme <cedric@cedricbonhomme.org>2014-01-06 22:35:34 +0100
commit601f3a868e56cf8328c7f7ae63b3247e52217172 (patch)
treea9477cb1e42aca2c9cd7c22d5f5601e5e4f59ae9 /pyaggr3g470r/feedgetter.py
parentBugfix: variable name. (diff)
downloadnewspipe-601f3a868e56cf8328c7f7ae63b3247e52217172.tar.gz
newspipe-601f3a868e56cf8328c7f7ae63b3247e52217172.tar.bz2
newspipe-601f3a868e56cf8328c7f7ae63b3247e52217172.zip
utm_* parameters are now systematically removed.
Diffstat (limited to 'pyaggr3g470r/feedgetter.py')
-rw-r--r--pyaggr3g470r/feedgetter.py28
1 files changed, 15 insertions, 13 deletions
diff --git a/pyaggr3g470r/feedgetter.py b/pyaggr3g470r/feedgetter.py
index fd187fcd..f6b2208f 100644
--- a/pyaggr3g470r/feedgetter.py
+++ b/pyaggr3g470r/feedgetter.py
@@ -98,25 +98,27 @@ class FeedGetter(object):
articles = []
for article in a_feed['entries']:
- real_url = article.link
+ real_url = article.link.encode("utf-8")
try:
- # remove utm_* parameters
+ # resolves URL behind proxies (like feedproxy.google.com)
r = requests.get(article.link, timeout=2.0)
- parsed_url = urlparse(r.url)
- qd = parse_qs(parsed_url.query, keep_blank_values=True)
- filtered = dict( (k, v) for k, v in qd.iteritems() if not k.startswith('utm_'))
- real_url = urlunparse([
- parsed_url.scheme,
- parsed_url.netloc,
- parsed_url.path,
- parsed_url.params,
- urlencode(filtered, doseq=True), # query string
- parsed_url.fragment
- ])
+ real_url = r.url.encode("utf-8")
except Timeout:
pyaggr3g470r_log.warning("Timeout when getting the real URL of %s." % (article.link,))
except Exception as e:
pyaggr3g470r_log.warning("Unable to get the real URL of %s. Error: %s" % (article.link, str(e)))
+ # remove utm_* parameters
+ parsed_url = urlparse(real_url)
+ qd = parse_qs(parsed_url.query, keep_blank_values=True)
+ filtered = dict((k, v) for k, v in qd.iteritems() if not k.startswith('utm_'))
+ real_url = urlunparse([
+ parsed_url.scheme,
+ parsed_url.netloc,
+ parsed_url.path,
+ parsed_url.params,
+ urlencode(filtered, doseq=True),
+ parsed_url.fragment
+ ])
description = ""
article_title = ""
bgstack15