diff options
author | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-01-06 22:35:34 +0100 |
---|---|---|
committer | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-01-06 22:35:34 +0100 |
commit | 601f3a868e56cf8328c7f7ae63b3247e52217172 (patch) | |
tree | a9477cb1e42aca2c9cd7c22d5f5601e5e4f59ae9 /pyaggr3g470r | |
parent | Bugfix: variable name. (diff) | |
download | newspipe-601f3a868e56cf8328c7f7ae63b3247e52217172.tar.gz newspipe-601f3a868e56cf8328c7f7ae63b3247e52217172.tar.bz2 newspipe-601f3a868e56cf8328c7f7ae63b3247e52217172.zip |
utm_* parameters are now systematically removed.
Diffstat (limited to 'pyaggr3g470r')
-rw-r--r-- | pyaggr3g470r/feedgetter.py | 28 |
1 files changed, 15 insertions, 13 deletions
diff --git a/pyaggr3g470r/feedgetter.py b/pyaggr3g470r/feedgetter.py index fd187fcd..f6b2208f 100644 --- a/pyaggr3g470r/feedgetter.py +++ b/pyaggr3g470r/feedgetter.py @@ -98,25 +98,27 @@ class FeedGetter(object): articles = [] for article in a_feed['entries']: - real_url = article.link + real_url = article.link.encode("utf-8") try: - # remove utm_* parameters + # resolves URL behind proxies (like feedproxy.google.com) r = requests.get(article.link, timeout=2.0) - parsed_url = urlparse(r.url) - qd = parse_qs(parsed_url.query, keep_blank_values=True) - filtered = dict( (k, v) for k, v in qd.iteritems() if not k.startswith('utm_')) - real_url = urlunparse([ - parsed_url.scheme, - parsed_url.netloc, - parsed_url.path, - parsed_url.params, - urlencode(filtered, doseq=True), # query string - parsed_url.fragment - ]) + real_url = r.url.encode("utf-8") except Timeout: pyaggr3g470r_log.warning("Timeout when getting the real URL of %s." % (article.link,)) except Exception as e: pyaggr3g470r_log.warning("Unable to get the real URL of %s. Error: %s" % (article.link, str(e))) + # remove utm_* parameters + parsed_url = urlparse(real_url) + qd = parse_qs(parsed_url.query, keep_blank_values=True) + filtered = dict((k, v) for k, v in qd.iteritems() if not k.startswith('utm_')) + real_url = urlunparse([ + parsed_url.scheme, + parsed_url.netloc, + parsed_url.path, + parsed_url.params, + urlencode(filtered, doseq=True), + parsed_url.fragment + ]) description = "" article_title = "" |