diff options
author | Cédric Bonhomme <kimble.mandel@gmail.com> | 2014-01-06 15:28:32 +0100 |
---|---|---|
committer | Cédric Bonhomme <kimble.mandel@gmail.com> | 2014-01-06 15:28:32 +0100 |
commit | cb978e1bdd966710e23a2817e8d3b6124995e4e8 (patch) | |
tree | d32de17118738a8b599bd8cbcc124754175dd9bd /pyaggr3g470r/feedgetter.py | |
parent | Bug fix in the feedgetter. (diff) | |
download | newspipe-cb978e1bdd966710e23a2817e8d3b6124995e4e8.tar.gz newspipe-cb978e1bdd966710e23a2817e8d3b6124995e4e8.tar.bz2 newspipe-cb978e1bdd966710e23a2817e8d3b6124995e4e8.zip |
utm_* parameters are automatically removed from article's URLs.
Diffstat (limited to 'pyaggr3g470r/feedgetter.py')
-rw-r--r-- | pyaggr3g470r/feedgetter.py | 18 |
1 files changed, 14 insertions, 4 deletions
diff --git a/pyaggr3g470r/feedgetter.py b/pyaggr3g470r/feedgetter.py index 55564840..9c6c240e 100644 --- a/pyaggr3g470r/feedgetter.py +++ b/pyaggr3g470r/feedgetter.py @@ -31,7 +31,8 @@ import requests import threading import feedparser from datetime import datetime -from urlparse import urlparse +from urllib import urlencode +from urlparse import urlparse, parse_qs, urlunparse from BeautifulSoup import BeautifulSoup from mongoengine.queryset import NotUniqueError from requests.exceptions import Timeout @@ -99,10 +100,19 @@ class FeedGetter(object): real_url = article.link try: + # remove utm_* parameters r = requests.get(article.link, timeout=2.0) - #parsed_url = urlparse(r.url) - #real_url = parsed_url.scheme + '://' + parsed_url.netloc + parsed_url.path - real_url = r.url + parsed_url = urlparse(r.url) + qd = parse_qs(parsed.query, keep_blank_values=True) + filtered = dict( (k, v) for k, v in qd.iteritems() if not k.startswith('utm_')) + real_url = urlunparse([ + parsed.scheme, + parsed.netloc, + parsed.path, + parsed.params, + urlencode(filtered, doseq=True), # query string + parsed.fragment + ]) except Timeout: pyaggr3g470r_log.warning("Timeout when getting the real URL of %s." % (article.link,)) except Exception as e: |