diff options
author | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2015-02-22 00:49:38 +0100 |
---|---|---|
committer | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2015-02-22 00:49:38 +0100 |
commit | c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856 (patch) | |
tree | 68b6cc12e3c47b36604402c5debd8f62f5108f00 /pyaggr3g470r/crawler.py | |
parent | Fix layout template. (diff) | |
download | newspipe-c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856.tar.gz newspipe-c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856.tar.bz2 newspipe-c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856.zip |
Prevents BeautifulSoup4 from adding extra <html><body> tags to the soup with the lxml parser.
Diffstat (limited to 'pyaggr3g470r/crawler.py')
-rw-r--r-- | pyaggr3g470r/crawler.py | 11 |
1 files changed, 10 insertions, 1 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py index 5d7261ff..41ba120e 100644 --- a/pyaggr3g470r/crawler.py +++ b/pyaggr3g470r/crawler.py @@ -129,7 +129,16 @@ def parse_feed(user, feed): description = article.get('description', '') try: - description = BeautifulSoup(description, "lxml").decode() + soup = BeautifulSoup(description, "lxml") + + # Prevents BeautifulSoup4 from adding extra <html><body> tags + # to the soup with the lxml parser. + if soup.body: + description = soup.body.next.decode() + elif soup.html: + description = soup.html.next.decode() + else: + description = soup.decode() except: logger.error("Problem when sanitizing the content of the article %s (%s)", article_title, nice_url) |