aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/crawler.py
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2015-02-22 00:49:38 +0100
committerCédric Bonhomme <cedric@cedricbonhomme.org>2015-02-22 00:49:38 +0100
commitc6ebbb93ac01d92c0e49bf7c553c6d1befb9a856 (patch)
tree68b6cc12e3c47b36604402c5debd8f62f5108f00 /pyaggr3g470r/crawler.py
parentFix layout template. (diff)
downloadnewspipe-c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856.tar.gz
newspipe-c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856.tar.bz2
newspipe-c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856.zip
Prevents BeautifulSoup4 from adding extra <html><body> tags to the soup with the lxml parser.
Diffstat (limited to 'pyaggr3g470r/crawler.py')
-rw-r--r--pyaggr3g470r/crawler.py11
1 files changed, 10 insertions, 1 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index 5d7261ff..41ba120e 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -129,7 +129,16 @@ def parse_feed(user, feed):
description = article.get('description', '')
try:
- description = BeautifulSoup(description, "lxml").decode()
+ soup = BeautifulSoup(description, "lxml")
+
+ # Prevents BeautifulSoup4 from adding extra <html><body> tags
+ # to the soup with the lxml parser.
+ if soup.body:
+ description = soup.body.next.decode()
+ elif soup.html:
+ description = soup.html.next.decode()
+ else:
+ description = soup.decode()
except:
logger.error("Problem when sanitizing the content of the article %s (%s)",
article_title, nice_url)
bgstack15