Prevents BeautifulSoup4 from adding extra <html><body> tags to the soup with the lxml parser.

author: Cédric Bonhomme <cedric@cedricbonhomme.org> 2015-02-22 00:49:38 +0100
committer: Cédric Bonhomme <cedric@cedricbonhomme.org> 2015-02-22 00:49:38 +0100
commit: c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856 (patch)
tree: 68b6cc12e3c47b36604402c5debd8f62f5108f00 /pyaggr3g470r
parent: Fix layout template. (diff)
download: newspipe-c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856.tar.gz
newspipe-c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856.tar.bz2
newspipe-c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856.zip
1 files changed, 10 insertions, 1 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index 5d7261ff..41ba120e 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -129,7 +129,16 @@ def parse_feed(user, feed):
             description = article.get('description', '')
 
         try:
-            description = BeautifulSoup(description, "lxml").decode()
+            soup = BeautifulSoup(description, "lxml")
+
+            # Prevents BeautifulSoup4 from adding extra <html><body> tags
+            # to the soup with the lxml parser.
+            if soup.body:
+                description = soup.body.next.decode()
+            elif soup.html:
+                description = soup.html.next.decode()
+            else:
+                description = soup.decode()
         except:
             logger.error("Problem when sanitizing the content of the article %s (%s)",
                                 article_title, nice_url)
author	Cédric Bonhomme <cedric@cedricbonhomme.org>	2015-02-22 00:49:38 +0100
committer	Cédric Bonhomme <cedric@cedricbonhomme.org>	2015-02-22 00:49:38 +0100
commit	c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856 (patch)
tree	68b6cc12e3c47b36604402c5debd8f62f5108f00 /pyaggr3g470r
parent	Fix layout template. (diff)
download	newspipe-c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856.tar.gz newspipe-c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856.tar.bz2 newspipe-c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856.zip