From c6ebbb93ac01d92c0e49bf7c553c6d1befb9a856 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Sun, 22 Feb 2015 00:49:38 +0100 Subject: Prevents BeautifulSoup4 from adding extra tags to the soup with the lxml parser. --- pyaggr3g470r/crawler.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'pyaggr3g470r') diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py index 5d7261ff..41ba120e 100644 --- a/pyaggr3g470r/crawler.py +++ b/pyaggr3g470r/crawler.py @@ -129,7 +129,16 @@ def parse_feed(user, feed): description = article.get('description', '') try: - description = BeautifulSoup(description, "lxml").decode() + soup = BeautifulSoup(description, "lxml") + + # Prevents BeautifulSoup4 from adding extra tags + # to the soup with the lxml parser. + if soup.body: + description = soup.body.next.decode() + elif soup.html: + description = soup.html.next.decode() + else: + description = soup.decode() except: logger.error("Problem when sanitizing the content of the article %s (%s)", article_title, nice_url) -- cgit