diff options
-rw-r--r-- | pyaggr3g470r/crawler.py | 11 |
1 files changed, 10 insertions, 1 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py index 5d7261ff..41ba120e 100644 --- a/pyaggr3g470r/crawler.py +++ b/pyaggr3g470r/crawler.py @@ -129,7 +129,16 @@ def parse_feed(user, feed): description = article.get('description', '') try: - description = BeautifulSoup(description, "lxml").decode() + soup = BeautifulSoup(description, "lxml") + + # Prevents BeautifulSoup4 from adding extra <html><body> tags + # to the soup with the lxml parser. + if soup.body: + description = soup.body.next.decode() + elif soup.html: + description = soup.html.next.decode() + else: + description = soup.decode() except: logger.error("Problem when sanitizing the content of the article %s (%s)", article_title, nice_url) |