From 67952f5b33583380be14b9cb420e8999a132e35d Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Sat, 3 May 2014 08:19:29 +0200 Subject: Using lxml parser instead of html.parser, fixes #4. --- pyaggr3g470r/crawler.py | 10 ++++++---- requirements.txt | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py index 34ce9d74..ec3dcef4 100644 --- a/pyaggr3g470r/crawler.py +++ b/pyaggr3g470r/crawler.py @@ -29,7 +29,9 @@ __license__ = "AGPLv3" import feedparser import urllib2 import requests +from bs4 import BeautifulSoup from datetime import datetime +from sqlalchemy.exc import IntegrityError from requests.exceptions import * import gevent.monkey @@ -173,9 +175,9 @@ class FeedGetter(object): except Exception: description = "" try: - description = BeautifulSoup(description, "html.parser").decode() - article_title = BeautifulSoup(article.title, "html.parser").decode() - except Exception: + description = BeautifulSoup(description, "lxml").decode() + article_title = BeautifulSoup(article.title, "lxml").decode() + except Exception as e: pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url)) article_title = article.title @@ -264,4 +266,4 @@ class FeedGetter(object): return True - \ No newline at end of file + diff --git a/requirements.txt b/requirements.txt index 65b5cd41..f843318c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ feedparser opml requests BeautifulSoup +lxml SQLAlchemy psycopg2 Flask -- cgit