diff options
-rw-r--r-- | pyaggr3g470r/crawler.py | 10 | ||||
-rw-r--r-- | requirements.txt | 1 |
2 files changed, 7 insertions, 4 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py index 34ce9d74..ec3dcef4 100644 --- a/pyaggr3g470r/crawler.py +++ b/pyaggr3g470r/crawler.py @@ -29,7 +29,9 @@ __license__ = "AGPLv3" import feedparser import urllib2 import requests +from bs4 import BeautifulSoup from datetime import datetime +from sqlalchemy.exc import IntegrityError from requests.exceptions import * import gevent.monkey @@ -173,9 +175,9 @@ class FeedGetter(object): except Exception: description = "" try: - description = BeautifulSoup(description, "html.parser").decode() - article_title = BeautifulSoup(article.title, "html.parser").decode() - except Exception: + description = BeautifulSoup(description, "lxml").decode() + article_title = BeautifulSoup(article.title, "lxml").decode() + except Exception as e: pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url)) article_title = article.title @@ -264,4 +266,4 @@ class FeedGetter(object): return True -
\ No newline at end of file + diff --git a/requirements.txt b/requirements.txt index 65b5cd41..f843318c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ feedparser opml requests BeautifulSoup +lxml SQLAlchemy psycopg2 Flask |