diff options
Diffstat (limited to 'pyaggr3g470r/crawler.py')
-rw-r--r-- | pyaggr3g470r/crawler.py | 10 |
1 files changed, 6 insertions, 4 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py index 34ce9d74..ec3dcef4 100644 --- a/pyaggr3g470r/crawler.py +++ b/pyaggr3g470r/crawler.py @@ -29,7 +29,9 @@ __license__ = "AGPLv3" import feedparser import urllib2 import requests +from bs4 import BeautifulSoup from datetime import datetime +from sqlalchemy.exc import IntegrityError from requests.exceptions import * import gevent.monkey @@ -173,9 +175,9 @@ class FeedGetter(object): except Exception: description = "" try: - description = BeautifulSoup(description, "html.parser").decode() - article_title = BeautifulSoup(article.title, "html.parser").decode() - except Exception: + description = BeautifulSoup(description, "lxml").decode() + article_title = BeautifulSoup(article.title, "lxml").decode() + except Exception as e: pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url)) article_title = article.title @@ -264,4 +266,4 @@ class FeedGetter(object): return True -
\ No newline at end of file + |