diff options
author | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-05-03 08:19:29 +0200 |
---|---|---|
committer | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-05-03 08:19:29 +0200 |
commit | 67952f5b33583380be14b9cb420e8999a132e35d (patch) | |
tree | b81db438ee837ff30f6a59edbd311c8760a74646 | |
parent | search.path fixes #5. (diff) | |
download | newspipe-67952f5b33583380be14b9cb420e8999a132e35d.tar.gz newspipe-67952f5b33583380be14b9cb420e8999a132e35d.tar.bz2 newspipe-67952f5b33583380be14b9cb420e8999a132e35d.zip |
Using lxml parser instead of html.parser, fixes #4.
-rw-r--r-- | pyaggr3g470r/crawler.py | 10 | ||||
-rw-r--r-- | requirements.txt | 1 |
2 files changed, 7 insertions, 4 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py index 34ce9d74..ec3dcef4 100644 --- a/pyaggr3g470r/crawler.py +++ b/pyaggr3g470r/crawler.py @@ -29,7 +29,9 @@ __license__ = "AGPLv3" import feedparser import urllib2 import requests +from bs4 import BeautifulSoup from datetime import datetime +from sqlalchemy.exc import IntegrityError from requests.exceptions import * import gevent.monkey @@ -173,9 +175,9 @@ class FeedGetter(object): except Exception: description = "" try: - description = BeautifulSoup(description, "html.parser").decode() - article_title = BeautifulSoup(article.title, "html.parser").decode() - except Exception: + description = BeautifulSoup(description, "lxml").decode() + article_title = BeautifulSoup(article.title, "lxml").decode() + except Exception as e: pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url)) article_title = article.title @@ -264,4 +266,4 @@ class FeedGetter(object): return True -
\ No newline at end of file + diff --git a/requirements.txt b/requirements.txt index 65b5cd41..f843318c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ feedparser opml requests BeautifulSoup +lxml SQLAlchemy psycopg2 Flask |