aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pyaggr3g470r/crawler.py10
-rw-r--r--requirements.txt1
2 files changed, 7 insertions, 4 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index 34ce9d74..ec3dcef4 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -29,7 +29,9 @@ __license__ = "AGPLv3"
import feedparser
import urllib2
import requests
+from bs4 import BeautifulSoup
from datetime import datetime
+from sqlalchemy.exc import IntegrityError
from requests.exceptions import *
import gevent.monkey
@@ -173,9 +175,9 @@ class FeedGetter(object):
except Exception:
description = ""
try:
- description = BeautifulSoup(description, "html.parser").decode()
- article_title = BeautifulSoup(article.title, "html.parser").decode()
- except Exception:
+ description = BeautifulSoup(description, "lxml").decode()
+ article_title = BeautifulSoup(article.title, "lxml").decode()
+ except Exception as e:
pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" %
(article_title, nice_url))
article_title = article.title
@@ -264,4 +266,4 @@ class FeedGetter(object):
return True
- \ No newline at end of file
+
diff --git a/requirements.txt b/requirements.txt
index 65b5cd41..f843318c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ feedparser
opml
requests
BeautifulSoup
+lxml
SQLAlchemy
psycopg2
Flask
bgstack15