aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/crawler.py
diff options
context:
space:
mode:
Diffstat (limited to 'pyaggr3g470r/crawler.py')
-rw-r--r--pyaggr3g470r/crawler.py10
1 files changed, 6 insertions, 4 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index 34ce9d74..ec3dcef4 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -29,7 +29,9 @@ __license__ = "AGPLv3"
import feedparser
import urllib2
import requests
+from bs4 import BeautifulSoup
from datetime import datetime
+from sqlalchemy.exc import IntegrityError
from requests.exceptions import *
import gevent.monkey
@@ -173,9 +175,9 @@ class FeedGetter(object):
except Exception:
description = ""
try:
- description = BeautifulSoup(description, "html.parser").decode()
- article_title = BeautifulSoup(article.title, "html.parser").decode()
- except Exception:
+ description = BeautifulSoup(description, "lxml").decode()
+ article_title = BeautifulSoup(article.title, "lxml").decode()
+ except Exception as e:
pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" %
(article_title, nice_url))
article_title = article.title
@@ -264,4 +266,4 @@ class FeedGetter(object):
return True
- \ No newline at end of file
+
bgstack15