aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/crawler.py
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2014-05-03 08:19:29 +0200
committerCédric Bonhomme <cedric@cedricbonhomme.org>2014-05-03 08:19:29 +0200
commit67952f5b33583380be14b9cb420e8999a132e35d (patch)
treeb81db438ee837ff30f6a59edbd311c8760a74646 /pyaggr3g470r/crawler.py
parentsearch.path fixes #5. (diff)
downloadnewspipe-67952f5b33583380be14b9cb420e8999a132e35d.tar.gz
newspipe-67952f5b33583380be14b9cb420e8999a132e35d.tar.bz2
newspipe-67952f5b33583380be14b9cb420e8999a132e35d.zip
Using lxml parser instead of html.parser, fixes #4.
Diffstat (limited to 'pyaggr3g470r/crawler.py')
-rw-r--r--pyaggr3g470r/crawler.py10
1 files changed, 6 insertions, 4 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index 34ce9d74..ec3dcef4 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -29,7 +29,9 @@ __license__ = "AGPLv3"
import feedparser
import urllib2
import requests
+from bs4 import BeautifulSoup
from datetime import datetime
+from sqlalchemy.exc import IntegrityError
from requests.exceptions import *
import gevent.monkey
@@ -173,9 +175,9 @@ class FeedGetter(object):
except Exception:
description = ""
try:
- description = BeautifulSoup(description, "html.parser").decode()
- article_title = BeautifulSoup(article.title, "html.parser").decode()
- except Exception:
+ description = BeautifulSoup(description, "lxml").decode()
+ article_title = BeautifulSoup(article.title, "lxml").decode()
+ except Exception as e:
pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" %
(article_title, nice_url))
article_title = article.title
@@ -264,4 +266,4 @@ class FeedGetter(object):
return True
- \ No newline at end of file
+
bgstack15