aboutsummaryrefslogtreecommitdiff
path: root/source
diff options
context:
space:
mode:
authorCédric Bonhomme <kimble.mandel@gmail.com>2013-04-02 10:06:55 +0200
committerCédric Bonhomme <kimble.mandel@gmail.com>2013-04-02 10:06:55 +0200
commit3da3ae456ab543aa00cd193fbbce53c7198a82d7 (patch)
treeb8996b8f0d8dfe2b5ecbc6e299c0466483c78791 /source
parentIndex are generated in background at database initialization. (diff)
downloadnewspipe-3da3ae456ab543aa00cd193fbbce53c7198a82d7.tar.gz
newspipe-3da3ae456ab543aa00cd193fbbce53c7198a82d7.tar.bz2
newspipe-3da3ae456ab543aa00cd193fbbce53c7198a82d7.zip
Test if BeautifulSoup failed to sanitize the HTML content.
Diffstat (limited to 'source')
-rwxr-xr-xsource/feedgetter.py13
1 files changed, 10 insertions, 3 deletions
diff --git a/source/feedgetter.py b/source/feedgetter.py
index f3fa5c07..3e3b7aef 100755
--- a/source/feedgetter.py
+++ b/source/feedgetter.py
@@ -123,6 +123,7 @@ class FeedGetter(object):
articles = []
for article in a_feed['entries']:
description = ""
+ article_title = ""
try:
# article content
description = article.content[0].value
@@ -132,8 +133,14 @@ class FeedGetter(object):
description = article.description
except Exception:
description = ""
- description = BeautifulSoup(description, "html.parser").decode()
- article_title = BeautifulSoup(article.title, "html.parser").decode()
+ try:
+ description = BeautifulSoup(description, "html.parser").decode()
+ article_title = BeautifulSoup(article.title, "html.parser").decode()
+ except Exception as E:
+ print("Problem when retrieving " + feed_link)
+ print(E)
+ article_title = article.title
+
try:
post_date = datetime(*article.published_parsed[:6])
except:
@@ -187,4 +194,4 @@ if __name__ == "__main__":
# For a blogspot blog:
#feed_getter.retrieve_feed("http://www.blogger.com/feeds/4195135246107166251/posts/default", "http://neopythonic.blogspot.com/feeds/posts/default")
- #feed_getter.retrieve_feed("http://www.blogger.com/feeds/8699431508730375743/posts/default", "http://python-history.blogspot.com/feeds/posts/default") \ No newline at end of file
+ #feed_getter.retrieve_feed("http://www.blogger.com/feeds/8699431508730375743/posts/default", "http://python-history.blogspot.com/feeds/posts/default")
bgstack15