Test if BeautifulSoup failed to sanitize the HTML content.

author: Cédric Bonhomme <kimble.mandel@gmail.com> 2013-04-02 10:06:55 +0200
committer: Cédric Bonhomme <kimble.mandel@gmail.com> 2013-04-02 10:06:55 +0200
commit: 3da3ae456ab543aa00cd193fbbce53c7198a82d7 (patch)
tree: b8996b8f0d8dfe2b5ecbc6e299c0466483c78791 /source
parent: Index are generated in background at database initialization. (diff)
download: newspipe-3da3ae456ab543aa00cd193fbbce53c7198a82d7.tar.gz
newspipe-3da3ae456ab543aa00cd193fbbce53c7198a82d7.tar.bz2
newspipe-3da3ae456ab543aa00cd193fbbce53c7198a82d7.zip
1 files changed, 10 insertions, 3 deletions
diff --git a/source/feedgetter.py b/source/feedgetter.py
index f3fa5c07..3e3b7aef 100755
--- a/source/feedgetter.py
+++ b/source/feedgetter.py
@@ -123,6 +123,7 @@ class FeedGetter(object):
         articles = []
         for article in a_feed['entries']:
             description = ""
+            article_title = ""
             try:
                 # article content
                 description = article.content[0].value
@@ -132,8 +133,14 @@ class FeedGetter(object):
                     description = article.description
                 except Exception:
                     description = ""
-            description = BeautifulSoup(description, "html.parser").decode()
-            article_title = BeautifulSoup(article.title, "html.parser").decode()
+            try:
+                description = BeautifulSoup(description, "html.parser").decode()
+                article_title = BeautifulSoup(article.title, "html.parser").decode()
+            except Exception as E:
+                print("Problem when retrieving " + feed_link)
+                print(E)
+                article_title = article.title
+
             try:
                 post_date = datetime(*article.published_parsed[:6])
             except:
@@ -187,4 +194,4 @@ if __name__ == "__main__":
 
     # For a blogspot blog:
     #feed_getter.retrieve_feed("http://www.blogger.com/feeds/4195135246107166251/posts/default", "http://neopythonic.blogspot.com/feeds/posts/default")
-    #feed_getter.retrieve_feed("http://www.blogger.com/feeds/8699431508730375743/posts/default", "http://python-history.blogspot.com/feeds/posts/default")
-\ No newline at end of file
+    #feed_getter.retrieve_feed("http://www.blogger.com/feeds/8699431508730375743/posts/default", "http://python-history.blogspot.com/feeds/posts/default")
author	Cédric Bonhomme <kimble.mandel@gmail.com>	2013-04-02 10:06:55 +0200
committer	Cédric Bonhomme <kimble.mandel@gmail.com>	2013-04-02 10:06:55 +0200
commit	3da3ae456ab543aa00cd193fbbce53c7198a82d7 (patch)
tree	b8996b8f0d8dfe2b5ecbc6e299c0466483c78791 /source
parent	Index are generated in background at database initialization. (diff)
download	newspipe-3da3ae456ab543aa00cd193fbbce53c7198a82d7.tar.gz newspipe-3da3ae456ab543aa00cd193fbbce53c7198a82d7.tar.bz2 newspipe-3da3ae456ab543aa00cd193fbbce53c7198a82d7.zip