diff options
author | Cédric Bonhomme <kimble.mandel@gmail.com> | 2013-04-02 10:06:55 +0200 |
---|---|---|
committer | Cédric Bonhomme <kimble.mandel@gmail.com> | 2013-04-02 10:06:55 +0200 |
commit | 3da3ae456ab543aa00cd193fbbce53c7198a82d7 (patch) | |
tree | b8996b8f0d8dfe2b5ecbc6e299c0466483c78791 /source | |
parent | Index are generated in background at database initialization. (diff) | |
download | newspipe-3da3ae456ab543aa00cd193fbbce53c7198a82d7.tar.gz newspipe-3da3ae456ab543aa00cd193fbbce53c7198a82d7.tar.bz2 newspipe-3da3ae456ab543aa00cd193fbbce53c7198a82d7.zip |
Test if BeautifulSoup failed to sanitize the HTML content.
Diffstat (limited to 'source')
-rwxr-xr-x | source/feedgetter.py | 13 |
1 files changed, 10 insertions, 3 deletions
diff --git a/source/feedgetter.py b/source/feedgetter.py index f3fa5c07..3e3b7aef 100755 --- a/source/feedgetter.py +++ b/source/feedgetter.py @@ -123,6 +123,7 @@ class FeedGetter(object): articles = [] for article in a_feed['entries']: description = "" + article_title = "" try: # article content description = article.content[0].value @@ -132,8 +133,14 @@ class FeedGetter(object): description = article.description except Exception: description = "" - description = BeautifulSoup(description, "html.parser").decode() - article_title = BeautifulSoup(article.title, "html.parser").decode() + try: + description = BeautifulSoup(description, "html.parser").decode() + article_title = BeautifulSoup(article.title, "html.parser").decode() + except Exception as E: + print("Problem when retrieving " + feed_link) + print(E) + article_title = article.title + try: post_date = datetime(*article.published_parsed[:6]) except: @@ -187,4 +194,4 @@ if __name__ == "__main__": # For a blogspot blog: #feed_getter.retrieve_feed("http://www.blogger.com/feeds/4195135246107166251/posts/default", "http://neopythonic.blogspot.com/feeds/posts/default") - #feed_getter.retrieve_feed("http://www.blogger.com/feeds/8699431508730375743/posts/default", "http://python-history.blogspot.com/feeds/posts/default")
\ No newline at end of file + #feed_getter.retrieve_feed("http://www.blogger.com/feeds/8699431508730375743/posts/default", "http://python-history.blogspot.com/feeds/posts/default") |