From 3da3ae456ab543aa00cd193fbbce53c7198a82d7 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Tue, 2 Apr 2013 10:06:55 +0200 Subject: Test if BeautifulSoup failed to sanitize the HTML content. --- source/feedgetter.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'source') diff --git a/source/feedgetter.py b/source/feedgetter.py index f3fa5c07..3e3b7aef 100755 --- a/source/feedgetter.py +++ b/source/feedgetter.py @@ -123,6 +123,7 @@ class FeedGetter(object): articles = [] for article in a_feed['entries']: description = "" + article_title = "" try: # article content description = article.content[0].value @@ -132,8 +133,14 @@ class FeedGetter(object): description = article.description except Exception: description = "" - description = BeautifulSoup(description, "html.parser").decode() - article_title = BeautifulSoup(article.title, "html.parser").decode() + try: + description = BeautifulSoup(description, "html.parser").decode() + article_title = BeautifulSoup(article.title, "html.parser").decode() + except Exception as E: + print("Problem when retrieving " + feed_link) + print(E) + article_title = article.title + try: post_date = datetime(*article.published_parsed[:6]) except: @@ -187,4 +194,4 @@ if __name__ == "__main__": # For a blogspot blog: #feed_getter.retrieve_feed("http://www.blogger.com/feeds/4195135246107166251/posts/default", "http://neopythonic.blogspot.com/feeds/posts/default") - #feed_getter.retrieve_feed("http://www.blogger.com/feeds/8699431508730375743/posts/default", "http://python-history.blogspot.com/feeds/posts/default") \ No newline at end of file + #feed_getter.retrieve_feed("http://www.blogger.com/feeds/8699431508730375743/posts/default", "http://python-history.blogspot.com/feeds/posts/default") -- cgit