From 498967b610b7c29d6c87a64ced8ac5033f02bc31 Mon Sep 17 00:00:00 2001 From: cedricbonhomme Date: Wed, 1 Jun 2011 09:20:39 +0200 Subject: Minor improvement: HTML purification of articles description with BeautifulSoup. The purification is done before the insertion in the database. --- feedgetter.py | 5 ++++- utils.py | 2 +- var/feed.lst | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/feedgetter.py b/feedgetter.py index aa65df1f..f2d3f290 100755 --- a/feedgetter.py +++ b/feedgetter.py @@ -28,6 +28,7 @@ __license__ = "GPLv3" import sqlite3 import threading import feedparser +from BeautifulSoup import BeautifulSoup from datetime import datetime @@ -128,12 +129,14 @@ class FeedGetter(object): description = article.description.encode('utf-8') except Exception, e: description = "" + description = str(BeautifulSoup(description)) + title = str(BeautifulSoup(article.title.encode('utf-8'))) try: # try. Will only success if the article is not already in the data base self.c.execute('insert into articles values (?, ?, ?, ?, ?, ?, ?)', (\ datetime(*article.updated_parsed[:6]), \ - utils.clear_string(article.title.encode('utf-8')), \ + title, \ article.link.encode('utf-8'), \ description, \ "0", \ diff --git a/utils.py b/utils.py index 5587ce0e..e5fc455d 100755 --- a/utils.py +++ b/utils.py @@ -103,7 +103,7 @@ def clear_string(data): """ p = re.compile(r'<[^<]*?/?>') # HTML tags q = re.compile(r'\s') # consecutive white spaces - return p.sub('', q.sub(' ', data)) + return p.sub('', q.sub(' ', data.replace('', ''))) def unescape(text): """ diff --git a/var/feed.lst b/var/feed.lst index d40a8a0f..5678f144 100755 --- a/var/feed.lst +++ b/var/feed.lst @@ -32,3 +32,4 @@ http://feeds.feedburner.com/quuxlabs http://python-history.blogspot.com/feeds/posts/default http://www.haypocalc.com/wordpress/feed http://www.crypto.com/blog/rss10.xml +http://spaf.wordpress.com/feed/ -- cgit