From 498967b610b7c29d6c87a64ced8ac5033f02bc31 Mon Sep 17 00:00:00 2001
From: cedricbonhomme <devnull@localhost>
Date: Wed, 1 Jun 2011 09:20:39 +0200
Subject: Minor improvement: HTML purification of articles description with
 BeautifulSoup. The purification is done before the insertion in the database.

---
 feedgetter.py | 5 ++++-
 utils.py      | 2 +-
 var/feed.lst  | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/feedgetter.py b/feedgetter.py
index aa65df1f..f2d3f290 100755
--- a/feedgetter.py
+++ b/feedgetter.py
@@ -28,6 +28,7 @@ __license__ = "GPLv3"
 import sqlite3
 import threading
 import feedparser
+from BeautifulSoup import BeautifulSoup
 
 from datetime import datetime
 
@@ -128,12 +129,14 @@ class FeedGetter(object):
                     description = article.description.encode('utf-8')
                 except Exception, e:
                     description = ""
+            description = str(BeautifulSoup(description))
+            title = str(BeautifulSoup(article.title.encode('utf-8')))
 
             try:
                 # try. Will only success if the article is not already in the data base
                 self.c.execute('insert into articles values (?, ?, ?, ?, ?, ?, ?)', (\
                         datetime(*article.updated_parsed[:6]), \
-                        utils.clear_string(article.title.encode('utf-8')), \
+                        title, \
                         article.link.encode('utf-8'), \
                         description, \
                         "0", \
diff --git a/utils.py b/utils.py
index 5587ce0e..e5fc455d 100755
--- a/utils.py
+++ b/utils.py
@@ -103,7 +103,7 @@ def clear_string(data):
     """
     p = re.compile(r'<[^<]*?/?>') # HTML tags
     q = re.compile(r'\s') # consecutive white spaces
-    return p.sub('', q.sub(' ', data))
+    return p.sub('', q.sub(' ', data.replace('', '')))
 
 def unescape(text):
     """
diff --git a/var/feed.lst b/var/feed.lst
index d40a8a0f..5678f144 100755
--- a/var/feed.lst
+++ b/var/feed.lst
@@ -32,3 +32,4 @@ http://feeds.feedburner.com/quuxlabs
 http://python-history.blogspot.com/feeds/posts/default
 http://www.haypocalc.com/wordpress/feed
 http://www.crypto.com/blog/rss10.xml
+http://spaf.wordpress.com/feed/
-- 
cgit