aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcedricbonhomme <devnull@localhost>2011-06-01 09:20:39 +0200
committercedricbonhomme <devnull@localhost>2011-06-01 09:20:39 +0200
commit498967b610b7c29d6c87a64ced8ac5033f02bc31 (patch)
treec2f9643313c2a2b317352e789fffd85674369102
parentRenamed the string 'feeds' to 'articles' in the management page. (diff)
downloadnewspipe-498967b610b7c29d6c87a64ced8ac5033f02bc31.tar.gz
newspipe-498967b610b7c29d6c87a64ced8ac5033f02bc31.tar.bz2
newspipe-498967b610b7c29d6c87a64ced8ac5033f02bc31.zip
Minor improvement: HTML purification of articles description with BeautifulSoup. The purification is done before the insertion in the database.
-rwxr-xr-xfeedgetter.py5
-rwxr-xr-xutils.py2
-rwxr-xr-xvar/feed.lst1
3 files changed, 6 insertions, 2 deletions
diff --git a/feedgetter.py b/feedgetter.py
index aa65df1f..f2d3f290 100755
--- a/feedgetter.py
+++ b/feedgetter.py
@@ -28,6 +28,7 @@ __license__ = "GPLv3"
import sqlite3
import threading
import feedparser
+from BeautifulSoup import BeautifulSoup
from datetime import datetime
@@ -128,12 +129,14 @@ class FeedGetter(object):
description = article.description.encode('utf-8')
except Exception, e:
description = ""
+ description = str(BeautifulSoup(description))
+ title = str(BeautifulSoup(article.title.encode('utf-8')))
try:
# try. Will only success if the article is not already in the data base
self.c.execute('insert into articles values (?, ?, ?, ?, ?, ?, ?)', (\
datetime(*article.updated_parsed[:6]), \
- utils.clear_string(article.title.encode('utf-8')), \
+ title, \
article.link.encode('utf-8'), \
description, \
"0", \
diff --git a/utils.py b/utils.py
index 5587ce0e..e5fc455d 100755
--- a/utils.py
+++ b/utils.py
@@ -103,7 +103,7 @@ def clear_string(data):
"""
p = re.compile(r'<[^<]*?/?>') # HTML tags
q = re.compile(r'\s') # consecutive white spaces
- return p.sub('', q.sub(' ', data))
+ return p.sub('', q.sub(' ', data.replace('', '')))
def unescape(text):
"""
diff --git a/var/feed.lst b/var/feed.lst
index d40a8a0f..5678f144 100755
--- a/var/feed.lst
+++ b/var/feed.lst
@@ -32,3 +32,4 @@ http://feeds.feedburner.com/quuxlabs
http://python-history.blogspot.com/feeds/posts/default
http://www.haypocalc.com/wordpress/feed
http://www.crypto.com/blog/rss10.xml
+http://spaf.wordpress.com/feed/
bgstack15