diff options
author | cedricbonhomme <devnull@localhost> | 2011-06-01 09:20:39 +0200 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2011-06-01 09:20:39 +0200 |
commit | 498967b610b7c29d6c87a64ced8ac5033f02bc31 (patch) | |
tree | c2f9643313c2a2b317352e789fffd85674369102 | |
parent | Renamed the string 'feeds' to 'articles' in the management page. (diff) | |
download | newspipe-498967b610b7c29d6c87a64ced8ac5033f02bc31.tar.gz newspipe-498967b610b7c29d6c87a64ced8ac5033f02bc31.tar.bz2 newspipe-498967b610b7c29d6c87a64ced8ac5033f02bc31.zip |
Minor improvement: HTML purification of articles description with BeautifulSoup. The purification is done before the insertion in the database.
-rwxr-xr-x | feedgetter.py | 5 | ||||
-rwxr-xr-x | utils.py | 2 | ||||
-rwxr-xr-x | var/feed.lst | 1 |
3 files changed, 6 insertions, 2 deletions
diff --git a/feedgetter.py b/feedgetter.py index aa65df1f..f2d3f290 100755 --- a/feedgetter.py +++ b/feedgetter.py @@ -28,6 +28,7 @@ __license__ = "GPLv3" import sqlite3 import threading import feedparser +from BeautifulSoup import BeautifulSoup from datetime import datetime @@ -128,12 +129,14 @@ class FeedGetter(object): description = article.description.encode('utf-8') except Exception, e: description = "" + description = str(BeautifulSoup(description)) + title = str(BeautifulSoup(article.title.encode('utf-8'))) try: # try. Will only success if the article is not already in the data base self.c.execute('insert into articles values (?, ?, ?, ?, ?, ?, ?)', (\ datetime(*article.updated_parsed[:6]), \ - utils.clear_string(article.title.encode('utf-8')), \ + title, \ article.link.encode('utf-8'), \ description, \ "0", \ @@ -103,7 +103,7 @@ def clear_string(data): """ p = re.compile(r'<[^<]*?/?>') # HTML tags q = re.compile(r'\s') # consecutive white spaces - return p.sub('', q.sub(' ', data)) + return p.sub('', q.sub(' ', data.replace('', ''))) def unescape(text): """ diff --git a/var/feed.lst b/var/feed.lst index d40a8a0f..5678f144 100755 --- a/var/feed.lst +++ b/var/feed.lst @@ -32,3 +32,4 @@ http://feeds.feedburner.com/quuxlabs http://python-history.blogspot.com/feeds/posts/default http://www.haypocalc.com/wordpress/feed http://www.crypto.com/blog/rss10.xml +http://spaf.wordpress.com/feed/ |