diff options
author | cedricbonhomme <devnull@localhost> | 2011-06-01 09:20:39 +0200 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2011-06-01 09:20:39 +0200 |
commit | 498967b610b7c29d6c87a64ced8ac5033f02bc31 (patch) | |
tree | c2f9643313c2a2b317352e789fffd85674369102 /feedgetter.py | |
parent | Renamed the string 'feeds' to 'articles' in the management page. (diff) | |
download | newspipe-498967b610b7c29d6c87a64ced8ac5033f02bc31.tar.gz newspipe-498967b610b7c29d6c87a64ced8ac5033f02bc31.tar.bz2 newspipe-498967b610b7c29d6c87a64ced8ac5033f02bc31.zip |
Minor improvement: HTML purification of articles description with BeautifulSoup. The purification is done before the insertion in the database.
Diffstat (limited to 'feedgetter.py')
-rwxr-xr-x | feedgetter.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/feedgetter.py b/feedgetter.py index aa65df1f..f2d3f290 100755 --- a/feedgetter.py +++ b/feedgetter.py @@ -28,6 +28,7 @@ __license__ = "GPLv3" import sqlite3 import threading import feedparser +from BeautifulSoup import BeautifulSoup from datetime import datetime @@ -128,12 +129,14 @@ class FeedGetter(object): description = article.description.encode('utf-8') except Exception, e: description = "" + description = str(BeautifulSoup(description)) + title = str(BeautifulSoup(article.title.encode('utf-8'))) try: # try. Will only success if the article is not already in the data base self.c.execute('insert into articles values (?, ?, ?, ?, ?, ?, ?)', (\ datetime(*article.updated_parsed[:6]), \ - utils.clear_string(article.title.encode('utf-8')), \ + title, \ article.link.encode('utf-8'), \ description, \ "0", \ |