aboutsummaryrefslogtreecommitdiff
path: root/feedgetter.py
diff options
context:
space:
mode:
authorcedricbonhomme <devnull@localhost>2011-06-01 09:20:39 +0200
committercedricbonhomme <devnull@localhost>2011-06-01 09:20:39 +0200
commit498967b610b7c29d6c87a64ced8ac5033f02bc31 (patch)
treec2f9643313c2a2b317352e789fffd85674369102 /feedgetter.py
parentRenamed the string 'feeds' to 'articles' in the management page. (diff)
downloadnewspipe-498967b610b7c29d6c87a64ced8ac5033f02bc31.tar.gz
newspipe-498967b610b7c29d6c87a64ced8ac5033f02bc31.tar.bz2
newspipe-498967b610b7c29d6c87a64ced8ac5033f02bc31.zip
Minor improvement: HTML purification of articles description with BeautifulSoup. The purification is done before the insertion in the database.
Diffstat (limited to 'feedgetter.py')
-rwxr-xr-xfeedgetter.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/feedgetter.py b/feedgetter.py
index aa65df1f..f2d3f290 100755
--- a/feedgetter.py
+++ b/feedgetter.py
@@ -28,6 +28,7 @@ __license__ = "GPLv3"
import sqlite3
import threading
import feedparser
+from BeautifulSoup import BeautifulSoup
from datetime import datetime
@@ -128,12 +129,14 @@ class FeedGetter(object):
description = article.description.encode('utf-8')
except Exception, e:
description = ""
+ description = str(BeautifulSoup(description))
+ title = str(BeautifulSoup(article.title.encode('utf-8')))
try:
# try. Will only success if the article is not already in the data base
self.c.execute('insert into articles values (?, ?, ?, ?, ?, ?, ?)', (\
datetime(*article.updated_parsed[:6]), \
- utils.clear_string(article.title.encode('utf-8')), \
+ title, \
article.link.encode('utf-8'), \
description, \
"0", \
bgstack15