aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcedricbonhomme <devnull@localhost>2010-07-05 21:39:53 +0200
committercedricbonhomme <devnull@localhost>2010-07-05 21:39:53 +0200
commitdb632991434cf688012e2af0d877cd5a5a5b71a6 (patch)
treeb9b51a92ff1ceda4c82cc5370451cb0e665835a9
parentRemoved useless __future__ import (diff)
downloadnewspipe-db632991434cf688012e2af0d877cd5a5a5b71a6.tar.gz
newspipe-db632991434cf688012e2af0d877cd5a5a5b71a6.tar.bz2
newspipe-db632991434cf688012e2af0d877cd5a5a5b71a6.zip
Better regular expression to remove HTML tags, special caracters and consecutive white spaces.
-rwxr-xr-xfeedgetter.py4
-rwxr-xr-xpyAggr3g470r.py14
-rwxr-xr-xutils.py16
3 files changed, 18 insertions, 16 deletions
diff --git a/feedgetter.py b/feedgetter.py
index bea01d28..267246db 100755
--- a/feedgetter.py
+++ b/feedgetter.py
@@ -98,7 +98,7 @@ class FeedGetter(object):
feed_image = "/css/img/feed-icon-28x28.png"
try:
self.c.execute('insert into feeds values (?,?,?,?,?)', (\
- utils.remove_html_tags(a_feed.feed.title.encode('utf-8')), \
+ utils.clear_string(a_feed.feed.title.encode('utf-8')), \
a_feed.feed.link.encode('utf-8'), \
feed_link, \
feed_image,
@@ -115,7 +115,7 @@ class FeedGetter(object):
try:
self.c.execute('insert into articles values (?,?,?,?,?,?,?)', (\
datetime(*article.updated_parsed[:6]), \
- utils.remove_html_tags(article.title.encode('utf-8')), \
+ utils.clear_string(article.title.encode('utf-8')), \
article.link.encode('utf-8'), \
description, \
"0", \
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py
index abe1624c..cc649cf1 100755
--- a/pyAggr3g470r.py
+++ b/pyAggr3g470r.py
@@ -297,9 +297,9 @@ class Root:
if feed_id is not None:
for article in self.articles[rss_feed_id]:
- article_content = utils.remove_html_tags(article[4].encode('utf-8'))
+ article_content = utils.clear_string(article[4].encode('utf-8'))
if not article_content:
- utils.remove_html_tags(article[2].encode('utf-8'))
+ utils.clear_string(article[2].encode('utf-8'))
if querystring.lower() in article_content.lower():
if article[5] == "0":
# not readed articles are in bold
@@ -317,9 +317,9 @@ class Root:
else:
for rss_feed_id in self.articles.keys():
for article in self.articles[rss_feed_id]:
- article_content = utils.remove_html_tags(article[4].encode('utf-8'))
+ article_content = utils.clear_string(article[4].encode('utf-8'))
if not article_content:
- utils.remove_html_tags(article[2].encode('utf-8'))
+ utils.clear_string(article[2].encode('utf-8'))
if querystring.lower() in article_content.lower():
if article[5] == "0":
# not readed articles are in bold
@@ -479,7 +479,7 @@ class Root:
" - " + not_read_begin + \
"""<a href="/description/%s:%s" rel="noreferrer" target="_blank">%s</a>""" % \
(feed_id, article[0].encode('utf-8'), \
- utils.remove_html_tags(article[2].encode('utf-8'))) + \
+ utils.clear_string(article[2].encode('utf-8'))) + \
not_read_end + like + \
"<br />\n"
@@ -583,7 +583,7 @@ class Root:
html += """<h1><i>%s</i> from <a href="/all_articles/%s">%s</a></h1>\n<br />\n"""% \
(article[2].encode('utf-8'), feed_id, \
self.feeds[feed_id][3].encode('utf-8'))
- description = utils.remove_html_tags(article[4].encode('utf-8'))
+ description = utils.clear_string(article[4].encode('utf-8'))
if description:
html += description
else:
@@ -828,7 +828,7 @@ class Root:
name = folder + "/" + article[1] + ".txt"
f = open(name.replace(' ', '_'), "w")
content = "Title: " + article[2].encode('utf-8') + "\n\n\n"
- content += utils.remove_html_tags(article[4].encode('utf-8'))
+ content += utils.clear_string(article[4].encode('utf-8'))
f.write(content)
except IOError:
pass
diff --git a/utils.py b/utils.py
index 24291812..f62a0e9a 100755
--- a/utils.py
+++ b/utils.py
@@ -74,13 +74,15 @@ def detect_language(text):
else:
return 'other'
-def remove_html_tags(data):
+def clear_string(data):
"""
- Remove HTML tags for the search.
+ Clear a string by removing HTML tags, HTML special caracters
+ and consecutive white spaces (more that one).
"""
p = re.compile(r'<[^<]*?/?>')
q = re.compile(r'&#[0-9]+;')
- return p.sub('', q.sub('', data))
+ r = re.compile(r's+')
+ return p.sub('', q.sub('', r.sub('', data)))
def top_words(dic_articles, n=10, size=5):
"""
@@ -90,7 +92,7 @@ def top_words(dic_articles, n=10, size=5):
articles_content = ""
for rss_feed_id in dic_articles.keys():
for article in dic_articles[rss_feed_id]:
- articles_content += remove_html_tags(article[4].encode('utf-8'))
+ articles_content += clear_string(article[4].encode('utf-8'))
words_gen = [word for word in articles_content.split() if len(word) > size]
words_gen = [word.strip(punctuation).lower() for word in words_gen]
@@ -300,10 +302,10 @@ def load_feed():
if "oice" not in IMPORT_ERROR:
if article[3] != "":
- language = detect_language(remove_html_tags(article[3][:80]).encode('utf-8') + \
- remove_html_tags(article[1]).encode('utf-8'))
+ language = detect_language(clear_string(article[3][:80]).encode('utf-8') + \
+ clear_string(article[1]).encode('utf-8'))
else:
- language = detect_language(remove_html_tags(article[1]).encode('utf-8'))
+ language = detect_language(clear_string(article[1]).encode('utf-8'))
else:
language = "IMPORT_ERROR"
bgstack15