diff options
author | cedricbonhomme <devnull@localhost> | 2010-07-05 21:39:53 +0200 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-07-05 21:39:53 +0200 |
commit | db632991434cf688012e2af0d877cd5a5a5b71a6 (patch) | |
tree | b9b51a92ff1ceda4c82cc5370451cb0e665835a9 | |
parent | Removed useless __future__ import (diff) | |
download | newspipe-db632991434cf688012e2af0d877cd5a5a5b71a6.tar.gz newspipe-db632991434cf688012e2af0d877cd5a5a5b71a6.tar.bz2 newspipe-db632991434cf688012e2af0d877cd5a5a5b71a6.zip |
Better regular expression to remove HTML tags, special caracters and consecutive white spaces.
-rwxr-xr-x | feedgetter.py | 4 | ||||
-rwxr-xr-x | pyAggr3g470r.py | 14 | ||||
-rwxr-xr-x | utils.py | 16 |
3 files changed, 18 insertions, 16 deletions
diff --git a/feedgetter.py b/feedgetter.py index bea01d28..267246db 100755 --- a/feedgetter.py +++ b/feedgetter.py @@ -98,7 +98,7 @@ class FeedGetter(object): feed_image = "/css/img/feed-icon-28x28.png" try: self.c.execute('insert into feeds values (?,?,?,?,?)', (\ - utils.remove_html_tags(a_feed.feed.title.encode('utf-8')), \ + utils.clear_string(a_feed.feed.title.encode('utf-8')), \ a_feed.feed.link.encode('utf-8'), \ feed_link, \ feed_image, @@ -115,7 +115,7 @@ class FeedGetter(object): try: self.c.execute('insert into articles values (?,?,?,?,?,?,?)', (\ datetime(*article.updated_parsed[:6]), \ - utils.remove_html_tags(article.title.encode('utf-8')), \ + utils.clear_string(article.title.encode('utf-8')), \ article.link.encode('utf-8'), \ description, \ "0", \ diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py index abe1624c..cc649cf1 100755 --- a/pyAggr3g470r.py +++ b/pyAggr3g470r.py @@ -297,9 +297,9 @@ class Root: if feed_id is not None: for article in self.articles[rss_feed_id]: - article_content = utils.remove_html_tags(article[4].encode('utf-8')) + article_content = utils.clear_string(article[4].encode('utf-8')) if not article_content: - utils.remove_html_tags(article[2].encode('utf-8')) + utils.clear_string(article[2].encode('utf-8')) if querystring.lower() in article_content.lower(): if article[5] == "0": # not readed articles are in bold @@ -317,9 +317,9 @@ class Root: else: for rss_feed_id in self.articles.keys(): for article in self.articles[rss_feed_id]: - article_content = utils.remove_html_tags(article[4].encode('utf-8')) + article_content = utils.clear_string(article[4].encode('utf-8')) if not article_content: - utils.remove_html_tags(article[2].encode('utf-8')) + utils.clear_string(article[2].encode('utf-8')) if querystring.lower() in article_content.lower(): if article[5] == "0": # not readed articles are in bold @@ -479,7 +479,7 @@ class Root: " - " + not_read_begin + \ """<a href="/description/%s:%s" rel="noreferrer" target="_blank">%s</a>""" % \ (feed_id, article[0].encode('utf-8'), \ - utils.remove_html_tags(article[2].encode('utf-8'))) + \ + utils.clear_string(article[2].encode('utf-8'))) + \ not_read_end + like + \ "<br />\n" @@ -583,7 +583,7 @@ class Root: html += """<h1><i>%s</i> from <a href="/all_articles/%s">%s</a></h1>\n<br />\n"""% \ (article[2].encode('utf-8'), feed_id, \ self.feeds[feed_id][3].encode('utf-8')) - description = utils.remove_html_tags(article[4].encode('utf-8')) + description = utils.clear_string(article[4].encode('utf-8')) if description: html += description else: @@ -828,7 +828,7 @@ class Root: name = folder + "/" + article[1] + ".txt" f = open(name.replace(' ', '_'), "w") content = "Title: " + article[2].encode('utf-8') + "\n\n\n" - content += utils.remove_html_tags(article[4].encode('utf-8')) + content += utils.clear_string(article[4].encode('utf-8')) f.write(content) except IOError: pass @@ -74,13 +74,15 @@ def detect_language(text): else: return 'other' -def remove_html_tags(data): +def clear_string(data): """ - Remove HTML tags for the search. + Clear a string by removing HTML tags, HTML special caracters + and consecutive white spaces (more that one). """ p = re.compile(r'<[^<]*?/?>') q = re.compile(r'&#[0-9]+;') - return p.sub('', q.sub('', data)) + r = re.compile(r's+') + return p.sub('', q.sub('', r.sub('', data))) def top_words(dic_articles, n=10, size=5): """ @@ -90,7 +92,7 @@ def top_words(dic_articles, n=10, size=5): articles_content = "" for rss_feed_id in dic_articles.keys(): for article in dic_articles[rss_feed_id]: - articles_content += remove_html_tags(article[4].encode('utf-8')) + articles_content += clear_string(article[4].encode('utf-8')) words_gen = [word for word in articles_content.split() if len(word) > size] words_gen = [word.strip(punctuation).lower() for word in words_gen] @@ -300,10 +302,10 @@ def load_feed(): if "oice" not in IMPORT_ERROR: if article[3] != "": - language = detect_language(remove_html_tags(article[3][:80]).encode('utf-8') + \ - remove_html_tags(article[1]).encode('utf-8')) + language = detect_language(clear_string(article[3][:80]).encode('utf-8') + \ + clear_string(article[1]).encode('utf-8')) else: - language = detect_language(remove_html_tags(article[1]).encode('utf-8')) + language = detect_language(clear_string(article[1]).encode('utf-8')) else: language = "IMPORT_ERROR" |