diff options
author | cedricbonhomme <devnull@localhost> | 2010-09-19 20:22:22 +0200 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-09-19 20:22:22 +0200 |
commit | c7ec934df972b43440bd8591a16679a466969aaa (patch) | |
tree | 00610f93239d6c60a7402366846b7ca76fb381dc | |
parent | Minor bugfix in topwords() function. (diff) | |
download | newspipe-c7ec934df972b43440bd8591a16679a466969aaa.tar.gz newspipe-c7ec934df972b43440bd8591a16679a466969aaa.tar.bz2 newspipe-c7ec934df972b43440bd8591a16679a466969aaa.zip |
Unescape HTML entities.
-rwxr-xr-x | pyAggr3g470r.py | 5 | ||||
-rwxr-xr-x | utils.py | 37 |
2 files changed, 34 insertions, 8 deletions
diff --git a/pyAggr3g470r.py b/pyAggr3g470r.py index 4ade91b9..f4964947 100755 --- a/pyAggr3g470r.py +++ b/pyAggr3g470r.py @@ -408,9 +408,10 @@ class Root: html += """ <a href="/delete_article/%s:%s"><img src="/css/img/cross.png" title="Delete this article" /></a>""" % \ (feed_id, article_id) html += "<br /><br />" - description = article[4].encode('utf-8') + description = article[4] if description: - html += description + #html += utils.unescape(description).encode('utf-8') + html += description.encode('utf-8') else: html += "No description available." html += "\n</div>\n<hr />\n" @@ -16,6 +16,7 @@ import sqlite3 import operator import urlparse import calendar +import htmlentitydefs try: # for high performance on list @@ -108,9 +109,32 @@ def clear_string(data): and consecutive white spaces (more that one). """ p = re.compile(r'<[^<]*?/?>') - q = re.compile(r'&#[0-9]+;') - r = re.compile(r'\s') - return p.sub('', q.sub('', r.sub(' ', data))) + q = re.compile(r'\s') + return p.sub('', q.sub(' ', data)) + +def unescape(text): + """ + Removes HTML or XML character references and entities from a text string. + """ + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + pass + else: + # named entity + try: + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + pass + return text # leave as is + return re.sub("&#?\w+;", fixup, text) def top_words(dic_articles, n=10, size=5): """ @@ -121,7 +145,7 @@ def top_words(dic_articles, n=10, size=5): for article in dic_articles[rss_feed_id]: words_gen.extend([word.strip(punctuation).lower() \ for word in clear_string(article[4].encode('utf-8')).split() \ - if len(word) >= size]) + if len(word.strip(punctuation)) >= size]) words = Counter() for word in words_gen: words[word] += 1 @@ -314,8 +338,9 @@ def load_feed(): else: language = "IMPORT_ERROR" - article_list = [article_id, article[0], article[1], \ - article[2], article[3], article[4], language, article[6]] + article_list = [article_id, article[0], unescape(article[1]), \ + article[2], unescape(article[3]), \ + article[4], language, article[6]] if feed_id not in articles: try: |