diff options
author | cedricbonhomme <devnull@localhost> | 2010-09-19 20:22:22 +0200 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-09-19 20:22:22 +0200 |
commit | c7ec934df972b43440bd8591a16679a466969aaa (patch) | |
tree | 00610f93239d6c60a7402366846b7ca76fb381dc /utils.py | |
parent | Minor bugfix in topwords() function. (diff) | |
download | newspipe-c7ec934df972b43440bd8591a16679a466969aaa.tar.gz newspipe-c7ec934df972b43440bd8591a16679a466969aaa.tar.bz2 newspipe-c7ec934df972b43440bd8591a16679a466969aaa.zip |
Unescape HTML entities.
Diffstat (limited to 'utils.py')
-rwxr-xr-x | utils.py | 37 |
1 files changed, 31 insertions, 6 deletions
@@ -16,6 +16,7 @@ import sqlite3 import operator import urlparse import calendar +import htmlentitydefs try: # for high performance on list @@ -108,9 +109,32 @@ def clear_string(data): and consecutive white spaces (more that one). """ p = re.compile(r'<[^<]*?/?>') - q = re.compile(r'&#[0-9]+;') - r = re.compile(r'\s') - return p.sub('', q.sub('', r.sub(' ', data))) + q = re.compile(r'\s') + return p.sub('', q.sub(' ', data)) + +def unescape(text): + """ + Removes HTML or XML character references and entities from a text string. + """ + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + pass + else: + # named entity + try: + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + pass + return text # leave as is + return re.sub("&#?\w+;", fixup, text) def top_words(dic_articles, n=10, size=5): """ @@ -121,7 +145,7 @@ def top_words(dic_articles, n=10, size=5): for article in dic_articles[rss_feed_id]: words_gen.extend([word.strip(punctuation).lower() \ for word in clear_string(article[4].encode('utf-8')).split() \ - if len(word) >= size]) + if len(word.strip(punctuation)) >= size]) words = Counter() for word in words_gen: words[word] += 1 @@ -314,8 +338,9 @@ def load_feed(): else: language = "IMPORT_ERROR" - article_list = [article_id, article[0], article[1], \ - article[2], article[3], article[4], language, article[6]] + article_list = [article_id, article[0], unescape(article[1]), \ + article[2], unescape(article[3]), \ + article[4], language, article[6]] if feed_id not in articles: try: |