diff options
author | Cédric Bonhomme <kimble.mandel@gmail.com> | 2012-12-20 23:40:29 +0100 |
---|---|---|
committer | Cédric Bonhomme <kimble.mandel@gmail.com> | 2012-12-20 23:40:29 +0100 |
commit | a10ddebe50e1650d8699fe5c9f08184337ba4dd7 (patch) | |
tree | 1a47cb4053e36b634c29313da220deb5770f17f1 | |
parent | test get_words for the generation of the tag cloud. (diff) | |
download | newspipe-a10ddebe50e1650d8699fe5c9f08184337ba4dd7.tar.gz newspipe-a10ddebe50e1650d8699fe5c9f08184337ba4dd7.tar.bz2 newspipe-a10ddebe50e1650d8699fe5c9f08184337ba4dd7.zip |
Optimization of the regular expression used by clear_string in utils.py.
-rwxr-xr-x | source/utils.py | 15 |
1 files changed, 2 insertions, 13 deletions
diff --git a/source/utils.py b/source/utils.py index 58adcfad..a2f3dfb6 100755 --- a/source/utils.py +++ b/source/utils.py @@ -83,22 +83,12 @@ def detect_url_errors(list_of_urls): errors.append((url, e.reason.errno ,e.reason.strerror)) return errors -def get_words(html): - # Remove all the HTML tags - txt=re.compile(r'<[^>]+>').sub('',html) - - # Split words by all non-alpha characters - words=re.compile(r'[^A-Z^a-z]+').split(txt) - - # Convert to lowercase - return [word.lower() for word in words if word!=''] - def clear_string(data): """ Clear a string by removing HTML tags, HTML special caracters and consecutive white spaces (more that one). """ - p = re.compile(b'<[^<]*?/?>') # HTML tags + p = re.compile(b'<[^>]+>') # HTML tags q = re.compile(b'\s') # consecutive white spaces return p.sub(b'', q.sub(b' ', bytes(data, "utf-8"))).decode("utf-8", "strict") @@ -170,8 +160,7 @@ def top_words(articles, n=10, size=5): words = Counter() wordre = re.compile(r'\b\w{%s,}\b' % size, re.I) for article in articles: - #for word in [elem.lower() for elem in wordre.findall(clear_string(article["article_content"])) if elem.lower() not in stop_words]: - for word in [elem.lower() for elem in get_words(article["article_content"]) if elem.lower() not in stop_words]: + for word in [elem.lower() for elem in wordre.findall(clear_string(article["article_content"])) if elem.lower() not in stop_words]: words[word] += 1 return words.most_common(n) |