test get_words for the generation of the tag cloud.

author: Cédric Bonhomme <kimble.mandel@gmail.com> 2012-12-20 23:34:56 +0100
committer: Cédric Bonhomme <kimble.mandel@gmail.com> 2012-12-20 23:34:56 +0100
commit: d3b4f3cf0c7e9f71f9bee6a0eb35719e13ab2e03 (patch)
tree: f2acd878ab267d8ef22c80bc5f6cab4bad48b970
parent: Added getwords(html) function. (diff)
download: newspipe-d3b4f3cf0c7e9f71f9bee6a0eb35719e13ab2e03.tar.gz
newspipe-d3b4f3cf0c7e9f71f9bee6a0eb35719e13ab2e03.tar.bz2
newspipe-d3b4f3cf0c7e9f71f9bee6a0eb35719e13ab2e03.zip
1 files changed, 3 insertions, 2 deletions
diff --git a/source/utils.py b/source/utils.py
index 14c0096f..58adcfad 100755
--- a/source/utils.py
+++ b/source/utils.py
@@ -83,7 +83,7 @@ def detect_url_errors(list_of_urls):
             errors.append((url, e.reason.errno ,e.reason.strerror))
     return errors
 
-def getwords(html):
+def get_words(html):
     # Remove all the HTML tags
     txt=re.compile(r'<[^>]+>').sub('',html)
 
@@ -170,7 +170,8 @@ def top_words(articles, n=10, size=5):
     words = Counter()
     wordre = re.compile(r'\b\w{%s,}\b' % size, re.I)
     for article in articles:
-        for word in [elem.lower() for elem in wordre.findall(clear_string(article["article_content"])) if elem.lower() not in stop_words]:
+        #for word in [elem.lower() for elem in wordre.findall(clear_string(article["article_content"])) if elem.lower() not in stop_words]:
+        for word in [elem.lower() for elem in get_words(article["article_content"]) if elem.lower() not in stop_words]:
             words[word] += 1
     return words.most_common(n)
author	Cédric Bonhomme <kimble.mandel@gmail.com>	2012-12-20 23:34:56 +0100
committer	Cédric Bonhomme <kimble.mandel@gmail.com>	2012-12-20 23:34:56 +0100
commit	d3b4f3cf0c7e9f71f9bee6a0eb35719e13ab2e03 (patch)
tree	f2acd878ab267d8ef22c80bc5f6cab4bad48b970
parent	Added getwords(html) function. (diff)
download	newspipe-d3b4f3cf0c7e9f71f9bee6a0eb35719e13ab2e03.tar.gz newspipe-d3b4f3cf0c7e9f71f9bee6a0eb35719e13ab2e03.tar.bz2 newspipe-d3b4f3cf0c7e9f71f9bee6a0eb35719e13ab2e03.zip