From d3b4f3cf0c7e9f71f9bee6a0eb35719e13ab2e03 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Thu, 20 Dec 2012 23:34:56 +0100 Subject: test get_words for the generation of the tag cloud. --- source/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'source/utils.py') diff --git a/source/utils.py b/source/utils.py index 14c0096f..58adcfad 100755 --- a/source/utils.py +++ b/source/utils.py @@ -83,7 +83,7 @@ def detect_url_errors(list_of_urls): errors.append((url, e.reason.errno ,e.reason.strerror)) return errors -def getwords(html): +def get_words(html): # Remove all the HTML tags txt=re.compile(r'<[^>]+>').sub('',html) @@ -170,7 +170,8 @@ def top_words(articles, n=10, size=5): words = Counter() wordre = re.compile(r'\b\w{%s,}\b' % size, re.I) for article in articles: - for word in [elem.lower() for elem in wordre.findall(clear_string(article["article_content"])) if elem.lower() not in stop_words]: + #for word in [elem.lower() for elem in wordre.findall(clear_string(article["article_content"])) if elem.lower() not in stop_words]: + for word in [elem.lower() for elem in get_words(article["article_content"]) if elem.lower() not in stop_words]: words[word] += 1 return words.most_common(n) -- cgit