aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xsource/utils.py5
1 files changed, 3 insertions, 2 deletions
diff --git a/source/utils.py b/source/utils.py
index 14c0096f..58adcfad 100755
--- a/source/utils.py
+++ b/source/utils.py
@@ -83,7 +83,7 @@ def detect_url_errors(list_of_urls):
errors.append((url, e.reason.errno ,e.reason.strerror))
return errors
-def getwords(html):
+def get_words(html):
# Remove all the HTML tags
txt=re.compile(r'<[^>]+>').sub('',html)
@@ -170,7 +170,8 @@ def top_words(articles, n=10, size=5):
words = Counter()
wordre = re.compile(r'\b\w{%s,}\b' % size, re.I)
for article in articles:
- for word in [elem.lower() for elem in wordre.findall(clear_string(article["article_content"])) if elem.lower() not in stop_words]:
+ #for word in [elem.lower() for elem in wordre.findall(clear_string(article["article_content"])) if elem.lower() not in stop_words]:
+ for word in [elem.lower() for elem in get_words(article["article_content"]) if elem.lower() not in stop_words]:
words[word] += 1
return words.most_common(n)
bgstack15