aboutsummaryrefslogtreecommitdiff
path: root/source/utils.py
diff options
context:
space:
mode:
authorCédric Bonhomme <kimble.mandel@gmail.com>2012-11-28 11:39:23 +0100
committerCédric Bonhomme <kimble.mandel@gmail.com>2012-11-28 11:39:23 +0100
commit84a79ec06541c7db92af48b43d1d4d379cded730 (patch)
treeacbaa6aa38153717d6cf360e519325e56f054492 /source/utils.py
parentFix: number of feeds wan no longer displayed in the navigation bar. (diff)
downloadnewspipe-84a79ec06541c7db92af48b43d1d4d379cded730.tar.gz
newspipe-84a79ec06541c7db92af48b43d1d4d379cded730.tar.bz2
newspipe-84a79ec06541c7db92af48b43d1d4d379cded730.zip
Ignore stop words when calculating top words.
Diffstat (limited to 'source/utils.py')
-rwxr-xr-xsource/utils.py17
1 files changed, 16 insertions, 1 deletions
diff --git a/source/utils.py b/source/utils.py
index 7681fea7..475f3c06 100755
--- a/source/utils.py
+++ b/source/utils.py
@@ -36,6 +36,7 @@ __license__ = "GPLv3"
import os
import re
+import glob
import operator
import urllib.parse
import calendar
@@ -139,15 +140,29 @@ def normalize_filename(name):
file_name = strip_accents(file_name, "utf-8")
return os.path.normpath(file_name)
+def load_stop_words():
+ """
+ Load the stop words and return them in a list.
+ """
+ stop_words_lists = glob.glob('./var/stop_words/*.txt')
+ stop_words = []
+
+ for stop_wods_list in stop_words_lists:
+ with open(stop_wods_list, "r") as stop_wods_file:
+ stop_words += stop_wods_file.read().split(";")
+ return stop_words
+
def top_words(articles, n=10, size=5):
"""
Return the n most frequent words in a list.
"""
+ stop_words = load_stop_words()
words = Counter()
wordre = re.compile(r'\b\w{%s,}\b' % size, re.I)
for article in articles:
for word in wordre.findall(clear_string(article["article_content"])):
- words[word.lower()] += 1
+ if word.lower() not in stop_words:
+ words[word.lower()] += 1
return words.most_common(n)
def tag_cloud(tags, query="word_count"):
bgstack15