Ignore stop words when calculating top words.

author: Cédric Bonhomme <kimble.mandel@gmail.com> 2012-11-28 11:39:23 +0100
committer: Cédric Bonhomme <kimble.mandel@gmail.com> 2012-11-28 11:39:23 +0100
commit: 84a79ec06541c7db92af48b43d1d4d379cded730 (patch)
tree: acbaa6aa38153717d6cf360e519325e56f054492 /source/utils.py
parent: Fix: number of feeds wan no longer displayed in the navigation bar. (diff)
download: newspipe-84a79ec06541c7db92af48b43d1d4d379cded730.tar.gz
newspipe-84a79ec06541c7db92af48b43d1d4d379cded730.tar.bz2
newspipe-84a79ec06541c7db92af48b43d1d4d379cded730.zip
1 files changed, 16 insertions, 1 deletions
diff --git a/source/utils.py b/source/utils.py
index 7681fea7..475f3c06 100755
--- a/source/utils.py
+++ b/source/utils.py
@@ -36,6 +36,7 @@ __license__ = "GPLv3"
 
 import os
 import re
+import glob
 import operator
 import urllib.parse
 import calendar
@@ -139,15 +140,29 @@ def normalize_filename(name):
     file_name = strip_accents(file_name, "utf-8")
     return os.path.normpath(file_name)
 
+def load_stop_words():
+    """
+    Load the stop words and return them in a list.
+    """
+    stop_words_lists = glob.glob('./var/stop_words/*.txt')
+    stop_words = []
+
+    for stop_wods_list in stop_words_lists:
+        with open(stop_wods_list, "r") as stop_wods_file:
+            stop_words += stop_wods_file.read().split(";")
+    return stop_words
+
 def top_words(articles, n=10, size=5):
     """
     Return the n most frequent words in a list.
     """
+    stop_words = load_stop_words()
     words = Counter()
     wordre = re.compile(r'\b\w{%s,}\b' % size, re.I)
     for article in articles:
         for word in wordre.findall(clear_string(article["article_content"])):
-            words[word.lower()] += 1
+            if word.lower() not in stop_words:
+                words[word.lower()] += 1
     return words.most_common(n)
 
 def tag_cloud(tags, query="word_count"):
author	Cédric Bonhomme <kimble.mandel@gmail.com>	2012-11-28 11:39:23 +0100
committer	Cédric Bonhomme <kimble.mandel@gmail.com>	2012-11-28 11:39:23 +0100
commit	84a79ec06541c7db92af48b43d1d4d379cded730 (patch)
tree	acbaa6aa38153717d6cf360e519325e56f054492 /source/utils.py
parent	Fix: number of feeds wan no longer displayed in the navigation bar. (diff)
download	newspipe-84a79ec06541c7db92af48b43d1d4d379cded730.tar.gz newspipe-84a79ec06541c7db92af48b43d1d4d379cded730.tar.bz2 newspipe-84a79ec06541c7db92af48b43d1d4d379cded730.zip