Improvements of the top_words() function using a regular expression.

author: cedricbonhomme <devnull@localhost> 2010-10-18 13:33:55 +0200
committer: cedricbonhomme <devnull@localhost> 2010-10-18 13:33:55 +0200
commit: 1bc92be33516bdf3a9f92c5793863d5d20a5619b (patch)
tree: 396acc0438246e72cca342e5fc6ff7882043d130 /utils.py
parent: Improvements of the page of unread articles and optimization of the top_words... (diff)
download: newspipe-1bc92be33516bdf3a9f92c5793863d5d20a5619b.tar.gz
newspipe-1bc92be33516bdf3a9f92c5793863d5d20a5619b.tar.bz2
newspipe-1bc92be33516bdf3a9f92c5793863d5d20a5619b.zip
1 files changed, 3 insertions, 4 deletions
diff --git a/utils.py b/utils.py
index 18deff58..0b33b4dd 100755
--- a/utils.py
+++ b/utils.py
@@ -162,12 +162,11 @@ def top_words(dic_articles, n=10, size=5):
     Return the n most frequent words in a list.
     """
     words = Counter()
+    wordre = re.compile(r'\b\w{%s,}\b' % size)
     for rss_feed_id in dic_articles.keys():
         for article in dic_articles[rss_feed_id]:
-            for good_word in [word.strip(punctuation).lower() \
-                            for word in clear_string(article[4].encode('utf-8')).split() \
-                            if len(word.strip(punctuation)) >= size]:
-                words[good_word] += 1
+            for word in wordre.findall(clear_string(article[4].encode('utf-8'))):
+                words[word.lower()] += 1
     return words.most_common(n)
 
 def tag_cloud(tags, query="word_count"):
author	cedricbonhomme <devnull@localhost>	2010-10-18 13:33:55 +0200
committer	cedricbonhomme <devnull@localhost>	2010-10-18 13:33:55 +0200
commit	1bc92be33516bdf3a9f92c5793863d5d20a5619b (patch)
tree	396acc0438246e72cca342e5fc6ff7882043d130 /utils.py
parent	Improvements of the page of unread articles and optimization of the top_words... (diff)
download	newspipe-1bc92be33516bdf3a9f92c5793863d5d20a5619b.tar.gz newspipe-1bc92be33516bdf3a9f92c5793863d5d20a5619b.tar.bz2 newspipe-1bc92be33516bdf3a9f92c5793863d5d20a5619b.zip