Faster top_words function.

author: cedricbonhomme <devnull@localhost> 2010-07-06 10:32:11 +0200
committer: cedricbonhomme <devnull@localhost> 2010-07-06 10:32:11 +0200
commit: bac1066d1b9c865c97f9e01445a332968f321784 (patch)
tree: d8123f516e9c8f326b936126c5bca617b4ef82c3 /utils.py
parent: Regular expression improvement. (diff)
download: newspipe-bac1066d1b9c865c97f9e01445a332968f321784.tar.gz
newspipe-bac1066d1b9c865c97f9e01445a332968f321784.tar.bz2
newspipe-bac1066d1b9c865c97f9e01445a332968f321784.zip
1 files changed, 4 insertions, 6 deletions
diff --git a/utils.py b/utils.py
index 91f61d72..fc6d6891 100755
--- a/utils.py
+++ b/utils.py
@@ -88,14 +88,12 @@ def top_words(dic_articles, n=10, size=5):
     """
     Return the n most frequent words in a list.
     """
-    articles_content = ""
+    words_gen = []
     for rss_feed_id in dic_articles.keys():
         for article in dic_articles[rss_feed_id]:
-            articles_content += clear_string(article[4].encode('utf-8'))
-
-    words_gen = [word for word in articles_content.split() if len(word) > size]
-    words_gen = [word.strip(punctuation).lower() for word in words_gen]
-
+            words_gen.extend([word.strip(punctuation).lower() \
+                            for word in clear_string(article[4].encode('utf-8')).split() \
+                            if len(word) > size])
     words = Counter()
     for word in words_gen:
         words[word] += 1
author	cedricbonhomme <devnull@localhost>	2010-07-06 10:32:11 +0200
committer	cedricbonhomme <devnull@localhost>	2010-07-06 10:32:11 +0200
commit	bac1066d1b9c865c97f9e01445a332968f321784 (patch)
tree	d8123f516e9c8f326b936126c5bca617b4ef82c3 /utils.py
parent	Regular expression improvement. (diff)
download	newspipe-bac1066d1b9c865c97f9e01445a332968f321784.tar.gz newspipe-bac1066d1b9c865c97f9e01445a332968f321784.tar.bz2 newspipe-bac1066d1b9c865c97f9e01445a332968f321784.zip