diff options
author | cedricbonhomme <devnull@localhost> | 2010-07-06 10:32:11 +0200 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-07-06 10:32:11 +0200 |
commit | bac1066d1b9c865c97f9e01445a332968f321784 (patch) | |
tree | d8123f516e9c8f326b936126c5bca617b4ef82c3 /utils.py | |
parent | Regular expression improvement. (diff) | |
download | newspipe-bac1066d1b9c865c97f9e01445a332968f321784.tar.gz newspipe-bac1066d1b9c865c97f9e01445a332968f321784.tar.bz2 newspipe-bac1066d1b9c865c97f9e01445a332968f321784.zip |
Faster top_words function.
Diffstat (limited to 'utils.py')
-rwxr-xr-x | utils.py | 10 |
1 files changed, 4 insertions, 6 deletions
@@ -88,14 +88,12 @@ def top_words(dic_articles, n=10, size=5): """ Return the n most frequent words in a list. """ - articles_content = "" + words_gen = [] for rss_feed_id in dic_articles.keys(): for article in dic_articles[rss_feed_id]: - articles_content += clear_string(article[4].encode('utf-8')) - - words_gen = [word for word in articles_content.split() if len(word) > size] - words_gen = [word.strip(punctuation).lower() for word in words_gen] - + words_gen.extend([word.strip(punctuation).lower() \ + for word in clear_string(article[4].encode('utf-8')).split() \ + if len(word) > size]) words = Counter() for word in words_gen: words[word] += 1 |