diff options
author | cedricbonhomme <devnull@localhost> | 2010-10-18 13:33:55 +0200 |
---|---|---|
committer | cedricbonhomme <devnull@localhost> | 2010-10-18 13:33:55 +0200 |
commit | 1bc92be33516bdf3a9f92c5793863d5d20a5619b (patch) | |
tree | 396acc0438246e72cca342e5fc6ff7882043d130 | |
parent | Improvements of the page of unread articles and optimization of the top_words... (diff) | |
download | newspipe-1bc92be33516bdf3a9f92c5793863d5d20a5619b.tar.gz newspipe-1bc92be33516bdf3a9f92c5793863d5d20a5619b.tar.bz2 newspipe-1bc92be33516bdf3a9f92c5793863d5d20a5619b.zip |
Improvements of the top_words() function using a regular expression.
-rwxr-xr-x | utils.py | 7 |
1 files changed, 3 insertions, 4 deletions
@@ -162,12 +162,11 @@ def top_words(dic_articles, n=10, size=5): Return the n most frequent words in a list. """ words = Counter() + wordre = re.compile(r'\b\w{%s,}\b' % size) for rss_feed_id in dic_articles.keys(): for article in dic_articles[rss_feed_id]: - for good_word in [word.strip(punctuation).lower() \ - for word in clear_string(article[4].encode('utf-8')).split() \ - if len(word.strip(punctuation)) >= size]: - words[good_word] += 1 + for word in wordre.findall(clear_string(article[4].encode('utf-8'))): + words[word.lower()] += 1 return words.most_common(n) def tag_cloud(tags, query="word_count"): |