aboutsummaryrefslogtreecommitdiff
path: root/source/generatefeedvector.py
diff options
context:
space:
mode:
authorCédric Bonhomme <kimble.mandel@gmail.com>2012-11-16 14:20:34 +0100
committerCédric Bonhomme <kimble.mandel@gmail.com>2012-11-16 14:20:34 +0100
commit9c7a6eca5f92b0cc3cc23c40b23f8c982fccdd06 (patch)
treee9009be19dae759459e2272794fab28179d740cd /source/generatefeedvector.py
parentMoved section "Script of installation". (diff)
downloadnewspipe-9c7a6eca5f92b0cc3cc23c40b23f8c982fccdd06.tar.gz
newspipe-9c7a6eca5f92b0cc3cc23c40b23f8c982fccdd06.tar.bz2
newspipe-9c7a6eca5f92b0cc3cc23c40b23f8c982fccdd06.zip
Added tests with clusters.
Diffstat (limited to 'source/generatefeedvector.py')
-rwxr-xr-xsource/generatefeedvector.py65
1 files changed, 65 insertions, 0 deletions
diff --git a/source/generatefeedvector.py b/source/generatefeedvector.py
new file mode 100755
index 00000000..3c33efa5
--- /dev/null
+++ b/source/generatefeedvector.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+import feedparser
+import re
+
+import conf
+import mongodb
+
+# Returns title and dictionary of word counts for an RSS feed
+def getwordcounts(feed_id):
+ wc={}
+ # Loop over all the entries
+ for article in mongo.get_articles_from_collection(feed_id):
+ summary = article["article_content"]
+
+ # Extract a list of words
+ words = getwords(feed["feed_title"] + ' ' + summary)
+ for word in words:
+ wc.setdefault(word,0)
+ wc[word] += 1
+ return feed["feed_title"], wc
+
+def getwords(html):
+ # Remove all the HTML tags
+ txt=re.compile(r'<[^>]+>').sub('',html)
+
+ # Split words by all non-alpha characters
+ words=re.compile(r'[^A-Z^a-z]+').split(txt)
+
+ # Convert to lowercase
+ return [word.lower() for word in words if word!='']
+
+
+apcount={}
+wordcounts={}
+mongo = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \
+ conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD)
+feeds = mongo.get_all_feeds()
+for feed in feeds:
+ try:
+ title,wc=getwordcounts(feed["feed_id"])
+ wordcounts[title]=wc
+ for word,count in list(wc.items()):
+ apcount.setdefault(word,0)
+ if count>1:
+ apcount[word]+=1
+ except:
+ print('Failed to parse feed %s' % feed["feed_title"])
+
+wordlist=[]
+for w,bc in list(apcount.items()):
+ frac=float(bc)/len(feeds)
+ if frac>0.1 and frac<0.5:
+ wordlist.append(w)
+
+out=open('blogdata1.txt','w')
+out.write('Blog')
+for word in wordlist: out.write('\t%s' % word)
+out.write('\n')
+for blog,wc in list(wordcounts.items()):
+ print(blog)
+ out.write(blog)
+ for word in wordlist:
+ if word in wc: out.write('\t%d' % wc[word])
+ else: out.write('\t0')
+ out.write('\n')
bgstack15