source/generatefeedvector.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

# -*- coding: utf-8 -*-
import feedparser
import re

import conf
import mongodb

# Returns title and dictionary of word counts for an RSS feed
def getwordcounts(feed_id):
  wc={}
  # Loop over all the entries
  for article in mongo.get_articles_from_collection(feed_id):
    summary = article["article_content"]

    # Extract a list of words
    words = getwords(feed["feed_title"] + ' ' + summary)
    for word in words:
      wc.setdefault(word,0)
      wc[word] += 1
  return feed["feed_title"], wc

def getwords(html):
  # Remove all the HTML tags
  txt=re.compile(r'<[^>]+>').sub('',html)

  # Split words by all non-alpha characters
  words=re.compile(r'[^A-Z^a-z]+').split(txt)

  # Convert to lowercase
  return [word.lower() for word in words if word!='']


apcount={}
wordcounts={}
mongo = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \
                        conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD)
feeds = mongo.get_all_feeds()
for feed in feeds:
  try:
    title,wc=getwordcounts(feed["feed_id"])
    wordcounts[title]=wc
    for word,count in list(wc.items()):
      apcount.setdefault(word,0)
      if count>1:
        apcount[word]+=1
  except:
    print('Failed to parse feed %s' % feed["feed_title"])

wordlist=[]
for w,bc in list(apcount.items()):
  frac=float(bc)/len(feeds)
  if frac>0.1 and frac<0.5:
    wordlist.append(w)

out=open('blogdata1.txt','w')
out.write('Blog')
for word in wordlist: out.write('\t%s' % word)
out.write('\n')
for blog,wc in list(wordcounts.items()):
  print(blog)
  out.write(blog)
  for word in wordlist:
    if word in wc: out.write('\t%d' % wc[word])
    else: out.write('\t0')
  out.write('\n')