diff options
author | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-11-28 19:06:14 +0100 |
---|---|---|
committer | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-11-28 19:06:14 +0100 |
commit | cb35a9b23a855b75ea7c429b44f320a4ab9c29f3 (patch) | |
tree | 0a7d9c3e63e5a8849476c6011fd98222e78c6513 /pyaggr3g470r/compare.py | |
parent | Updated README.rst and vagrant bootstrap file. (diff) | |
download | newspipe-cb35a9b23a855b75ea7c429b44f320a4ab9c29f3.tar.gz newspipe-cb35a9b23a855b75ea7c429b44f320a4ab9c29f3.tar.bz2 newspipe-cb35a9b23a855b75ea7c429b44f320a4ab9c29f3.zip |
Misc fixes for the installation/uage of scipy/numpy/nltk.
Diffstat (limited to 'pyaggr3g470r/compare.py')
-rw-r--r-- | pyaggr3g470r/compare.py | 52 |
1 files changed, 0 insertions, 52 deletions
diff --git a/pyaggr3g470r/compare.py b/pyaggr3g470r/compare.py deleted file mode 100644 index 80f3d694..00000000 --- a/pyaggr3g470r/compare.py +++ /dev/null @@ -1,52 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -import itertools -import nltk, string -from sklearn.feature_extraction.text import TfidfVectorizer - -import utils - -# tokenizers/punkt/english.pickle - - -stemmer = nltk.stem.porter.PorterStemmer() -remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) - -def stem_tokens(tokens): - return [stemmer.stem(item) for item in tokens] - -def normalize(text): - """ - Remove punctuation, lowercase, stem - """ - return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) - -vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english') - -def cosine_sim(article1, article2): - try: - tfidf = vectorizer.fit_transform([utils.clear_string(article1.content), - utils.clear_string(article2.content)]) - except ValueError as e: - raise e - return ((tfidf * tfidf.T).A)[0,1] - - -def compare_documents(feed): - """ - Compare a list of documents by pair. - """ - nltk.download("punkt") - duplicates = [] - for pair in itertools.combinations(feed.articles, 2): - try: - result = cosine_sim(*pair) - if abs(result.item() - 1.0) < 1e-10: - duplicates.append(pair) - #print pair[0].id, pair[0].title, pair[0].link - #print pair[1].id, pair[1].title, pair[1].link - #print - except ValueError: - continue - return duplicates
\ No newline at end of file |