aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.rst3
-rw-r--r--pyaggr3g470r/duplicate.py44
-rw-r--r--requirements.txt6
-rw-r--r--vagrant/bootstrap.sh4
4 files changed, 3 insertions, 54 deletions
diff --git a/README.rst b/README.rst
index 52a22cfd..eeb75a6d 100644
--- a/README.rst
+++ b/README.rst
@@ -69,7 +69,6 @@ The geek way
$ cd pyaggr3g470r
$ heroku create
$ heroku addons:add heroku-postgresql:dev
- $ heroku config:set BUILDPACK_URL=https://github.com/cedricbonhomme/heroku-buildpack-scipy
$ heroku config:set HEROKU=1
$ git push heroku master
$ heroku run init
@@ -110,9 +109,7 @@ Deploying the application on a traditional server
.. code:: bash
$ sudo apt-get install python libpq-dev python-dev python-pip build-essential git
- $ sudo apt-get install libatlas-base-dev gfortran # for scipy
$ sudo apt-get install libxml2-dev libxslt1-dev # for lxml
- $ sudo apt-get install python-nose # for scikit-learn
$ git clone https://bitbucket.org/cedricbonhomme/pyaggr3g470r.git
$ cd pyaggr3g470r
$ sudo pip install --upgrade -r requirements.txt
diff --git a/pyaggr3g470r/duplicate.py b/pyaggr3g470r/duplicate.py
index 6220a3cb..23f4adc0 100644
--- a/pyaggr3g470r/duplicate.py
+++ b/pyaggr3g470r/duplicate.py
@@ -2,55 +2,15 @@
#-*- coding: utf-8 -*-
import itertools
-import nltk, string
-from sklearn.feature_extraction.text import TfidfVectorizer
-
import utils
-# tokenizers/punkt/english.pickle
-
-
-stemmer = nltk.stem.porter.PorterStemmer()
-remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
-
-def stem_tokens(tokens):
- return [stemmer.stem(item) for item in tokens]
-
-def normalize(text):
- """
- Remove punctuation, lowercase, stem
- """
- return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
-
-vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
-
-def cosine_sim(article1, article2):
- try:
- tfidf = vectorizer.fit_transform([utils.clear_string(article1.content),
- utils.clear_string(article2.content)])
- except ValueError as e:
- raise e
- return ((tfidf * tfidf.T).A)[0,1]
-
def compare_documents(feed):
"""
Compare a list of documents by pair.
"""
- downloaded = nltk.download("punkt")
- if not downloaded:
- # Ubuntu packaged version still uses old URL
- dl = nltk.downloader.Downloader("https://nltk.github.com/nltk_data/")
- dl.download("punkt")
duplicates = []
for pair in itertools.combinations(feed.articles, 2):
- try:
- result = cosine_sim(*pair)
- if abs(result.item() - 1.0) < 1e-10:
- duplicates.append(pair)
- #print pair[0].id, pair[0].title, pair[0].link
- #print pair[1].id, pair[1].title, pair[1].link
- #print
- except ValueError:
- continue
+ if pair[0].content != "" and pair[0].content == pair[1].content:
+ duplicates.append(pair)
return duplicates \ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index b2cae674..a9b3756f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,8 +17,4 @@ WTForms
python-postmark
gevent
whoosh
-python-dateutil
-nltk
-numpy==1.8.1
-scipy==0.14.0
-scikit-learn==0.13.1 \ No newline at end of file
+python-dateutil \ No newline at end of file
diff --git a/vagrant/bootstrap.sh b/vagrant/bootstrap.sh
index 750fe4f5..f756c468 100644
--- a/vagrant/bootstrap.sh
+++ b/vagrant/bootstrap.sh
@@ -15,10 +15,6 @@ fi
cd pyaggr3g470r
# For lxml:
apt-get install -y libxml2-dev libxslt1-dev
-# For scipy:
-apt-get install -y libatlas-base-dev gfortran
-# For scikit-learn:
-apt-get install -y python-nose
# installation with pip
sudo pip install --upgrade -r requirements.txt
# copy of the default configuration files for vagrant
bgstack15