From 7af0dba1c342f099ce66fad2b4a2c61c95bbf437 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Fri, 28 Nov 2014 19:37:16 +0100 Subject: Finall the method with nltk is really slow... --- README.rst | 3 --- pyaggr3g470r/duplicate.py | 44 ++------------------------------------------ requirements.txt | 6 +----- vagrant/bootstrap.sh | 4 ---- 4 files changed, 3 insertions(+), 54 deletions(-) diff --git a/README.rst b/README.rst index 52a22cfd..eeb75a6d 100644 --- a/README.rst +++ b/README.rst @@ -69,7 +69,6 @@ The geek way $ cd pyaggr3g470r $ heroku create $ heroku addons:add heroku-postgresql:dev - $ heroku config:set BUILDPACK_URL=https://github.com/cedricbonhomme/heroku-buildpack-scipy $ heroku config:set HEROKU=1 $ git push heroku master $ heroku run init @@ -110,9 +109,7 @@ Deploying the application on a traditional server .. code:: bash $ sudo apt-get install python libpq-dev python-dev python-pip build-essential git - $ sudo apt-get install libatlas-base-dev gfortran # for scipy $ sudo apt-get install libxml2-dev libxslt1-dev # for lxml - $ sudo apt-get install python-nose # for scikit-learn $ git clone https://bitbucket.org/cedricbonhomme/pyaggr3g470r.git $ cd pyaggr3g470r $ sudo pip install --upgrade -r requirements.txt diff --git a/pyaggr3g470r/duplicate.py b/pyaggr3g470r/duplicate.py index 6220a3cb..23f4adc0 100644 --- a/pyaggr3g470r/duplicate.py +++ b/pyaggr3g470r/duplicate.py @@ -2,55 +2,15 @@ #-*- coding: utf-8 -*- import itertools -import nltk, string -from sklearn.feature_extraction.text import TfidfVectorizer - import utils -# tokenizers/punkt/english.pickle - - -stemmer = nltk.stem.porter.PorterStemmer() -remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) - -def stem_tokens(tokens): - return [stemmer.stem(item) for item in tokens] - -def normalize(text): - """ - Remove punctuation, lowercase, stem - """ - return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) - -vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english') - -def cosine_sim(article1, article2): - try: - tfidf = vectorizer.fit_transform([utils.clear_string(article1.content), - utils.clear_string(article2.content)]) - except ValueError as e: - raise e - return ((tfidf * tfidf.T).A)[0,1] - def compare_documents(feed): """ Compare a list of documents by pair. """ - downloaded = nltk.download("punkt") - if not downloaded: - # Ubuntu packaged version still uses old URL - dl = nltk.downloader.Downloader("https://nltk.github.com/nltk_data/") - dl.download("punkt") duplicates = [] for pair in itertools.combinations(feed.articles, 2): - try: - result = cosine_sim(*pair) - if abs(result.item() - 1.0) < 1e-10: - duplicates.append(pair) - #print pair[0].id, pair[0].title, pair[0].link - #print pair[1].id, pair[1].title, pair[1].link - #print - except ValueError: - continue + if pair[0].content != "" and pair[0].content == pair[1].content: + duplicates.append(pair) return duplicates \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b2cae674..a9b3756f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,8 +17,4 @@ WTForms python-postmark gevent whoosh -python-dateutil -nltk -numpy==1.8.1 -scipy==0.14.0 -scikit-learn==0.13.1 \ No newline at end of file +python-dateutil \ No newline at end of file diff --git a/vagrant/bootstrap.sh b/vagrant/bootstrap.sh index 750fe4f5..f756c468 100644 --- a/vagrant/bootstrap.sh +++ b/vagrant/bootstrap.sh @@ -15,10 +15,6 @@ fi cd pyaggr3g470r # For lxml: apt-get install -y libxml2-dev libxslt1-dev -# For scipy: -apt-get install -y libatlas-base-dev gfortran -# For scikit-learn: -apt-get install -y python-nose # installation with pip sudo pip install --upgrade -r requirements.txt # copy of the default configuration files for vagrant -- cgit