From cb35a9b23a855b75ea7c429b44f320a4ab9c29f3 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Fri, 28 Nov 2014 19:06:14 +0100 Subject: Misc fixes for the installation/uage of scipy/numpy/nltk. --- README.rst | 2 ++ pyaggr3g470r/compare.py | 52 ------------------------------------------- pyaggr3g470r/duplicate.py | 56 +++++++++++++++++++++++++++++++++++++++++++++++ pyaggr3g470r/views.py | 4 ++-- vagrant/bootstrap.sh | 2 ++ 5 files changed, 62 insertions(+), 54 deletions(-) delete mode 100644 pyaggr3g470r/compare.py create mode 100644 pyaggr3g470r/duplicate.py diff --git a/README.rst b/README.rst index 46c8b866..52a22cfd 100644 --- a/README.rst +++ b/README.rst @@ -69,6 +69,7 @@ The geek way $ cd pyaggr3g470r $ heroku create $ heroku addons:add heroku-postgresql:dev + $ heroku config:set BUILDPACK_URL=https://github.com/cedricbonhomme/heroku-buildpack-scipy $ heroku config:set HEROKU=1 $ git push heroku master $ heroku run init @@ -111,6 +112,7 @@ Deploying the application on a traditional server $ sudo apt-get install python libpq-dev python-dev python-pip build-essential git $ sudo apt-get install libatlas-base-dev gfortran # for scipy $ sudo apt-get install libxml2-dev libxslt1-dev # for lxml + $ sudo apt-get install python-nose # for scikit-learn $ git clone https://bitbucket.org/cedricbonhomme/pyaggr3g470r.git $ cd pyaggr3g470r $ sudo pip install --upgrade -r requirements.txt diff --git a/pyaggr3g470r/compare.py b/pyaggr3g470r/compare.py deleted file mode 100644 index 80f3d694..00000000 --- a/pyaggr3g470r/compare.py +++ /dev/null @@ -1,52 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -import itertools -import nltk, string -from sklearn.feature_extraction.text import TfidfVectorizer - -import utils - -# tokenizers/punkt/english.pickle - - -stemmer = nltk.stem.porter.PorterStemmer() -remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) - -def stem_tokens(tokens): - return [stemmer.stem(item) for item in tokens] - -def normalize(text): - """ - Remove punctuation, lowercase, stem - """ - return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) - -vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english') - -def cosine_sim(article1, article2): - try: - tfidf = vectorizer.fit_transform([utils.clear_string(article1.content), - utils.clear_string(article2.content)]) - except ValueError as e: - raise e - return ((tfidf * tfidf.T).A)[0,1] - - -def compare_documents(feed): - """ - Compare a list of documents by pair. - """ - nltk.download("punkt") - duplicates = [] - for pair in itertools.combinations(feed.articles, 2): - try: - result = cosine_sim(*pair) - if abs(result.item() - 1.0) < 1e-10: - duplicates.append(pair) - #print pair[0].id, pair[0].title, pair[0].link - #print pair[1].id, pair[1].title, pair[1].link - #print - except ValueError: - continue - return duplicates \ No newline at end of file diff --git a/pyaggr3g470r/duplicate.py b/pyaggr3g470r/duplicate.py new file mode 100644 index 00000000..6220a3cb --- /dev/null +++ b/pyaggr3g470r/duplicate.py @@ -0,0 +1,56 @@ +#! /usr/bin/env python +#-*- coding: utf-8 -*- + +import itertools +import nltk, string +from sklearn.feature_extraction.text import TfidfVectorizer + +import utils + +# tokenizers/punkt/english.pickle + + +stemmer = nltk.stem.porter.PorterStemmer() +remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) + +def stem_tokens(tokens): + return [stemmer.stem(item) for item in tokens] + +def normalize(text): + """ + Remove punctuation, lowercase, stem + """ + return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) + +vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english') + +def cosine_sim(article1, article2): + try: + tfidf = vectorizer.fit_transform([utils.clear_string(article1.content), + utils.clear_string(article2.content)]) + except ValueError as e: + raise e + return ((tfidf * tfidf.T).A)[0,1] + + +def compare_documents(feed): + """ + Compare a list of documents by pair. + """ + downloaded = nltk.download("punkt") + if not downloaded: + # Ubuntu packaged version still uses old URL + dl = nltk.downloader.Downloader("https://nltk.github.com/nltk_data/") + dl.download("punkt") + duplicates = [] + for pair in itertools.combinations(feed.articles, 2): + try: + result = cosine_sim(*pair) + if abs(result.item() - 1.0) < 1e-10: + duplicates.append(pair) + #print pair[0].id, pair[0].title, pair[0].link + #print pair[1].id, pair[1].title, pair[1].link + #print + except ValueError: + continue + return duplicates \ No newline at end of file diff --git a/pyaggr3g470r/views.py b/pyaggr3g470r/views.py index c9acec2d..c6c7b5b3 100644 --- a/pyaggr3g470r/views.py +++ b/pyaggr3g470r/views.py @@ -42,7 +42,7 @@ from sqlalchemy.exc import IntegrityError from werkzeug import generate_password_hash import conf -from pyaggr3g470r import utils, notifications, export, compare +from pyaggr3g470r import utils, notifications, export, duplicate from pyaggr3g470r import app, db, allowed_file, babel from pyaggr3g470r.models import User, Feed, Article, Role from pyaggr3g470r.decorators import feed_access_required @@ -484,7 +484,7 @@ def duplicates(feed_id=None): """ feed = Feed.query.filter(Feed.user_id == g.user.id, Feed.id == feed_id).first() duplicates = [] - duplicates = compare.compare_documents(feed) + duplicates = duplicate.compare_documents(feed) return render_template('duplicates.html', duplicates=duplicates) @app.route('/index_database', methods=['GET']) diff --git a/vagrant/bootstrap.sh b/vagrant/bootstrap.sh index 38f3b689..750fe4f5 100644 --- a/vagrant/bootstrap.sh +++ b/vagrant/bootstrap.sh @@ -17,6 +17,8 @@ cd pyaggr3g470r apt-get install -y libxml2-dev libxslt1-dev # For scipy: apt-get install -y libatlas-base-dev gfortran +# For scikit-learn: +apt-get install -y python-nose # installation with pip sudo pip install --upgrade -r requirements.txt # copy of the default configuration files for vagrant -- cgit