diff options
author | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-11-28 15:07:45 +0100 |
---|---|---|
committer | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-11-28 15:07:45 +0100 |
commit | e9f19f3df42d6ac4ddb97af5bb6753d93ff0ffa9 (patch) | |
tree | b3ab31fe98ca2ea61e5a610a6cdc2f182087548b | |
parent | Updated README (vagrant installation). (diff) | |
download | newspipe-e9f19f3df42d6ac4ddb97af5bb6753d93ff0ffa9.tar.gz newspipe-e9f19f3df42d6ac4ddb97af5bb6753d93ff0ffa9.tar.bz2 newspipe-e9f19f3df42d6ac4ddb97af5bb6753d93ff0ffa9.zip |
Compare files with cosine.
-rw-r--r-- | pyaggr3g470r/compare.py | 53 | ||||
-rw-r--r-- | pyaggr3g470r/templates/inactives.html | 25 | ||||
-rw-r--r-- | pyaggr3g470r/views.py | 8 | ||||
-rw-r--r-- | requirements.txt | 3 |
4 files changed, 72 insertions, 17 deletions
diff --git a/pyaggr3g470r/compare.py b/pyaggr3g470r/compare.py new file mode 100644 index 00000000..26ac57ab --- /dev/null +++ b/pyaggr3g470r/compare.py @@ -0,0 +1,53 @@ +#! /usr/bin/env python +#-*- coding: utf-8 -*- + +import itertools +import nltk, string +from sklearn.feature_extraction.text import TfidfVectorizer + +import utils + +# tokenizers/punkt/english.pickle + + +stemmer = nltk.stem.porter.PorterStemmer() +remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) + +def stem_tokens(tokens): + return [stemmer.stem(item) for item in tokens] + +def normalize(text): + """ + Remove punctuation, lowercase, stem + """ + return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) + +vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english') + +def cosine_sim(article1, article2): + try: + tfidf = vectorizer.fit_transform([utils.clear_string(article1.content), + utils.clear_string(article2.content)]) + except ValueError as e: + raise e + return ((tfidf * tfidf.T).A)[0,1] + + +def compare_documents(feed): + """ + Compare a list of documents by pair. + """ + nltk.download("punkt") + duplicates = [] + for pair in [(elem[0], elem[1]) for elem in itertools.product(feed.articles, repeat=2) + if elem[0].id != elem[1].id]: + try: + result = cosine_sim(*pair) + if abs(result.item() - 1.0) < 1e-10: + duplicates.append(pair) + #print pair[0].id, pair[0].title, pair[0].link + #print pair[1].id, pair[1].title, pair[1].link + #print + except ValueError: + continue + return duplicates
\ No newline at end of file diff --git a/pyaggr3g470r/templates/inactives.html b/pyaggr3g470r/templates/inactives.html index e6897281..548f845d 100644 --- a/pyaggr3g470r/templates/inactives.html +++ b/pyaggr3g470r/templates/inactives.html @@ -1,21 +1,14 @@ {% extends "layout.html" %} {% block content %} <div class="container"> - <div class="jumbotron"> - <form method=get action="/inactives"> - <p>{{ _('Days of inactivity') }}:</p> - <input type="number" name="nb_days" class="form-control" value="{{ nb_days }}" min="0" max="1000000" step="1" size="4" style="text-align: center" /> - </form> - <br /> - {% if inactives != [] %} - <ul class="list-group"> - {% for item in inactives %} - <li class="list-group-item"><a href="/feed/{{ item[0].id }}">{{ item[0].title }}</a> - {{ item[1].days }} {{ _('days') }}</li> - {% endfor %} - </ul> - {% else %} - <p>{{ _('No inactive feeds.') }}<p> - {% endif %} - </div> + {% if duplicates != [] %} + <ul class="list-group"> + {% for pair in duplicates %} + <li><a href="/article/{{ pair[0].id }}">{{ pair[0].title }}</a> - <a href="/article/{{ pair[1].id }}">{{ pair[1].title }}</a></li> + {% endfor %} + </ul> + {% else %} + <p>{{ _('No duplicates.') }}<p> + {% endif %} </div><!-- /.container --> {% endblock %} diff --git a/pyaggr3g470r/views.py b/pyaggr3g470r/views.py index ad0cc914..a19b67dd 100644 --- a/pyaggr3g470r/views.py +++ b/pyaggr3g470r/views.py @@ -42,7 +42,7 @@ from sqlalchemy.exc import IntegrityError from werkzeug import generate_password_hash import conf -from pyaggr3g470r import utils, notifications, export +from pyaggr3g470r import utils, notifications, export, compare from pyaggr3g470r import app, db, allowed_file, babel from pyaggr3g470r.models import User, Feed, Article, Role from pyaggr3g470r.decorators import feed_access_required @@ -476,6 +476,12 @@ def inactives(): inactives.append((feed, elapsed)) return render_template('inactives.html', inactives=inactives, nb_days=nb_days) +@app.route('/duplicates/<int:feed_id>', methods=['GET']) +def duplicates(feed_id=None): + feed = Feed.query.filter(Feed.user_id == g.user.id, Feed.id == feed_id).first() + result = compare.compare_documents(feed) + return render_template('unread.html', duplicates=duplicates) + @app.route('/index_database', methods=['GET']) @login_required def index_database(): diff --git a/requirements.txt b/requirements.txt index 0a357c37..ea18a06d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,6 @@ python-postmark gevent whoosh python-dateutil +nltk +scikit-learn +scipy
\ No newline at end of file |