From e9f19f3df42d6ac4ddb97af5bb6753d93ff0ffa9 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Fri, 28 Nov 2014 15:07:45 +0100 Subject: Compare files with cosine. --- pyaggr3g470r/compare.py | 53 +++++++++++++++++++++++++++++++++++ pyaggr3g470r/templates/inactives.html | 25 ++++++----------- pyaggr3g470r/views.py | 8 +++++- 3 files changed, 69 insertions(+), 17 deletions(-) create mode 100644 pyaggr3g470r/compare.py (limited to 'pyaggr3g470r') diff --git a/pyaggr3g470r/compare.py b/pyaggr3g470r/compare.py new file mode 100644 index 00000000..26ac57ab --- /dev/null +++ b/pyaggr3g470r/compare.py @@ -0,0 +1,53 @@ +#! /usr/bin/env python +#-*- coding: utf-8 -*- + +import itertools +import nltk, string +from sklearn.feature_extraction.text import TfidfVectorizer + +import utils + +# tokenizers/punkt/english.pickle + + +stemmer = nltk.stem.porter.PorterStemmer() +remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) + +def stem_tokens(tokens): + return [stemmer.stem(item) for item in tokens] + +def normalize(text): + """ + Remove punctuation, lowercase, stem + """ + return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) + +vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english') + +def cosine_sim(article1, article2): + try: + tfidf = vectorizer.fit_transform([utils.clear_string(article1.content), + utils.clear_string(article2.content)]) + except ValueError as e: + raise e + return ((tfidf * tfidf.T).A)[0,1] + + +def compare_documents(feed): + """ + Compare a list of documents by pair. + """ + nltk.download("punkt") + duplicates = [] + for pair in [(elem[0], elem[1]) for elem in itertools.product(feed.articles, repeat=2) + if elem[0].id != elem[1].id]: + try: + result = cosine_sim(*pair) + if abs(result.item() - 1.0) < 1e-10: + duplicates.append(pair) + #print pair[0].id, pair[0].title, pair[0].link + #print pair[1].id, pair[1].title, pair[1].link + #print + except ValueError: + continue + return duplicates \ No newline at end of file diff --git a/pyaggr3g470r/templates/inactives.html b/pyaggr3g470r/templates/inactives.html index e6897281..548f845d 100644 --- a/pyaggr3g470r/templates/inactives.html +++ b/pyaggr3g470r/templates/inactives.html @@ -1,21 +1,14 @@ {% extends "layout.html" %} {% block content %}
-
-
-

{{ _('Days of inactivity') }}:

- -
-
- {% if inactives != [] %} -
    - {% for item in inactives %} -
  • {{ item[0].title }} - {{ item[1].days }} {{ _('days') }}
  • - {% endfor %} -
- {% else %} -

{{ _('No inactive feeds.') }}

- {% endif %} -

+ {% if duplicates != [] %} + + {% else %} +

{{ _('No duplicates.') }}

+ {% endif %}

{% endblock %} diff --git a/pyaggr3g470r/views.py b/pyaggr3g470r/views.py index ad0cc914..a19b67dd 100644 --- a/pyaggr3g470r/views.py +++ b/pyaggr3g470r/views.py @@ -42,7 +42,7 @@ from sqlalchemy.exc import IntegrityError from werkzeug import generate_password_hash import conf -from pyaggr3g470r import utils, notifications, export +from pyaggr3g470r import utils, notifications, export, compare from pyaggr3g470r import app, db, allowed_file, babel from pyaggr3g470r.models import User, Feed, Article, Role from pyaggr3g470r.decorators import feed_access_required @@ -476,6 +476,12 @@ def inactives(): inactives.append((feed, elapsed)) return render_template('inactives.html', inactives=inactives, nb_days=nb_days) +@app.route('/duplicates/', methods=['GET']) +def duplicates(feed_id=None): + feed = Feed.query.filter(Feed.user_id == g.user.id, Feed.id == feed_id).first() + result = compare.compare_documents(feed) + return render_template('unread.html', duplicates=duplicates) + @app.route('/index_database', methods=['GET']) @login_required def index_database(): -- cgit