aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2014-11-28 15:07:45 +0100
committerCédric Bonhomme <cedric@cedricbonhomme.org>2014-11-28 15:07:45 +0100
commite9f19f3df42d6ac4ddb97af5bb6753d93ff0ffa9 (patch)
treeb3ab31fe98ca2ea61e5a610a6cdc2f182087548b /pyaggr3g470r
parentUpdated README (vagrant installation). (diff)
downloadnewspipe-e9f19f3df42d6ac4ddb97af5bb6753d93ff0ffa9.tar.gz
newspipe-e9f19f3df42d6ac4ddb97af5bb6753d93ff0ffa9.tar.bz2
newspipe-e9f19f3df42d6ac4ddb97af5bb6753d93ff0ffa9.zip
Compare files with cosine.
Diffstat (limited to 'pyaggr3g470r')
-rw-r--r--pyaggr3g470r/compare.py53
-rw-r--r--pyaggr3g470r/templates/inactives.html25
-rw-r--r--pyaggr3g470r/views.py8
3 files changed, 69 insertions, 17 deletions
diff --git a/pyaggr3g470r/compare.py b/pyaggr3g470r/compare.py
new file mode 100644
index 00000000..26ac57ab
--- /dev/null
+++ b/pyaggr3g470r/compare.py
@@ -0,0 +1,53 @@
+#! /usr/bin/env python
+#-*- coding: utf-8 -*-
+
+import itertools
+import nltk, string
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+import utils
+
+# tokenizers/punkt/english.pickle
+
+
+stemmer = nltk.stem.porter.PorterStemmer()
+remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
+
+def stem_tokens(tokens):
+ return [stemmer.stem(item) for item in tokens]
+
+def normalize(text):
+ """
+ Remove punctuation, lowercase, stem
+ """
+ return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
+
+vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
+
+def cosine_sim(article1, article2):
+ try:
+ tfidf = vectorizer.fit_transform([utils.clear_string(article1.content),
+ utils.clear_string(article2.content)])
+ except ValueError as e:
+ raise e
+ return ((tfidf * tfidf.T).A)[0,1]
+
+
+def compare_documents(feed):
+ """
+ Compare a list of documents by pair.
+ """
+ nltk.download("punkt")
+ duplicates = []
+ for pair in [(elem[0], elem[1]) for elem in itertools.product(feed.articles, repeat=2)
+ if elem[0].id != elem[1].id]:
+ try:
+ result = cosine_sim(*pair)
+ if abs(result.item() - 1.0) < 1e-10:
+ duplicates.append(pair)
+ #print pair[0].id, pair[0].title, pair[0].link
+ #print pair[1].id, pair[1].title, pair[1].link
+ #print
+ except ValueError:
+ continue
+ return duplicates \ No newline at end of file
diff --git a/pyaggr3g470r/templates/inactives.html b/pyaggr3g470r/templates/inactives.html
index e6897281..548f845d 100644
--- a/pyaggr3g470r/templates/inactives.html
+++ b/pyaggr3g470r/templates/inactives.html
@@ -1,21 +1,14 @@
{% extends "layout.html" %}
{% block content %}
<div class="container">
- <div class="jumbotron">
- <form method=get action="/inactives">
- <p>{{ _('Days of inactivity') }}:</p>
- <input type="number" name="nb_days" class="form-control" value="{{ nb_days }}" min="0" max="1000000" step="1" size="4" style="text-align: center" />
- </form>
- <br />
- {% if inactives != [] %}
- <ul class="list-group">
- {% for item in inactives %}
- <li class="list-group-item"><a href="/feed/{{ item[0].id }}">{{ item[0].title }}</a> - {{ item[1].days }} {{ _('days') }}</li>
- {% endfor %}
- </ul>
- {% else %}
- <p>{{ _('No inactive feeds.') }}<p>
- {% endif %}
- </div>
+ {% if duplicates != [] %}
+ <ul class="list-group">
+ {% for pair in duplicates %}
+ <li><a href="/article/{{ pair[0].id }}">{{ pair[0].title }}</a> - <a href="/article/{{ pair[1].id }}">{{ pair[1].title }}</a></li>
+ {% endfor %}
+ </ul>
+ {% else %}
+ <p>{{ _('No duplicates.') }}<p>
+ {% endif %}
</div><!-- /.container -->
{% endblock %}
diff --git a/pyaggr3g470r/views.py b/pyaggr3g470r/views.py
index ad0cc914..a19b67dd 100644
--- a/pyaggr3g470r/views.py
+++ b/pyaggr3g470r/views.py
@@ -42,7 +42,7 @@ from sqlalchemy.exc import IntegrityError
from werkzeug import generate_password_hash
import conf
-from pyaggr3g470r import utils, notifications, export
+from pyaggr3g470r import utils, notifications, export, compare
from pyaggr3g470r import app, db, allowed_file, babel
from pyaggr3g470r.models import User, Feed, Article, Role
from pyaggr3g470r.decorators import feed_access_required
@@ -476,6 +476,12 @@ def inactives():
inactives.append((feed, elapsed))
return render_template('inactives.html', inactives=inactives, nb_days=nb_days)
+@app.route('/duplicates/<int:feed_id>', methods=['GET'])
+def duplicates(feed_id=None):
+ feed = Feed.query.filter(Feed.user_id == g.user.id, Feed.id == feed_id).first()
+ result = compare.compare_documents(feed)
+ return render_template('unread.html', duplicates=duplicates)
+
@app.route('/index_database', methods=['GET'])
@login_required
def index_database():
bgstack15