aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2014-11-28 19:59:50 +0100
committerCédric Bonhomme <cedric@cedricbonhomme.org>2014-11-28 19:59:50 +0100
commit903030fc0a0d545e652337d543c6167e2bb192b1 (patch)
tree8f279d6ab7eb19038eb7ef31c53822e789dc4db5
parentFinall the method with nltk is really slow... (diff)
downloadnewspipe-903030fc0a0d545e652337d543c6167e2bb192b1.tar.gz
newspipe-903030fc0a0d545e652337d543c6167e2bb192b1.tar.bz2
newspipe-903030fc0a0d545e652337d543c6167e2bb192b1.zip
Test the equality of the contents and of the titles.
-rw-r--r--pyaggr3g470r/duplicate.py4
1 files changed, 3 insertions, 1 deletions
diff --git a/pyaggr3g470r/duplicate.py b/pyaggr3g470r/duplicate.py
index 23f4adc0..d63cb2e7 100644
--- a/pyaggr3g470r/duplicate.py
+++ b/pyaggr3g470r/duplicate.py
@@ -11,6 +11,8 @@ def compare_documents(feed):
"""
duplicates = []
for pair in itertools.combinations(feed.articles, 2):
- if pair[0].content != "" and pair[0].content == pair[1].content:
+ if pair[0].content != "" and \
+ (utils.clear_string(pair[0].title) == utils.clear_string(pair[1].title) or \
+ utils.clear_string(pair[0].content) == utils.clear_string(pair[1].content)):
duplicates.append(pair)
return duplicates \ No newline at end of file
bgstack15