diff options
author | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-11-28 19:59:50 +0100 |
---|---|---|
committer | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-11-28 19:59:50 +0100 |
commit | 903030fc0a0d545e652337d543c6167e2bb192b1 (patch) | |
tree | 8f279d6ab7eb19038eb7ef31c53822e789dc4db5 /pyaggr3g470r | |
parent | Finall the method with nltk is really slow... (diff) | |
download | newspipe-903030fc0a0d545e652337d543c6167e2bb192b1.tar.gz newspipe-903030fc0a0d545e652337d543c6167e2bb192b1.tar.bz2 newspipe-903030fc0a0d545e652337d543c6167e2bb192b1.zip |
Test the equality of the contents and of the titles.
Diffstat (limited to 'pyaggr3g470r')
-rw-r--r-- | pyaggr3g470r/duplicate.py | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/pyaggr3g470r/duplicate.py b/pyaggr3g470r/duplicate.py index 23f4adc0..d63cb2e7 100644 --- a/pyaggr3g470r/duplicate.py +++ b/pyaggr3g470r/duplicate.py @@ -11,6 +11,8 @@ def compare_documents(feed): """ duplicates = [] for pair in itertools.combinations(feed.articles, 2): - if pair[0].content != "" and pair[0].content == pair[1].content: + if pair[0].content != "" and \ + (utils.clear_string(pair[0].title) == utils.clear_string(pair[1].title) or \ + utils.clear_string(pair[0].content) == utils.clear_string(pair[1].content)): duplicates.append(pair) return duplicates
\ No newline at end of file |