diff options
author | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-11-28 20:08:32 +0100 |
---|---|---|
committer | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-11-28 20:08:32 +0100 |
commit | 4d3ced06e09c292e8fe97d6879b94ea12ccc0052 (patch) | |
tree | cf3471d665b6ddff760c4426d21928e8a44487b1 /pyaggr3g470r/duplicate.py | |
parent | Test the equality of the contents and of the titles. (diff) | |
download | newspipe-4d3ced06e09c292e8fe97d6879b94ea12ccc0052.tar.gz newspipe-4d3ced06e09c292e8fe97d6879b94ea12ccc0052.tar.bz2 newspipe-4d3ced06e09c292e8fe97d6879b94ea12ccc0052.zip |
Only compare published date and title of articles.
Diffstat (limited to 'pyaggr3g470r/duplicate.py')
-rw-r--r-- | pyaggr3g470r/duplicate.py | 8 |
1 files changed, 6 insertions, 2 deletions
diff --git a/pyaggr3g470r/duplicate.py b/pyaggr3g470r/duplicate.py index d63cb2e7..6f78fd81 100644 --- a/pyaggr3g470r/duplicate.py +++ b/pyaggr3g470r/duplicate.py @@ -1,6 +1,8 @@ #! /usr/bin/env python #-*- coding: utf-8 -*- +from datetime import timedelta + import itertools import utils @@ -11,8 +13,10 @@ def compare_documents(feed): """ duplicates = [] for pair in itertools.combinations(feed.articles, 2): + date1 = pair[0].date + date2 = pair[1].date if pair[0].content != "" and \ - (utils.clear_string(pair[0].title) == utils.clear_string(pair[1].title) or \ - utils.clear_string(pair[0].content) == utils.clear_string(pair[1].content)): + utils.clear_string(pair[0].title) == utils.clear_string(pair[1].title) and \ + (date1 - date2) < timedelta(days = 1): duplicates.append(pair) return duplicates
\ No newline at end of file |