aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2014-11-28 20:08:32 +0100
committerCédric Bonhomme <cedric@cedricbonhomme.org>2014-11-28 20:08:32 +0100
commit4d3ced06e09c292e8fe97d6879b94ea12ccc0052 (patch)
treecf3471d665b6ddff760c4426d21928e8a44487b1 /pyaggr3g470r
parentTest the equality of the contents and of the titles. (diff)
downloadnewspipe-4d3ced06e09c292e8fe97d6879b94ea12ccc0052.tar.gz
newspipe-4d3ced06e09c292e8fe97d6879b94ea12ccc0052.tar.bz2
newspipe-4d3ced06e09c292e8fe97d6879b94ea12ccc0052.zip
Only compare published date and title of articles.
Diffstat (limited to 'pyaggr3g470r')
-rw-r--r--pyaggr3g470r/duplicate.py8
1 files changed, 6 insertions, 2 deletions
diff --git a/pyaggr3g470r/duplicate.py b/pyaggr3g470r/duplicate.py
index d63cb2e7..6f78fd81 100644
--- a/pyaggr3g470r/duplicate.py
+++ b/pyaggr3g470r/duplicate.py
@@ -1,6 +1,8 @@
#! /usr/bin/env python
#-*- coding: utf-8 -*-
+from datetime import timedelta
+
import itertools
import utils
@@ -11,8 +13,10 @@ def compare_documents(feed):
"""
duplicates = []
for pair in itertools.combinations(feed.articles, 2):
+ date1 = pair[0].date
+ date2 = pair[1].date
if pair[0].content != "" and \
- (utils.clear_string(pair[0].title) == utils.clear_string(pair[1].title) or \
- utils.clear_string(pair[0].content) == utils.clear_string(pair[1].content)):
+ utils.clear_string(pair[0].title) == utils.clear_string(pair[1].title) and \
+ (date1 - date2) < timedelta(days = 1):
duplicates.append(pair)
return duplicates \ No newline at end of file
bgstack15