aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/utils.py
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2015-03-08 12:07:36 +0100
committerCédric Bonhomme <cedric@cedricbonhomme.org>2015-03-08 12:07:36 +0100
commit2378de49ba37116c5bf93054fd6aed65fa44022a (patch)
tree90bb6efddb1d8fc4772c74fcf5dda4dccef74b1a /pyaggr3g470r/utils.py
parentBetter handling of the error logging in the crawler. (diff)
downloadnewspipe-2378de49ba37116c5bf93054fd6aed65fa44022a.tar.gz
newspipe-2378de49ba37116c5bf93054fd6aed65fa44022a.tar.bz2
newspipe-2378de49ba37116c5bf93054fd6aed65fa44022a.zip
Moved duplicate() function in utils.py. Some minor cosmethic changes.
Diffstat (limited to 'pyaggr3g470r/utils.py')
-rwxr-xr-xpyaggr3g470r/utils.py13
1 files changed, 13 insertions, 0 deletions
diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index 3ed89f55..ba440c78 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -41,12 +41,14 @@ import logging
import datetime
import operator
import urllib
+import itertools
import subprocess
try:
from urlparse import urlparse, parse_qs, urlunparse
except:
from urllib.parse import urlparse, parse_qs, urlunparse
from bs4 import BeautifulSoup
+from datetime import timedelta
from collections import Counter
from contextlib import contextmanager
@@ -283,6 +285,17 @@ def tag_cloud(tags):
(min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, format(count, ',d'), word)) \
for (word, count) in tags])
+def compare_documents(feed):
+ """
+ Compare a list of documents by pair.
+ """
+ duplicates = []
+ for pair in itertools.combinations(feed.articles, 2):
+ date1, date2 = pair[0].date, pair[1].date
+ if clear_string(pair[0].title) == clear_string(pair[1].title) and \
+ (date1 - date2) < timedelta(days = 1):
+ duplicates.append(pair)
+ return duplicates
def search_feed(url):
"""
bgstack15