diff options
author | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2015-03-08 12:07:36 +0100 |
---|---|---|
committer | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2015-03-08 12:07:36 +0100 |
commit | 2378de49ba37116c5bf93054fd6aed65fa44022a (patch) | |
tree | 90bb6efddb1d8fc4772c74fcf5dda4dccef74b1a /pyaggr3g470r/utils.py | |
parent | Better handling of the error logging in the crawler. (diff) | |
download | newspipe-2378de49ba37116c5bf93054fd6aed65fa44022a.tar.gz newspipe-2378de49ba37116c5bf93054fd6aed65fa44022a.tar.bz2 newspipe-2378de49ba37116c5bf93054fd6aed65fa44022a.zip |
Moved duplicate() function in utils.py. Some minor cosmethic changes.
Diffstat (limited to 'pyaggr3g470r/utils.py')
-rwxr-xr-x | pyaggr3g470r/utils.py | 13 |
1 files changed, 13 insertions, 0 deletions
diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py index 3ed89f55..ba440c78 100755 --- a/pyaggr3g470r/utils.py +++ b/pyaggr3g470r/utils.py @@ -41,12 +41,14 @@ import logging import datetime import operator import urllib +import itertools import subprocess try: from urlparse import urlparse, parse_qs, urlunparse except: from urllib.parse import urlparse, parse_qs, urlunparse from bs4 import BeautifulSoup +from datetime import timedelta from collections import Counter from contextlib import contextmanager @@ -283,6 +285,17 @@ def tag_cloud(tags): (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, format(count, ',d'), word)) \ for (word, count) in tags]) +def compare_documents(feed): + """ + Compare a list of documents by pair. + """ + duplicates = [] + for pair in itertools.combinations(feed.articles, 2): + date1, date2 = pair[0].date, pair[1].date + if clear_string(pair[0].title) == clear_string(pair[1].title) and \ + (date1 - date2) < timedelta(days = 1): + duplicates.append(pair) + return duplicates def search_feed(url): """ |