aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'pyaggr3g470r/utils.py')
-rwxr-xr-xpyaggr3g470r/utils.py13
1 files changed, 13 insertions, 0 deletions
diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index 3ed89f55..ba440c78 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -41,12 +41,14 @@ import logging
import datetime
import operator
import urllib
+import itertools
import subprocess
try:
from urlparse import urlparse, parse_qs, urlunparse
except:
from urllib.parse import urlparse, parse_qs, urlunparse
from bs4 import BeautifulSoup
+from datetime import timedelta
from collections import Counter
from contextlib import contextmanager
@@ -283,6 +285,17 @@ def tag_cloud(tags):
(min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, format(count, ',d'), word)) \
for (word, count) in tags])
+def compare_documents(feed):
+ """
+ Compare a list of documents by pair.
+ """
+ duplicates = []
+ for pair in itertools.combinations(feed.articles, 2):
+ date1, date2 = pair[0].date, pair[1].date
+ if clear_string(pair[0].title) == clear_string(pair[1].title) and \
+ (date1 - date2) < timedelta(days = 1):
+ duplicates.append(pair)
+ return duplicates
def search_feed(url):
"""
bgstack15