From 4142e50e512bf05bba7e47be232c3cc3ae4e32f5 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Tue, 10 Mar 2015 14:57:14 +0100 Subject: implementing cache construction on crawler side (limiting useless pushes) --- pyaggr3g470r/lib/crawler.py | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'pyaggr3g470r') diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 9df37993..99967671 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -19,6 +19,7 @@ import logging import requests import feedparser import dateutil.parser +from hashlib import md5 from functools import wraps from datetime import datetime from concurrent.futures import ThreadPoolExecutor @@ -29,6 +30,10 @@ logger = logging.getLogger(__name__) API_ROOT = "api/v2.0/" +def to_hash(text): + return md5(text.encode('utf8')).hexdigest() + + def extract_id(entry, keys=[('link', 'link'), ('published', 'retrieved_date'), ('updated', 'retrieved_date')], force_id=False): @@ -40,8 +45,8 @@ def extract_id(entry, keys=[('link', 'link'), if entry_id: return {'entry_id': entry_id} if not entry_id and force_id: - entry_id = hash("".join(entry[entry_key] for _, entry_key in keys - if entry_key in entry)) + entry_id = to_hash("".join(entry[entry_key] for _, entry_key in keys + if entry_key in entry).encode('utf8')) else: ids = {} for entry_key, pyagg_key in keys: @@ -218,18 +223,32 @@ class FeedCrawler(AbstractCrawler): future.add_done_callback(self.get_counter_callback()) return + etag_generated = False if response.status_code == 304: logger.info("%r %r - feed responded with 304", - self.feed['id'], self.feed['title']) + self.feed['id'], self.feed['title']) self.clean_feed() return - if self.feed['etag'] and response.headers.get('etag') \ - and response.headers.get('etag') == self.feed['etag']: - logger.info("%r %r - feed responded with same etag (%d)", - self.feed['id'], self.feed['title'], - response.status_code) + if not response.headers.get('etag'): + etag_generated = True + logger.debug('%r %r - manually generating etag', + self.feed['id'], self.feed['title']) + response.headers['etag'] = 'pyagg/"%s"' % to_hash(response.text) + if self.feed['etag'] and response.headers['etag'] == self.feed['etag']: + if etag_generated: + logger.info("%r %r - calculated hash matches (%d)", + self.feed['id'], self.feed['title'], + response.status_code) + else: + logger.info("%r %r - feed responded with same etag (%d)", + self.feed['id'], self.feed['title'], + response.status_code) self.clean_feed() return + else: + logger.info('%r %r - etag mismatch %r != %r', + self.feed['id'], self.feed['title'], + response.headers['etag'], self.feed['etag']) ids, entries = [], {} parsed_response = feedparser.parse(response.text) for entry in parsed_response['entries']: @@ -253,7 +272,7 @@ class CrawlerScheduler(AbstractCrawler): def prepare_headers(self, feed): """For a known feed, will construct some header dictionnary""" headers = {'User-Agent': 'pyaggr3g470r/crawler'} - if feed.get('etag', None): + if feed.get('etag') and 'pyagg' not in feed.get('etag', ''): headers['If-None-Match'] = feed['etag'] if feed.get('last_modified'): headers['If-Modified-Since'] = feed['last_modified'] -- cgit