From 4ad1b29d831633de1430a683c4ad37873007d34c Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Fri, 31 Jul 2015 13:20:55 +0200 Subject: redoing the etag matching mechanism --- pyaggr3g470r/lib/view_utils.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 pyaggr3g470r/lib/view_utils.py (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/view_utils.py b/pyaggr3g470r/lib/view_utils.py new file mode 100644 index 00000000..fa5e1eec --- /dev/null +++ b/pyaggr3g470r/lib/view_utils.py @@ -0,0 +1,20 @@ +from functools import wraps +from flask import request, Response, make_response +from pyaggr3g470r.lib.utils import to_hash + + +def etag_match(func): + @wraps(func) + def wrapper(*args, **kwargs): + response = func(*args, **kwargs) + if not type(response) is str: + return response + etag = to_hash(response) + if request.headers.get('if-none-match') == etag: + response = Response(status=304, headers={'etag': etag, + 'Cache-Control': 'pragma: no-cache'}) + else: + response = make_response(response) + response.headers['etag'] = etag + return response + return wrapper -- cgit From 60052ffca30ec33b79eb36b0fe4d49b338f73ca1 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Fri, 31 Jul 2015 13:21:33 +0200 Subject: ensuring the icon isn't empty and redoing a bit of logging --- pyaggr3g470r/lib/crawler.py | 7 ++++--- pyaggr3g470r/lib/utils.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index e5998776..e4dc5955 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -125,7 +125,7 @@ class PyAggUpdater(AbstractCrawler): entry = construct_article( self.entries[tuple(sorted(id_to_create.items()))], self.feed) - logger.warn('%r %r - creating %r for %r - %r', self.feed['id'], + logger.info('%r %r - creating %r for %r - %r', self.feed['id'], self.feed['title'], entry['title'], entry['user_id'], id_to_create) self.query_pyagg('post', 'article', entry) @@ -152,11 +152,12 @@ class PyAggUpdater(AbstractCrawler): up_feed['last_retrieved'] \ = (datetime.now() - timedelta(minutes=45)).isoformat() - logger.info('%r %r - pushing feed attrs %r', + if any([up_feed[key] != self.feed.get(key) for key in up_feed]): + logger.warn('%r %r - pushing feed attrs %r', self.feed['id'], self.feed['title'], {key: "%s -> %s" % (up_feed[key], self.feed.get(key)) for key in up_feed if up_feed[key] != self.feed.get(key)}) - if any([up_feed[key] != self.feed.get(key) for key in up_feed]): + future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], up_feed) future.add_done_callback(self.get_counter_callback()) diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index a51b6c3e..b937b5a9 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -48,7 +48,7 @@ def try_get_b64icon(url, *splits): response = requests.get(rb_url, verify=False, timeout=10) # if html in content-type, we assume it's a fancy 404 page content_type = response.headers.get('content-type', '') - if response.ok and 'html' not in content_type: + if response.ok and 'html' not in content_type and response.content: return content_type + ( '\n%s' % base64.b64encode(response.content).decode('utf8')) return None -- cgit From 0caffceec8b58bc3f78c0d8ea36d2f7e9da668ec Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Mon, 3 Aug 2015 14:36:13 +0200 Subject: sqlalchemy was requesting icons everytime feed where listed so i choosed to move the icons into their own table --- pyaggr3g470r/lib/crawler.py | 2 +- pyaggr3g470r/lib/feed_utils.py | 33 +++++++++++++++++---------------- pyaggr3g470r/lib/utils.py | 9 ++++----- pyaggr3g470r/lib/view_utils.py | 18 ++++++++++++------ 4 files changed, 34 insertions(+), 28 deletions(-) (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index e4dc5955..216e7a96 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -141,7 +141,7 @@ class PyAggUpdater(AbstractCrawler): strftime('%a, %d %b %Y %X %Z', gmtime()))} fresh_feed = construct_feed_from(url=self.feed['link'], fp_parsed=self.parsed_feed) - for key in ('description', 'site_link', 'icon'): + for key in ('description', 'site_link', 'icon_url'): if fresh_feed.get(key) and fresh_feed[key] != self.feed.get(key): up_feed[key] = fresh_feed[key] if not self.feed.get('title'): diff --git a/pyaggr3g470r/lib/feed_utils.py b/pyaggr3g470r/lib/feed_utils.py index 28123f66..aa9db29c 100644 --- a/pyaggr3g470r/lib/feed_utils.py +++ b/pyaggr3g470r/lib/feed_utils.py @@ -4,7 +4,7 @@ import requests import feedparser from bs4 import BeautifulSoup, SoupStrainer -from pyaggr3g470r.lib.utils import try_keys, try_get_b64icon, rebuild_url +from pyaggr3g470r.lib.utils import try_keys, try_get_icon_url, rebuild_url logger = logging.getLogger(__name__) @@ -29,7 +29,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link') feed['title'] = fp_parsed['feed'].get('title') feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title') - feed['icon'] = try_keys(fp_parsed['feed'], 'icon') + feed['icon_url'] = try_keys(fp_parsed['feed'], 'icon') else: feed['site_link'] = url @@ -37,13 +37,14 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): feed['site_link'] = rebuild_url(feed['site_link'], feed_split) site_split = urllib.parse.urlsplit(feed['site_link']) - if feed.get('icon'): - feed['icon'] = try_get_b64icon(feed['icon'], site_split, feed_split) - if feed['icon'] is None: - del feed['icon'] + if feed.get('icon_url'): + feed['icon_url'] = try_get_icon_url( + feed['icon_url'], site_split, feed_split) + if feed['icon_url'] is None: + del feed['icon_url'] if not feed.get('site_link') or not query_site \ - or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')): + or all(bool(feed.get(k)) for k in ('link', 'title', 'icon_url')): return feed response = requests.get(feed['site_link'], verify=False) @@ -66,22 +67,22 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): return True return wrapper - if not feed.get('icon'): + if not feed.get('icon_url'): icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut'])) if not len(icons): icons = bs_parsed.find_all(check_keys(rel=['icon'])) if len(icons) >= 1: for icon in icons: - feed['icon'] = try_get_b64icon(icon.attrs['href'], - site_split, feed_split) - if feed['icon'] is not None: + feed['icon_url'] = try_get_icon_url(icon.attrs['href'], + site_split, feed_split) + if feed['icon_url'] is not None: break - if feed.get('icon') is None: - feed['icon'] = try_get_b64icon('/favicon.ico', - site_split, feed_split) - if 'icon' in feed and feed['icon'] is None: - del feed['icon'] + if feed.get('icon_url') is None: + feed['icon_url'] = try_get_icon_url('/favicon.ico', + site_split, feed_split) + if 'icon_url' in feed and feed['icon_url'] is None: + del feed['icon_url'] if not feed.get('link'): alternates = bs_parsed.find_all(check_keys(rel=['alternate'], diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index b937b5a9..aa552a12 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -1,6 +1,5 @@ import types import urllib -import base64 import logging import requests from hashlib import md5 @@ -40,7 +39,7 @@ def rebuild_url(url, base_split): return urllib.parse.urlunsplit(new_split) -def try_get_b64icon(url, *splits): +def try_get_icon_url(url, *splits): for split in splits: if split is None: continue @@ -49,10 +48,10 @@ def try_get_b64icon(url, *splits): # if html in content-type, we assume it's a fancy 404 page content_type = response.headers.get('content-type', '') if response.ok and 'html' not in content_type and response.content: - return content_type + ( - '\n%s' % base64.b64encode(response.content).decode('utf8')) + return response.url return None def to_hash(text): - return md5(text.encode('utf8')).hexdigest() + return md5(text.encode('utf8') if hasattr(text, 'encode') else text)\ + .hexdigest() diff --git a/pyaggr3g470r/lib/view_utils.py b/pyaggr3g470r/lib/view_utils.py index fa5e1eec..0cfe62c4 100644 --- a/pyaggr3g470r/lib/view_utils.py +++ b/pyaggr3g470r/lib/view_utils.py @@ -7,14 +7,20 @@ def etag_match(func): @wraps(func) def wrapper(*args, **kwargs): response = func(*args, **kwargs) - if not type(response) is str: + if isinstance(response, Response): + etag = to_hash(response.data) + headers = response.headers + elif type(response) is str: + etag = to_hash(response) + headers = {} + else: return response - etag = to_hash(response) if request.headers.get('if-none-match') == etag: - response = Response(status=304, headers={'etag': etag, - 'Cache-Control': 'pragma: no-cache'}) - else: + response = Response(status=304) + response.headers['Cache-Control'] \ + = headers.get('Cache-Control', 'pragma: no-cache') + elif not isinstance(response, Response): response = make_response(response) - response.headers['etag'] = etag + response.headers['etag'] = etag return response return wrapper -- cgit