aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/lib
diff options
context:
space:
mode:
authorCédric Bonhomme <kimble.mandel+bitbucket@gmail.com>2015-08-03 23:55:46 +0200
committerCédric Bonhomme <kimble.mandel+bitbucket@gmail.com>2015-08-03 23:55:46 +0200
commitcdbd573500a365e290e88b50d7b0c2355b7f7e19 (patch)
tree25ede52ae4b02a2377ae40d2c146c7ed2e9abe2a /pyaggr3g470r/lib
parentThe numver ov values of the splited string is variable (sometimes the charset... (diff)
parentsqlalchemy was requesting icons everytime feed where listed (diff)
downloadnewspipe-cdbd573500a365e290e88b50d7b0c2355b7f7e19.tar.gz
newspipe-cdbd573500a365e290e88b50d7b0c2355b7f7e19.tar.bz2
newspipe-cdbd573500a365e290e88b50d7b0c2355b7f7e19.zip
Merged in jaesivsm/pyaggr3g470r (pull request #20)
perf improvement
Diffstat (limited to 'pyaggr3g470r/lib')
-rw-r--r--pyaggr3g470r/lib/crawler.py9
-rw-r--r--pyaggr3g470r/lib/feed_utils.py33
-rw-r--r--pyaggr3g470r/lib/utils.py11
-rw-r--r--pyaggr3g470r/lib/view_utils.py26
4 files changed, 53 insertions, 26 deletions
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py
index e5998776..216e7a96 100644
--- a/pyaggr3g470r/lib/crawler.py
+++ b/pyaggr3g470r/lib/crawler.py
@@ -125,7 +125,7 @@ class PyAggUpdater(AbstractCrawler):
entry = construct_article(
self.entries[tuple(sorted(id_to_create.items()))],
self.feed)
- logger.warn('%r %r - creating %r for %r - %r', self.feed['id'],
+ logger.info('%r %r - creating %r for %r - %r', self.feed['id'],
self.feed['title'], entry['title'], entry['user_id'],
id_to_create)
self.query_pyagg('post', 'article', entry)
@@ -141,7 +141,7 @@ class PyAggUpdater(AbstractCrawler):
strftime('%a, %d %b %Y %X %Z', gmtime()))}
fresh_feed = construct_feed_from(url=self.feed['link'],
fp_parsed=self.parsed_feed)
- for key in ('description', 'site_link', 'icon'):
+ for key in ('description', 'site_link', 'icon_url'):
if fresh_feed.get(key) and fresh_feed[key] != self.feed.get(key):
up_feed[key] = fresh_feed[key]
if not self.feed.get('title'):
@@ -152,11 +152,12 @@ class PyAggUpdater(AbstractCrawler):
up_feed['last_retrieved'] \
= (datetime.now() - timedelta(minutes=45)).isoformat()
- logger.info('%r %r - pushing feed attrs %r',
+ if any([up_feed[key] != self.feed.get(key) for key in up_feed]):
+ logger.warn('%r %r - pushing feed attrs %r',
self.feed['id'], self.feed['title'],
{key: "%s -> %s" % (up_feed[key], self.feed.get(key))
for key in up_feed if up_feed[key] != self.feed.get(key)})
- if any([up_feed[key] != self.feed.get(key) for key in up_feed]):
+
future = self.query_pyagg('put',
'feed/%d' % self.feed['id'], up_feed)
future.add_done_callback(self.get_counter_callback())
diff --git a/pyaggr3g470r/lib/feed_utils.py b/pyaggr3g470r/lib/feed_utils.py
index 28123f66..aa9db29c 100644
--- a/pyaggr3g470r/lib/feed_utils.py
+++ b/pyaggr3g470r/lib/feed_utils.py
@@ -4,7 +4,7 @@ import requests
import feedparser
from bs4 import BeautifulSoup, SoupStrainer
-from pyaggr3g470r.lib.utils import try_keys, try_get_b64icon, rebuild_url
+from pyaggr3g470r.lib.utils import try_keys, try_get_icon_url, rebuild_url
logger = logging.getLogger(__name__)
@@ -29,7 +29,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link')
feed['title'] = fp_parsed['feed'].get('title')
feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title')
- feed['icon'] = try_keys(fp_parsed['feed'], 'icon')
+ feed['icon_url'] = try_keys(fp_parsed['feed'], 'icon')
else:
feed['site_link'] = url
@@ -37,13 +37,14 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
feed['site_link'] = rebuild_url(feed['site_link'], feed_split)
site_split = urllib.parse.urlsplit(feed['site_link'])
- if feed.get('icon'):
- feed['icon'] = try_get_b64icon(feed['icon'], site_split, feed_split)
- if feed['icon'] is None:
- del feed['icon']
+ if feed.get('icon_url'):
+ feed['icon_url'] = try_get_icon_url(
+ feed['icon_url'], site_split, feed_split)
+ if feed['icon_url'] is None:
+ del feed['icon_url']
if not feed.get('site_link') or not query_site \
- or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')):
+ or all(bool(feed.get(k)) for k in ('link', 'title', 'icon_url')):
return feed
response = requests.get(feed['site_link'], verify=False)
@@ -66,22 +67,22 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
return True
return wrapper
- if not feed.get('icon'):
+ if not feed.get('icon_url'):
icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut']))
if not len(icons):
icons = bs_parsed.find_all(check_keys(rel=['icon']))
if len(icons) >= 1:
for icon in icons:
- feed['icon'] = try_get_b64icon(icon.attrs['href'],
- site_split, feed_split)
- if feed['icon'] is not None:
+ feed['icon_url'] = try_get_icon_url(icon.attrs['href'],
+ site_split, feed_split)
+ if feed['icon_url'] is not None:
break
- if feed.get('icon') is None:
- feed['icon'] = try_get_b64icon('/favicon.ico',
- site_split, feed_split)
- if 'icon' in feed and feed['icon'] is None:
- del feed['icon']
+ if feed.get('icon_url') is None:
+ feed['icon_url'] = try_get_icon_url('/favicon.ico',
+ site_split, feed_split)
+ if 'icon_url' in feed and feed['icon_url'] is None:
+ del feed['icon_url']
if not feed.get('link'):
alternates = bs_parsed.find_all(check_keys(rel=['alternate'],
diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py
index a51b6c3e..aa552a12 100644
--- a/pyaggr3g470r/lib/utils.py
+++ b/pyaggr3g470r/lib/utils.py
@@ -1,6 +1,5 @@
import types
import urllib
-import base64
import logging
import requests
from hashlib import md5
@@ -40,7 +39,7 @@ def rebuild_url(url, base_split):
return urllib.parse.urlunsplit(new_split)
-def try_get_b64icon(url, *splits):
+def try_get_icon_url(url, *splits):
for split in splits:
if split is None:
continue
@@ -48,11 +47,11 @@ def try_get_b64icon(url, *splits):
response = requests.get(rb_url, verify=False, timeout=10)
# if html in content-type, we assume it's a fancy 404 page
content_type = response.headers.get('content-type', '')
- if response.ok and 'html' not in content_type:
- return content_type + (
- '\n%s' % base64.b64encode(response.content).decode('utf8'))
+ if response.ok and 'html' not in content_type and response.content:
+ return response.url
return None
def to_hash(text):
- return md5(text.encode('utf8')).hexdigest()
+ return md5(text.encode('utf8') if hasattr(text, 'encode') else text)\
+ .hexdigest()
diff --git a/pyaggr3g470r/lib/view_utils.py b/pyaggr3g470r/lib/view_utils.py
new file mode 100644
index 00000000..0cfe62c4
--- /dev/null
+++ b/pyaggr3g470r/lib/view_utils.py
@@ -0,0 +1,26 @@
+from functools import wraps
+from flask import request, Response, make_response
+from pyaggr3g470r.lib.utils import to_hash
+
+
+def etag_match(func):
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ response = func(*args, **kwargs)
+ if isinstance(response, Response):
+ etag = to_hash(response.data)
+ headers = response.headers
+ elif type(response) is str:
+ etag = to_hash(response)
+ headers = {}
+ else:
+ return response
+ if request.headers.get('if-none-match') == etag:
+ response = Response(status=304)
+ response.headers['Cache-Control'] \
+ = headers.get('Cache-Control', 'pragma: no-cache')
+ elif not isinstance(response, Response):
+ response = make_response(response)
+ response.headers['etag'] = etag
+ return response
+ return wrapper
bgstack15