From cbe51f5f50bceff02f48dfbdd3c09e1660063c4a Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Tue, 21 Jul 2015 10:51:45 +0200 Subject: retrieved date is now when retrieved and comparison are made on the date of articles --- pyaggr3g470r/lib/article_utils.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/article_utils.py b/pyaggr3g470r/lib/article_utils.py index 115b6058..dbe29659 100644 --- a/pyaggr3g470r/lib/article_utils.py +++ b/pyaggr3g470r/lib/article_utils.py @@ -2,7 +2,6 @@ import logging import requests import dateutil.parser from datetime import datetime -from bs4 import BeautifulSoup import conf from pyaggr3g470r.lib.utils import to_hash @@ -10,9 +9,8 @@ from pyaggr3g470r.lib.utils import to_hash logger = logging.getLogger(__name__) -def extract_id(entry, keys=[('link', 'link'), - ('published', 'retrieved_date'), - ('updated', 'retrieved_date')], force_id=False): +def extract_id(entry, keys=[('link', 'link'), ('published', 'date'), + ('updated', 'date')], force_id=False): """For a given entry will return a dict that allows to identify it. The dict will be constructed on the uid of the entry. if that identifier is absent, the dict will be constructed upon the values of "keys". @@ -21,7 +19,7 @@ def extract_id(entry, keys=[('link', 'link'), if entry_id: return {'entry_id': entry_id} if not entry_id and force_id: - entry_id = to_hash("".join(entry[entry_key] for _, entry_key in keys + return to_hash("".join(entry[entry_key] for _, entry_key in keys if entry_key in entry).encode('utf8')) else: ids = {} @@ -38,8 +36,8 @@ def construct_article(entry, feed): if hasattr(feed, 'dump'): # this way can be a sqlalchemy obj or a dict feed = feed.dump() "Safe method to transorm a feedparser entry into an article" - date = datetime.now() - + now = datetime.now() + date = None for date_key in ('published', 'updated'): if entry.get(date_key): try: @@ -72,5 +70,5 @@ def construct_article(entry, feed): 'title': entry.get('title', 'No title'), 'readed': False, 'like': False, 'content': content, - 'retrieved_date': date.isoformat(), - 'date': date.isoformat()} + 'retrieved_date': now.isoformat(), + 'date': (date or now).isoformat()} -- cgit From 92289b32248f4568579edfd5a301e571ade0c284 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Tue, 21 Jul 2015 14:44:11 +0200 Subject: fetching mimetypes with images --- pyaggr3g470r/lib/feed_utils.py | 9 +++++---- pyaggr3g470r/lib/utils.py | 9 ++++++--- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'pyaggr3g470r/lib') diff --git a/pyaggr3g470r/lib/feed_utils.py b/pyaggr3g470r/lib/feed_utils.py index 367fd4b5..28123f66 100644 --- a/pyaggr3g470r/lib/feed_utils.py +++ b/pyaggr3g470r/lib/feed_utils.py @@ -4,7 +4,7 @@ import requests import feedparser from bs4 import BeautifulSoup, SoupStrainer -from pyaggr3g470r.lib.utils import try_keys, try_splits, rebuild_url +from pyaggr3g470r.lib.utils import try_keys, try_get_b64icon, rebuild_url logger = logging.getLogger(__name__) @@ -38,7 +38,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): site_split = urllib.parse.urlsplit(feed['site_link']) if feed.get('icon'): - feed['icon'] = try_splits(feed['icon'], site_split, feed_split) + feed['icon'] = try_get_b64icon(feed['icon'], site_split, feed_split) if feed['icon'] is None: del feed['icon'] @@ -72,13 +72,14 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): icons = bs_parsed.find_all(check_keys(rel=['icon'])) if len(icons) >= 1: for icon in icons: - feed['icon'] = try_splits(icon.attrs['href'], + feed['icon'] = try_get_b64icon(icon.attrs['href'], site_split, feed_split) if feed['icon'] is not None: break if feed.get('icon') is None: - feed['icon'] = try_splits('/favicon.ico', site_split, feed_split) + feed['icon'] = try_get_b64icon('/favicon.ico', + site_split, feed_split) if 'icon' in feed and feed['icon'] is None: del feed['icon'] diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index 62284de1..a51b6c3e 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -40,14 +40,17 @@ def rebuild_url(url, base_split): return urllib.parse.urlunsplit(new_split) -def try_splits(url, *splits): +def try_get_b64icon(url, *splits): for split in splits: if split is None: continue rb_url = rebuild_url(url, split) response = requests.get(rb_url, verify=False, timeout=10) - if response.ok and 'html' not in response.headers['content-type']: - return base64.b64encode(response.content).decode('utf8') + # if html in content-type, we assume it's a fancy 404 page + content_type = response.headers.get('content-type', '') + if response.ok and 'html' not in content_type: + return content_type + ( + '\n%s' % base64.b64encode(response.content).decode('utf8')) return None -- cgit