From 334737969cad57a916163b974cd6c1a412c70dd1 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Fri, 3 Jul 2015 15:00:33 +0200 Subject: fixing bug on reset link for feeds and hidding feed title for eXtraSmall device --- pyaggr3g470r/lib/crawler.py | 3 +-- pyaggr3g470r/lib/utils.py | 32 +++++++++++++++++++++++--------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 2ba5403a..45b1acde 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -193,8 +193,7 @@ class PyAggUpdater(AbstractCrawler): 'last_modified': self.headers.get('last-modified', strftime('%a, %d %b %Y %X %Z', gmtime()))} fresh_feed = construct_feed_from(url=self.feed['link'], - fp_parsed=self.parsed_feed, - feed=self.feed) + fp_parsed=self.parsed_feed) for key in ('description', 'site_link', 'icon'): if fresh_feed.get(key) and fresh_feed[key] != self.feed.get(key): up_feed[key] = fresh_feed[key] diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index a0154b7f..041a2d29 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -37,6 +37,13 @@ def rebuild_url(url, base_split): return urllib.parse.urlunsplit(new_split) +def try_splits(url, *splits): + for split in splits: + if requests.get(rebuild_url(url, split), verify=False).ok: + return rebuild_url(url, split) + return None + + def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): if url is None and fp_parsed is not None: url = fp_parsed.get('url') @@ -45,7 +52,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): fp_parsed = feedparser.parse(response.content) assert url is not None and fp_parsed is not None feed = feed or {} - split = urllib.parse.urlsplit(url) + feed_split = urllib.parse.urlsplit(url) if not fp_parsed['bozo']: feed['link'] = url feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link') @@ -56,11 +63,13 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): feed['site_link'] = url if feed.get('site_link'): - feed['site_link'] = rebuild_url(feed['site_link'], split) - split = urllib.parse.urlsplit(feed['site_link']) + feed['site_link'] = rebuild_url(feed['site_link'], feed_split) + site_split = urllib.parse.urlsplit(feed['site_link']) if feed.get('icon'): - feed['icon'] = rebuild_url(feed['icon'], split) + feed['icon'] = try_splits(feed['icon'], site_split, feed_split) + if feed['icon'] is None: + del feed['icon'] if not feed.get('site_link') or not query_site \ or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')): @@ -91,11 +100,16 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): if not len(icons): icons = bs_parsed.find_all(check_keys(rel=['icon'])) if len(icons) >= 1: - feed['icon'] = rebuild_url(icons[0].attrs['href'], split) - else: # trying the default one - icon = rebuild_url('/favicon.ico', split) - if requests.get(icon, verify=False).ok: - feed['icon'] = icon + for icon in icons: + feed['icon'] = try_splits(icon.attrs['href'], + site_split, feed_split) + if feed['icon'] is not None: + break + + if feed['icon'] is None: + feed['icon'] = try_splits('/favicon.ico', site_split, feed_split) + if feed['icon'] is None: + del feed['icon'] if not feed.get('link'): alternate = bs_parsed.find_all(check_keys(rel=['alternate'], -- cgit From d08cc46087d3349aff7b06908c70d97fecbdec8f Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Sun, 5 Jul 2015 16:38:26 +0200 Subject: constructing feed from normal url also --- pyaggr3g470r/lib/utils.py | 9 ++++----- pyaggr3g470r/views/feed.py | 7 ++++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index 041a2d29..6d6725c8 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -77,7 +77,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): response = requests.get(feed['site_link'], verify=False) bs_parsed = BeautifulSoup(response.content, 'html.parser', - parse_only=SoupStrainer('head')) + parse_only=SoupStrainer('head')) if not feed.get('title'): try: @@ -115,9 +115,8 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): alternate = bs_parsed.find_all(check_keys(rel=['alternate'], type=['application/rss+xml'])) if len(alternate) == 1: - feed['link'] = rebuild_url(alternate[0].attrs['href'], split) + feed['link'] = alternate[0].attrs['href'] elif len(alternate) > 1: - feed['link'] = rebuild_url(alternate[0].attrs['href'], split) - feed['other_link'] = [rebuild_url(al.attrs['href'], split) - for al in alternate[1:]] + feed['link'] = alternate[0].attrs['href'] + feed['other_link'] = [al.attrs['href'] for al in alternate[1:]] return feed diff --git a/pyaggr3g470r/views/feed.py b/pyaggr3g470r/views/feed.py index 8bd2f8e9..d31aa212 100644 --- a/pyaggr3g470r/views/feed.py +++ b/pyaggr3g470r/views/feed.py @@ -12,6 +12,7 @@ from flask.ext.login import login_required import conf from pyaggr3g470r import utils +from pyaggr3g470r.lib.utils import construct_feed_from from pyaggr3g470r.forms import AddFeedForm from pyaggr3g470r.controllers import FeedController, ArticleController @@ -94,14 +95,14 @@ def bookmarklet(): flash(gettext("Couldn't add feed: url missing."), "error") raise BadRequest("url is missing") - existing_feeds = list(feed_contr.read(link=url)) - if existing_feeds: + feed_exists = list(feed_contr.read(__or__={'link': url, 'site_link': url})) + if feed_exists: flash(gettext("Couldn't add feed: feed already exists."), "warning") return redirect(url_for('feed.form', feed_id=existing_feeds[0].id)) - feed = feed_contr.create(link=url) + feed = feed_contr.create(**construct_feed_from(url)) flash(gettext('Feed was successfully created.'), 'success') if conf.CRAWLING_METHOD == "classic": utils.fetch(g.user.id, feed.id) -- cgit From c1551acb30513f96d0053b96e240da7ab68833d2 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Sun, 5 Jul 2015 17:41:06 +0200 Subject: making bookmaklet work for any url --- pyaggr3g470r/lib/utils.py | 21 +++++++++++++-------- pyaggr3g470r/views/feed.py | 5 ++--- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index 6d6725c8..a4f2e043 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -1,9 +1,12 @@ import types import urllib +import logging import requests import feedparser from bs4 import BeautifulSoup, SoupStrainer +logger = logging.getLogger(__name__) + def default_handler(obj): """JSON handler for default query formatting""" @@ -48,8 +51,13 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): if url is None and fp_parsed is not None: url = fp_parsed.get('url') if url is not None and fp_parsed is None: - response = requests.get(url, verify=False) - fp_parsed = feedparser.parse(response.content) + try: + response = requests.get(url, verify=False) + fp_parsed = feedparser.parse(response.content, + request_headers=response.headers) + except Exception: + logger.exception('failed to retreive that url') + fp_parsed = {'bozo': True} assert url is not None and fp_parsed is not None feed = feed or {} feed_split = urllib.parse.urlsplit(url) @@ -106,17 +114,14 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): if feed['icon'] is not None: break - if feed['icon'] is None: + if feed.get('icon') is None: feed['icon'] = try_splits('/favicon.ico', site_split, feed_split) - if feed['icon'] is None: + if 'icon' in feed and feed['icon'] is None: del feed['icon'] if not feed.get('link'): alternate = bs_parsed.find_all(check_keys(rel=['alternate'], type=['application/rss+xml'])) - if len(alternate) == 1: - feed['link'] = alternate[0].attrs['href'] - elif len(alternate) > 1: + if len(alternate) >= 1: feed['link'] = alternate[0].attrs['href'] - feed['other_link'] = [al.attrs['href'] for al in alternate[1:]] return feed diff --git a/pyaggr3g470r/views/feed.py b/pyaggr3g470r/views/feed.py index d31aa212..224e27fb 100644 --- a/pyaggr3g470r/views/feed.py +++ b/pyaggr3g470r/views/feed.py @@ -1,6 +1,6 @@ #! /usr/bin/env python # -*- coding: utf-8 - - +import logging from datetime import datetime from sqlalchemy import desc from werkzeug.exceptions import BadRequest @@ -99,8 +99,7 @@ def bookmarklet(): if feed_exists: flash(gettext("Couldn't add feed: feed already exists."), "warning") - return redirect(url_for('feed.form', - feed_id=existing_feeds[0].id)) + return redirect(url_for('feed.form', feed_id=feed_exists[0].id)) feed = feed_contr.create(**construct_feed_from(url)) flash(gettext('Feed was successfully created.'), 'success') -- cgit From 75df52051b167425adcfb68797f77fcbcad33c4e Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Mon, 6 Jul 2015 10:28:58 +0200 Subject: light refact * both crawlers now use the same utils methods * the original crawler use now more of the controllers, enabling the filters feature --- pyaggr3g470r/crawler.py | 164 ++++++++------------------------------ pyaggr3g470r/lib/article_utils.py | 94 ++++++++++++++++++++++ pyaggr3g470r/lib/crawler.py | 68 ++-------------- pyaggr3g470r/lib/feed_utils.py | 89 +++++++++++++++++++++ pyaggr3g470r/lib/utils.py | 83 +------------------ pyaggr3g470r/views/feed.py | 3 +- 6 files changed, 225 insertions(+), 276 deletions(-) create mode 100644 pyaggr3g470r/lib/article_utils.py create mode 100644 pyaggr3g470r/lib/feed_utils.py diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py index 23f9026e..b70b4e70 100644 --- a/pyaggr3g470r/crawler.py +++ b/pyaggr3g470r/crawler.py @@ -27,19 +27,18 @@ __copyright__ = "Copyright (c) Cedric Bonhomme" __license__ = "AGPLv3" import asyncio -import aiohttp import logging -import requests import feedparser import dateutil.parser from datetime import datetime -from bs4 import BeautifulSoup from sqlalchemy import or_ -from pyaggr3g470r import utils import conf from bootstrap import db -from pyaggr3g470r.models import User, Article +from pyaggr3g470r.models import User +from pyaggr3g470r.controllers import FeedController, ArticleController +from pyaggr3g470r.lib.feed_utils import construct_feed_from +from pyaggr3g470r.lib.article_utils import construct_article, extract_id logger = logging.getLogger(__name__) @@ -85,122 +84,26 @@ def parse_feed(user, feed): db.session.commit() return - #a_feed = feedparser.parse(data) + up_feed = {} if a_feed['bozo'] == 1: - #logger.error(a_feed['bozo_exception']) - feed.last_error = str(a_feed['bozo_exception']) - feed.error_count += 1 + up_feed['last_error'] = str(a_feed['bozo_exception']) + up_feed['error_count'] = feed.error_count + 1 db.session.commit() if a_feed['entries'] == []: return - feed.last_retrieved = datetime.now(dateutil.tz.tzlocal()) - feed.error_count = 0 - feed.last_error = "" + up_feed['last_retrieved'] = datetime.now(dateutil.tz.tzlocal()) + up_feed['error_count'] = 0 + up_feed['last_error'] = "" # Feed informations - try: - feed.title = a_feed.feed.title - except: - feed.title = "No title" - if feed.link == "": - try: - feed.link = a_feed.feed.link - except: - feed.link = "" - try: - feed.description = a_feed.feed.subtitle - except: - feed.description = "" - try: - - feed.icon = [a_feed.feed.get('image', False) and - a_feed.feed.image.get('href', "") or a_feed.feed.get('icon', "")][0] - except: - feed.icon = "" - - db.session.commit() - - articles = [] - for article in a_feed['entries']: - - try: - nice_url = article.link - except: - # if not able to get the link of the article, continue - continue - if conf.RESOLVE_ARTICLE_URL: - try: - # resolves URL behind proxies - # (like feedproxy.google.com) - r = requests.get(article.link, timeout=5.0) - nice_url = r.url - except Exception as error: - logger.warning( - "Unable to get the real URL of %s. Error: %s", - article.link, error) - pass - # remove utm_* parameters - nice_url = utils.clean_url(nice_url) - - try: - entry_id = article.id - except: - entry_id = nice_url - - description = "" - article_title = article.get('title', '') - try: - # article content - description = article.content[0].value - except AttributeError: - # article description - description = article.get('description', '') - - try: - soup = BeautifulSoup(description, "lxml") - - # Prevents BeautifulSoup4 from adding extra tags - # to the soup with the lxml parser. - if soup.html.body: - description = soup.html.body.decode_contents() - elif soup.html: - description = soup.html.decode_contents() - else: - description = soup.decode() - except: - logger.error("Problem when sanitizing the content of the article %s (%s)", - article_title, nice_url) - - # Get the date of publication of the article - post_date = None - for date_key in ('published_parsed', 'published', - 'updated_parsed', 'updated'): - if not date_key in article: - continue + up_feed.update(construct_feed_from(feed.link, a_feed)) + if feed.title and 'title' in up_feed: + del up_feed['title'] + FeedController().update({'id': feed.id}, up_feed) - try: - post_date = dateutil.parser.parse(article[date_key], - dayfirst=True) - break - except: - try: # trying to clean date field from letters - post_date = dateutil.parser.parse( - re.sub('[A-z]', '', article[date_key]), - dayfirst=True) - break - except: - pass - else: - post_date = datetime.now(dateutil.tz.tzlocal()) + return a_feed['entries'] - # create the models.Article object and append it to the list of articles - article = Article(entry_id=entry_id, link=nice_url, title=article_title, - content=description, readed=False, like=False, - date=post_date, user_id=user.id, - feed_id=feed.id) - articles.append(article) - return articles @asyncio.coroutine def insert_database(user, feed): @@ -209,34 +112,32 @@ def insert_database(user, feed): if None is articles: return [] - #print('inserting articles for {}'.format(feed.title)) + logger.debug('inserting articles for {}'.format(feed.title)) logger.info("Database insertion...") new_articles = [] - query1 = Article.query.filter(Article.user_id == user.id) - query2 = query1.filter(Article.feed_id == feed.id) + art_contr = ArticleController(user.id) for article in articles: - exist = query2.filter(or_(Article.entry_id==article.entry_id, Article.link==article.link)).count() != 0 + exist = art_contr.read(feed_id=feed.id, **extract_id(article)) if exist: - #logger.debug("Article %r (%r) already in the database.", article.title, article.link) + logger.debug("Article %r (%r) already in the database.", + article.title, article.link) continue - new_articles.append(article) + article = construct_article(article, feed) try: - feed.articles.append(article) - #db.session.merge(article) - db.session.commit() - #logger.info("New article % (%r) added.", article.title, article.link) - except Exception as e: - logger.error("Error when inserting article in database: " + str(e)) + new_articles.append(art_contr.create(**article)) + logger.info("New article % (%r) added.", + article.title, article.link) + except Exception: + logger.exception("Error when inserting article in database:") continue - #db.session.close() return new_articles @asyncio.coroutine def init_process(user, feed): # Fetch the feed and insert new articles in the database articles = yield from asyncio.async(insert_database(user, feed)) - #print('inserted articles for {}'.format(feed.title)) + logger.debug('inserted articles for %s', feed.title) return articles def retrieve_feed(loop, user, feed_id=None): @@ -248,24 +149,23 @@ def retrieve_feed(loop, user, feed_id=None): # Get the list of feeds to fetch user = User.query.filter(User.email == user.email).first() feeds = [feed for feed in user.feeds if - feed.error_count <= conf.DEFAULT_MAX_ERROR and \ - feed.enabled] + feed.error_count <= conf.DEFAULT_MAX_ERROR and feed.enabled] if feed_id is not None: feeds = [feed for feed in feeds if feed.id == feed_id] if feeds == []: return - import time # Launch the process for all the feeds tasks = [] try: # Python 3.5 (test) - tasks = [asyncio.ensure_future(init_process(user, feed)) for feed in feeds] + tasks = [asyncio.ensure_future(init_process(user, feed)) + for feed in feeds] except: tasks = [init_process(user, feed) for feed in feeds] try: loop.run_until_complete(asyncio.wait(tasks)) - except Exception as e: - print(e) + except Exception: + logger.exception('an error occured') logger.info("All articles retrieved. End of the processus.") diff --git a/pyaggr3g470r/lib/article_utils.py b/pyaggr3g470r/lib/article_utils.py new file mode 100644 index 00000000..023be9a7 --- /dev/null +++ b/pyaggr3g470r/lib/article_utils.py @@ -0,0 +1,94 @@ +import logging +import requests +import dateutil.parser +from datetime import datetime +from bs4 import BeautifulSoup + +import conf +from pyaggr3g470r.lib.utils import to_hash + +logger = logging.getLogger(__name__) + + +def extract_id(entry, keys=[('link', 'link'), + ('published', 'retrieved_date'), + ('updated', 'retrieved_date')], force_id=False): + """For a given entry will return a dict that allows to identify it. The + dict will be constructed on the uid of the entry. if that identifier is + absent, the dict will be constructed upon the values of "keys". + """ + entry_id = entry.get('entry_id') or entry.get('id') + if entry_id: + return {'entry_id': entry_id} + if not entry_id and force_id: + entry_id = to_hash("".join(entry[entry_key] for _, entry_key in keys + if entry_key in entry).encode('utf8')) + else: + ids = {} + for entry_key, pyagg_key in keys: + if entry_key in entry and pyagg_key not in ids: + ids[pyagg_key] = entry[entry_key] + if 'date' in pyagg_key: + ids[pyagg_key] = dateutil.parser.parse(ids[pyagg_key])\ + .isoformat() + return ids + + +def construct_article(entry, feed): + "Safe method to transorm a feedparser entry into an article" + now = datetime.now() + + for date_key in ('published', 'updated'): + if entry.get(date_key): + try: + date = dateutil.parser.parse(entry[date_key]) + except Exception: + pass + else: + break + content = '' + if entry.get('content'): + content = entry['content'][0]['value'] + elif entry.get('summary'): + content = entry['summary'] + + description = entry.get('description', '') + try: + description = entry.content[0].value + except Exception: + pass + + try: + soup = BeautifulSoup(description, "lxml") + # Prevents BeautifulSoup4 from adding extra tags + # to the soup with the lxml parser. + if soup.html.body: + description = soup.html.body.decode_contents() + elif soup.html: + description = soup.html.decode_contents() + else: + description = soup.decode() + except Exception: + pass + + article_link = entry.get('link') + if conf.RESOLVE_ARTICLE_URL and article_link: + try: + # resolves URL behind proxies + # (like feedproxy.google.com) + response = requests.get(article_link, verify=False, timeout=5.0) + article_link = response.url + except Exception as error: + logger.warning("Unable to get the real URL of %s. Error: %s", + article_link, error) + + return {'feed_id': feed['id'], + 'user_id': feed['user_id'], + 'entry_id': extract_id(entry).get('entry_id', None), + 'link': entry.get('link', feed['site_link']), + 'title': entry.get('title', 'No title'), + 'readed': False, 'like': False, + 'description': description, + 'content': content, + 'retrieved_date': now.isoformat(), + 'date': (date or now).isoformat()} diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 45b1acde..8d2de15f 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -17,48 +17,19 @@ import conf import json import logging import feedparser -import dateutil.parser -from hashlib import md5 from functools import wraps -from datetime import datetime from time import strftime, gmtime from concurrent.futures import ThreadPoolExecutor from requests_futures.sessions import FuturesSession -from pyaggr3g470r.lib.utils import default_handler, construct_feed_from +from pyaggr3g470r.lib.utils import default_handler, to_hash +from pyaggr3g470r.lib.feed_utils import construct_feed_from +from pyaggr3g470r.lib.article_utils import extract_id, construct_article logger = logging.getLogger(__name__) logging.captureWarnings(True) API_ROOT = "api/v2.0/" -def to_hash(text): - return md5(text.encode('utf8')).hexdigest() - - -def extract_id(entry, keys=[('link', 'link'), - ('published', 'retrieved_date'), - ('updated', 'retrieved_date')], force_id=False): - """For a given entry will return a dict that allows to identify it. The - dict will be constructed on the uid of the entry. if that identifier is - absent, the dict will be constructed upon the values of "keys". - """ - entry_id = entry.get('entry_id') or entry.get('id') - if entry_id: - return {'entry_id': entry_id} - if not entry_id and force_id: - entry_id = to_hash("".join(entry[entry_key] for _, entry_key in keys - if entry_key in entry).encode('utf8')) - else: - ids = {} - for entry_key, pyagg_key in keys: - if entry_key in entry and pyagg_key not in ids: - ids[pyagg_key] = entry[entry_key] - if 'date' in pyagg_key: - ids[pyagg_key] = dateutil.parser.parse(ids[pyagg_key])\ - .isoformat() - return ids - - class AbstractCrawler: __session__ = None __counter__ = 0 @@ -139,34 +110,6 @@ class PyAggUpdater(AbstractCrawler): self.parsed_feed = parsed_feed super(PyAggUpdater, self).__init__(auth) - def to_article(self, entry): - "Safe method to transorm a feedparser entry into an article" - date = datetime.now() - - for date_key in ('published', 'updated'): - if entry.get(date_key): - try: - date = dateutil.parser.parse(entry[date_key]) - except Exception: - pass - else: - break - content = '' - if entry.get('content'): - content = entry['content'][0]['value'] - elif entry.get('summary'): - content = entry['summary'] - - return {'feed_id': self.feed['id'], - 'user_id': self.feed['user_id'], - 'entry_id': extract_id(entry).get('entry_id', None), - 'link': entry.get('link', self.feed['site_link']), - 'title': entry.get('title', 'No title'), - 'readed': False, 'like': False, - 'content': content, - 'retrieved_date': date.isoformat(), - 'date': date.isoformat()} - @AbstractCrawler.count_on_me def callback(self, response): """Will process the result from the challenge, creating missing article @@ -176,8 +119,9 @@ class PyAggUpdater(AbstractCrawler): logger.debug('%r %r - %d entries were not matched and will be created', self.feed['id'], self.feed['title'], len(results)) for id_to_create in results: - entry = self.to_article( - self.entries[tuple(sorted(id_to_create.items()))]) + entry = construct_article( + self.entries[tuple(sorted(id_to_create.items()))], + self.feed) logger.warn('%r %r - creating %r for %r - %r', self.feed['id'], self.feed['title'], entry['title'], entry['user_id'], id_to_create) diff --git a/pyaggr3g470r/lib/feed_utils.py b/pyaggr3g470r/lib/feed_utils.py new file mode 100644 index 00000000..a7149d79 --- /dev/null +++ b/pyaggr3g470r/lib/feed_utils.py @@ -0,0 +1,89 @@ +import urllib +import logging +import requests +import feedparser +from bs4 import BeautifulSoup, SoupStrainer + +from pyaggr3g470r.lib.utils import try_keys, try_splits, rebuild_url + +logger = logging.getLogger(__name__) + + +def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): + if url is None and fp_parsed is not None: + url = fp_parsed.get('url') + if url is not None and fp_parsed is None: + try: + response = requests.get(url, verify=False) + fp_parsed = feedparser.parse(response.content, + request_headers=response.headers) + except Exception: + logger.exception('failed to retreive that url') + fp_parsed = {'bozo': True} + assert url is not None and fp_parsed is not None + feed = feed or {} + feed_split = urllib.parse.urlsplit(url) + if not fp_parsed['bozo']: + feed['link'] = url + feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link') + feed['title'] = fp_parsed['feed'].get('title') + feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title') + feed['icon'] = try_keys(fp_parsed['feed'], 'icon') + else: + feed['site_link'] = url + + if feed.get('site_link'): + feed['site_link'] = rebuild_url(feed['site_link'], feed_split) + site_split = urllib.parse.urlsplit(feed['site_link']) + + if feed.get('icon'): + feed['icon'] = try_splits(feed['icon'], site_split, feed_split) + if feed['icon'] is None: + del feed['icon'] + + if not feed.get('site_link') or not query_site \ + or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')): + return feed + + response = requests.get(feed['site_link'], verify=False) + bs_parsed = BeautifulSoup(response.content, 'html.parser', + parse_only=SoupStrainer('head')) + + if not feed.get('title'): + try: + feed['title'] = bs_parsed.find_all('title')[0].text + except Exception: + pass + + def check_keys(**kwargs): + def wrapper(elem): + for key, vals in kwargs.items(): + if not elem.has_attr(key): + return False + if not all(val in elem.attrs[key] for val in vals): + return False + return True + return wrapper + + if not feed.get('icon'): + icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut'])) + if not len(icons): + icons = bs_parsed.find_all(check_keys(rel=['icon'])) + if len(icons) >= 1: + for icon in icons: + feed['icon'] = try_splits(icon.attrs['href'], + site_split, feed_split) + if feed['icon'] is not None: + break + + if feed.get('icon') is None: + feed['icon'] = try_splits('/favicon.ico', site_split, feed_split) + if 'icon' in feed and feed['icon'] is None: + del feed['icon'] + + if not feed.get('link'): + alternate = bs_parsed.find_all(check_keys(rel=['alternate'], + type=['application/rss+xml'])) + if len(alternate) >= 1: + feed['link'] = alternate[0].attrs['href'] + return feed diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index a4f2e043..b7e5cafc 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -2,8 +2,7 @@ import types import urllib import logging import requests -import feedparser -from bs4 import BeautifulSoup, SoupStrainer +from hashlib import md5 logger = logging.getLogger(__name__) @@ -47,81 +46,5 @@ def try_splits(url, *splits): return None -def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): - if url is None and fp_parsed is not None: - url = fp_parsed.get('url') - if url is not None and fp_parsed is None: - try: - response = requests.get(url, verify=False) - fp_parsed = feedparser.parse(response.content, - request_headers=response.headers) - except Exception: - logger.exception('failed to retreive that url') - fp_parsed = {'bozo': True} - assert url is not None and fp_parsed is not None - feed = feed or {} - feed_split = urllib.parse.urlsplit(url) - if not fp_parsed['bozo']: - feed['link'] = url - feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link') - feed['title'] = fp_parsed['feed'].get('title') - feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title') - feed['icon'] = try_keys(fp_parsed['feed'], 'icon') - else: - feed['site_link'] = url - - if feed.get('site_link'): - feed['site_link'] = rebuild_url(feed['site_link'], feed_split) - site_split = urllib.parse.urlsplit(feed['site_link']) - - if feed.get('icon'): - feed['icon'] = try_splits(feed['icon'], site_split, feed_split) - if feed['icon'] is None: - del feed['icon'] - - if not feed.get('site_link') or not query_site \ - or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')): - return feed - - response = requests.get(feed['site_link'], verify=False) - bs_parsed = BeautifulSoup(response.content, 'html.parser', - parse_only=SoupStrainer('head')) - - if not feed.get('title'): - try: - feed['title'] = bs_parsed.find_all('title')[0].text - except Exception: - pass - - def check_keys(**kwargs): - def wrapper(elem): - for key, vals in kwargs.items(): - if not elem.has_attr(key): - return False - if not all(val in elem.attrs[key] for val in vals): - return False - return True - return wrapper - - if not feed.get('icon'): - icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut'])) - if not len(icons): - icons = bs_parsed.find_all(check_keys(rel=['icon'])) - if len(icons) >= 1: - for icon in icons: - feed['icon'] = try_splits(icon.attrs['href'], - site_split, feed_split) - if feed['icon'] is not None: - break - - if feed.get('icon') is None: - feed['icon'] = try_splits('/favicon.ico', site_split, feed_split) - if 'icon' in feed and feed['icon'] is None: - del feed['icon'] - - if not feed.get('link'): - alternate = bs_parsed.find_all(check_keys(rel=['alternate'], - type=['application/rss+xml'])) - if len(alternate) >= 1: - feed['link'] = alternate[0].attrs['href'] - return feed +def to_hash(text): + return md5(text.encode('utf8')).hexdigest() diff --git a/pyaggr3g470r/views/feed.py b/pyaggr3g470r/views/feed.py index 224e27fb..4c848b0e 100644 --- a/pyaggr3g470r/views/feed.py +++ b/pyaggr3g470r/views/feed.py @@ -1,6 +1,5 @@ #! /usr/bin/env python # -*- coding: utf-8 - -import logging from datetime import datetime from sqlalchemy import desc from werkzeug.exceptions import BadRequest @@ -12,7 +11,7 @@ from flask.ext.login import login_required import conf from pyaggr3g470r import utils -from pyaggr3g470r.lib.utils import construct_feed_from +from pyaggr3g470r.lib.feed.utils import construct_feed_from from pyaggr3g470r.forms import AddFeedForm from pyaggr3g470r.controllers import FeedController, ArticleController -- cgit From c47e8fbd86eebd4c61888b1cba6cf39670679fd0 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Mon, 6 Jul 2015 11:47:45 +0200 Subject: the icon isn't a url but a b64 dump --- pyaggr3g470r/lib/utils.py | 7 +++++-- pyaggr3g470r/templates/home.html | 2 +- pyaggr3g470r/views/feed.py | 11 +++++++++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index b7e5cafc..280256f6 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -1,5 +1,6 @@ import types import urllib +import base64 import logging import requests from hashlib import md5 @@ -41,8 +42,10 @@ def rebuild_url(url, base_split): def try_splits(url, *splits): for split in splits: - if requests.get(rebuild_url(url, split), verify=False).ok: - return rebuild_url(url, split) + rb_url = rebuild_url(url, split) + response = requests.get(rb_url, verify=False, timeout=10) + if response.ok and 'html' not in response.headers['content-type']: + return base64.b64encode(response.content).decode('utf8') return None diff --git a/pyaggr3g470r/templates/home.html b/pyaggr3g470r/templates/home.html index 7e272b55..42a5d498 100644 --- a/pyaggr3g470r/templates/home.html +++ b/pyaggr3g470r/templates/home.html @@ -106,7 +106,7 @@ {% endif %} - {% if article.source.icon %}{% endif %} + {% if article.source.icon %}{% endif %} {{ article.title|safe }} diff --git a/pyaggr3g470r/views/feed.py b/pyaggr3g470r/views/feed.py index 4c848b0e..d84a68bc 100644 --- a/pyaggr3g470r/views/feed.py +++ b/pyaggr3g470r/views/feed.py @@ -1,17 +1,18 @@ #! /usr/bin/env python # -*- coding: utf-8 - +import base64 from datetime import datetime from sqlalchemy import desc from werkzeug.exceptions import BadRequest from flask import Blueprint, g, render_template, flash, \ - redirect, request, url_for + redirect, request, url_for, Response from flask.ext.babel import gettext from flask.ext.login import login_required import conf from pyaggr3g470r import utils -from pyaggr3g470r.lib.feed.utils import construct_feed_from +from pyaggr3g470r.lib.feed_utils import construct_feed_from from pyaggr3g470r.forms import AddFeedForm from pyaggr3g470r.controllers import FeedController, ArticleController @@ -182,3 +183,9 @@ def process_form(feed_id=None): flash(gettext("Downloading articles for the new feed..."), 'info') return redirect(url_for('feed.form', feed_id=new_feed.id)) + + +@feed_bp.route('/icon/', methods=['GET']) +def icon(feed_id): + return Response(base64.b64decode(FeedController().get(id=feed_id).icon), + mimetype='image') -- cgit From da4a415176c5d91a2310666ab597224591d3957c Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Mon, 6 Jul 2015 14:20:21 +0200 Subject: adding cache control on icons --- pyaggr3g470r/views/feed.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pyaggr3g470r/views/feed.py b/pyaggr3g470r/views/feed.py index d84a68bc..3556d7c7 100644 --- a/pyaggr3g470r/views/feed.py +++ b/pyaggr3g470r/views/feed.py @@ -1,6 +1,7 @@ #! /usr/bin/env python # -*- coding: utf-8 - import base64 +from hashlib import md5 from datetime import datetime from sqlalchemy import desc from werkzeug.exceptions import BadRequest @@ -186,6 +187,11 @@ def process_form(feed_id=None): @feed_bp.route('/icon/', methods=['GET']) +@login_required def icon(feed_id): - return Response(base64.b64decode(FeedController().get(id=feed_id).icon), - mimetype='image') + icon = FeedController(g.user.id).get(id=feed_id).icon + etag = md5(icon.encode('utf8')).hexdigest() + headers = {'Cache-Control': 'max-age=86400', 'ETag': etag} + if request.headers.get('if-none-match') == etag: + return Response(status=304, headers=headers) + return Response(base64.b64decode(icon), mimetype='image', headers=headers) -- cgit