diff options
Diffstat (limited to 'pyaggr3g470r')
-rw-r--r-- | pyaggr3g470r/lib/crawler.py | 33 | ||||
-rw-r--r-- | pyaggr3g470r/lib/utils.py | 95 | ||||
-rw-r--r-- | pyaggr3g470r/models/feed.py | 2 | ||||
-rw-r--r-- | pyaggr3g470r/templates/home.html | 4 | ||||
-rw-r--r-- | pyaggr3g470r/views/api/feed.py | 1 |
5 files changed, 121 insertions, 14 deletions
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 324f0d8e..2ba5403a 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -24,7 +24,7 @@ from datetime import datetime from time import strftime, gmtime from concurrent.futures import ThreadPoolExecutor from requests_futures.sessions import FuturesSession -from pyaggr3g470r.lib.utils import default_handler +from pyaggr3g470r.lib.utils import default_handler, construct_feed_from logger = logging.getLogger(__name__) logging.captureWarnings(True) @@ -136,7 +136,7 @@ class PyAggUpdater(AbstractCrawler): self.feed = feed self.entries = entries self.headers = headers - self.parsed_feed = parsed_feed.get('feed', {}) + self.parsed_feed = parsed_feed super(PyAggUpdater, self).__init__(auth) def to_article(self, entry): @@ -188,19 +188,26 @@ class PyAggUpdater(AbstractCrawler): self.headers.get('etag', ''), self.headers.get('last-modified', '')) - dico = {'error_count': 0, 'last_error': None, - 'etag': self.headers.get('etag', ''), - 'last_modified': self.headers.get('last-modified', - strftime('%a, %d %b %Y %X %Z', gmtime())), - 'site_link': self.parsed_feed.get('link')} + up_feed = {'error_count': 0, 'last_error': None, + 'etag': self.headers.get('etag', ''), + 'last_modified': self.headers.get('last-modified', + strftime('%a, %d %b %Y %X %Z', gmtime()))} + fresh_feed = construct_feed_from(url=self.feed['link'], + fp_parsed=self.parsed_feed, + feed=self.feed) + for key in ('description', 'site_link', 'icon'): + if fresh_feed.get(key) and fresh_feed[key] != self.feed.get(key): + up_feed[key] = fresh_feed[key] if not self.feed.get('title'): - dico['title'] = self.parsed_feed.get('title', '') + up_feed['title'] = fresh_feed.get('title', '') + logger.info('%r %r - pushing feed attrs %r', self.feed['id'], self.feed['title'], - {key: "%s -> %s" % (dico[key], self.feed.get(key)) - for key in dico if dico[key] != self.feed.get(key)}) - if any([dico[key] != self.feed.get(key) for key in dico]): - future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico) + {key: "%s -> %s" % (up_feed[key], self.feed.get(key)) + for key in up_feed if up_feed[key] != self.feed.get(key)}) + if any([up_feed[key] != self.feed.get(key) for key in up_feed]): + future = self.query_pyagg('put', + 'feed/%d' % self.feed['id'], up_feed) future.add_done_callback(self.get_counter_callback()) @@ -265,7 +272,7 @@ class FeedCrawler(AbstractCrawler): self.feed['id'], self.feed['title']) ids, entries = [], {} - parsed_response = feedparser.parse(response.text) + parsed_response = feedparser.parse(response.content) for entry in parsed_response['entries']: entry_ids = extract_id(entry) entry_ids['feed_id'] = self.feed['id'] diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py index a4f4b3ec..a0154b7f 100644 --- a/pyaggr3g470r/lib/utils.py +++ b/pyaggr3g470r/lib/utils.py @@ -1,4 +1,9 @@ import types +import urllib +import requests +import feedparser +from bs4 import BeautifulSoup, SoupStrainer + def default_handler(obj): """JSON handler for default query formatting""" @@ -12,3 +17,93 @@ def default_handler(obj): return str(obj) raise TypeError("Object of type %s with value of %r " "is not JSON serializable" % (type(obj), obj)) + + +def try_keys(dico, *keys): + for key in keys: + if key in dico: + return dico[key] + return + + +def rebuild_url(url, base_split): + split = urllib.parse.urlsplit(url) + if split.scheme and split.netloc: + return url # url is fine + new_split = urllib.parse.SplitResult( + scheme=split.scheme or base_split.scheme, + netloc=split.netloc or base_split.netloc, + path=split.path, query='', fragment='') + return urllib.parse.urlunsplit(new_split) + + +def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): + if url is None and fp_parsed is not None: + url = fp_parsed.get('url') + if url is not None and fp_parsed is None: + response = requests.get(url, verify=False) + fp_parsed = feedparser.parse(response.content) + assert url is not None and fp_parsed is not None + feed = feed or {} + split = urllib.parse.urlsplit(url) + if not fp_parsed['bozo']: + feed['link'] = url + feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link') + feed['title'] = fp_parsed['feed'].get('title') + feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title') + feed['icon'] = try_keys(fp_parsed['feed'], 'icon') + else: + feed['site_link'] = url + + if feed.get('site_link'): + feed['site_link'] = rebuild_url(feed['site_link'], split) + split = urllib.parse.urlsplit(feed['site_link']) + + if feed.get('icon'): + feed['icon'] = rebuild_url(feed['icon'], split) + + if not feed.get('site_link') or not query_site \ + or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')): + return feed + + response = requests.get(feed['site_link'], verify=False) + bs_parsed = BeautifulSoup(response.content, 'html.parser', + parse_only=SoupStrainer('head')) + + if not feed.get('title'): + try: + feed['title'] = bs_parsed.find_all('title')[0].text + except Exception: + pass + + def check_keys(**kwargs): + def wrapper(elem): + for key, vals in kwargs.items(): + if not elem.has_attr(key): + return False + if not all(val in elem.attrs[key] for val in vals): + return False + return True + return wrapper + + if not feed.get('icon'): + icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut'])) + if not len(icons): + icons = bs_parsed.find_all(check_keys(rel=['icon'])) + if len(icons) >= 1: + feed['icon'] = rebuild_url(icons[0].attrs['href'], split) + else: # trying the default one + icon = rebuild_url('/favicon.ico', split) + if requests.get(icon, verify=False).ok: + feed['icon'] = icon + + if not feed.get('link'): + alternate = bs_parsed.find_all(check_keys(rel=['alternate'], + type=['application/rss+xml'])) + if len(alternate) == 1: + feed['link'] = rebuild_url(alternate[0].attrs['href'], split) + elif len(alternate) > 1: + feed['link'] = rebuild_url(alternate[0].attrs['href'], split) + feed['other_link'] = [rebuild_url(al.attrs['href'], split) + for al in alternate[1:]] + return feed diff --git a/pyaggr3g470r/models/feed.py b/pyaggr3g470r/models/feed.py index 793642fb..75e55df1 100644 --- a/pyaggr3g470r/models/feed.py +++ b/pyaggr3g470r/models/feed.py @@ -43,6 +43,7 @@ class Feed(db.Model): enabled = db.Column(db.Boolean(), default=True) created_date = db.Column(db.DateTime(), default=datetime.now) filters = db.Column(db.PickleType, default=[]) + icon = db.Column(db.String(), default="") # cache handling etag = db.Column(db.String(), default="") @@ -70,6 +71,7 @@ class Feed(db.Model): "link": self.link, "site_link": self.site_link, "etag": self.etag, + "icon": self.icon, "error_count": self.error_count, "last_modified": self.last_modified, "last_retrieved": self.last_retrieved} diff --git a/pyaggr3g470r/templates/home.html b/pyaggr3g470r/templates/home.html index 6d1ca85e..5c961888 100644 --- a/pyaggr3g470r/templates/home.html +++ b/pyaggr3g470r/templates/home.html @@ -105,7 +105,9 @@ {% if filter_ == 'all' %}</b>{% endif %} {% endif %} </td> - <td><a class="open-article" href="/article/redirect/{{ article.id}}" target="_blank">{{ article.source.title|safe }}</a></td> + <td><a class="open-article" href="/article/redirect/{{ article.id}}" target="_blank"> + {% if article.source.icon %}<img src="{{ article.source.icon }}" width="16px" />{% endif %} + {{ article.source.title|safe }}</a></td> <td {%if filter_ == 'all' and article.readed == False %}style='font-weight:bold'{% endif %}> <a href="/article/{{ article.id }}">{{ article.title|safe }}</a> </td> diff --git a/pyaggr3g470r/views/api/feed.py b/pyaggr3g470r/views/api/feed.py index 68f3a12c..f9060263 100644 --- a/pyaggr3g470r/views/api/feed.py +++ b/pyaggr3g470r/views/api/feed.py @@ -19,6 +19,7 @@ FEED_ATTRS = {'title': {'type': str}, 'site_link': {'type': str}, 'enabled': {'type': bool, 'default': True}, 'etag': {'type': str, 'default': ''}, + 'icon': {'type': str, 'default': ''}, 'last_modified': {'type': str}, 'last_retrieved': {'type': str}, 'last_error': {'type': str}, |