aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r
diff options
context:
space:
mode:
Diffstat (limited to 'pyaggr3g470r')
-rw-r--r--pyaggr3g470r/lib/crawler.py33
-rw-r--r--pyaggr3g470r/lib/utils.py95
-rw-r--r--pyaggr3g470r/models/feed.py2
-rw-r--r--pyaggr3g470r/templates/home.html4
-rw-r--r--pyaggr3g470r/views/api/feed.py1
5 files changed, 121 insertions, 14 deletions
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py
index 324f0d8e..2ba5403a 100644
--- a/pyaggr3g470r/lib/crawler.py
+++ b/pyaggr3g470r/lib/crawler.py
@@ -24,7 +24,7 @@ from datetime import datetime
from time import strftime, gmtime
from concurrent.futures import ThreadPoolExecutor
from requests_futures.sessions import FuturesSession
-from pyaggr3g470r.lib.utils import default_handler
+from pyaggr3g470r.lib.utils import default_handler, construct_feed_from
logger = logging.getLogger(__name__)
logging.captureWarnings(True)
@@ -136,7 +136,7 @@ class PyAggUpdater(AbstractCrawler):
self.feed = feed
self.entries = entries
self.headers = headers
- self.parsed_feed = parsed_feed.get('feed', {})
+ self.parsed_feed = parsed_feed
super(PyAggUpdater, self).__init__(auth)
def to_article(self, entry):
@@ -188,19 +188,26 @@ class PyAggUpdater(AbstractCrawler):
self.headers.get('etag', ''),
self.headers.get('last-modified', ''))
- dico = {'error_count': 0, 'last_error': None,
- 'etag': self.headers.get('etag', ''),
- 'last_modified': self.headers.get('last-modified',
- strftime('%a, %d %b %Y %X %Z', gmtime())),
- 'site_link': self.parsed_feed.get('link')}
+ up_feed = {'error_count': 0, 'last_error': None,
+ 'etag': self.headers.get('etag', ''),
+ 'last_modified': self.headers.get('last-modified',
+ strftime('%a, %d %b %Y %X %Z', gmtime()))}
+ fresh_feed = construct_feed_from(url=self.feed['link'],
+ fp_parsed=self.parsed_feed,
+ feed=self.feed)
+ for key in ('description', 'site_link', 'icon'):
+ if fresh_feed.get(key) and fresh_feed[key] != self.feed.get(key):
+ up_feed[key] = fresh_feed[key]
if not self.feed.get('title'):
- dico['title'] = self.parsed_feed.get('title', '')
+ up_feed['title'] = fresh_feed.get('title', '')
+
logger.info('%r %r - pushing feed attrs %r',
self.feed['id'], self.feed['title'],
- {key: "%s -> %s" % (dico[key], self.feed.get(key))
- for key in dico if dico[key] != self.feed.get(key)})
- if any([dico[key] != self.feed.get(key) for key in dico]):
- future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico)
+ {key: "%s -> %s" % (up_feed[key], self.feed.get(key))
+ for key in up_feed if up_feed[key] != self.feed.get(key)})
+ if any([up_feed[key] != self.feed.get(key) for key in up_feed]):
+ future = self.query_pyagg('put',
+ 'feed/%d' % self.feed['id'], up_feed)
future.add_done_callback(self.get_counter_callback())
@@ -265,7 +272,7 @@ class FeedCrawler(AbstractCrawler):
self.feed['id'], self.feed['title'])
ids, entries = [], {}
- parsed_response = feedparser.parse(response.text)
+ parsed_response = feedparser.parse(response.content)
for entry in parsed_response['entries']:
entry_ids = extract_id(entry)
entry_ids['feed_id'] = self.feed['id']
diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py
index a4f4b3ec..a0154b7f 100644
--- a/pyaggr3g470r/lib/utils.py
+++ b/pyaggr3g470r/lib/utils.py
@@ -1,4 +1,9 @@
import types
+import urllib
+import requests
+import feedparser
+from bs4 import BeautifulSoup, SoupStrainer
+
def default_handler(obj):
"""JSON handler for default query formatting"""
@@ -12,3 +17,93 @@ def default_handler(obj):
return str(obj)
raise TypeError("Object of type %s with value of %r "
"is not JSON serializable" % (type(obj), obj))
+
+
+def try_keys(dico, *keys):
+ for key in keys:
+ if key in dico:
+ return dico[key]
+ return
+
+
+def rebuild_url(url, base_split):
+ split = urllib.parse.urlsplit(url)
+ if split.scheme and split.netloc:
+ return url # url is fine
+ new_split = urllib.parse.SplitResult(
+ scheme=split.scheme or base_split.scheme,
+ netloc=split.netloc or base_split.netloc,
+ path=split.path, query='', fragment='')
+ return urllib.parse.urlunsplit(new_split)
+
+
+def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
+ if url is None and fp_parsed is not None:
+ url = fp_parsed.get('url')
+ if url is not None and fp_parsed is None:
+ response = requests.get(url, verify=False)
+ fp_parsed = feedparser.parse(response.content)
+ assert url is not None and fp_parsed is not None
+ feed = feed or {}
+ split = urllib.parse.urlsplit(url)
+ if not fp_parsed['bozo']:
+ feed['link'] = url
+ feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link')
+ feed['title'] = fp_parsed['feed'].get('title')
+ feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title')
+ feed['icon'] = try_keys(fp_parsed['feed'], 'icon')
+ else:
+ feed['site_link'] = url
+
+ if feed.get('site_link'):
+ feed['site_link'] = rebuild_url(feed['site_link'], split)
+ split = urllib.parse.urlsplit(feed['site_link'])
+
+ if feed.get('icon'):
+ feed['icon'] = rebuild_url(feed['icon'], split)
+
+ if not feed.get('site_link') or not query_site \
+ or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')):
+ return feed
+
+ response = requests.get(feed['site_link'], verify=False)
+ bs_parsed = BeautifulSoup(response.content, 'html.parser',
+ parse_only=SoupStrainer('head'))
+
+ if not feed.get('title'):
+ try:
+ feed['title'] = bs_parsed.find_all('title')[0].text
+ except Exception:
+ pass
+
+ def check_keys(**kwargs):
+ def wrapper(elem):
+ for key, vals in kwargs.items():
+ if not elem.has_attr(key):
+ return False
+ if not all(val in elem.attrs[key] for val in vals):
+ return False
+ return True
+ return wrapper
+
+ if not feed.get('icon'):
+ icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut']))
+ if not len(icons):
+ icons = bs_parsed.find_all(check_keys(rel=['icon']))
+ if len(icons) >= 1:
+ feed['icon'] = rebuild_url(icons[0].attrs['href'], split)
+ else: # trying the default one
+ icon = rebuild_url('/favicon.ico', split)
+ if requests.get(icon, verify=False).ok:
+ feed['icon'] = icon
+
+ if not feed.get('link'):
+ alternate = bs_parsed.find_all(check_keys(rel=['alternate'],
+ type=['application/rss+xml']))
+ if len(alternate) == 1:
+ feed['link'] = rebuild_url(alternate[0].attrs['href'], split)
+ elif len(alternate) > 1:
+ feed['link'] = rebuild_url(alternate[0].attrs['href'], split)
+ feed['other_link'] = [rebuild_url(al.attrs['href'], split)
+ for al in alternate[1:]]
+ return feed
diff --git a/pyaggr3g470r/models/feed.py b/pyaggr3g470r/models/feed.py
index 793642fb..75e55df1 100644
--- a/pyaggr3g470r/models/feed.py
+++ b/pyaggr3g470r/models/feed.py
@@ -43,6 +43,7 @@ class Feed(db.Model):
enabled = db.Column(db.Boolean(), default=True)
created_date = db.Column(db.DateTime(), default=datetime.now)
filters = db.Column(db.PickleType, default=[])
+ icon = db.Column(db.String(), default="")
# cache handling
etag = db.Column(db.String(), default="")
@@ -70,6 +71,7 @@ class Feed(db.Model):
"link": self.link,
"site_link": self.site_link,
"etag": self.etag,
+ "icon": self.icon,
"error_count": self.error_count,
"last_modified": self.last_modified,
"last_retrieved": self.last_retrieved}
diff --git a/pyaggr3g470r/templates/home.html b/pyaggr3g470r/templates/home.html
index 6d1ca85e..5c961888 100644
--- a/pyaggr3g470r/templates/home.html
+++ b/pyaggr3g470r/templates/home.html
@@ -105,7 +105,9 @@
{% if filter_ == 'all' %}</b>{% endif %}
{% endif %}
</td>
- <td><a class="open-article" href="/article/redirect/{{ article.id}}" target="_blank">{{ article.source.title|safe }}</a></td>
+ <td><a class="open-article" href="/article/redirect/{{ article.id}}" target="_blank">
+ {% if article.source.icon %}<img src="{{ article.source.icon }}" width="16px" />{% endif %}
+ {{ article.source.title|safe }}</a></td>
<td {%if filter_ == 'all' and article.readed == False %}style='font-weight:bold'{% endif %}>
<a href="/article/{{ article.id }}">{{ article.title|safe }}</a>
</td>
diff --git a/pyaggr3g470r/views/api/feed.py b/pyaggr3g470r/views/api/feed.py
index 68f3a12c..f9060263 100644
--- a/pyaggr3g470r/views/api/feed.py
+++ b/pyaggr3g470r/views/api/feed.py
@@ -19,6 +19,7 @@ FEED_ATTRS = {'title': {'type': str},
'site_link': {'type': str},
'enabled': {'type': bool, 'default': True},
'etag': {'type': str, 'default': ''},
+ 'icon': {'type': str, 'default': ''},
'last_modified': {'type': str},
'last_retrieved': {'type': str},
'last_error': {'type': str},
bgstack15