1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
import types
import urllib
import logging
import requests
import feedparser
from bs4 import BeautifulSoup, SoupStrainer
logger = logging.getLogger(__name__)
def default_handler(obj):
"""JSON handler for default query formatting"""
if hasattr(obj, 'isoformat'):
return obj.isoformat()
if hasattr(obj, 'dump'):
return obj.dump()
if isinstance(obj, (set, frozenset, types.GeneratorType)):
return list(obj)
if isinstance(obj, BaseException):
return str(obj)
raise TypeError("Object of type %s with value of %r "
"is not JSON serializable" % (type(obj), obj))
def try_keys(dico, *keys):
for key in keys:
if key in dico:
return dico[key]
return
def rebuild_url(url, base_split):
split = urllib.parse.urlsplit(url)
if split.scheme and split.netloc:
return url # url is fine
new_split = urllib.parse.SplitResult(
scheme=split.scheme or base_split.scheme,
netloc=split.netloc or base_split.netloc,
path=split.path, query='', fragment='')
return urllib.parse.urlunsplit(new_split)
def try_splits(url, *splits):
for split in splits:
if requests.get(rebuild_url(url, split), verify=False).ok:
return rebuild_url(url, split)
return None
def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
if url is None and fp_parsed is not None:
url = fp_parsed.get('url')
if url is not None and fp_parsed is None:
try:
response = requests.get(url, verify=False)
fp_parsed = feedparser.parse(response.content,
request_headers=response.headers)
except Exception:
logger.exception('failed to retreive that url')
fp_parsed = {'bozo': True}
assert url is not None and fp_parsed is not None
feed = feed or {}
feed_split = urllib.parse.urlsplit(url)
if not fp_parsed['bozo']:
feed['link'] = url
feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link')
feed['title'] = fp_parsed['feed'].get('title')
feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title')
feed['icon'] = try_keys(fp_parsed['feed'], 'icon')
else:
feed['site_link'] = url
if feed.get('site_link'):
feed['site_link'] = rebuild_url(feed['site_link'], feed_split)
site_split = urllib.parse.urlsplit(feed['site_link'])
if feed.get('icon'):
feed['icon'] = try_splits(feed['icon'], site_split, feed_split)
if feed['icon'] is None:
del feed['icon']
if not feed.get('site_link') or not query_site \
or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')):
return feed
response = requests.get(feed['site_link'], verify=False)
bs_parsed = BeautifulSoup(response.content, 'html.parser',
parse_only=SoupStrainer('head'))
if not feed.get('title'):
try:
feed['title'] = bs_parsed.find_all('title')[0].text
except Exception:
pass
def check_keys(**kwargs):
def wrapper(elem):
for key, vals in kwargs.items():
if not elem.has_attr(key):
return False
if not all(val in elem.attrs[key] for val in vals):
return False
return True
return wrapper
if not feed.get('icon'):
icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut']))
if not len(icons):
icons = bs_parsed.find_all(check_keys(rel=['icon']))
if len(icons) >= 1:
for icon in icons:
feed['icon'] = try_splits(icon.attrs['href'],
site_split, feed_split)
if feed['icon'] is not None:
break
if feed.get('icon') is None:
feed['icon'] = try_splits('/favicon.ico', site_split, feed_split)
if 'icon' in feed and feed['icon'] is None:
del feed['icon']
if not feed.get('link'):
alternate = bs_parsed.find_all(check_keys(rel=['alternate'],
type=['application/rss+xml']))
if len(alternate) >= 1:
feed['link'] = alternate[0].attrs['href']
return feed
|