From 92645115f3d8c4286acfe2f81c049b8c539093c4 Mon Sep 17 00:00:00 2001 From: "B. Stack" Date: Fri, 2 Apr 2021 23:02:17 -0400 Subject: support har files in lib Still need to add harfile support to front-end --- savewebfonts_lib.py | 139 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 98 insertions(+), 41 deletions(-) (limited to 'savewebfonts_lib.py') diff --git a/savewebfonts_lib.py b/savewebfonts_lib.py index 2e0a55a..a8bcb11 100755 --- a/savewebfonts_lib.py +++ b/savewebfonts_lib.py @@ -14,7 +14,7 @@ # Handle using tinycss old? # Dependencies: # req-fedora: python3-beautifulsoup4, python3-tinycss2 -import requests, os +import requests, os, json from sys import stderr from bs4 import BeautifulSoup as bs # python3-beautifulsoup4 from urllib.parse import urljoin, urlparse @@ -32,7 +32,7 @@ def get_session(): session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" return session -def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun = False): +def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun=False): """ Return all css links from a given page """ @@ -51,7 +51,7 @@ def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun = css_files.append(css_url) return css_files -def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryrun = False): +def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryrun=False): """ Return a list of urls of all webfonts specified in this css file """ @@ -90,7 +90,7 @@ def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryru # c is a flat list of all font files, many of which are duplicates return c -def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun = False): +def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun=False): """ Given a url, and destination dir, and optionally an existing http session, download the url and save to a file """ @@ -101,48 +101,44 @@ def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun = False) filepath = os.path.join(destdir, filename) if not os.path.exists(filepath): - if not dryrun: - # Download content - if session: - response = session.get(url) - else: - response = requests.get(url) - - if 'Content-Disposition' in response.headers: - filename=response.headers['Content-Disposition'] - eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}") - filepath = os.path.join(destdir, filename) - - # Future: logic for woff2 to ttf conversion goes here, approximately - - try: - if debuglevel >= 1: - sstring = "Saving" if not dryrun else "Save" - eprint(f"{sstring} {url} to file {filepath}") + if url.startswith("data:"): + # not supported! + eprint(f"Warning: Url {url[:120]} is unsupported.") + else: if not dryrun: - with open(filepath,'wb') as thisfile: - thisfile.write(response.content) - return 0 - except Exception as E: - eprint(f"Error when downloading {url}, {E}") - return -1 + # Download content + if session: + response = session.get(url) + else: + response = requests.get(url) + + if 'Content-Disposition' in response.headers: + filename=response.headers['Content-Disposition'] + eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}") + filepath = os.path.join(destdir, filename) + + # Future: logic for woff2 to ttf conversion goes here, approximately + + try: + if debuglevel >= 1: + sstring = "Saving" if not dryrun else "Save" + eprint(f"{sstring} {url} to file {filepath}") + if not dryrun: + with open(filepath,'wb') as thisfile: + thisfile.write(response.content) + return 0 + except Exception as E: + eprint(f"Error when downloading {url}, {E}") + return -1 else: # filepath does exist if debuglevel >= 2: eprint(f"File {filepath} exists for {url}. Skipping.") return 0 -def whitelist_page(url, fontdir, debuglevel = debuglevel, dryrun = False): - """ - For the given URL, Save all listed webfonts to a directory named - after the domain, underneath the given fontdir. - """ +def get_all_fonts_from_csslist(all_css, session=None, debuglevel=debuglevel, dryrun=False): all_fonts = [] - session = get_session() - - # List all webfonts called by the given page - all_css = list_all_css_for_page(url, session, debuglevel = debuglevel, dryrun = dryrun) for this_css in all_css: - webfonts = get_webfonts_for_one_css(this_css, session, debuglevel = debuglevel, dryrun = dryrun) + webfonts = get_webfonts_for_one_css(this_css, session, debuglevel=debuglevel, dryrun=dryrun) for webfont in webfonts: # filter accepted extensions here. Technically fontconfig only uses ttf. # Always exclude svg, because those are really big, and not usable files for fontconfig. @@ -150,9 +146,12 @@ def whitelist_page(url, fontdir, debuglevel = debuglevel, dryrun = False): if debuglevel >= 2: eprint(f"Found font {webfont}") all_fonts.append(webfont) + return all_fonts - # Prepare destination dir - destdir = os.path.join(fontdir,urlparse(url).netloc) +def save_all_fonts(all_fonts, destdir, session=None, debuglevel=debuglevel, dryrun=False): + """ + Given a list of font urls, and the destdir, save all these fonts + """ #print(f"Found {len(all_fonts)} font files for page {url}") #print(f"Will save to {destdir}") @@ -171,5 +170,63 @@ def whitelist_page(url, fontdir, debuglevel = debuglevel, dryrun = False): # Loop through all webfont files and save them for font in all_fonts: - save_font(font, destdir, debuglevel = debuglevel, dryrun = dryrun) + save_font(font, destdir, session=session, debuglevel=debuglevel, dryrun=dryrun) return 0 + +def whitelist_page(url, fontdir, session=None, debuglevel=debuglevel, dryrun=False): + """ + For the given URL, Save all listed webfonts to a directory named + after the domain, underneath the given fontdir. + """ + all_fonts = [] + if not session: + session = get_session() + + # List all webfonts called by the given page + all_css = list_all_css_for_page(url, session, debuglevel=debuglevel, dryrun=dryrun) + all_fonts = get_all_fonts_from_csslist(all_css, session, debuglevel=debuglevel, dryrun=dryrun) + + # Prepare destination dir + destdir = os.path.join(fontdir,urlparse(url).netloc) + + # Save all fonts to that dir + return save_all_fonts(all_fonts, destdir, session, debuglevel=debuglevel, dryrun=dryrun) + +def whitelist_harfile(harfile, fontdir, session=None, debuglevel=debuglevel, dryrun=False): + """ + Given the harfile, save all fonts listed in the discovered css files + """ + all_fonts = [] + if not session: + session = get_session() + + # List all css in the har file + all_css = extract_css_urls_from_harfile(harfile) + all_fonts = get_all_fonts_from_csslist(all_css, session, debuglevel=debuglevel, dryrun=dryrun) + + # Prepare destination dir + destdir = os.path.join(fontdir,"harfiles") + + # Save all fonts to that dir + return save_all_fonts(all_fonts, destdir, session, debuglevel=debuglevel, dryrun=dryrun) + +def extract_css_urls_from_harfile(harfile): + """ + Extract all urls that match string "css" from a har file + """ + css_files = [] + + with open(harfile,'r') as o: + har_contents = o.read() + + a = json.loads(har_contents) + c = a['log']['entries'] + x = 0 + for d in c: + e = c[x]['request']['url'] + if "css" in e and e not in css_files: + if debuglevel >= 5: + eprint(e) + css_files.append(e) + x = x + 1 + return css_files -- cgit