#!/usr/bin/env python3 # File: /usr/libexec/savewebfonts/savewebfonts_lib.py # Location: save-webfonts package # Author: bgstack15 # Startdate: 2021-04-02 07:20 # SPDX-License-Identifier: CC-BY-SA 4.0 # Title: Library for Saving Webfonts # Purpose: library for whitelisting a page's webfonts by downloading them for current user # Usage: See save-webfonts (1) # Reference: # https://github.com/fonttools/fonttools/issues/1694 # Improve: # accept a list of filetypes to save, or exclude? Such as, ['ttf','woff2'] # Convert woff2 fonts? # Handle using tinycss old? # Dependencies: # req-fedora: python3-beautifulsoup4, python3-tinycss2 # rec-fedora: python3-fonttools import requests, os, json, tempfile from sys import stderr from bs4 import BeautifulSoup as bs # python3-beautifulsoup4 from urllib.parse import urljoin, urlparse import tinycss2 # python3-tinycss2 # defaults for library debuglevel = 8 MAX_STRING_PRINT_LENGTH = 180 # Functions def eprint(*args, **kwargs): print(*args, file=stderr, **kwargs) def ttfify_filename(filename): return filename.rstrip(".woff").rstrip(".woff2").rstrip(".svg").rstrip(".eot") + ".ttf" def get_session(): session = requests.Session() session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" return session def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun=False): """ Return all css links from a given page """ # Reference: https://www.thepythoncode.com/article/extract-web-page-script-and-css-files-in-python css_files = [] if not session: session = get_session() html = session.get(url).content soup = bs(html, "html.parser") for css in soup.find_all("link"): if ".css" in css.attrs.get("href"): # if the link tag has the 'href' attribute css_url = urljoin(url, css.attrs.get("href")) if debuglevel >= 8: eprint(f"Found css: {css_url}") css_files.append(css_url) return css_files def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryrun=False): """ Return a list of urls of all webfonts specified in this css file """ #theseFonts = [] if not session: session = get_session() css = session.get(url).content a = tinycss2.parse_stylesheet_bytes(css) a = a[0] b = [] x=0 # extract only the font-face rules for i in a: x = x + 1 try: if "at-rule" in i.type and "font-face" in i.at_keyword: b.append(i) if debuglevel >= 10: eprint(str(x) + " " + str(i)) except: pass # now list b is only the font-face rules c = [] for i in b: x=0 marker=-1 for j in i.content: x = x + 1 if "url" in j.type: # make absolute from relative thisurl = urljoin(url,j.value) if thisurl not in c: if debuglevel >= 5: eprint(f"get_webfonts_for_one_css: Found font url {thisurl}") c.append(thisurl) # c is a flat list of all font files, many of which are duplicates return c def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun=False, convert=False): """ Given a url, and destination dir, and optionally an existing http session, download the url and save to a file. If convert, save any woff/woff2 to ttf. """ need_convert = False # Derive filename filename="" filename=os.path.basename(urlparse(url).path) ext = os.path.splitext(filename)[-1] # Do not try to convert .eot if convert and not filename.endswith(".ttf") and ext not in [".eot"]: need_convert = True orig_filename = filename # in case we cannot load library later filename = ttfify_filename(filename) filepath = os.path.join(destdir, filename) if not os.path.exists(filepath): if url.startswith("data:"): # not supported! # WORKHERE: support saving to a tempfile this datastream, probably a base64encoded woff file. Then just convert. eprint(f"Warning: Url {url[:MAX_STRING_PRINT_LENGTH]} is unsupported.") else: if not dryrun: # Download content if session: response = session.get(url) else: response = requests.get(url) if 'Content-Disposition' in response.headers: filename=response.headers['Content-Disposition'] eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}") if need_convert and not filename.endswith(".ttf"): orig_filename = filename # in case we cannot load library later filename = ttfify_filename(filename) filepath = os.path.join(destdir, filename) # Future: logic for woff2 to ttf conversion goes here, approximately try: if debuglevel >= 1: sstring = "Saving" if not dryrun else "Save" eprint(f"{sstring} {url} to file {filepath}") if not dryrun: if not need_convert: with open(filepath,'wb') as thisfile: thisfile.write(response.content) else: # need_convert is true, and not dryrun, so call function try: from fontTools import ttLib except Exception as e: raise e convert_font(url,filepath,session=session,debuglevel=debuglevel,dryrun=dryrun) return 0 except Exception as E: eprint(f"Error when downloading {url}, {E}") return -1 else: # filepath does exist if debuglevel >= 2: eprint(f"File {filepath} exists for {url}. Skipping.") return 0 def get_all_fonts_from_csslist(all_css, session=None, debuglevel=debuglevel, dryrun=False): all_fonts = [] for this_css in all_css: webfonts = get_webfonts_for_one_css(this_css, session, debuglevel=debuglevel, dryrun=dryrun) for webfont in webfonts: # filter accepted extensions here. Technically fontconfig only uses ttf. # Always exclude svg, because those are really big, and not usable files for fontconfig. # WORKHERE: allow svg, if convert_font works on svg. if webfont not in all_fonts and '.svg' not in webfont: if debuglevel >= 2: eprint(f"Found font {webfont}") all_fonts.append(webfont) return all_fonts def save_all_fonts(all_fonts, destdir, session=None, debuglevel=debuglevel, dryrun=False, convert=False): """ Given a list of font urls, and the destdir, save all these fonts """ #print(f"Found {len(all_fonts)} font files for page {url}") #print(f"Will save to {destdir}") if os.path.exists(destdir): if not os.path.isdir(destdir): raise NotADirectoryError(20,destdir,"Please clean up this non-directory file and try again") return -1 try: if not dryrun: os.mkdir(destdir) except FileExistsError: pass # it already exists except Exception as E: raise E # Loop through all webfont files and save them for font in all_fonts: save_font(font, destdir, session=session, debuglevel=debuglevel, dryrun=dryrun, convert=convert) return 0 def whitelist_page(url, fontdir, session=None, debuglevel=debuglevel, dryrun=False, convert = False): """ For the given URL, Save all listed webfonts to a directory named after the domain, underneath the given fontdir. If convert, then convert all woff, woff2 files to ttf using woffTools """ all_fonts = [] if not session: session = get_session() # List all webfonts called by the given page all_css = list_all_css_for_page(url, session, debuglevel=debuglevel, dryrun=dryrun) all_fonts = get_all_fonts_from_csslist(all_css, session, debuglevel=debuglevel, dryrun=dryrun) # Prepare destination dir destdir = os.path.join(fontdir,urlparse(url).netloc) # Save all fonts to that dir return save_all_fonts(all_fonts, destdir, session, debuglevel=debuglevel, dryrun=dryrun, convert=convert) def whitelist_harfile(harfile, fontdir, session=None, debuglevel=debuglevel, dryrun=False, convert=False): """ Given the harfile, save all fonts listed in the discovered css files """ all_fonts = [] if not session: session = get_session() # List all css in the har file all_css = extract_css_urls_from_harfile(harfile) all_fonts = get_all_fonts_from_csslist(all_css, session, debuglevel=debuglevel, dryrun=dryrun) # Prepare destination dir destdir = os.path.join(fontdir,"harfiles") # Save all fonts to that dir return save_all_fonts(all_fonts, destdir, session, debuglevel=debuglevel, dryrun=dryrun, convert=convert) def extract_css_urls_from_harfile(harfile): """ Extract all urls that match string "css" from a har file """ css_files = [] with open(harfile,'r') as o: har_contents = o.read() a = json.loads(har_contents) c = a['log']['entries'] x = 0 for d in c: e = c[x]['request']['url'] if "css" in e and e not in css_files: if debuglevel >= 5: eprint(e) css_files.append(e) x = x + 1 return css_files def convert_font(url, filename, session=None, debuglevel=debuglevel, dryrun=False): """ Save the given url to filename, with filetype ttf """ # This will only be called from save_font when dryrun=False, so the dryrun flag here is useful only if called from some other usage. if session: response = session.get(url) else: response = requests.get(url) try: from fontTools import ttLib except ModuleNotFoundError: eprint("Warning: cannot load fontTools. Try installing python3-fonttools") return -1 except Exception as e: raise e with tempfile.TemporaryFile() as tf: tf.write(response.content) font = ttLib.TTFont(tf) if debuglevel >= 3: eprint(f"Converting {url[:MAX_STRING_PRINT_LENGTH]} from {font.flavor} to ttf as file {filename}") font.flavor = None # restores default value, for non-compressed OpenType font.save(filename) return 0