#!/usr/bin/env python3 # File: /usr/libexec/savewebfonts/savewebfonts_lib.py # Location: save-webfonts package # Author: bgstack15 # Startdate: 2021-04-02 07:20 # SPDX-License-Identifier: CC-BY-SA 4.0 # Title: Library for Saving Webfonts # Purpose: library for whitelisting a page's webfonts by downloading them for current user # Usage: See save-webfonts (1) # Reference: # https://github.com/fonttools/fonttools/issues/1694 # Improve: # Handle using tinycss old? # Dependencies: # req-fedora: python3-beautifulsoup4, python3-tinycss2 # rec-fedora: python3-fonttools, libeot-tools # req-devuan: python3-bs4, python3-tinycss2 # rec-devuan: python3-fonttools, eot2ttf import requests, os, json, tempfile, subprocess, base64 from sys import stderr from bs4 import BeautifulSoup as bs # python3-beautifulsoup4 from urllib.parse import urljoin, urlparse import tinycss2 # python3-tinycss2 def get_session(): session = requests.Session() session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" return session # defaults for library class swf_config: def __init__( self , debuglevel = 8 , session = None , MAX_STRING_PRINT_LENGTH = 180 , eot2ttf_binary = "eot2ttf" , dryrun = True , convert = False ): self.debuglevel = debuglevel self.MAX_STRING_PRINT_LENGTH = MAX_STRING_PRINT_LENGTH self.eot2ttf_binary = eot2ttf_binary self.dryrun = dryrun if session is None: self.session = get_session() else: self.session = session self.convert = convert def __repr__(self): response = "" return response config_default = swf_config() # Functions def eprint(*args, **kwargs): print(*args, file=stderr, **kwargs) def ttfify_filename(filename): response = filename for end in [".woff2",".woff",".eot",".svg"]: if response.endswith(end): response = response[:-len(end)] # For python 3.9 and higher only: #response = response.removesuffix(end) return response + ".ttf" def list_all_css_for_page(url, config): """ Return all css links from a given page """ # Reference: https://www.thepythoncode.com/article/extract-web-page-script-and-css-files-in-python css_files = [] html = config.session.get(url).content soup = bs(html, "html.parser") for css in soup.find_all("link"): if ".css" in css.attrs.get("href"): # if the link tag has the 'href' attribute css_url = urljoin(url, css.attrs.get("href")) if config.debuglevel >= 8: eprint(f"Found css: {css_url}") css_files.append(css_url) return css_files def get_webfonts_for_one_css(url, config): """ Return a list of urls of all webfonts specified in this css file """ #theseFonts = [] css = config.session.get(url).content a = tinycss2.parse_stylesheet_bytes(css) a = a[0] b = [] x=0 # extract only the font-face rules for i in a: x = x + 1 try: if "at-rule" in i.type and "font-face" in i.at_keyword: b.append(i) if config.debuglevel >= 10: eprint(str(x) + " " + str(i)) except: pass # now list b is only the font-face rules c = [] for i in b: x=0 marker=-1 for j in i.content: x = x + 1 if "url" in j.type: # make absolute from relative thisurl = urljoin(url,j.value) if thisurl not in c: if config.debuglevel >= 5: eprint(f"get_webfonts_for_one_css: Found font url {thisurl[:config.MAX_STRING_PRINT_LENGTH]}") c.append(thisurl) # c is a flat list of all font files, many of which are duplicates return c def save_inline_font_helper(intro,url): """ Call this from save_font if url.startswith("data:") Return need_convert, ext, tf, and filename """ need_convert = False ext = "" tf = tempfile.NamedTemporaryFile() if "/x-font-woff2;" in intro or "/font-woff2;" in intro: need_convert = True ext = ".woff2" elif "/x-font-woff;" in intro or "/font-woff;" in intro: need_convert = True ext = ".woff" elif "/x-font-ttf;" in intro or "/font-ttf;" in intro: ext = ".ttf" contents = url[len(intro):] filename = ttfify_filename(contents[:20]) #print(f"TEMP, sifh: {contents}") if ";base64," in intro: contents = base64.b64decode(contents) #print(f"TEMP, did the base64 conversion") tf.write(contents) return need_convert, ext, tf, filename def save_font(url,destdir,config): """ Given a url, and destination dir, and optionally an existing http session, download the url and save to a file. If convert, save any woff/woff2 to ttf. """ need_convert = False # Derive filename filename="" filename=os.path.basename(urlparse(url).path) ext = os.path.splitext(filename)[-1] tf = None # Do not try to convert .svg if config.convert and not filename.endswith(".ttf") and ext not in [".svg"]: need_convert = True orig_filename = filename # in case we cannot load library later filename = ttfify_filename(filename) if url.startswith("data:"): # FINDTHIS data for fheader in ["data:application/x-font-woff;charset=utf-8;base64,","data:application/x-font-ttf;charset=utf-8;base64,","data:application/font-woff;charset=utf-8;base64,","data:font/ttf;base64,"]: if url.startswith(fheader): need_convert, ext, tf, filename = save_inline_font_helper(fheader, url) #print(f"TEMP: FOR {url}, {need_convert} {ext} {tf} {filename}") break filepath = os.path.join(destdir, filename) if not os.path.exists(filepath): if url.startswith("data:"): # Yes, some repetition here. #if url.startswith("data:application/x-font-woff;charset=utf-8;base64,"): if tf is not None: pass else: # not supported yet! eprint(f"Warning: Url {url[:config.MAX_STRING_PRINT_LENGTH]} is unsupported, for file {filepath}.") return -1 if not config.dryrun: if tf: with open(tf.name,'rb') as otf: file_contents = otf.read() else: # Download content response = config.session.get(url) if 'Content-Disposition' in response.headers: filename=response.headers['Content-Disposition'] eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}") if need_convert and not filename.endswith(".ttf"): orig_filename = filename # in case we cannot load library later filename = ttfify_filename(filename) file_contents = response.content filepath = os.path.join(destdir, filename) #try: if True: if config.debuglevel >= 1: sstring = "Saving" if not config.dryrun else "Save" eprint(f"{sstring} {url[:config.MAX_STRING_PRINT_LENGTH]} to file {filepath}") if not config.dryrun: if not need_convert: with open(filepath,'wb') as thisfile: thisfile.write(file_contents) else: # need_convert is true, and not dryrun, so call function if ext in [".woff",".woff2"]: try: from fontTools import ttLib except Exception as e: raise e convert_in = url if tf: convert_in = tf.name convert_woffwoff2_ttf(convert_in,filepath,config=config) elif ext in [".eot"]: convert_eot_ttf(url,filepath,config=config) else: # no plan for conversion! eprint(f"Warning: no conversion plan for ext {ext} of {url[:config.MAX_STRING_PRINT_LENGTH]}. Saving as-is.") with open(filepath,'wb') as thisfile: thisfile.write(file_contents) if tf: tf.close() return 0 #except Exception as E: # eprint(f"Error when downloading {url}, {E}") # if tf: tf.close() # return -1 if tf: tf.close() else: # filepath does exist if config.debuglevel >= 2: eprint(f"File {filepath} exists for {url[:config.MAX_STRING_PRINT_LENGTH]}. Skipping.") if tf: tf.close() return 0 def get_all_fonts_from_csslist(all_css, config): all_fonts = [] for this_css in all_css: webfonts = get_webfonts_for_one_css(this_css, config) for webfont in webfonts: # filter accepted extensions here. Technically fontconfig only uses ttf. # Always exclude svg, because those are really big, and not usable files for fontconfig. # WORKHERE: allow svg, if convert_woffwoff2_ttf works on svg. if webfont not in all_fonts and '.svg' not in webfont: if config.debuglevel >= 2: eprint(f"Found font {webfont[:config.MAX_STRING_PRINT_LENGTH+30]}") all_fonts.append(webfont) return all_fonts def save_all_fonts(all_fonts, destdir, config): """ Given a list of font urls, and the destdir, save all these fonts """ #print(f"Found {len(all_fonts)} font files for page {url}") #print(f"Will save to {destdir}") if os.path.exists(destdir): if not os.path.isdir(destdir): raise NotADirectoryError(20,destdir,"Please clean up this non-directory file and try again") return -1 try: if not config.dryrun: os.mkdir(destdir) except FileExistsError: pass # it already exists except Exception as E: raise E # Loop through all webfont files and save them for font in all_fonts: save_font(font, destdir, config) return 0 def whitelist_page(url, fontdir, config = config_default): """ For the given URL, Save all listed webfonts to a directory named after the domain, underneath the given fontdir. If convert, then convert all woff, woff2 files to ttf using woffTools """ all_fonts = [] # List all webfonts called by the given page all_css = list_all_css_for_page(url, config) all_fonts = get_all_fonts_from_csslist(all_css, config) # Prepare destination dir destdir = os.path.join(fontdir,urlparse(url).netloc) # Save all fonts to that dir return save_all_fonts(all_fonts, destdir, config) def whitelist_harfile(harfile, fontdir, config = config_default): """ Given the harfile, save all fonts listed in the discovered css files """ all_fonts = [] # List all css in the har file all_css = extract_css_urls_from_harfile(harfile, config) all_fonts = get_all_fonts_from_csslist(all_css, config) # Prepare destination dir with open(harfile,'r') as o: har_contents = o.read() a = json.loads(har_contents) domain = urlparse(a['log']['entries'][0]['request']['url']).netloc # get first entry's domain name destdir = os.path.join(fontdir,"harfile-"+domain) # Save all fonts to that dir return save_all_fonts(all_fonts, destdir, config) def extract_css_urls_from_harfile(harfile, config): """ Extract all urls that match string "css" from a har file """ css_files = [] with open(harfile,'r') as o: har_contents = o.read() a = json.loads(har_contents) c = a['log']['entries'] x = 0 for d in c: e = c[x]['request']['url'] if "css" in e and e not in css_files: if config.debuglevel >= 5: eprint(e) css_files.append(e) x = x + 1 return css_files def convert_woffwoff2_ttf(url, filename, config): """ Save the given url to filename, with filetype ttf """ # This will only be called from save_font when dryrun=False, so the dryrun flag here is useful only if called from some other usage. if (url.startswith("http://") or url.startswith("https://") or url.startswith("ftp://")): response = config.session.get(url) file_contents = response.content else: # assume local file with open(url,'rb') as o: file_contents = o.read() try: from fontTools import ttLib except ModuleNotFoundError: eprint("Warning: cannot load fontTools. Try installing python3-fonttools") return -1 except Exception as e: raise e with tempfile.TemporaryFile() as tf: tf.write(file_contents) try: font = ttLib.TTFont(tf) except ttLib.TTLibError as e: eprint(f"Warning: not a woff/woff2: {url[:config.MAX_STRING_PRINT_LENGTH]} for file {filename}") return -1 if config.debuglevel >= 3: eprint(f"Converting {url[:config.MAX_STRING_PRINT_LENGTH]} from {font.flavor} to ttf as file {filename}") font.flavor = None # restores default value, for non-compressed OpenType font.save(filename) return 0 def convert_eot_ttf(uri, filename, config): """ Save the given uri of an eot file to filename, with filetype ttf """ # This will only be called from save_font when dryrun=False, so the dryrun flag here is useful only if called from some other usage. tf = None if "http://" in uri or "https://" in uri or "ftp://" in uri: response = config.session.get(uri) content = response.content tf = tempfile.NamedTemporaryFile() tf.write(content) infile = tf.name # change to use this temp file else: # local file, or some uri scheme not planned yet infile = uri #with open(uri,'rb') as o: # content = o.read() try: r = subprocess.run(["which",config.eot2ttf_binary], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if r.returncode != 0: #raise FileNotFoundError(18,"eot2ttf","cannot find") eprint(f"Warning: Cannot convert {uri} because cannot find eot2ttf. Please set --eotbin.") return -1 # so proceed except Exception as e: try: # clean up temp file if tf: tf.close() except: pass raise e r=subprocess.run([config.eot2ttf_binary,infile,filename]) if r.returncode != 0: eprint(f"Warning: eot2ttf failed on {uri}") # exit convert_eot_ttf try: # clean up temp file if tf: tf.close() except: pass return 0