diff options
Diffstat (limited to 'savewebfonts_lib.py')
-rwxr-xr-x | savewebfonts_lib.py | 202 |
1 files changed, 135 insertions, 67 deletions
diff --git a/savewebfonts_lib.py b/savewebfonts_lib.py index 19790df..0c5943e 100755 --- a/savewebfonts_lib.py +++ b/savewebfonts_lib.py @@ -10,62 +10,90 @@ # Reference: # https://github.com/fonttools/fonttools/issues/1694 # Improve: -# accept a list of filetypes to save, or exclude? Such as, ['ttf','woff2'] -# Convert woff2 fonts? # Handle using tinycss old? # Dependencies: # req-fedora: python3-beautifulsoup4, python3-tinycss2 -# rec-fedora: python3-fonttools -import requests, os, json, tempfile +# rec-fedora: python3-fonttools, libeot-tools +# req-devuan: python3-bs4, python3-tinycss2 +# rec-devuan: python3-fonttools, eot2ttf + +import requests, os, json, tempfile, subprocess from sys import stderr from bs4 import BeautifulSoup as bs # python3-beautifulsoup4 from urllib.parse import urljoin, urlparse import tinycss2 # python3-tinycss2 # defaults for library -debuglevel = 8 -MAX_STRING_PRINT_LENGTH = 180 +class swf_config: + def __init__( + self + , debuglevel = 8 + , session = None + , MAX_STRING_PRINT_LENGTH = 180 + , eot2ttf_binary = "eot2ttf" + , dryrun = False + , convert = False + ): + self.debuglevel = debuglevel + self.MAX_STRING_PRINT_LENGTH = MAX_STRING_PRINT_LENGTH + self.eot2ttf_binary = eot2ttf_binary + self.dryrun = dryrun + if session is None: + self.session = get_session() + else: + self.session = session + self.convert = convert + + def __repr__(self): + response = "<swf_config" + for i in self.__dict__: + # omit printing session + if "session" not in i: + response = response + " " + (str(i)) + "=\"" + str(self.__dict__[i]) + "\"," + response = response.rstrip(",") + ">" + return response # Functions def eprint(*args, **kwargs): print(*args, file=stderr, **kwargs) def ttfify_filename(filename): - return filename.rstrip(".woff").rstrip(".woff2").rstrip(".svg").rstrip(".eot") + ".ttf" - + response = filename + for end in [".woff2",".woff",".eot",".svg"]: + if response.endswith(end): + response = response[:-len(end)] + # For python 3.9 and higher only: + #response = response.removesuffix(end) + return response + ".ttf" def get_session(): session = requests.Session() session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" return session -def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun=False): +def list_all_css_for_page(url, config): """ Return all css links from a given page """ # Reference: https://www.thepythoncode.com/article/extract-web-page-script-and-css-files-in-python css_files = [] - if not session: - session = get_session() - html = session.get(url).content + html = config.session.get(url).content soup = bs(html, "html.parser") for css in soup.find_all("link"): if ".css" in css.attrs.get("href"): # if the link tag has the 'href' attribute css_url = urljoin(url, css.attrs.get("href")) - if debuglevel >= 8: + if config.debuglevel >= 8: eprint(f"Found css: {css_url}") css_files.append(css_url) return css_files -def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryrun=False): +def get_webfonts_for_one_css(url, config): """ Return a list of urls of all webfonts specified in this css file """ #theseFonts = [] - if not session: - session = get_session() - css = session.get(url).content + css = config.session.get(url).content a = tinycss2.parse_stylesheet_bytes(css) a = a[0] b = [] @@ -76,7 +104,7 @@ def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryru try: if "at-rule" in i.type and "font-face" in i.at_keyword: b.append(i) - if debuglevel >= 10: + if config.debuglevel >= 10: eprint(str(x) + " " + str(i)) except: pass @@ -91,13 +119,13 @@ def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryru # make absolute from relative thisurl = urljoin(url,j.value) if thisurl not in c: - if debuglevel >= 5: + if config.debuglevel >= 5: eprint(f"get_webfonts_for_one_css: Found font url {thisurl}") c.append(thisurl) # c is a flat list of all font files, many of which are duplicates return c -def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun=False, convert=False): +def save_font(url,destdir,config): """ Given a url, and destination dir, and optionally an existing http session, download the url and save to a file. If convert, save any woff/woff2 to ttf. """ @@ -108,8 +136,8 @@ def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun=False, c filename="" filename=os.path.basename(urlparse(url).path) ext = os.path.splitext(filename)[-1] - # Do not try to convert .eot - if convert and not filename.endswith(".ttf") and ext not in [".eot"]: + # Do not try to convert .svg + if config.convert and not filename.endswith(".ttf") and ext not in [".svg"]: need_convert = True orig_filename = filename # in case we cannot load library later filename = ttfify_filename(filename) @@ -119,14 +147,11 @@ def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun=False, c if url.startswith("data:"): # not supported! # WORKHERE: support saving to a tempfile this datastream, probably a base64encoded woff file. Then just convert. - eprint(f"Warning: Url {url[:MAX_STRING_PRINT_LENGTH]} is unsupported.") + eprint(f"Warning: Url {url[:config.MAX_STRING_PRINT_LENGTH]} is unsupported.") else: - if not dryrun: + if not config.dryrun: # Download content - if session: - response = session.get(url) - else: - response = requests.get(url) + response = config.session.get(url) if 'Content-Disposition' in response.headers: filename=response.headers['Content-Disposition'] @@ -136,47 +161,53 @@ def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun=False, c filename = ttfify_filename(filename) filepath = os.path.join(destdir, filename) - # Future: logic for woff2 to ttf conversion goes here, approximately - try: - if debuglevel >= 1: - sstring = "Saving" if not dryrun else "Save" + if config.debuglevel >= 1: + sstring = "Saving" if not config.dryrun else "Save" eprint(f"{sstring} {url} to file {filepath}") - if not dryrun: + if not config.dryrun: if not need_convert: with open(filepath,'wb') as thisfile: thisfile.write(response.content) else: # need_convert is true, and not dryrun, so call function - try: - from fontTools import ttLib - except Exception as e: - raise e - convert_font(url,filepath,session=session,debuglevel=debuglevel,dryrun=dryrun) + if ext in [".woff",".woff2"]: + try: + from fontTools import ttLib + except Exception as e: + raise e + convert_woffwoff2_ttf(url,filepath,config=config) + elif ext in [".eot"]: + convert_eot_ttf(url,filepath,config=config) + else: + # no plan for conversion! + eprint(f"Warning: no conversion plan for ext {ext} of {url}. Saving as-is.") + with open(filepath,'wb') as thisfile: + thisfile.write(response.content) return 0 except Exception as E: eprint(f"Error when downloading {url}, {E}") return -1 else: # filepath does exist - if debuglevel >= 2: + if config.debuglevel >= 2: eprint(f"File {filepath} exists for {url}. Skipping.") return 0 -def get_all_fonts_from_csslist(all_css, session=None, debuglevel=debuglevel, dryrun=False): +def get_all_fonts_from_csslist(all_css, config): all_fonts = [] for this_css in all_css: - webfonts = get_webfonts_for_one_css(this_css, session, debuglevel=debuglevel, dryrun=dryrun) + webfonts = get_webfonts_for_one_css(this_css, config) for webfont in webfonts: # filter accepted extensions here. Technically fontconfig only uses ttf. # Always exclude svg, because those are really big, and not usable files for fontconfig. - # WORKHERE: allow svg, if convert_font works on svg. + # WORKHERE: allow svg, if convert_woffwoff2_ttf works on svg. if webfont not in all_fonts and '.svg' not in webfont: - if debuglevel >= 2: + if config.debuglevel >= 2: eprint(f"Found font {webfont}") all_fonts.append(webfont) return all_fonts -def save_all_fonts(all_fonts, destdir, session=None, debuglevel=debuglevel, dryrun=False, convert=False): +def save_all_fonts(all_fonts, destdir, config): """ Given a list of font urls, and the destdir, save all these fonts """ @@ -189,7 +220,7 @@ def save_all_fonts(all_fonts, destdir, session=None, debuglevel=debuglevel, dryr raise NotADirectoryError(20,destdir,"Please clean up this non-directory file and try again") return -1 try: - if not dryrun: + if not config.dryrun: os.mkdir(destdir) except FileExistsError: pass # it already exists @@ -198,48 +229,44 @@ def save_all_fonts(all_fonts, destdir, session=None, debuglevel=debuglevel, dryr # Loop through all webfont files and save them for font in all_fonts: - save_font(font, destdir, session=session, debuglevel=debuglevel, dryrun=dryrun, convert=convert) + save_font(font, destdir, config) return 0 -def whitelist_page(url, fontdir, session=None, debuglevel=debuglevel, dryrun=False, convert = False): +def whitelist_page(url, fontdir, config): """ For the given URL, Save all listed webfonts to a directory named after the domain, underneath the given fontdir. If convert, then convert all woff, woff2 files to ttf using woffTools """ all_fonts = [] - if not session: - session = get_session() # List all webfonts called by the given page - all_css = list_all_css_for_page(url, session, debuglevel=debuglevel, dryrun=dryrun) - all_fonts = get_all_fonts_from_csslist(all_css, session, debuglevel=debuglevel, dryrun=dryrun) + all_css = list_all_css_for_page(url, config) + all_fonts = get_all_fonts_from_csslist(all_css, config) # Prepare destination dir destdir = os.path.join(fontdir,urlparse(url).netloc) # Save all fonts to that dir - return save_all_fonts(all_fonts, destdir, session, debuglevel=debuglevel, dryrun=dryrun, convert=convert) + return save_all_fonts(all_fonts, destdir, config) -def whitelist_harfile(harfile, fontdir, session=None, debuglevel=debuglevel, dryrun=False, convert=False): +def whitelist_harfile(harfile, fontdir, config): """ Given the harfile, save all fonts listed in the discovered css files """ all_fonts = [] - if not session: - session = get_session() # List all css in the har file - all_css = extract_css_urls_from_harfile(harfile) - all_fonts = get_all_fonts_from_csslist(all_css, session, debuglevel=debuglevel, dryrun=dryrun) + all_css = extract_css_urls_from_harfile(harfile, config) + all_fonts = get_all_fonts_from_csslist(all_css, config) # Prepare destination dir destdir = os.path.join(fontdir,"harfiles") # Save all fonts to that dir - return save_all_fonts(all_fonts, destdir, session, debuglevel=debuglevel, dryrun=dryrun, convert=convert) + return save_all_fonts(all_fonts, destdir, config) -def extract_css_urls_from_harfile(harfile): +def extract_css_urls_from_harfile(harfile, config): """ Extract all urls that match string "css" from a har file """ @@ -254,21 +281,18 @@ def extract_css_urls_from_harfile(harfile): for d in c: e = c[x]['request']['url'] if "css" in e and e not in css_files: - if debuglevel >= 5: + if config.debuglevel >= 5: eprint(e) css_files.append(e) x = x + 1 return css_files -def convert_font(url, filename, session=None, debuglevel=debuglevel, dryrun=False): +def convert_woffwoff2_ttf(url, filename, config): """ Save the given url to filename, with filetype ttf """ # This will only be called from save_font when dryrun=False, so the dryrun flag here is useful only if called from some other usage. - if session: - response = session.get(url) - else: - response = requests.get(url) + response = config.session.get(url) try: from fontTools import ttLib except ModuleNotFoundError: @@ -280,10 +304,54 @@ def convert_font(url, filename, session=None, debuglevel=debuglevel, dryrun=Fals with tempfile.TemporaryFile() as tf: tf.write(response.content) font = ttLib.TTFont(tf) - if debuglevel >= 3: - eprint(f"Converting {url[:MAX_STRING_PRINT_LENGTH]} from {font.flavor} to ttf as file {filename}") + if config.debuglevel >= 3: + eprint(f"Converting {url[:config.MAX_STRING_PRINT_LENGTH]} from {font.flavor} to ttf as file {filename}") font.flavor = None # restores default value, for non-compressed OpenType font.save(filename) return 0 + +def convert_eot_ttf(uri, filename, config): + """ + Save the given uri of an eot file to filename, with filetype ttf + """ + # This will only be called from save_font when dryrun=False, so the dryrun flag here is useful only if called from some other usage. + tf = None + if "http://" in uri or "https://" in uri or "ftp://" in uri: + response = config.session.get(uri) + content = response.content + tf = tempfile.NamedTemporaryFile() + tf.write(content) + infile = tf.name # change to use this temp file + else: + # local file, or some uri scheme not planned yet + infile = uri + #with open(uri,'rb') as o: + # content = o.read() + try: + r = subprocess.run(["which",config.eot2ttf_binary], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + if r.returncode != 0: + #raise FileNotFoundError(18,"eot2ttf","cannot find") + eprint(f"Warning: Cannot convert {uri} because cannot find eot2ttf. Please set --eotbin.") + return -1 + # so proceed + except Exception as e: + try: # clean up temp file + if tf: + tf.close() + except: + pass + raise e + + r=subprocess.run([config.eot2ttf_binary,infile,filename]) + if r.returncode != 0: + eprint(f"Warning: eot2ttf failed on {uri}") + + # exit convert_eot_ttf + try: # clean up temp file + if tf: + tf.close() + except: + pass + return 0 |