diff options
Diffstat (limited to 'savewebfonts_lib.py')
-rwxr-xr-x | savewebfonts_lib.py | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/savewebfonts_lib.py b/savewebfonts_lib.py new file mode 100755 index 0000000..2e0a55a --- /dev/null +++ b/savewebfonts_lib.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +# File: /usr/libexec/savewebfonts/savewebfonts_lib.py +# Location: save-webfonts package +# Author: bgstack15 +# Startdate: 2021-04-02 07:20 +# SPDX-License-Identifier: CC-BY-SA 4.0 +# Title: Library for Saving Webfonts +# Purpose: library for whitelisting a page's webfonts by downloading them for current user +# Usage: See save-webfonts (1) +# Reference: +# Improve: +# accept a list of filetypes to save, or exclude? Such as, ['ttf','woff2'] +# Convert woff2 fonts? +# Handle using tinycss old? +# Dependencies: +# req-fedora: python3-beautifulsoup4, python3-tinycss2 +import requests, os +from sys import stderr +from bs4 import BeautifulSoup as bs # python3-beautifulsoup4 +from urllib.parse import urljoin, urlparse +import tinycss2 # python3-tinycss2 + +# default for library +debuglevel = 8 + +# Functions +def eprint(*args, **kwargs): + print(*args, file=stderr, **kwargs) + +def get_session(): + session = requests.Session() + session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" + return session + +def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun = False): + """ + Return all css links from a given page + """ + # Reference: https://www.thepythoncode.com/article/extract-web-page-script-and-css-files-in-python + css_files = [] + if not session: + session = get_session() + html = session.get(url).content + soup = bs(html, "html.parser") + for css in soup.find_all("link"): + if ".css" in css.attrs.get("href"): + # if the link tag has the 'href' attribute + css_url = urljoin(url, css.attrs.get("href")) + if debuglevel >= 8: + eprint(f"Found css: {css_url}") + css_files.append(css_url) + return css_files + +def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryrun = False): + """ + Return a list of urls of all webfonts specified in this css file + """ + #theseFonts = [] + if not session: + session = get_session() + css = session.get(url).content + a = tinycss2.parse_stylesheet_bytes(css) + a = a[0] + b = [] + x=0 + # extract only the font-face rules + for i in a: + x = x + 1 + try: + if "at-rule" in i.type and "font-face" in i.at_keyword: + b.append(i) + if debuglevel >= 10: + eprint(str(x) + " " + str(i)) + except: + pass + # now list b is only the font-face rules + c = [] + for i in b: + x=0 + marker=-1 + for j in i.content: + x = x + 1 + if "url" in j.type: + # make absolute from relative + thisurl = urljoin(url,j.value) + if thisurl not in c: + if debuglevel >= 5: + eprint(f"get_webfonts_for_one_css: Found font url {thisurl}") + c.append(thisurl) + # c is a flat list of all font files, many of which are duplicates + return c + +def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun = False): + """ + Given a url, and destination dir, and optionally an existing http session, download the url and save to a file + """ + + # Derive filename + filename="" + filename=os.path.basename(urlparse(url).path) + filepath = os.path.join(destdir, filename) + + if not os.path.exists(filepath): + if not dryrun: + # Download content + if session: + response = session.get(url) + else: + response = requests.get(url) + + if 'Content-Disposition' in response.headers: + filename=response.headers['Content-Disposition'] + eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}") + filepath = os.path.join(destdir, filename) + + # Future: logic for woff2 to ttf conversion goes here, approximately + + try: + if debuglevel >= 1: + sstring = "Saving" if not dryrun else "Save" + eprint(f"{sstring} {url} to file {filepath}") + if not dryrun: + with open(filepath,'wb') as thisfile: + thisfile.write(response.content) + return 0 + except Exception as E: + eprint(f"Error when downloading {url}, {E}") + return -1 + else: # filepath does exist + if debuglevel >= 2: + eprint(f"File {filepath} exists for {url}. Skipping.") + return 0 + +def whitelist_page(url, fontdir, debuglevel = debuglevel, dryrun = False): + """ + For the given URL, Save all listed webfonts to a directory named + after the domain, underneath the given fontdir. + """ + all_fonts = [] + session = get_session() + + # List all webfonts called by the given page + all_css = list_all_css_for_page(url, session, debuglevel = debuglevel, dryrun = dryrun) + for this_css in all_css: + webfonts = get_webfonts_for_one_css(this_css, session, debuglevel = debuglevel, dryrun = dryrun) + for webfont in webfonts: + # filter accepted extensions here. Technically fontconfig only uses ttf. + # Always exclude svg, because those are really big, and not usable files for fontconfig. + if webfont not in all_fonts and '.svg' not in webfont: + if debuglevel >= 2: + eprint(f"Found font {webfont}") + all_fonts.append(webfont) + + # Prepare destination dir + destdir = os.path.join(fontdir,urlparse(url).netloc) + + #print(f"Found {len(all_fonts)} font files for page {url}") + #print(f"Will save to {destdir}") + + if os.path.exists(destdir): + if not os.path.isdir(destdir): + raise NotADirectoryError(20,destdir,"Please clean up this non-directory file and try again") + return -1 + try: + if not dryrun: + os.mkdir(destdir) + except FileExistsError: + pass # it already exists + except Exception as E: + raise E + + # Loop through all webfont files and save them + for font in all_fonts: + save_font(font, destdir, debuglevel = debuglevel, dryrun = dryrun) + return 0 |