aboutsummaryrefslogtreecommitdiff
path: root/savewebfonts_lib.py
diff options
context:
space:
mode:
authorB. Stack <bgstack15@gmail.com>2021-04-02 21:14:08 -0400
committerB. Stack <bgstack15@gmail.com>2021-04-02 21:14:08 -0400
commitc15f82fa38458d76cec62b930b13b446829e3a2d (patch)
treed8df51f33bd2af5266eb19b6c00245aafab49bbb /savewebfonts_lib.py
downloadsave-webfonts-c15f82fa38458d76cec62b930b13b446829e3a2d.tar.gz
save-webfonts-c15f82fa38458d76cec62b930b13b446829e3a2d.tar.bz2
save-webfonts-c15f82fa38458d76cec62b930b13b446829e3a2d.zip
initial commit
Diffstat (limited to 'savewebfonts_lib.py')
-rwxr-xr-xsavewebfonts_lib.py175
1 files changed, 175 insertions, 0 deletions
diff --git a/savewebfonts_lib.py b/savewebfonts_lib.py
new file mode 100755
index 0000000..2e0a55a
--- /dev/null
+++ b/savewebfonts_lib.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+# File: /usr/libexec/savewebfonts/savewebfonts_lib.py
+# Location: save-webfonts package
+# Author: bgstack15
+# Startdate: 2021-04-02 07:20
+# SPDX-License-Identifier: CC-BY-SA 4.0
+# Title: Library for Saving Webfonts
+# Purpose: library for whitelisting a page's webfonts by downloading them for current user
+# Usage: See save-webfonts (1)
+# Reference:
+# Improve:
+# accept a list of filetypes to save, or exclude? Such as, ['ttf','woff2']
+# Convert woff2 fonts?
+# Handle using tinycss old?
+# Dependencies:
+# req-fedora: python3-beautifulsoup4, python3-tinycss2
+import requests, os
+from sys import stderr
+from bs4 import BeautifulSoup as bs # python3-beautifulsoup4
+from urllib.parse import urljoin, urlparse
+import tinycss2 # python3-tinycss2
+
+# default for library
+debuglevel = 8
+
+# Functions
+def eprint(*args, **kwargs):
+ print(*args, file=stderr, **kwargs)
+
+def get_session():
+ session = requests.Session()
+ session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
+ return session
+
+def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun = False):
+ """
+ Return all css links from a given page
+ """
+ # Reference: https://www.thepythoncode.com/article/extract-web-page-script-and-css-files-in-python
+ css_files = []
+ if not session:
+ session = get_session()
+ html = session.get(url).content
+ soup = bs(html, "html.parser")
+ for css in soup.find_all("link"):
+ if ".css" in css.attrs.get("href"):
+ # if the link tag has the 'href' attribute
+ css_url = urljoin(url, css.attrs.get("href"))
+ if debuglevel >= 8:
+ eprint(f"Found css: {css_url}")
+ css_files.append(css_url)
+ return css_files
+
+def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryrun = False):
+ """
+ Return a list of urls of all webfonts specified in this css file
+ """
+ #theseFonts = []
+ if not session:
+ session = get_session()
+ css = session.get(url).content
+ a = tinycss2.parse_stylesheet_bytes(css)
+ a = a[0]
+ b = []
+ x=0
+ # extract only the font-face rules
+ for i in a:
+ x = x + 1
+ try:
+ if "at-rule" in i.type and "font-face" in i.at_keyword:
+ b.append(i)
+ if debuglevel >= 10:
+ eprint(str(x) + " " + str(i))
+ except:
+ pass
+ # now list b is only the font-face rules
+ c = []
+ for i in b:
+ x=0
+ marker=-1
+ for j in i.content:
+ x = x + 1
+ if "url" in j.type:
+ # make absolute from relative
+ thisurl = urljoin(url,j.value)
+ if thisurl not in c:
+ if debuglevel >= 5:
+ eprint(f"get_webfonts_for_one_css: Found font url {thisurl}")
+ c.append(thisurl)
+ # c is a flat list of all font files, many of which are duplicates
+ return c
+
+def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun = False):
+ """
+ Given a url, and destination dir, and optionally an existing http session, download the url and save to a file
+ """
+
+ # Derive filename
+ filename=""
+ filename=os.path.basename(urlparse(url).path)
+ filepath = os.path.join(destdir, filename)
+
+ if not os.path.exists(filepath):
+ if not dryrun:
+ # Download content
+ if session:
+ response = session.get(url)
+ else:
+ response = requests.get(url)
+
+ if 'Content-Disposition' in response.headers:
+ filename=response.headers['Content-Disposition']
+ eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}")
+ filepath = os.path.join(destdir, filename)
+
+ # Future: logic for woff2 to ttf conversion goes here, approximately
+
+ try:
+ if debuglevel >= 1:
+ sstring = "Saving" if not dryrun else "Save"
+ eprint(f"{sstring} {url} to file {filepath}")
+ if not dryrun:
+ with open(filepath,'wb') as thisfile:
+ thisfile.write(response.content)
+ return 0
+ except Exception as E:
+ eprint(f"Error when downloading {url}, {E}")
+ return -1
+ else: # filepath does exist
+ if debuglevel >= 2:
+ eprint(f"File {filepath} exists for {url}. Skipping.")
+ return 0
+
+def whitelist_page(url, fontdir, debuglevel = debuglevel, dryrun = False):
+ """
+ For the given URL, Save all listed webfonts to a directory named
+ after the domain, underneath the given fontdir.
+ """
+ all_fonts = []
+ session = get_session()
+
+ # List all webfonts called by the given page
+ all_css = list_all_css_for_page(url, session, debuglevel = debuglevel, dryrun = dryrun)
+ for this_css in all_css:
+ webfonts = get_webfonts_for_one_css(this_css, session, debuglevel = debuglevel, dryrun = dryrun)
+ for webfont in webfonts:
+ # filter accepted extensions here. Technically fontconfig only uses ttf.
+ # Always exclude svg, because those are really big, and not usable files for fontconfig.
+ if webfont not in all_fonts and '.svg' not in webfont:
+ if debuglevel >= 2:
+ eprint(f"Found font {webfont}")
+ all_fonts.append(webfont)
+
+ # Prepare destination dir
+ destdir = os.path.join(fontdir,urlparse(url).netloc)
+
+ #print(f"Found {len(all_fonts)} font files for page {url}")
+ #print(f"Will save to {destdir}")
+
+ if os.path.exists(destdir):
+ if not os.path.isdir(destdir):
+ raise NotADirectoryError(20,destdir,"Please clean up this non-directory file and try again")
+ return -1
+ try:
+ if not dryrun:
+ os.mkdir(destdir)
+ except FileExistsError:
+ pass # it already exists
+ except Exception as E:
+ raise E
+
+ # Loop through all webfont files and save them
+ for font in all_fonts:
+ save_font(font, destdir, debuglevel = debuglevel, dryrun = dryrun)
+ return 0
bgstack15