1 files changed, 175 insertions, 0 deletions
diff --git a/savewebfonts_lib.py b/savewebfonts_lib.py
new file mode 100755
index 0000000..2e0a55a
--- /dev/null
+++ b/savewebfonts_lib.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+# File: /usr/libexec/savewebfonts/savewebfonts_lib.py
+# Location: save-webfonts package
+# Author: bgstack15
+# Startdate: 2021-04-02 07:20
+# SPDX-License-Identifier: CC-BY-SA 4.0
+# Title: Library for Saving Webfonts
+# Purpose: library for whitelisting a page's webfonts by downloading them for current user
+# Usage: See save-webfonts (1)
+# Reference:
+# Improve:
+#    accept a list of filetypes to save, or exclude? Such as, ['ttf','woff2']
+#    Convert woff2 fonts?
+#    Handle using tinycss old?
+# Dependencies:
+#    req-fedora: python3-beautifulsoup4, python3-tinycss2
+import requests, os
+from sys import stderr
+from bs4 import BeautifulSoup as bs # python3-beautifulsoup4
+from urllib.parse import urljoin, urlparse
+import tinycss2 # python3-tinycss2
+
+# default for library
+debuglevel = 8
+
+# Functions
+def eprint(*args, **kwargs):
+   print(*args, file=stderr, **kwargs)
+
+def get_session():
+   session = requests.Session()
+   session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
+   return session
+
+def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun = False):
+   """
+   Return all css links from a given page
+   """
+   # Reference: https://www.thepythoncode.com/article/extract-web-page-script-and-css-files-in-python
+   css_files = []
+   if not session:
+      session = get_session()
+   html = session.get(url).content
+   soup = bs(html, "html.parser")
+   for css in soup.find_all("link"):
+      if ".css" in css.attrs.get("href"):
+         # if the link tag has the 'href' attribute
+         css_url = urljoin(url, css.attrs.get("href"))
+         if debuglevel >= 8:
+            eprint(f"Found css: {css_url}")
+         css_files.append(css_url)
+   return css_files
+
+def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryrun = False):
+   """
+   Return a list of urls of all webfonts specified in this css file
+   """
+   #theseFonts = []
+   if not session:
+      session = get_session()
+   css = session.get(url).content
+   a = tinycss2.parse_stylesheet_bytes(css)
+   a = a[0]
+   b = []
+   x=0
+   # extract only the font-face rules
+   for i in a:
+      x = x + 1
+      try:
+         if "at-rule" in i.type and "font-face" in i.at_keyword:
+            b.append(i)
+            if debuglevel >= 10:
+               eprint(str(x) + " " + str(i))
+      except:
+         pass
+   # now list b is only the font-face rules
+   c = []
+   for i in b:
+      x=0
+      marker=-1
+      for j in i.content:
+         x = x + 1
+         if "url" in j.type:
+            # make absolute from relative
+            thisurl = urljoin(url,j.value)
+            if thisurl not in c:
+               if debuglevel >= 5:
+                  eprint(f"get_webfonts_for_one_css: Found font url {thisurl}")
+               c.append(thisurl)
+   # c is a flat list of all font files, many of which are duplicates
+   return c
+
+def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun = False):
+   """
+   Given a url, and destination dir, and optionally an existing http session, download the url and save to a file
+   """
+
+   # Derive filename
+   filename=""
+   filename=os.path.basename(urlparse(url).path)
+   filepath = os.path.join(destdir, filename)
+
+   if not os.path.exists(filepath):
+      if not dryrun:
+      # Download content
+         if session:
+            response = session.get(url)
+         else:
+            response = requests.get(url)
+
+         if 'Content-Disposition' in response.headers:
+            filename=response.headers['Content-Disposition']
+            eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}")
+            filepath = os.path.join(destdir, filename)
+
+      # Future: logic for woff2 to ttf conversion goes here, approximately
+
+      try:
+         if debuglevel >= 1:
+            sstring = "Saving" if not dryrun else "Save"
+            eprint(f"{sstring} {url} to file {filepath}")
+         if not dryrun:
+            with open(filepath,'wb') as thisfile:
+               thisfile.write(response.content)
+         return 0
+      except Exception as E:
+         eprint(f"Error when downloading {url}, {E}")
+         return -1
+   else: # filepath does exist
+      if debuglevel >= 2:
+         eprint(f"File {filepath} exists for {url}. Skipping.")
+      return 0
+
+def whitelist_page(url, fontdir, debuglevel = debuglevel, dryrun = False):
+   """
+   For the given URL, Save all listed webfonts to a directory named
+   after the domain, underneath the given fontdir.
+   """
+   all_fonts = []
+   session = get_session()
+
+   # List all webfonts called by the given page
+   all_css = list_all_css_for_page(url, session, debuglevel = debuglevel, dryrun = dryrun)
+   for this_css in all_css:
+      webfonts = get_webfonts_for_one_css(this_css, session, debuglevel = debuglevel, dryrun = dryrun)
+      for webfont in webfonts:
+         # filter accepted extensions here. Technically fontconfig only uses ttf.
+         # Always exclude svg, because those are really big, and not usable files for fontconfig.
+         if webfont not in all_fonts and '.svg' not in webfont:
+            if debuglevel >= 2:
+               eprint(f"Found font {webfont}")
+            all_fonts.append(webfont)
+
+   # Prepare destination dir
+   destdir = os.path.join(fontdir,urlparse(url).netloc)
+
+   #print(f"Found {len(all_fonts)} font files for page {url}")
+   #print(f"Will save to {destdir}")
+
+   if os.path.exists(destdir):
+      if not os.path.isdir(destdir):
+         raise NotADirectoryError(20,destdir,"Please clean up this non-directory file and try again")
+         return -1
+   try:
+      if not dryrun:
+         os.mkdir(destdir)
+   except FileExistsError:
+      pass # it already exists
+   except Exception as E:
+      raise E
+
+   # Loop through all webfont files and save them
+   for font in all_fonts:
+      save_font(font, destdir, debuglevel = debuglevel, dryrun = dryrun)
+   return 0