#!/usr/bin/env python3
# File: /usr/libexec/savewebfonts/savewebfonts_lib.py
# Location: save-webfonts package
# Author: bgstack15
# Startdate: 2021-04-02 07:20
# SPDX-License-Identifier: CC-BY-SA 4.0
# Title: Library for Saving Webfonts
# Purpose: library for whitelisting a page's webfonts by downloading them for current user
# Usage: See save-webfonts (1)
# Reference:
#    https://github.com/fonttools/fonttools/issues/1694
# Improve:
#    Handle using tinycss old?
# Dependencies:
#    req-fedora: python3-beautifulsoup4, python3-tinycss2
#    rec-fedora: python3-fonttools, libeot-tools
#    req-devuan: python3-bs4, python3-tinycss2
#    rec-devuan: python3-fonttools, eot2ttf

import requests, os, json, tempfile, subprocess, base64
from sys import stderr
from bs4 import BeautifulSoup as bs # python3-beautifulsoup4
from urllib.parse import urljoin, urlparse
import tinycss2 # python3-tinycss2

def get_session():
   session = requests.Session()
   session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
   return session

# defaults for library
class swf_config:
   def __init__(
         self
         , debuglevel = 8
         , session = None
         , MAX_STRING_PRINT_LENGTH = 180
         , eot2ttf_binary = "eot2ttf"
         , dryrun = True
         , convert = False
      ):
      self.debuglevel = debuglevel
      self.MAX_STRING_PRINT_LENGTH = MAX_STRING_PRINT_LENGTH
      self.eot2ttf_binary = eot2ttf_binary
      self.dryrun = dryrun
      if session is None:
         self.session = get_session()
      else:
         self.session = session
      self.convert = convert

   def __repr__(self):
      response = "<swf_config"
      for i in self.__dict__:
         # omit printing session
         if "session" not in i:
            response = response + " " + (str(i)) + "=\"" + str(self.__dict__[i]) + "\","
      response = response.rstrip(",") + ">"
      return response

config_default = swf_config()

# Functions
def eprint(*args, **kwargs):
   print(*args, file=stderr, **kwargs)

def ttfify_filename(filename):
   response = filename
   for end in [".woff2",".woff",".eot",".svg"]:
      if response.endswith(end):
         response = response[:-len(end)]
      # For python 3.9 and higher only:
      #response = response.removesuffix(end)
   return response + ".ttf"

def list_all_css_for_page(url, config):
   """
   Return all css links from a given page
   """
   # Reference: https://www.thepythoncode.com/article/extract-web-page-script-and-css-files-in-python
   css_files = []
   html = config.session.get(url).content
   soup = bs(html, "html.parser")
   for css in soup.find_all("link"):
      if ".css" in css.attrs.get("href"):
         # if the link tag has the 'href' attribute
         css_url = urljoin(url, css.attrs.get("href"))
         if config.debuglevel >= 8:
            eprint(f"Found css: {css_url}")
         css_files.append(css_url)
   return css_files

def get_webfonts_for_one_css(url, config):
   """
   Return a list of urls of all webfonts specified in this css file
   """
   #theseFonts = []
   css = config.session.get(url).content
   a = tinycss2.parse_stylesheet_bytes(css)
   a = a[0]
   b = []
   x=0
   # extract only the font-face rules
   for i in a:
      x = x + 1
      try:
         if "at-rule" in i.type and "font-face" in i.at_keyword:
            b.append(i)
            if config.debuglevel >= 10:
               eprint(str(x) + " " + str(i))
      except:
         pass
   # now list b is only the font-face rules
   c = []
   for i in b:
      x=0
      marker=-1
      for j in i.content:
         x = x + 1
         if "url" in j.type:
            # make absolute from relative
            thisurl = urljoin(url,j.value)
            if thisurl not in c:
               if config.debuglevel >= 5:
                  eprint(f"get_webfonts_for_one_css: Found font url {thisurl[:config.MAX_STRING_PRINT_LENGTH]}")
               c.append(thisurl)
   # c is a flat list of all font files, many of which are duplicates
   return c

def save_inline_font_helper(intro,url):
   """
   Call this from save_font if url.startswith("data:")
   Return need_convert, ext, tf, and filename
   """
   need_convert = False
   ext = ""
   tf = tempfile.NamedTemporaryFile()
   if "/x-font-woff2;" in intro or "/font-woff2;" in intro:
      need_convert = True
      ext = ".woff2"
   elif "/x-font-woff;" in intro or "/font-woff;" in intro:
      need_convert = True
      ext = ".woff"
   elif "/x-font-ttf;" in intro or "/font-ttf;" in intro:
      ext = ".ttf"

   contents = url[len(intro):]
   filename = ttfify_filename(contents[:20])
   #print(f"TEMP, sifh: {contents}")
   if ";base64," in intro:
      contents = base64.b64decode(contents)
      #print(f"TEMP, did the base64 conversion")
   tf.write(contents)
   return need_convert, ext, tf, filename

def save_font(url,destdir,config):
   """
   Given a url, and destination dir, and optionally an existing http session, download the url and save to a file. If convert, save any woff/woff2 to ttf.
   """

   need_convert = False

   # Derive filename
   filename=""
   filename=os.path.basename(urlparse(url).path)
   ext = os.path.splitext(filename)[-1]
   tf = None
   # Do not try to convert .svg
   if config.convert and not filename.endswith(".ttf") and ext not in [".svg"]:
      need_convert = True
      orig_filename = filename # in case we cannot load library later
      filename = ttfify_filename(filename)

   if url.startswith("data:"):
      # FINDTHIS data
      for fheader in ["data:application/x-font-woff;charset=utf-8;base64,","data:application/x-font-ttf;charset=utf-8;base64,","data:application/font-woff;charset=utf-8;base64,","data:font/ttf;base64,"]:
         if url.startswith(fheader):
            need_convert, ext, tf, filename = save_inline_font_helper(fheader, url)
            #print(f"TEMP: FOR {url}, {need_convert} {ext} {tf} {filename}")
            break

   filepath = os.path.join(destdir, filename)

   if not os.path.exists(filepath):
      if url.startswith("data:"):
         # Yes, some repetition here.
         #if url.startswith("data:application/x-font-woff;charset=utf-8;base64,"):
         if tf is not None:
            pass
         else:
            # not supported yet!
            eprint(f"Warning: Url {url[:config.MAX_STRING_PRINT_LENGTH]} is unsupported, for file {filepath}.")
            return -1

      if not config.dryrun:
         if tf:
            with open(tf.name,'rb') as otf:
               file_contents = otf.read()
         else:
            # Download content
            response = config.session.get(url)
            if 'Content-Disposition' in response.headers:
               filename=response.headers['Content-Disposition']
               eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}")
               if need_convert and not filename.endswith(".ttf"):
                  orig_filename = filename # in case we cannot load library later
                  filename = ttfify_filename(filename)
            file_contents = response.content

      filepath = os.path.join(destdir, filename)
      #try:
      if True:
         if config.debuglevel >= 1:
            sstring = "Saving" if not config.dryrun else "Save"
            eprint(f"{sstring} {url[:config.MAX_STRING_PRINT_LENGTH]} to file {filepath}")
         if not config.dryrun:
            if not need_convert:
               with open(filepath,'wb') as thisfile:
                  thisfile.write(file_contents)
            else:
               # need_convert is true, and not dryrun, so call function
               if ext in [".woff",".woff2"]:
                  try:
                     from fontTools import ttLib
                  except Exception as e:
                     raise e
                  convert_in = url
                  if tf:
                     convert_in = tf.name
                  convert_woffwoff2_ttf(convert_in,filepath,config=config)
               elif ext in [".eot"]:
                  convert_eot_ttf(url,filepath,config=config)
               else:
                  # no plan for conversion!
                  eprint(f"Warning: no conversion plan for ext {ext} of {url[:config.MAX_STRING_PRINT_LENGTH]}. Saving as-is.")
                  with open(filepath,'wb') as thisfile:
                     thisfile.write(file_contents)
         if tf: tf.close()
         return 0
      #except Exception as E:
      #   eprint(f"Error when downloading {url}, {E}")
      #   if tf: tf.close()
      #   return -1
      if tf: tf.close()
   else: # filepath does exist
      if config.debuglevel >= 2:
         eprint(f"File {filepath} exists for {url[:config.MAX_STRING_PRINT_LENGTH]}. Skipping.")
      if tf: tf.close()
      return 0

def get_all_fonts_from_csslist(all_css, config):
   all_fonts = []
   for this_css in all_css:
      webfonts = get_webfonts_for_one_css(this_css, config)
      for webfont in webfonts:
         # filter accepted extensions here. Technically fontconfig only uses ttf.
         # Always exclude svg, because those are really big, and not usable files for fontconfig.
         # WORKHERE: allow svg, if convert_woffwoff2_ttf works on svg.
         if webfont not in all_fonts and '.svg' not in webfont:
            if config.debuglevel >= 2:
               eprint(f"Found font {webfont[:config.MAX_STRING_PRINT_LENGTH+30]}")
            all_fonts.append(webfont)
   return all_fonts

def save_all_fonts(all_fonts, destdir, config):
   """
   Given a list of font urls, and the destdir, save all these fonts
   """

   #print(f"Found {len(all_fonts)} font files for page {url}")
   #print(f"Will save to {destdir}")

   if os.path.exists(destdir):
      if not os.path.isdir(destdir):
         raise NotADirectoryError(20,destdir,"Please clean up this non-directory file and try again")
         return -1
   try:
      if not config.dryrun:
         os.mkdir(destdir)
   except FileExistsError:
      pass # it already exists
   except Exception as E:
      raise E

   # Loop through all webfont files and save them
   for font in all_fonts:
      save_font(font, destdir, config)
   return 0

def whitelist_page(url, fontdir, config = config_default):
   """
   For the given URL, Save all listed webfonts to a directory named
   after the domain, underneath the given fontdir. If convert, then
   convert all woff, woff2 files to ttf using woffTools
   """
   all_fonts = []

   # List all webfonts called by the given page
   all_css = list_all_css_for_page(url, config)
   all_fonts = get_all_fonts_from_csslist(all_css, config)

   # Prepare destination dir
   destdir = os.path.join(fontdir,urlparse(url).netloc)

   # Save all fonts to that dir
   return save_all_fonts(all_fonts, destdir, config)

def whitelist_harfile(harfile, fontdir, config = config_default):
   """
   Given the harfile, save all fonts listed in the discovered css files
   """
   all_fonts = []

   # List all css in the har file
   all_css = extract_css_urls_from_harfile(harfile, config)
   all_fonts = get_all_fonts_from_csslist(all_css, config)

   # Prepare destination dir
   with open(harfile,'r') as o:
      har_contents = o.read()
   a = json.loads(har_contents)
   domain = urlparse(a['log']['entries'][0]['request']['url']).netloc # get first entry's domain name
   destdir = os.path.join(fontdir,"harfile-"+domain)

   # Save all fonts to that dir
   return save_all_fonts(all_fonts, destdir, config)

def extract_css_urls_from_harfile(harfile, config):
   """
   Extract all urls that match string "css" from a har file
   """
   css_files = []

   with open(harfile,'r') as o:
      har_contents = o.read()

   a = json.loads(har_contents)
   c = a['log']['entries']
   x = 0
   for d in c:
      e = c[x]['request']['url']
      if "css" in e and e not in css_files:
         if config.debuglevel >= 5:
            eprint(e)
         css_files.append(e)
      x = x + 1
   return css_files

def convert_woffwoff2_ttf(url, filename, config):
   """
   Save the given url to filename, with filetype ttf
   """
   # This will only be called from save_font when dryrun=False, so the dryrun flag here is useful only if called from some other usage.
   if (url.startswith("http://") or url.startswith("https://") or url.startswith("ftp://")):
      response = config.session.get(url)
      file_contents = response.content
   else:
      # assume local file
      with open(url,'rb') as o:
         file_contents = o.read()
   try:
      from fontTools import ttLib
   except ModuleNotFoundError:
      eprint("Warning: cannot load fontTools. Try installing python3-fonttools")
      return -1
   except Exception as e:
      raise e

   with tempfile.TemporaryFile() as tf:
      tf.write(file_contents)
      try:
         font = ttLib.TTFont(tf)
      except ttLib.TTLibError as e:
         eprint(f"Warning: not a woff/woff2: {url[:config.MAX_STRING_PRINT_LENGTH]} for file {filename}")
         return -1
   if config.debuglevel >= 3:
      eprint(f"Converting {url[:config.MAX_STRING_PRINT_LENGTH]} from {font.flavor} to ttf as file {filename}")

   font.flavor = None # restores default value, for non-compressed OpenType
   font.save(filename)

   return 0

def convert_eot_ttf(uri, filename, config):
   """
   Save the given uri of an eot file to filename, with filetype ttf
   """
   # This will only be called from save_font when dryrun=False, so the dryrun flag here is useful only if called from some other usage.
   tf = None
   if "http://" in uri or "https://" in uri or "ftp://" in uri:
      response = config.session.get(uri)
      content = response.content
      tf = tempfile.NamedTemporaryFile()
      tf.write(content)
      infile = tf.name # change to use this temp file
   else:
      # local file, or some uri scheme not planned yet
      infile = uri
      #with open(uri,'rb') as o:
      #   content = o.read()
   try:
      r = subprocess.run(["which",config.eot2ttf_binary], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
      if r.returncode != 0:
         #raise FileNotFoundError(18,"eot2ttf","cannot find")
         eprint(f"Warning: Cannot convert {uri} because cannot find eot2ttf. Please set --eotbin.")
         return -1
      # so proceed
   except Exception as e:
      try: # clean up temp file
         if tf:
            tf.close()
      except:
         pass
      raise e

   r=subprocess.run([config.eot2ttf_binary,infile,filename])
   if r.returncode != 0:
      eprint(f"Warning: eot2ttf failed on {uri}")

   # exit convert_eot_ttf
   try: # clean up temp file
      if tf:
         tf.close()
   except:
      pass
   return 0