support har files in lib

Still need to add harfile support to front-end
author: B. Stack <bgstack15@gmail.com> 2021-04-02 23:02:17 -0400
committer: B. Stack <bgstack15@gmail.com> 2021-04-02 23:02:17 -0400
commit: 92645115f3d8c4286acfe2f81c049b8c539093c4 (patch)
tree: b6328d6c7d3460c6f05f3ac24cb4fdd71fcd78b3 /savewebfonts_lib.py
parent: add bugs and improve sections (diff)
download: save-webfonts-92645115f3d8c4286acfe2f81c049b8c539093c4.tar.gz
save-webfonts-92645115f3d8c4286acfe2f81c049b8c539093c4.tar.bz2
save-webfonts-92645115f3d8c4286acfe2f81c049b8c539093c4.zip
1 files changed, 98 insertions, 41 deletions
diff --git a/savewebfonts_lib.py b/savewebfonts_lib.py
index 2e0a55a..a8bcb11 100755
--- a/savewebfonts_lib.py
+++ b/savewebfonts_lib.py
@@ -14,7 +14,7 @@
 #    Handle using tinycss old?
 # Dependencies:
 #    req-fedora: python3-beautifulsoup4, python3-tinycss2
-import requests, os
+import requests, os, json
 from sys import stderr
 from bs4 import BeautifulSoup as bs # python3-beautifulsoup4
 from urllib.parse import urljoin, urlparse
@@ -32,7 +32,7 @@ def get_session():
    session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
    return session
 
-def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun = False):
+def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun=False):
    """
    Return all css links from a given page
    """
@@ -51,7 +51,7 @@ def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun =
          css_files.append(css_url)
    return css_files
 
-def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryrun = False):
+def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryrun=False):
    """
    Return a list of urls of all webfonts specified in this css file
    """
@@ -90,7 +90,7 @@ def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryru
    # c is a flat list of all font files, many of which are duplicates
    return c
 
-def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun = False):
+def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun=False):
    """
    Given a url, and destination dir, and optionally an existing http session, download the url and save to a file
    """
@@ -101,48 +101,44 @@ def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun = False)
    filepath = os.path.join(destdir, filename)
 
    if not os.path.exists(filepath):
-      if not dryrun:
-      # Download content
-         if session:
-            response = session.get(url)
-         else:
-            response = requests.get(url)
-
-         if 'Content-Disposition' in response.headers:
-            filename=response.headers['Content-Disposition']
-            eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}")
-            filepath = os.path.join(destdir, filename)
-
-      # Future: logic for woff2 to ttf conversion goes here, approximately
-
-      try:
-         if debuglevel >= 1:
-            sstring = "Saving" if not dryrun else "Save"
-            eprint(f"{sstring} {url} to file {filepath}")
+      if url.startswith("data:"):
+         # not supported!
+         eprint(f"Warning: Url {url[:120]} is unsupported.")
+      else:
          if not dryrun:
-            with open(filepath,'wb') as thisfile:
-               thisfile.write(response.content)
-         return 0
-      except Exception as E:
-         eprint(f"Error when downloading {url}, {E}")
-         return -1
+         # Download content
+            if session:
+               response = session.get(url)
+            else:
+               response = requests.get(url)
+
+            if 'Content-Disposition' in response.headers:
+               filename=response.headers['Content-Disposition']
+               eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}")
+               filepath = os.path.join(destdir, filename)
+
+         # Future: logic for woff2 to ttf conversion goes here, approximately
+
+         try:
+            if debuglevel >= 1:
+               sstring = "Saving" if not dryrun else "Save"
+               eprint(f"{sstring} {url} to file {filepath}")
+            if not dryrun:
+               with open(filepath,'wb') as thisfile:
+                  thisfile.write(response.content)
+            return 0
+         except Exception as E:
+            eprint(f"Error when downloading {url}, {E}")
+            return -1
    else: # filepath does exist
       if debuglevel >= 2:
          eprint(f"File {filepath} exists for {url}. Skipping.")
       return 0
 
-def whitelist_page(url, fontdir, debuglevel = debuglevel, dryrun = False):
-   """
-   For the given URL, Save all listed webfonts to a directory named
-   after the domain, underneath the given fontdir.
-   """
+def get_all_fonts_from_csslist(all_css, session=None, debuglevel=debuglevel, dryrun=False):
    all_fonts = []
-   session = get_session()
-
-   # List all webfonts called by the given page
-   all_css = list_all_css_for_page(url, session, debuglevel = debuglevel, dryrun = dryrun)
    for this_css in all_css:
-      webfonts = get_webfonts_for_one_css(this_css, session, debuglevel = debuglevel, dryrun = dryrun)
+      webfonts = get_webfonts_for_one_css(this_css, session, debuglevel=debuglevel, dryrun=dryrun)
       for webfont in webfonts:
          # filter accepted extensions here. Technically fontconfig only uses ttf.
          # Always exclude svg, because those are really big, and not usable files for fontconfig.
@@ -150,9 +146,12 @@ def whitelist_page(url, fontdir, debuglevel = debuglevel, dryrun = False):
             if debuglevel >= 2:
                eprint(f"Found font {webfont}")
             all_fonts.append(webfont)
+   return all_fonts
 
-   # Prepare destination dir
-   destdir = os.path.join(fontdir,urlparse(url).netloc)
+def save_all_fonts(all_fonts, destdir, session=None, debuglevel=debuglevel, dryrun=False):
+   """
+   Given a list of font urls, and the destdir, save all these fonts
+   """
 
    #print(f"Found {len(all_fonts)} font files for page {url}")
    #print(f"Will save to {destdir}")
@@ -171,5 +170,63 @@ def whitelist_page(url, fontdir, debuglevel = debuglevel, dryrun = False):
 
    # Loop through all webfont files and save them
    for font in all_fonts:
-      save_font(font, destdir, debuglevel = debuglevel, dryrun = dryrun)
+      save_font(font, destdir, session=session, debuglevel=debuglevel, dryrun=dryrun)
    return 0
+
+def whitelist_page(url, fontdir, session=None, debuglevel=debuglevel, dryrun=False):
+   """
+   For the given URL, Save all listed webfonts to a directory named
+   after the domain, underneath the given fontdir.
+   """
+   all_fonts = []
+   if not session:
+      session = get_session()
+
+   # List all webfonts called by the given page
+   all_css = list_all_css_for_page(url, session, debuglevel=debuglevel, dryrun=dryrun)
+   all_fonts = get_all_fonts_from_csslist(all_css, session, debuglevel=debuglevel, dryrun=dryrun)
+
+   # Prepare destination dir
+   destdir = os.path.join(fontdir,urlparse(url).netloc)
+
+   # Save all fonts to that dir
+   return save_all_fonts(all_fonts, destdir, session, debuglevel=debuglevel, dryrun=dryrun)
+
+def whitelist_harfile(harfile, fontdir, session=None, debuglevel=debuglevel, dryrun=False):
+   """
+   Given the harfile, save all fonts listed in the discovered css files
+   """
+   all_fonts = []
+   if not session:
+      session = get_session()
+
+   # List all css in the har file
+   all_css = extract_css_urls_from_harfile(harfile)
+   all_fonts = get_all_fonts_from_csslist(all_css, session, debuglevel=debuglevel, dryrun=dryrun)
+
+   # Prepare destination dir
+   destdir = os.path.join(fontdir,"harfiles")
+
+   # Save all fonts to that dir
+   return save_all_fonts(all_fonts, destdir, session, debuglevel=debuglevel, dryrun=dryrun)
+
+def extract_css_urls_from_harfile(harfile):
+   """
+   Extract all urls that match string "css" from a har file
+   """
+   css_files = []
+
+   with open(harfile,'r') as o:
+      har_contents = o.read()
+
+   a = json.loads(har_contents)
+   c = a['log']['entries']
+   x = 0
+   for d in c:
+      e = c[x]['request']['url']
+      if "css" in e and e not in css_files:
+         if debuglevel >= 5:
+            eprint(e)
+         css_files.append(e)
+      x = x + 1
+   return css_files
author	B. Stack <bgstack15@gmail.com>	2021-04-02 23:02:17 -0400
committer	B. Stack <bgstack15@gmail.com>	2021-04-02 23:02:17 -0400
commit	92645115f3d8c4286acfe2f81c049b8c539093c4 (patch)
tree	b6328d6c7d3460c6f05f3ac24cb4fdd71fcd78b3 /savewebfonts_lib.py
parent	add bugs and improve sections (diff)
download	save-webfonts-92645115f3d8c4286acfe2f81c049b8c539093c4.tar.gz save-webfonts-92645115f3d8c4286acfe2f81c049b8c539093c4.tar.bz2 save-webfonts-92645115f3d8c4286acfe2f81c049b8c539093c4.zip