aboutsummaryrefslogtreecommitdiff
path: root/savewebfonts_lib.py
diff options
context:
space:
mode:
authorB. Stack <bgstack15@gmail.com>2021-04-02 23:02:17 -0400
committerB. Stack <bgstack15@gmail.com>2021-04-02 23:02:17 -0400
commit92645115f3d8c4286acfe2f81c049b8c539093c4 (patch)
treeb6328d6c7d3460c6f05f3ac24cb4fdd71fcd78b3 /savewebfonts_lib.py
parentadd bugs and improve sections (diff)
downloadsave-webfonts-92645115f3d8c4286acfe2f81c049b8c539093c4.tar.gz
save-webfonts-92645115f3d8c4286acfe2f81c049b8c539093c4.tar.bz2
save-webfonts-92645115f3d8c4286acfe2f81c049b8c539093c4.zip
support har files in lib
Still need to add harfile support to front-end
Diffstat (limited to 'savewebfonts_lib.py')
-rwxr-xr-xsavewebfonts_lib.py139
1 files changed, 98 insertions, 41 deletions
diff --git a/savewebfonts_lib.py b/savewebfonts_lib.py
index 2e0a55a..a8bcb11 100755
--- a/savewebfonts_lib.py
+++ b/savewebfonts_lib.py
@@ -14,7 +14,7 @@
# Handle using tinycss old?
# Dependencies:
# req-fedora: python3-beautifulsoup4, python3-tinycss2
-import requests, os
+import requests, os, json
from sys import stderr
from bs4 import BeautifulSoup as bs # python3-beautifulsoup4
from urllib.parse import urljoin, urlparse
@@ -32,7 +32,7 @@ def get_session():
session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
return session
-def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun = False):
+def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun=False):
"""
Return all css links from a given page
"""
@@ -51,7 +51,7 @@ def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun =
css_files.append(css_url)
return css_files
-def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryrun = False):
+def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryrun=False):
"""
Return a list of urls of all webfonts specified in this css file
"""
@@ -90,7 +90,7 @@ def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryru
# c is a flat list of all font files, many of which are duplicates
return c
-def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun = False):
+def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun=False):
"""
Given a url, and destination dir, and optionally an existing http session, download the url and save to a file
"""
@@ -101,48 +101,44 @@ def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun = False)
filepath = os.path.join(destdir, filename)
if not os.path.exists(filepath):
- if not dryrun:
- # Download content
- if session:
- response = session.get(url)
- else:
- response = requests.get(url)
-
- if 'Content-Disposition' in response.headers:
- filename=response.headers['Content-Disposition']
- eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}")
- filepath = os.path.join(destdir, filename)
-
- # Future: logic for woff2 to ttf conversion goes here, approximately
-
- try:
- if debuglevel >= 1:
- sstring = "Saving" if not dryrun else "Save"
- eprint(f"{sstring} {url} to file {filepath}")
+ if url.startswith("data:"):
+ # not supported!
+ eprint(f"Warning: Url {url[:120]} is unsupported.")
+ else:
if not dryrun:
- with open(filepath,'wb') as thisfile:
- thisfile.write(response.content)
- return 0
- except Exception as E:
- eprint(f"Error when downloading {url}, {E}")
- return -1
+ # Download content
+ if session:
+ response = session.get(url)
+ else:
+ response = requests.get(url)
+
+ if 'Content-Disposition' in response.headers:
+ filename=response.headers['Content-Disposition']
+ eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}")
+ filepath = os.path.join(destdir, filename)
+
+ # Future: logic for woff2 to ttf conversion goes here, approximately
+
+ try:
+ if debuglevel >= 1:
+ sstring = "Saving" if not dryrun else "Save"
+ eprint(f"{sstring} {url} to file {filepath}")
+ if not dryrun:
+ with open(filepath,'wb') as thisfile:
+ thisfile.write(response.content)
+ return 0
+ except Exception as E:
+ eprint(f"Error when downloading {url}, {E}")
+ return -1
else: # filepath does exist
if debuglevel >= 2:
eprint(f"File {filepath} exists for {url}. Skipping.")
return 0
-def whitelist_page(url, fontdir, debuglevel = debuglevel, dryrun = False):
- """
- For the given URL, Save all listed webfonts to a directory named
- after the domain, underneath the given fontdir.
- """
+def get_all_fonts_from_csslist(all_css, session=None, debuglevel=debuglevel, dryrun=False):
all_fonts = []
- session = get_session()
-
- # List all webfonts called by the given page
- all_css = list_all_css_for_page(url, session, debuglevel = debuglevel, dryrun = dryrun)
for this_css in all_css:
- webfonts = get_webfonts_for_one_css(this_css, session, debuglevel = debuglevel, dryrun = dryrun)
+ webfonts = get_webfonts_for_one_css(this_css, session, debuglevel=debuglevel, dryrun=dryrun)
for webfont in webfonts:
# filter accepted extensions here. Technically fontconfig only uses ttf.
# Always exclude svg, because those are really big, and not usable files for fontconfig.
@@ -150,9 +146,12 @@ def whitelist_page(url, fontdir, debuglevel = debuglevel, dryrun = False):
if debuglevel >= 2:
eprint(f"Found font {webfont}")
all_fonts.append(webfont)
+ return all_fonts
- # Prepare destination dir
- destdir = os.path.join(fontdir,urlparse(url).netloc)
+def save_all_fonts(all_fonts, destdir, session=None, debuglevel=debuglevel, dryrun=False):
+ """
+ Given a list of font urls, and the destdir, save all these fonts
+ """
#print(f"Found {len(all_fonts)} font files for page {url}")
#print(f"Will save to {destdir}")
@@ -171,5 +170,63 @@ def whitelist_page(url, fontdir, debuglevel = debuglevel, dryrun = False):
# Loop through all webfont files and save them
for font in all_fonts:
- save_font(font, destdir, debuglevel = debuglevel, dryrun = dryrun)
+ save_font(font, destdir, session=session, debuglevel=debuglevel, dryrun=dryrun)
return 0
+
+def whitelist_page(url, fontdir, session=None, debuglevel=debuglevel, dryrun=False):
+ """
+ For the given URL, Save all listed webfonts to a directory named
+ after the domain, underneath the given fontdir.
+ """
+ all_fonts = []
+ if not session:
+ session = get_session()
+
+ # List all webfonts called by the given page
+ all_css = list_all_css_for_page(url, session, debuglevel=debuglevel, dryrun=dryrun)
+ all_fonts = get_all_fonts_from_csslist(all_css, session, debuglevel=debuglevel, dryrun=dryrun)
+
+ # Prepare destination dir
+ destdir = os.path.join(fontdir,urlparse(url).netloc)
+
+ # Save all fonts to that dir
+ return save_all_fonts(all_fonts, destdir, session, debuglevel=debuglevel, dryrun=dryrun)
+
+def whitelist_harfile(harfile, fontdir, session=None, debuglevel=debuglevel, dryrun=False):
+ """
+ Given the harfile, save all fonts listed in the discovered css files
+ """
+ all_fonts = []
+ if not session:
+ session = get_session()
+
+ # List all css in the har file
+ all_css = extract_css_urls_from_harfile(harfile)
+ all_fonts = get_all_fonts_from_csslist(all_css, session, debuglevel=debuglevel, dryrun=dryrun)
+
+ # Prepare destination dir
+ destdir = os.path.join(fontdir,"harfiles")
+
+ # Save all fonts to that dir
+ return save_all_fonts(all_fonts, destdir, session, debuglevel=debuglevel, dryrun=dryrun)
+
+def extract_css_urls_from_harfile(harfile):
+ """
+ Extract all urls that match string "css" from a har file
+ """
+ css_files = []
+
+ with open(harfile,'r') as o:
+ har_contents = o.read()
+
+ a = json.loads(har_contents)
+ c = a['log']['entries']
+ x = 0
+ for d in c:
+ e = c[x]['request']['url']
+ if "css" in e and e not in css_files:
+ if debuglevel >= 5:
+ eprint(e)
+ css_files.append(e)
+ x = x + 1
+ return css_files
bgstack15