aboutsummaryrefslogtreecommitdiff
path: root/savewebfonts_lib.py
blob: cb29ec7e738ec1588bd209f595f6b76aa88c26df (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
#!/usr/bin/env python3
# File: /usr/libexec/savewebfonts/savewebfonts_lib.py
# Location: save-webfonts package
# Author: bgstack15
# Startdate: 2021-04-02 07:20
# SPDX-License-Identifier: CC-BY-SA 4.0
# Title: Library for Saving Webfonts
# Purpose: library for whitelisting a page's webfonts by downloading them for current user
# Usage: See save-webfonts (1)
# Reference:
#    https://github.com/fonttools/fonttools/issues/1694
# Improve:
#    Handle using tinycss old?
# Dependencies:
#    req-fedora: python3-beautifulsoup4, python3-tinycss2
#    rec-fedora: python3-fonttools, libeot-tools
#    req-devuan: python3-bs4, python3-tinycss2
#    rec-devuan: python3-fonttools, eot2ttf

import requests, os, json, tempfile, subprocess, base64
from sys import stderr
from bs4 import BeautifulSoup as bs # python3-beautifulsoup4
from urllib.parse import urljoin, urlparse
import tinycss2 # python3-tinycss2

# defaults for library
class swf_config:
   def __init__(
         self
         , debuglevel = 8
         , session = None
         , MAX_STRING_PRINT_LENGTH = 180
         , eot2ttf_binary = "eot2ttf"
         , dryrun = True
         , convert = False
      ):
      self.debuglevel = debuglevel
      self.MAX_STRING_PRINT_LENGTH = MAX_STRING_PRINT_LENGTH
      self.eot2ttf_binary = eot2ttf_binary
      self.dryrun = dryrun
      if session is None:
         self.session = get_session()
      else:
         self.session = session
      self.convert = convert

   def __repr__(self):
      response = "<swf_config"
      for i in self.__dict__:
         # omit printing session
         if "session" not in i:
            response = response + " " + (str(i)) + "=\"" + str(self.__dict__[i]) + "\","
      response = response.rstrip(",") + ">"
      return response

config_default = swf_config()

# Functions
def eprint(*args, **kwargs):
   print(*args, file=stderr, **kwargs)

def ttfify_filename(filename):
   response = filename
   for end in [".woff2",".woff",".eot",".svg"]:
      if response.endswith(end):
         response = response[:-len(end)]
      # For python 3.9 and higher only:
      #response = response.removesuffix(end)
   return response + ".ttf"

def get_session():
   session = requests.Session()
   session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
   return session

def list_all_css_for_page(url, config):
   """
   Return all css links from a given page
   """
   # Reference: https://www.thepythoncode.com/article/extract-web-page-script-and-css-files-in-python
   css_files = []
   html = config.session.get(url).content
   soup = bs(html, "html.parser")
   for css in soup.find_all("link"):
      if ".css" in css.attrs.get("href"):
         # if the link tag has the 'href' attribute
         css_url = urljoin(url, css.attrs.get("href"))
         if config.debuglevel >= 8:
            eprint(f"Found css: {css_url}")
         css_files.append(css_url)
   return css_files

def get_webfonts_for_one_css(url, config):
   """
   Return a list of urls of all webfonts specified in this css file
   """
   #theseFonts = []
   css = config.session.get(url).content
   a = tinycss2.parse_stylesheet_bytes(css)
   a = a[0]
   b = []
   x=0
   # extract only the font-face rules
   for i in a:
      x = x + 1
      try:
         if "at-rule" in i.type and "font-face" in i.at_keyword:
            b.append(i)
            if config.debuglevel >= 10:
               eprint(str(x) + " " + str(i))
      except:
         pass
   # now list b is only the font-face rules
   c = []
   for i in b:
      x=0
      marker=-1
      for j in i.content:
         x = x + 1
         if "url" in j.type:
            # make absolute from relative
            thisurl = urljoin(url,j.value)
            if thisurl not in c:
               if config.debuglevel >= 5:
                  eprint(f"get_webfonts_for_one_css: Found font url {thisurl[:config.MAX_STRING_PRINT_LENGTH]}")
               c.append(thisurl)
   # c is a flat list of all font files, many of which are duplicates
   return c

def save_font(url,destdir,config):
   """
   Given a url, and destination dir, and optionally an existing http session, download the url and save to a file. If convert, save any woff/woff2 to ttf.
   """

   need_convert = False

   # Derive filename
   filename=""
   filename=os.path.basename(urlparse(url).path)
   ext = os.path.splitext(filename)[-1]
   tf = None
   # Do not try to convert .svg
   if config.convert and not filename.endswith(".ttf") and ext not in [".svg"]:
      need_convert = True
      orig_filename = filename # in case we cannot load library later
      filename = ttfify_filename(filename)

   if url.startswith("data:"):
      if url.startswith("data:application/x-font-woff;charset=utf-8;base64,"):
         need_convert = True
         ext = ".woff"
         tf = tempfile.NamedTemporaryFile()
         contents = url[len("data:application/x-font-woff;charset=utf-8;base64,"):] # no worries about dryrun; we have already downloaded the font contents which are inline in the css file itself.
         tf.write(base64.b64decode(contents))
         filename = ttfify_filename(contents[:20])

   filepath = os.path.join(destdir, filename)

   if not os.path.exists(filepath):
      if url.startswith("data:"):
         # Yes, some repetition here.
         if url.startswith("data:application/x-font-woff;charset=utf-8;base64,"):
            pass
         else:
            # not supported yet!
            eprint(f"Warning: Url {url[:config.MAX_STRING_PRINT_LENGTH]} is unsupported, for file {filepath}.")
            return -1

      if not config.dryrun:
         if tf:
            with open(tf.name,'rb') as otf:
               file_contents = otf.read()
         else:
            # Download content
            response = config.session.get(url)
            if 'Content-Disposition' in response.headers:
               filename=response.headers['Content-Disposition']
               eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}")
               if need_convert and not filename.endswith(".ttf"):
                  orig_filename = filename # in case we cannot load library later
                  filename = ttfify_filename(filename)
            file_contents = response.content

      filepath = os.path.join(destdir, filename)
      #try:
      if True:
         if config.debuglevel >= 1:
            sstring = "Saving" if not config.dryrun else "Save"
            eprint(f"{sstring} {url[:config.MAX_STRING_PRINT_LENGTH]} to file {filepath}")
         if not config.dryrun:
            if not need_convert:
               with open(filepath,'wb') as thisfile:
                  thisfile.write(file_contents)
            else:
               # need_convert is true, and not dryrun, so call function
               if ext in [".woff",".woff2"]:
                  try:
                     from fontTools import ttLib
                  except Exception as e:
                     raise e
                  convert_in = url
                  if tf:
                     convert_in = tf.name
                  convert_woffwoff2_ttf(convert_in,filepath,config=config)
               elif ext in [".eot"]:
                  convert_eot_ttf(url,filepath,config=config)
               else:
                  # no plan for conversion!
                  eprint(f"Warning: no conversion plan for ext {ext} of {url[:config.MAX_STRING_PRINT_LENGTH]}. Saving as-is.")
                  with open(filepath,'wb') as thisfile:
                     thisfile.write(file_contents)
         if tf: tf.close()
         return 0
      #except Exception as E:
      #   eprint(f"Error when downloading {url}, {E}")
      #   if tf: tf.close()
      #   return -1
      if tf: tf.close()
   else: # filepath does exist
      if config.debuglevel >= 2:
         eprint(f"File {filepath} exists for {url[:config.MAX_STRING_PRINT_LENGTH]}. Skipping.")
      if tf: tf.close()
      return 0

def get_all_fonts_from_csslist(all_css, config):
   all_fonts = []
   for this_css in all_css:
      webfonts = get_webfonts_for_one_css(this_css, config)
      for webfont in webfonts:
         # filter accepted extensions here. Technically fontconfig only uses ttf.
         # Always exclude svg, because those are really big, and not usable files for fontconfig.
         # WORKHERE: allow svg, if convert_woffwoff2_ttf works on svg.
         if webfont not in all_fonts and '.svg' not in webfont:
            if config.debuglevel >= 2:
               eprint(f"Found font {webfont[:config.MAX_STRING_PRINT_LENGTH+30]}")
            all_fonts.append(webfont)
   return all_fonts

def save_all_fonts(all_fonts, destdir, config):
   """
   Given a list of font urls, and the destdir, save all these fonts
   """

   #print(f"Found {len(all_fonts)} font files for page {url}")
   #print(f"Will save to {destdir}")

   if os.path.exists(destdir):
      if not os.path.isdir(destdir):
         raise NotADirectoryError(20,destdir,"Please clean up this non-directory file and try again")
         return -1
   try:
      if not config.dryrun:
         os.mkdir(destdir)
   except FileExistsError:
      pass # it already exists
   except Exception as E:
      raise E

   # Loop through all webfont files and save them
   for font in all_fonts:
      save_font(font, destdir, config)
   return 0

def whitelist_page(url, fontdir, config = config_default):
   """
   For the given URL, Save all listed webfonts to a directory named
   after the domain, underneath the given fontdir. If convert, then
   convert all woff, woff2 files to ttf using woffTools
   """
   all_fonts = []

   # List all webfonts called by the given page
   all_css = list_all_css_for_page(url, config)
   all_fonts = get_all_fonts_from_csslist(all_css, config)

   # Prepare destination dir
   destdir = os.path.join(fontdir,urlparse(url).netloc)

   # Save all fonts to that dir
   return save_all_fonts(all_fonts, destdir, config)

def whitelist_harfile(harfile, fontdir, config = config_default):
   """
   Given the harfile, save all fonts listed in the discovered css files
   """
   all_fonts = []

   # List all css in the har file
   all_css = extract_css_urls_from_harfile(harfile, config)
   all_fonts = get_all_fonts_from_csslist(all_css, config)

   # Prepare destination dir
   with open(harfile,'r') as o:
      har_contents = o.read()
   a = json.loads(har_contents)
   domain = urlparse(a['log']['entries'][0]['request']['url']).netloc # get first entry's domain name
   destdir = os.path.join(fontdir,"harfile-"+domain)

   # Save all fonts to that dir
   return save_all_fonts(all_fonts, destdir, config)

def extract_css_urls_from_harfile(harfile, config):
   """
   Extract all urls that match string "css" from a har file
   """
   css_files = []

   with open(harfile,'r') as o:
      har_contents = o.read()

   a = json.loads(har_contents)
   c = a['log']['entries']
   x = 0
   for d in c:
      e = c[x]['request']['url']
      if "css" in e and e not in css_files:
         if config.debuglevel >= 5:
            eprint(e)
         css_files.append(e)
      x = x + 1
   return css_files

def convert_woffwoff2_ttf(url, filename, config):
   """
   Save the given url to filename, with filetype ttf
   """
   # This will only be called from save_font when dryrun=False, so the dryrun flag here is useful only if called from some other usage.
   if (url.startswith("http://") or url.startswith("https://") or url.startswith("ftp://")):
      response = config.session.get(url)
      file_contents = response.content
   else:
      # assume local file
      with open(url,'rb') as o:
         file_contents = o.read()
   try:
      from fontTools import ttLib
   except ModuleNotFoundError:
      eprint("Warning: cannot load fontTools. Try installing python3-fonttools")
      return -1
   except Exception as e:
      raise e

   with tempfile.TemporaryFile() as tf:
      tf.write(file_contents)
      font = ttLib.TTFont(tf)
   if config.debuglevel >= 3:
      eprint(f"Converting {url[:config.MAX_STRING_PRINT_LENGTH]} from {font.flavor} to ttf as file {filename}")

   font.flavor = None # restores default value, for non-compressed OpenType
   font.save(filename)

   return 0

def convert_eot_ttf(uri, filename, config):
   """
   Save the given uri of an eot file to filename, with filetype ttf
   """
   # This will only be called from save_font when dryrun=False, so the dryrun flag here is useful only if called from some other usage.
   tf = None
   if "http://" in uri or "https://" in uri or "ftp://" in uri:
      response = config.session.get(uri)
      content = response.content
      tf = tempfile.NamedTemporaryFile()
      tf.write(content)
      infile = tf.name # change to use this temp file
   else:
      # local file, or some uri scheme not planned yet
      infile = uri
      #with open(uri,'rb') as o:
      #   content = o.read()
   try:
      r = subprocess.run(["which",config.eot2ttf_binary], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
      if r.returncode != 0:
         #raise FileNotFoundError(18,"eot2ttf","cannot find")
         eprint(f"Warning: Cannot convert {uri} because cannot find eot2ttf. Please set --eotbin.")
         return -1
      # so proceed
   except Exception as e:
      try: # clean up temp file
         if tf:
            tf.close()
      except:
         pass
      raise e

   r=subprocess.run([config.eot2ttf_binary,infile,filename])
   if r.returncode != 0:
      eprint(f"Warning: eot2ttf failed on {uri}")

   # exit convert_eot_ttf
   try: # clean up temp file
      if tf:
         tf.close()
   except:
      pass
   return 0
bgstack15