aboutsummaryrefslogtreecommitdiff
path: root/savewebfonts_lib.py
blob: 19790df4bc271a5f434f566d4ace64934e0139bf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
#!/usr/bin/env python3
# File: /usr/libexec/savewebfonts/savewebfonts_lib.py
# Location: save-webfonts package
# Author: bgstack15
# Startdate: 2021-04-02 07:20
# SPDX-License-Identifier: CC-BY-SA 4.0
# Title: Library for Saving Webfonts
# Purpose: library for whitelisting a page's webfonts by downloading them for current user
# Usage: See save-webfonts (1)
# Reference:
#    https://github.com/fonttools/fonttools/issues/1694
# Improve:
#    accept a list of filetypes to save, or exclude? Such as, ['ttf','woff2']
#    Convert woff2 fonts?
#    Handle using tinycss old?
# Dependencies:
#    req-fedora: python3-beautifulsoup4, python3-tinycss2
#    rec-fedora: python3-fonttools
import requests, os, json, tempfile
from sys import stderr
from bs4 import BeautifulSoup as bs # python3-beautifulsoup4
from urllib.parse import urljoin, urlparse
import tinycss2 # python3-tinycss2

# defaults for library
debuglevel = 8
MAX_STRING_PRINT_LENGTH = 180

# Functions
def eprint(*args, **kwargs):
   print(*args, file=stderr, **kwargs)

def ttfify_filename(filename):
   return filename.rstrip(".woff").rstrip(".woff2").rstrip(".svg").rstrip(".eot") + ".ttf"


def get_session():
   session = requests.Session()
   session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
   return session

def list_all_css_for_page(url, session = None, debuglevel = debuglevel, dryrun=False):
   """
   Return all css links from a given page
   """
   # Reference: https://www.thepythoncode.com/article/extract-web-page-script-and-css-files-in-python
   css_files = []
   if not session:
      session = get_session()
   html = session.get(url).content
   soup = bs(html, "html.parser")
   for css in soup.find_all("link"):
      if ".css" in css.attrs.get("href"):
         # if the link tag has the 'href' attribute
         css_url = urljoin(url, css.attrs.get("href"))
         if debuglevel >= 8:
            eprint(f"Found css: {css_url}")
         css_files.append(css_url)
   return css_files

def get_webfonts_for_one_css(url, session = None, debuglevel = debuglevel, dryrun=False):
   """
   Return a list of urls of all webfonts specified in this css file
   """
   #theseFonts = []
   if not session:
      session = get_session()
   css = session.get(url).content
   a = tinycss2.parse_stylesheet_bytes(css)
   a = a[0]
   b = []
   x=0
   # extract only the font-face rules
   for i in a:
      x = x + 1
      try:
         if "at-rule" in i.type and "font-face" in i.at_keyword:
            b.append(i)
            if debuglevel >= 10:
               eprint(str(x) + " " + str(i))
      except:
         pass
   # now list b is only the font-face rules
   c = []
   for i in b:
      x=0
      marker=-1
      for j in i.content:
         x = x + 1
         if "url" in j.type:
            # make absolute from relative
            thisurl = urljoin(url,j.value)
            if thisurl not in c:
               if debuglevel >= 5:
                  eprint(f"get_webfonts_for_one_css: Found font url {thisurl}")
               c.append(thisurl)
   # c is a flat list of all font files, many of which are duplicates
   return c

def save_font(url,destdir,session=None, debuglevel = debuglevel, dryrun=False, convert=False):
   """
   Given a url, and destination dir, and optionally an existing http session, download the url and save to a file. If convert, save any woff/woff2 to ttf.
   """

   need_convert = False

   # Derive filename
   filename=""
   filename=os.path.basename(urlparse(url).path)
   ext = os.path.splitext(filename)[-1]
   # Do not try to convert .eot
   if convert and not filename.endswith(".ttf") and ext not in [".eot"]:
      need_convert = True
      orig_filename = filename # in case we cannot load library later
      filename = ttfify_filename(filename)
   filepath = os.path.join(destdir, filename)

   if not os.path.exists(filepath):
      if url.startswith("data:"):
         # not supported!
         # WORKHERE: support saving to a tempfile this datastream, probably a base64encoded woff file. Then just convert.
         eprint(f"Warning: Url {url[:MAX_STRING_PRINT_LENGTH]} is unsupported.")
      else:
         if not dryrun:
         # Download content
            if session:
               response = session.get(url)
            else:
               response = requests.get(url)

            if 'Content-Disposition' in response.headers:
               filename=response.headers['Content-Disposition']
               eprint(f"Using content-disposition value of {response.headers['Content-Disposition']}")
               if need_convert and not filename.endswith(".ttf"):
                  orig_filename = filename # in case we cannot load library later
                  filename = ttfify_filename(filename)
               filepath = os.path.join(destdir, filename)

         # Future: logic for woff2 to ttf conversion goes here, approximately

         try:
            if debuglevel >= 1:
               sstring = "Saving" if not dryrun else "Save"
               eprint(f"{sstring} {url} to file {filepath}")
            if not dryrun:
               if not need_convert:
                  with open(filepath,'wb') as thisfile:
                     thisfile.write(response.content)
               else:
                  # need_convert is true, and not dryrun, so call function
                  try:
                     from fontTools import ttLib
                  except Exception as e:
                     raise e
                  convert_font(url,filepath,session=session,debuglevel=debuglevel,dryrun=dryrun)
            return 0
         except Exception as E:
            eprint(f"Error when downloading {url}, {E}")
            return -1
   else: # filepath does exist
      if debuglevel >= 2:
         eprint(f"File {filepath} exists for {url}. Skipping.")
      return 0

def get_all_fonts_from_csslist(all_css, session=None, debuglevel=debuglevel, dryrun=False):
   all_fonts = []
   for this_css in all_css:
      webfonts = get_webfonts_for_one_css(this_css, session, debuglevel=debuglevel, dryrun=dryrun)
      for webfont in webfonts:
         # filter accepted extensions here. Technically fontconfig only uses ttf.
         # Always exclude svg, because those are really big, and not usable files for fontconfig.
         # WORKHERE: allow svg, if convert_font works on svg.
         if webfont not in all_fonts and '.svg' not in webfont:
            if debuglevel >= 2:
               eprint(f"Found font {webfont}")
            all_fonts.append(webfont)
   return all_fonts

def save_all_fonts(all_fonts, destdir, session=None, debuglevel=debuglevel, dryrun=False, convert=False):
   """
   Given a list of font urls, and the destdir, save all these fonts
   """

   #print(f"Found {len(all_fonts)} font files for page {url}")
   #print(f"Will save to {destdir}")

   if os.path.exists(destdir):
      if not os.path.isdir(destdir):
         raise NotADirectoryError(20,destdir,"Please clean up this non-directory file and try again")
         return -1
   try:
      if not dryrun:
         os.mkdir(destdir)
   except FileExistsError:
      pass # it already exists
   except Exception as E:
      raise E

   # Loop through all webfont files and save them
   for font in all_fonts:
      save_font(font, destdir, session=session, debuglevel=debuglevel, dryrun=dryrun, convert=convert)
   return 0

def whitelist_page(url, fontdir, session=None, debuglevel=debuglevel, dryrun=False, convert = False):
   """
   For the given URL, Save all listed webfonts to a directory named
   after the domain, underneath the given fontdir. If convert, then
   convert all woff, woff2 files to ttf using woffTools
   """
   all_fonts = []
   if not session:
      session = get_session()

   # List all webfonts called by the given page
   all_css = list_all_css_for_page(url, session, debuglevel=debuglevel, dryrun=dryrun)
   all_fonts = get_all_fonts_from_csslist(all_css, session, debuglevel=debuglevel, dryrun=dryrun)

   # Prepare destination dir
   destdir = os.path.join(fontdir,urlparse(url).netloc)

   # Save all fonts to that dir
   return save_all_fonts(all_fonts, destdir, session, debuglevel=debuglevel, dryrun=dryrun, convert=convert)

def whitelist_harfile(harfile, fontdir, session=None, debuglevel=debuglevel, dryrun=False, convert=False):
   """
   Given the harfile, save all fonts listed in the discovered css files
   """
   all_fonts = []
   if not session:
      session = get_session()

   # List all css in the har file
   all_css = extract_css_urls_from_harfile(harfile)
   all_fonts = get_all_fonts_from_csslist(all_css, session, debuglevel=debuglevel, dryrun=dryrun)

   # Prepare destination dir
   destdir = os.path.join(fontdir,"harfiles")

   # Save all fonts to that dir
   return save_all_fonts(all_fonts, destdir, session, debuglevel=debuglevel, dryrun=dryrun, convert=convert)

def extract_css_urls_from_harfile(harfile):
   """
   Extract all urls that match string "css" from a har file
   """
   css_files = []

   with open(harfile,'r') as o:
      har_contents = o.read()

   a = json.loads(har_contents)
   c = a['log']['entries']
   x = 0
   for d in c:
      e = c[x]['request']['url']
      if "css" in e and e not in css_files:
         if debuglevel >= 5:
            eprint(e)
         css_files.append(e)
      x = x + 1
   return css_files

def convert_font(url, filename, session=None, debuglevel=debuglevel, dryrun=False):
   """
   Save the given url to filename, with filetype ttf
   """
   # This will only be called from save_font when dryrun=False, so the dryrun flag here is useful only if called from some other usage.
   if session:
      response = session.get(url)
   else:
      response = requests.get(url)
   try:
      from fontTools import ttLib
   except ModuleNotFoundError:
      eprint("Warning: cannot load fontTools. Try installing python3-fonttools")
      return -1
   except Exception as e:
      raise e

   with tempfile.TemporaryFile() as tf:
      tf.write(response.content)
      font = ttLib.TTFont(tf)
   if debuglevel >= 3:
      eprint(f"Converting {url[:MAX_STRING_PRINT_LENGTH]} from {font.flavor} to ttf as file {filename}")

   font.flavor = None # restores default value, for non-compressed OpenType
   font.save(filename)

   return 0
bgstack15