diff options
author | B Stack <bgstack15@gmail.com> | 2020-06-16 13:40:39 -0400 |
---|---|---|
committer | B Stack <bgstack15@gmail.com> | 2020-06-16 13:40:39 -0400 |
commit | 0ca931489f98b65e1025a4c4f00ae9eb8484dc27 (patch) | |
tree | 231f9fa86e40a62ef59ca0cc1ba81d52086b3597 /fetch-images.sh | |
parent | initial commit (diff) | |
download | glip-0ca931489f98b65e1025a4c4f00ae9eb8484dc27.tar.gz glip-0ca931489f98b65e1025a4c4f00ae9eb8484dc27.tar.bz2 glip-0ca931489f98b65e1025a4c4f00ae9eb8484dc27.zip |
add minor fixes and major image fixes
perform better unicode removal/conversion
fix image retrieval, including svg and minor graphics assets
remove even more html elements not necessary for archival display
Diffstat (limited to 'fetch-images.sh')
-rwxr-xr-x | fetch-images.sh | 19 |
1 files changed, 12 insertions, 7 deletions
diff --git a/fetch-images.sh b/fetch-images.sh index 4f4884b..bec3a6f 100755 --- a/fetch-images.sh +++ b/fetch-images.sh @@ -1,33 +1,38 @@ #!/bin/sh # startdate 2020-05-29 20:04 # After running this, be sure to do the sed. -# sed -i -f fix-images-in-html.sed /mnt/public/www/issues/*.html +# sed -i -f fix-images-in-html.sed /mnt/public/www/gitlab-issues/*.html # Improve: # It is probably an artifact of the weird way the asset svgs are embedded, but I cannot get them to display at all even though they are downloaded successfully. I have seen this before, the little embedded images you cannot easily download and simply display. -INDIR=/mnt/public/www/issues +INDIR=/mnt/public/www/gitlab-issues INGLOB=*.html SEDSCRIPT=/mnt/public/work/devuan/fix-images-in-html.sed INSERVER=https://git.devuan.org +INSERVERREGEX="https://git(lab)?\.devuan\.org" cd "${INDIR}" # could use this line to get all the assets, but they do not display regardless due to html weirdness #orig_src="$( grep -oE '(\<src|xlink:href)="?\/[^"]*"' ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$0]++{print $2}' )" -orig_src="$( grep -oE '\<src="?\/[^"]*"' ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$2]++{print $2}' )" +orig_src="$( grep -oE "(src|xlink:href)=\"(${INSERVERREGEX})?\/[^\"]*\"" ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$2]++{print $2}' )" cat /dev/null > "${SEDSCRIPT}" echo "${orig_src}" | while read line ; do #echo "${line}" | awk -F'"' '{print $2}' - getpath="${INSERVER}${line}" - outdir="$( echo "${line}" | awk -F'/' '{print $2}' )" + getpath="${line}" + echo "${getpath}" | grep -q -E "https?:\/\/" || getpath="${INSERVER}${getpath}" + temp="$( echo "${line}" | sed -r -e "s@${INSERVERREGEX}@@g" -e 's@/-/@/@g' )" + outdir="$( dirname "${temp}" | sed -r -e 's@^/@@' )" test ! -d "${outdir}" && mkdir -p "${outdir}" - targetfile="${outdir}/$( basename "${line}" )" + targetfile="${temp%%\?*}" # for output file itself remove the question mark which is normally width attribute + targetfile="${targetfile%%\#*}" # for output file itself remove the question mark which is normally width attribute + targetfile="${targetfile##/}" # for output file itself remove the pound sign which is some svg thing test -n "${DEBUG}" && echo "process ${getpath} and save to ${targetfile}" 1>&2 test -z "${DRYRUN}" && wget --quiet --content-disposition -O "${targetfile}" "${getpath}" # dynamically build a sed script - echo "s:${line}:${targetfile##/}:g;" | tee -a "${SEDSCRIPT}" + echo "s@${line}@${temp##/}@g;" | tee -a "${SEDSCRIPT}" done |