diff options
author | B Stack <bgstack15@gmail.com> | 2020-06-16 13:40:39 -0400 |
---|---|---|
committer | B Stack <bgstack15@gmail.com> | 2020-06-16 13:40:39 -0400 |
commit | 0ca931489f98b65e1025a4c4f00ae9eb8484dc27 (patch) | |
tree | 231f9fa86e40a62ef59ca0cc1ba81d52086b3597 | |
parent | initial commit (diff) | |
download | glip-0ca931489f98b65e1025a4c4f00ae9eb8484dc27.tar.gz glip-0ca931489f98b65e1025a4c4f00ae9eb8484dc27.tar.bz2 glip-0ca931489f98b65e1025a4c4f00ae9eb8484dc27.zip |
add minor fixes and major image fixes
perform better unicode removal/conversion
fix image retrieval, including svg and minor graphics assets
remove even more html elements not necessary for archival display
-rwxr-xr-x | conversion.sh | 10 | ||||
-rwxr-xr-x | fetch-css.sh | 3 | ||||
-rwxr-xr-x | fetch-images.sh | 19 | ||||
-rwxr-xr-x | fetch-issue-webpages.py | 2 | ||||
-rwxr-xr-x | fix-timestamps.py | 2 | ||||
-rwxr-xr-x | flow-part2.sh | 10 | ||||
-rw-r--r-- | flow.md | 22 | ||||
-rwxr-xr-x | remove-useless.py | 7 | ||||
-rw-r--r-- | remove-useless.sed | 1 | ||||
-rwxr-xr-x | use-datasrc-instead-src.py | 34 |
10 files changed, 90 insertions, 20 deletions
diff --git a/conversion.sh b/conversion.sh new file mode 100755 index 0000000..f739eaf --- /dev/null +++ b/conversion.sh @@ -0,0 +1,10 @@ +# startdate 2020-06-15 +# References: +# iconv(1), charsets(7) +for oldfile in $@ ; do + test -w "${oldfile}" && { + newfile="${oldfile}.2" + iconv -c -f utf-8 -t iso-8859-1 < "${oldfile}" > "${newfile}" + mv "${newfile}" "${oldfile}" + } +done diff --git a/fetch-css.sh b/fetch-css.sh index 06718c2..28a6ac9 100755 --- a/fetch-css.sh +++ b/fetch-css.sh @@ -1,7 +1,7 @@ #!/bin/sh # Startdate: 2020-05-29 20:18 -INDIR=/mnt/public/www/issues +INDIR=/mnt/public/www/gitlab-issues INGLOB=*.html SEDSCRIPT=/mnt/public/work/devuan/fix-css-in-html.sed @@ -14,6 +14,7 @@ INSERVER=https://git.devuan.org cd "${INDIR}" +#orig_css="$( sed -n -r -e 's/^.*rel="stylesheet".*(href="[^"]+\.css").*/\1/p' ${INGLOB} | awk -F'"' '!x[$2]++{print $2}' )" orig_css="$( sed -n -r -e 's/^.*<link.*(href="[^"]+\.css").*/\1/p' ${INGLOB} | awk -F'"' '!x[$2]++{print $2}' )" cat /dev/null > "${SEDSCRIPT}" diff --git a/fetch-images.sh b/fetch-images.sh index 4f4884b..bec3a6f 100755 --- a/fetch-images.sh +++ b/fetch-images.sh @@ -1,33 +1,38 @@ #!/bin/sh # startdate 2020-05-29 20:04 # After running this, be sure to do the sed. -# sed -i -f fix-images-in-html.sed /mnt/public/www/issues/*.html +# sed -i -f fix-images-in-html.sed /mnt/public/www/gitlab-issues/*.html # Improve: # It is probably an artifact of the weird way the asset svgs are embedded, but I cannot get them to display at all even though they are downloaded successfully. I have seen this before, the little embedded images you cannot easily download and simply display. -INDIR=/mnt/public/www/issues +INDIR=/mnt/public/www/gitlab-issues INGLOB=*.html SEDSCRIPT=/mnt/public/work/devuan/fix-images-in-html.sed INSERVER=https://git.devuan.org +INSERVERREGEX="https://git(lab)?\.devuan\.org" cd "${INDIR}" # could use this line to get all the assets, but they do not display regardless due to html weirdness #orig_src="$( grep -oE '(\<src|xlink:href)="?\/[^"]*"' ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$0]++{print $2}' )" -orig_src="$( grep -oE '\<src="?\/[^"]*"' ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$2]++{print $2}' )" +orig_src="$( grep -oE "(src|xlink:href)=\"(${INSERVERREGEX})?\/[^\"]*\"" ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$2]++{print $2}' )" cat /dev/null > "${SEDSCRIPT}" echo "${orig_src}" | while read line ; do #echo "${line}" | awk -F'"' '{print $2}' - getpath="${INSERVER}${line}" - outdir="$( echo "${line}" | awk -F'/' '{print $2}' )" + getpath="${line}" + echo "${getpath}" | grep -q -E "https?:\/\/" || getpath="${INSERVER}${getpath}" + temp="$( echo "${line}" | sed -r -e "s@${INSERVERREGEX}@@g" -e 's@/-/@/@g' )" + outdir="$( dirname "${temp}" | sed -r -e 's@^/@@' )" test ! -d "${outdir}" && mkdir -p "${outdir}" - targetfile="${outdir}/$( basename "${line}" )" + targetfile="${temp%%\?*}" # for output file itself remove the question mark which is normally width attribute + targetfile="${targetfile%%\#*}" # for output file itself remove the question mark which is normally width attribute + targetfile="${targetfile##/}" # for output file itself remove the pound sign which is some svg thing test -n "${DEBUG}" && echo "process ${getpath} and save to ${targetfile}" 1>&2 test -z "${DRYRUN}" && wget --quiet --content-disposition -O "${targetfile}" "${getpath}" # dynamically build a sed script - echo "s:${line}:${targetfile##/}:g;" | tee -a "${SEDSCRIPT}" + echo "s@${line}@${temp##/}@g;" | tee -a "${SEDSCRIPT}" done diff --git a/fetch-issue-webpages.py b/fetch-issue-webpages.py index 86d6b71..3291cdc 100755 --- a/fetch-issue-webpages.py +++ b/fetch-issue-webpages.py @@ -44,7 +44,7 @@ def scrollDownAllTheWay(driver): return True server_string="https://git.devuan.org" -outdir="/mnt/public/www/issues" +outdir="/mnt/public/www/gitlab-issues" with open("output/files-to-fetch.txt") as f: lines=[line.rstrip() for line in f] diff --git a/fix-timestamps.py b/fix-timestamps.py index a564257..a5a4441 100755 --- a/fix-timestamps.py +++ b/fix-timestamps.py @@ -4,7 +4,7 @@ # History: # 2020-05-30 09:24 add loop through files listed in output/files-for-timestamps.txt # Usage: -# ls -1 /mnt/public/www/issues/output*.html > output/files-for-timestamps.txt +# ls -1 /mnt/public/www/gitlab-issues/*.html > output/files-for-timestamps.txt # ./fix-timestamps.py # References: # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#pretty-printing diff --git a/flow-part2.sh b/flow-part2.sh index db27d98..e56bdf2 100755 --- a/flow-part2.sh +++ b/flow-part2.sh @@ -3,7 +3,7 @@ # Startdate: 2020-05-30 16:46 # Purpose: automate the post-download components of the issue-saving process -OUTDIR=/mnt/public/www/issues +OUTDIR=/mnt/public/www/gitlab-issues cd /mnt/public/work/devuan @@ -12,10 +12,10 @@ sed -i -r -e 's/\\n/\n/g;' "${OUTDIR}"/*.html ls -1 "${OUTDIR}"/*.html > output/files-for-timestamps.txt ./fix-timestamps.py -./fetch-images.sh +DEBUG=1 ./fetch-images.sh 2>~/log/flow-part2.$( date "+%FT%H%M%S" ).log sed -i -f fix-images-in-html.sed "${OUTDIR}"/*.html -mkdir -p /mnt/public/www/issues/css +mkdir -p "${OUTDIR}/css" ./fetch-css.sh sed -i -f fix-css-in-html.sed "${OUTDIR}"/*.html @@ -24,3 +24,7 @@ sed -i -f remove-useless.sed "${OUTDIR}"/*.html ./remove-useless.py sed -i -r -f fix-without-systemd-links.sed "${OUTDIR}"/*.html + +./conversion.sh "${OUTDIR}"/*.html + +./use-datasrc-instead-src.py @@ -24,27 +24,27 @@ Everything on this page, for jq filtering. https://stedolan.github.io/jq/manual/ * fix newlines - sed -i -r -e 's/\\n/\n/g;' /mnt/public/www/issues/*.html + sed -i -r -e 's/\\n/\n/g;' /mnt/public/www/gitlab-issues/*.html * find data-original-titles and replace the <time> tag contents with the value of its data-original-title. Also, this will BeautifulSoup pretty-print the html so some of the following commands work correctly. - ls -1 /mnt/public/www/issues/*.html > output/files-for-timestamps.txt + ls -1 /mnt/public/www/gitlab-issues/*.html > output/files-for-timestamps.txt ./fix-timestamps.py * download all relevant images, and then fix them. ./fetch-images.sh - sed -i -f fix-images-in-html.sed /mnt/public/www/issues/*.html + sed -i -f fix-images-in-html.sed /mnt/public/www/gitlab-issues/*.html * download all stylesheets and then fix them. - mkdir -p /mnt/public/www/issues/css + mkdir -p /mnt/public/www/gitlab-issues/css ./fetch-css.sh - sed -i -f fix-css-in-html.sed /mnt/public/www/issues/*.html + sed -i -f fix-css-in-html.sed /mnt/public/www/gitlab-issues/*.html * fix some encoding oddities - sed -i -f remove-useless.sed /mnt/public/www/issues/*.html + sed -i -f remove-useless.sed /mnt/public/www/gitlab-issues/*.html * remove html components that are not necessary @@ -52,6 +52,14 @@ Everything on this page, for jq filtering. https://stedolan.github.io/jq/manual/ * Fix links that point to defunct domain without-systemd.org. - sed -i -r -f fix-without-systemd-links.sed /mnt/public/www/issues/*.html + sed -i -r -f fix-without-systemd-links.sed /mnt/public/www/gitlab-issues/*.html + + * Perform final encoding conversion to remove any remaining broken characters + + ./conversion.sh /mnt/public/www/gitlab-issues/*.html + + * Fix some images that have a src="data:" that do not load, but the data-src property is the proper link + + ./use-datasrc-instead-src.py * build some sort of index? diff --git a/remove-useless.py b/remove-useless.py index e68f458..a5dd17e 100755 --- a/remove-useless.py +++ b/remove-useless.py @@ -67,6 +67,13 @@ def remove_useless(contents): for i in mylist: i.replace_with("") except: + pass + try: + mylist = soup.find_all(class_="broadcast-message") + for i in mylist: + i.replace_with("") + except: + pass return soup # this works, for the single file called diff --git a/remove-useless.sed b/remove-useless.sed index 3dbe856..861fcbe 100644 --- a/remove-useless.sed +++ b/remove-useless.sed @@ -10,3 +10,4 @@ s/\\xe1\\xb4\\x84\\xe1\\xb4\\xa0\\xe1\\xb4\\x87/CVE/g s/\\xe2\\x80\\x99/'/g s/\\xe2\\x80\\xa6/.../g s/(\\x..)*\\xb7/·/g # two characters here +s/\xc3.·/·/g # do not ask how I made this one diff --git a/use-datasrc-instead-src.py b/use-datasrc-instead-src.py new file mode 100755 index 0000000..90fca9f --- /dev/null +++ b/use-datasrc-instead-src.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# Startdate: 2020-06-15 +# Purpose: fix some images that somehow havea src="data:" that is rendered wrong, but the data-src attribute has the local, valid image uri! +from bs4 import BeautifulSoup +import re +import sys + +def fiximgs(contents): + soup = BeautifulSoup(contents,"html.parser") + try: + images = soup.find_all("img") + for image in images: + try: + if re.match(".*data:.*",image["src"]): + if image["data-src"]: + image["src"]=image["data-src"] + except: + print("Was not able to transfer data-src to src where src contained 'data:'") + #print("src",image["src"]) + #print("data-src",image.datasrc) + except: + print("Error of some sort.") + return soup + +with open("output/files-for-timestamps.txt") as f: + lines = [line.rstrip() for line in f] + +for thisfile in lines: + print("Fixing images with src=\"data:\" tag",thisfile) + with open(thisfile) as tf: + output=fiximgs(tf.read()) + #with open(thisfile,"w",encoding='utf-8') as tf: + with open(thisfile,"w") as tf: + tf.write(str(output.prettify())) |