summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorB Stack <bgstack15@gmail.com>2020-06-16 13:40:39 -0400
committerB Stack <bgstack15@gmail.com>2020-06-16 13:40:39 -0400
commit0ca931489f98b65e1025a4c4f00ae9eb8484dc27 (patch)
tree231f9fa86e40a62ef59ca0cc1ba81d52086b3597
parentinitial commit (diff)
downloadglip-0ca931489f98b65e1025a4c4f00ae9eb8484dc27.tar.gz
glip-0ca931489f98b65e1025a4c4f00ae9eb8484dc27.tar.bz2
glip-0ca931489f98b65e1025a4c4f00ae9eb8484dc27.zip
add minor fixes and major image fixes
perform better unicode removal/conversion fix image retrieval, including svg and minor graphics assets remove even more html elements not necessary for archival display
-rwxr-xr-xconversion.sh10
-rwxr-xr-xfetch-css.sh3
-rwxr-xr-xfetch-images.sh19
-rwxr-xr-xfetch-issue-webpages.py2
-rwxr-xr-xfix-timestamps.py2
-rwxr-xr-xflow-part2.sh10
-rw-r--r--flow.md22
-rwxr-xr-xremove-useless.py7
-rw-r--r--remove-useless.sed1
-rwxr-xr-xuse-datasrc-instead-src.py34
10 files changed, 90 insertions, 20 deletions
diff --git a/conversion.sh b/conversion.sh
new file mode 100755
index 0000000..f739eaf
--- /dev/null
+++ b/conversion.sh
@@ -0,0 +1,10 @@
+# startdate 2020-06-15
+# References:
+# iconv(1), charsets(7)
+for oldfile in $@ ; do
+ test -w "${oldfile}" && {
+ newfile="${oldfile}.2"
+ iconv -c -f utf-8 -t iso-8859-1 < "${oldfile}" > "${newfile}"
+ mv "${newfile}" "${oldfile}"
+ }
+done
diff --git a/fetch-css.sh b/fetch-css.sh
index 06718c2..28a6ac9 100755
--- a/fetch-css.sh
+++ b/fetch-css.sh
@@ -1,7 +1,7 @@
#!/bin/sh
# Startdate: 2020-05-29 20:18
-INDIR=/mnt/public/www/issues
+INDIR=/mnt/public/www/gitlab-issues
INGLOB=*.html
SEDSCRIPT=/mnt/public/work/devuan/fix-css-in-html.sed
@@ -14,6 +14,7 @@ INSERVER=https://git.devuan.org
cd "${INDIR}"
+#orig_css="$( sed -n -r -e 's/^.*rel="stylesheet".*(href="[^"]+\.css").*/\1/p' ${INGLOB} | awk -F'"' '!x[$2]++{print $2}' )"
orig_css="$( sed -n -r -e 's/^.*<link.*(href="[^"]+\.css").*/\1/p' ${INGLOB} | awk -F'"' '!x[$2]++{print $2}' )"
cat /dev/null > "${SEDSCRIPT}"
diff --git a/fetch-images.sh b/fetch-images.sh
index 4f4884b..bec3a6f 100755
--- a/fetch-images.sh
+++ b/fetch-images.sh
@@ -1,33 +1,38 @@
#!/bin/sh
# startdate 2020-05-29 20:04
# After running this, be sure to do the sed.
-# sed -i -f fix-images-in-html.sed /mnt/public/www/issues/*.html
+# sed -i -f fix-images-in-html.sed /mnt/public/www/gitlab-issues/*.html
# Improve:
# It is probably an artifact of the weird way the asset svgs are embedded, but I cannot get them to display at all even though they are downloaded successfully. I have seen this before, the little embedded images you cannot easily download and simply display.
-INDIR=/mnt/public/www/issues
+INDIR=/mnt/public/www/gitlab-issues
INGLOB=*.html
SEDSCRIPT=/mnt/public/work/devuan/fix-images-in-html.sed
INSERVER=https://git.devuan.org
+INSERVERREGEX="https://git(lab)?\.devuan\.org"
cd "${INDIR}"
# could use this line to get all the assets, but they do not display regardless due to html weirdness
#orig_src="$( grep -oE '(\<src|xlink:href)="?\/[^"]*"' ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$0]++{print $2}' )"
-orig_src="$( grep -oE '\<src="?\/[^"]*"' ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$2]++{print $2}' )"
+orig_src="$( grep -oE "(src|xlink:href)=\"(${INSERVERREGEX})?\/[^\"]*\"" ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$2]++{print $2}' )"
cat /dev/null > "${SEDSCRIPT}"
echo "${orig_src}" | while read line ; do
#echo "${line}" | awk -F'"' '{print $2}'
- getpath="${INSERVER}${line}"
- outdir="$( echo "${line}" | awk -F'/' '{print $2}' )"
+ getpath="${line}"
+ echo "${getpath}" | grep -q -E "https?:\/\/" || getpath="${INSERVER}${getpath}"
+ temp="$( echo "${line}" | sed -r -e "s@${INSERVERREGEX}@@g" -e 's@/-/@/@g' )"
+ outdir="$( dirname "${temp}" | sed -r -e 's@^/@@' )"
test ! -d "${outdir}" && mkdir -p "${outdir}"
- targetfile="${outdir}/$( basename "${line}" )"
+ targetfile="${temp%%\?*}" # for output file itself remove the question mark which is normally width attribute
+ targetfile="${targetfile%%\#*}" # for output file itself remove the question mark which is normally width attribute
+ targetfile="${targetfile##/}" # for output file itself remove the pound sign which is some svg thing
test -n "${DEBUG}" && echo "process ${getpath} and save to ${targetfile}" 1>&2
test -z "${DRYRUN}" && wget --quiet --content-disposition -O "${targetfile}" "${getpath}"
# dynamically build a sed script
- echo "s:${line}:${targetfile##/}:g;" | tee -a "${SEDSCRIPT}"
+ echo "s@${line}@${temp##/}@g;" | tee -a "${SEDSCRIPT}"
done
diff --git a/fetch-issue-webpages.py b/fetch-issue-webpages.py
index 86d6b71..3291cdc 100755
--- a/fetch-issue-webpages.py
+++ b/fetch-issue-webpages.py
@@ -44,7 +44,7 @@ def scrollDownAllTheWay(driver):
return True
server_string="https://git.devuan.org"
-outdir="/mnt/public/www/issues"
+outdir="/mnt/public/www/gitlab-issues"
with open("output/files-to-fetch.txt") as f:
lines=[line.rstrip() for line in f]
diff --git a/fix-timestamps.py b/fix-timestamps.py
index a564257..a5a4441 100755
--- a/fix-timestamps.py
+++ b/fix-timestamps.py
@@ -4,7 +4,7 @@
# History:
# 2020-05-30 09:24 add loop through files listed in output/files-for-timestamps.txt
# Usage:
-# ls -1 /mnt/public/www/issues/output*.html > output/files-for-timestamps.txt
+# ls -1 /mnt/public/www/gitlab-issues/*.html > output/files-for-timestamps.txt
# ./fix-timestamps.py
# References:
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#pretty-printing
diff --git a/flow-part2.sh b/flow-part2.sh
index db27d98..e56bdf2 100755
--- a/flow-part2.sh
+++ b/flow-part2.sh
@@ -3,7 +3,7 @@
# Startdate: 2020-05-30 16:46
# Purpose: automate the post-download components of the issue-saving process
-OUTDIR=/mnt/public/www/issues
+OUTDIR=/mnt/public/www/gitlab-issues
cd /mnt/public/work/devuan
@@ -12,10 +12,10 @@ sed -i -r -e 's/\\n/\n/g;' "${OUTDIR}"/*.html
ls -1 "${OUTDIR}"/*.html > output/files-for-timestamps.txt
./fix-timestamps.py
-./fetch-images.sh
+DEBUG=1 ./fetch-images.sh 2>~/log/flow-part2.$( date "+%FT%H%M%S" ).log
sed -i -f fix-images-in-html.sed "${OUTDIR}"/*.html
-mkdir -p /mnt/public/www/issues/css
+mkdir -p "${OUTDIR}/css"
./fetch-css.sh
sed -i -f fix-css-in-html.sed "${OUTDIR}"/*.html
@@ -24,3 +24,7 @@ sed -i -f remove-useless.sed "${OUTDIR}"/*.html
./remove-useless.py
sed -i -r -f fix-without-systemd-links.sed "${OUTDIR}"/*.html
+
+./conversion.sh "${OUTDIR}"/*.html
+
+./use-datasrc-instead-src.py
diff --git a/flow.md b/flow.md
index 5c81d5e..ca81d52 100644
--- a/flow.md
+++ b/flow.md
@@ -24,27 +24,27 @@ Everything on this page, for jq filtering. https://stedolan.github.io/jq/manual/
* fix newlines
- sed -i -r -e 's/\\n/\n/g;' /mnt/public/www/issues/*.html
+ sed -i -r -e 's/\\n/\n/g;' /mnt/public/www/gitlab-issues/*.html
* find data-original-titles and replace the <time> tag contents with the value of its data-original-title. Also, this will BeautifulSoup pretty-print the html so some of the following commands work correctly.
- ls -1 /mnt/public/www/issues/*.html > output/files-for-timestamps.txt
+ ls -1 /mnt/public/www/gitlab-issues/*.html > output/files-for-timestamps.txt
./fix-timestamps.py
* download all relevant images, and then fix them.
./fetch-images.sh
- sed -i -f fix-images-in-html.sed /mnt/public/www/issues/*.html
+ sed -i -f fix-images-in-html.sed /mnt/public/www/gitlab-issues/*.html
* download all stylesheets and then fix them.
- mkdir -p /mnt/public/www/issues/css
+ mkdir -p /mnt/public/www/gitlab-issues/css
./fetch-css.sh
- sed -i -f fix-css-in-html.sed /mnt/public/www/issues/*.html
+ sed -i -f fix-css-in-html.sed /mnt/public/www/gitlab-issues/*.html
* fix some encoding oddities
- sed -i -f remove-useless.sed /mnt/public/www/issues/*.html
+ sed -i -f remove-useless.sed /mnt/public/www/gitlab-issues/*.html
* remove html components that are not necessary
@@ -52,6 +52,14 @@ Everything on this page, for jq filtering. https://stedolan.github.io/jq/manual/
* Fix links that point to defunct domain without-systemd.org.
- sed -i -r -f fix-without-systemd-links.sed /mnt/public/www/issues/*.html
+ sed -i -r -f fix-without-systemd-links.sed /mnt/public/www/gitlab-issues/*.html
+
+ * Perform final encoding conversion to remove any remaining broken characters
+
+ ./conversion.sh /mnt/public/www/gitlab-issues/*.html
+
+ * Fix some images that have a src="data:" that do not load, but the data-src property is the proper link
+
+ ./use-datasrc-instead-src.py
* build some sort of index?
diff --git a/remove-useless.py b/remove-useless.py
index e68f458..a5dd17e 100755
--- a/remove-useless.py
+++ b/remove-useless.py
@@ -67,6 +67,13 @@ def remove_useless(contents):
for i in mylist:
i.replace_with("")
except:
+ pass
+ try:
+ mylist = soup.find_all(class_="broadcast-message")
+ for i in mylist:
+ i.replace_with("")
+ except:
+ pass
return soup
# this works, for the single file called
diff --git a/remove-useless.sed b/remove-useless.sed
index 3dbe856..861fcbe 100644
--- a/remove-useless.sed
+++ b/remove-useless.sed
@@ -10,3 +10,4 @@ s/\\xe1\\xb4\\x84\\xe1\\xb4\\xa0\\xe1\\xb4\\x87/CVE/g
s/\\xe2\\x80\\x99/'/g
s/\\xe2\\x80\\xa6/.../g
s/(\\x..)*\\xb7/·/g # two characters here
+s/\xc3.·/·/g # do not ask how I made this one
diff --git a/use-datasrc-instead-src.py b/use-datasrc-instead-src.py
new file mode 100755
index 0000000..90fca9f
--- /dev/null
+++ b/use-datasrc-instead-src.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# Startdate: 2020-06-15
+# Purpose: fix some images that somehow havea src="data:" that is rendered wrong, but the data-src attribute has the local, valid image uri!
+from bs4 import BeautifulSoup
+import re
+import sys
+
+def fiximgs(contents):
+ soup = BeautifulSoup(contents,"html.parser")
+ try:
+ images = soup.find_all("img")
+ for image in images:
+ try:
+ if re.match(".*data:.*",image["src"]):
+ if image["data-src"]:
+ image["src"]=image["data-src"]
+ except:
+ print("Was not able to transfer data-src to src where src contained 'data:'")
+ #print("src",image["src"])
+ #print("data-src",image.datasrc)
+ except:
+ print("Error of some sort.")
+ return soup
+
+with open("output/files-for-timestamps.txt") as f:
+ lines = [line.rstrip() for line in f]
+
+for thisfile in lines:
+ print("Fixing images with src=\"data:\" tag",thisfile)
+ with open(thisfile) as tf:
+ output=fiximgs(tf.read())
+ #with open(thisfile,"w",encoding='utf-8') as tf:
+ with open(thisfile,"w") as tf:
+ tf.write(str(output.prettify()))
bgstack15