summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xfetch-css.sh28
-rwxr-xr-xfetch-images.sh33
-rwxr-xr-xfetch-issue-webpages.py78
-rw-r--r--fix-css-in-html.sed2
-rw-r--r--fix-images-in-html.sed6
-rwxr-xr-xfix-timestamps.py46
-rw-r--r--fix-without-systemd-links.sed5
-rwxr-xr-xflow-part2.sh26
-rw-r--r--flow.md57
-rw-r--r--gitlablib.sh88
-rwxr-xr-xremove-useless.py84
-rw-r--r--remove-useless.sed12
12 files changed, 465 insertions, 0 deletions
diff --git a/fetch-css.sh b/fetch-css.sh
new file mode 100755
index 0000000..06718c2
--- /dev/null
+++ b/fetch-css.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+# Startdate: 2020-05-29 20:18
+
+INDIR=/mnt/public/www/issues
+INGLOB=*.html
+
+SEDSCRIPT=/mnt/public/work/devuan/fix-css-in-html.sed
+
+# OUTDIR will be made in INDIR, because of the `cd` below.
+OUTDIR=css
+test ! -d "${OUTDIR}" && mkdir -p "${OUTDIR}"
+
+INSERVER=https://git.devuan.org
+
+cd "${INDIR}"
+
+orig_css="$( sed -n -r -e 's/^.*<link.*(href="[^"]+\.css").*/\1/p' ${INGLOB} | awk -F'"' '!x[$2]++{print $2}' )"
+
+cat /dev/null > "${SEDSCRIPT}"
+
+echo "${orig_css}" | while read line ; do
+ getpath="${INSERVER}${line}"
+ targetfile="${OUTDIR}/$( basename "${line}" )"
+ test -n "${DEBUG}" && echo "process ${getpath} and save to ${targetfile}" 1>&2
+ test -z "${DRYRUN}" && wget --quiet --content-disposition -O "${targetfile}" "${getpath}"
+ # dynamically build a sed script
+ echo "s:${line}:${targetfile##/}:g;" | tee -a "${SEDSCRIPT}"
+done
diff --git a/fetch-images.sh b/fetch-images.sh
new file mode 100755
index 0000000..4f4884b
--- /dev/null
+++ b/fetch-images.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+# startdate 2020-05-29 20:04
+# After running this, be sure to do the sed.
+# sed -i -f fix-images-in-html.sed /mnt/public/www/issues/*.html
+# Improve:
+# It is probably an artifact of the weird way the asset svgs are embedded, but I cannot get them to display at all even though they are downloaded successfully. I have seen this before, the little embedded images you cannot easily download and simply display.
+
+INDIR=/mnt/public/www/issues
+INGLOB=*.html
+
+SEDSCRIPT=/mnt/public/work/devuan/fix-images-in-html.sed
+
+INSERVER=https://git.devuan.org
+
+cd "${INDIR}"
+
+# could use this line to get all the assets, but they do not display regardless due to html weirdness
+#orig_src="$( grep -oE '(\<src|xlink:href)="?\/[^"]*"' ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$0]++{print $2}' )"
+orig_src="$( grep -oE '\<src="?\/[^"]*"' ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$2]++{print $2}' )"
+
+cat /dev/null > "${SEDSCRIPT}"
+
+echo "${orig_src}" | while read line ; do
+ #echo "${line}" | awk -F'"' '{print $2}'
+ getpath="${INSERVER}${line}"
+ outdir="$( echo "${line}" | awk -F'/' '{print $2}' )"
+ test ! -d "${outdir}" && mkdir -p "${outdir}"
+ targetfile="${outdir}/$( basename "${line}" )"
+ test -n "${DEBUG}" && echo "process ${getpath} and save to ${targetfile}" 1>&2
+ test -z "${DRYRUN}" && wget --quiet --content-disposition -O "${targetfile}" "${getpath}"
+ # dynamically build a sed script
+ echo "s:${line}:${targetfile##/}:g;" | tee -a "${SEDSCRIPT}"
+done
diff --git a/fetch-issue-webpages.py b/fetch-issue-webpages.py
new file mode 100755
index 0000000..86d6b71
--- /dev/null
+++ b/fetch-issue-webpages.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# Startdate: 2020-05-29 16:22
+# History:
+# Usage:
+# ln -s issues.all.web_url output/files-to-fetch.txt
+# ./fetch-issues-webpages.py
+# How to make this work:
+# apt-get install python3-pyvirtualdisplay
+# download this geckodriver, place in /usr/local/bin
+# References:
+# basic guide https://web.archive.org/web/20191031110759/http://scraping.pro/use-headless-firefox-scraping-linux/
+# https://stackoverflow.com/questions/40302006/no-such-file-or-directory-geckodriver-for-a-python-simple-selenium-applicatio
+# geckodriver https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
+# https://www.selenium.dev/selenium/docs/api/py/index.html?highlight=get
+# page source https://www.selenium.dev/selenium/docs/api/py/webdriver_remote/selenium.webdriver.remote.webdriver.html?highlight=title#selenium.webdriver.remote.webdriver.WebDriver.title
+# make sure all comments load https://stackoverflow.com/questions/26566799/wait-until-page-is-loaded-with-selenium-webdriver-for-python/44998503#44998503
+# https://crossbrowsertesting.com/blog/test-automation/automate-login-with-selenium/
+# Improve:
+from pyvirtualdisplay import Display
+from selenium import webdriver
+from selenium.webdriver.support.ui import WebDriverWait
+import re, time, getpass
+
+def ask_password(prompt):
+ #return input(prompt+": ")
+ return getpass.getpass(prompt+": ")
+
+def scrollDown(driver, value):
+ driver.execute_script("window.scrollBy(0,"+str(value)+")")
+
+# Scroll down the page
+def scrollDownAllTheWay(driver):
+ old_page = driver.page_source
+ while True:
+ #logging.debug("Scrolling loop")
+ for i in range(2):
+ scrollDown(driver, 500)
+ time.sleep(2)
+ new_page = driver.page_source
+ if new_page != old_page:
+ old_page = new_page
+ else:
+ break
+ return True
+
+server_string="https://git.devuan.org"
+outdir="/mnt/public/www/issues"
+
+with open("output/files-to-fetch.txt") as f:
+ lines=[line.rstrip() for line in f]
+
+# ask password now instead of after the delay
+password = ask_password("Enter password for "+server_string)
+
+display = Display(visible=0, size=(800, 600))
+display.start()
+
+browser = webdriver.Firefox()
+
+# log in to gitlab instance
+browser.get(server_string+"/users/sign_in")
+browser.find_element_by_id("user_login").send_keys('bgstack15')
+browser.find_element_by_id("user_password").send_keys(password)
+browser.find_element_by_class_name("qa-sign-in-button").click()
+browser.get(server_string+"/profile") # always needs the authentication
+scrollDownAllTheWay(browser)
+
+for thisfile in lines:
+ destfile=re.sub("\.+",".",re.sub("\/|issues",".",re.sub("^"+re.escape(server_string)+"\/","",thisfile)))+".html"
+ print("Saving",thisfile,outdir+"/"+destfile)
+ browser.get(thisfile)
+ scrollDownAllTheWay(browser)
+ with open(outdir+"/"+destfile,"w") as text_file:
+ print(browser.page_source.encode('utf-8'),file=text_file)
+
+# done with loop
+browser.quit()
+display.stop()
diff --git a/fix-css-in-html.sed b/fix-css-in-html.sed
new file mode 100644
index 0000000..eb07dd9
--- /dev/null
+++ b/fix-css-in-html.sed
@@ -0,0 +1,2 @@
+s:/assets/application-5e11f017cc719f3e7c77f1024cdea9381ee9b21f47fe391ef5d29a7f3c94ce4a.css:css/application-5e11f017cc719f3e7c77f1024cdea9381ee9b21f47fe391ef5d29a7f3c94ce4a.css:g;
+s:/assets/print-c8ff536271f8974b8a9a5f75c0ca25d2b8c1dceb4cff3c01d1603862a0bdcbfc.css:css/print-c8ff536271f8974b8a9a5f75c0ca25d2b8c1dceb4cff3c01d1603862a0bdcbfc.css:g;
diff --git a/fix-images-in-html.sed b/fix-images-in-html.sed
new file mode 100644
index 0000000..aedf89f
--- /dev/null
+++ b/fix-images-in-html.sed
@@ -0,0 +1,6 @@
+s:/uploads/-/system/appearance/header_logo/1/gdo-icon.png:uploads/gdo-icon.png:g;
+s:/uploads/-/system/user/avatar/4095/avatar.png?width=23:uploads/avatar.png?width=23:g;
+s:/assets/illustrations/cluster_popover-9830388038d966d8d64d43576808f9d5ba05f639a78a40bae9a5ddc7cbf72f24.svg:assets/cluster_popover-9830388038d966d8d64d43576808f9d5ba05f639a78a40bae9a5ddc7cbf72f24.svg:g;
+s:/uploads/-/system/user/avatar/4095/avatar.png?width=40:uploads/avatar.png?width=40:g;
+s:/uploads/-/system/user/avatar/17/cph-devuan.png?width=24:uploads/cph-devuan.png?width=24:g;
+# TRUNCATED FOR GIT REPO
diff --git a/fix-timestamps.py b/fix-timestamps.py
new file mode 100755
index 0000000..a564257
--- /dev/null
+++ b/fix-timestamps.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+# Startdate: 2020-05-29 20:40
+# Purpose: convert timestamps on gitlab issue web page into UTC
+# History:
+# 2020-05-30 09:24 add loop through files listed in output/files-for-timestamps.txt
+# Usage:
+# ls -1 /mnt/public/www/issues/output*.html > output/files-for-timestamps.txt
+# ./fix-timestamps.py
+# References:
+# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#pretty-printing
+# https://gitlab.com/bgstack15/vooblystats/-/blob/master/vooblystats.py
+# https://bgstack15.wordpress.com/2020/02/16/python3-convert-relative-date-to-utc-timestamp/
+# Improve:
+# this is hardcoded to work when the pages are shown in EDT.
+from bs4 import BeautifulSoup
+from datetime import timedelta
+from parsedatetime import Calendar
+from pytz import timezone
+
+def fix_timestamps(page_text):
+ soup = BeautifulSoup(page_text,"html.parser")
+ cal = Calendar()
+ x = 0
+ for i in soup.find_all(name='time'):
+ x = x + 1
+ j = i.attrs["data-original-title"]
+ if 'EDT' == j[-3:] or 'EST' == j[-3:]:
+ tzobject=timezone("US/Eastern")
+ else:
+ tzobject=timezone("UTC")
+ dto, _ = cal.parseDT(datetimeString=j,tzinfo=timezone("US/Eastern"))
+ add_hours = int((str(dto)[-6:])[:3])
+ j = (timedelta(hours=-add_hours) + dto).strftime('%Y-%m-%dT%H:%MZ')
+ # second precision %S is not needed for this use case.
+ i.string = j
+ return soup
+
+with open("output/files-for-timestamps.txt") as f:
+ lines = [line.rstrip() for line in f]
+
+for thisfile in lines:
+ print("Fixing timestamps in file",thisfile)
+ with open(thisfile) as tf:
+ output=fix_timestamps(tf.read())
+ with open(thisfile,"w",encoding='utf-8') as tf:
+ tf.write(str(output.prettify()))
diff --git a/fix-without-systemd-links.sed b/fix-without-systemd-links.sed
new file mode 100644
index 0000000..d2b25c6
--- /dev/null
+++ b/fix-without-systemd-links.sed
@@ -0,0 +1,5 @@
+/without-systemd\.org/{
+ /archive\.org/!{
+ s@(http://without-systemd\.org)@https://web.archive.org/web/20190208013412/\1@g;
+ }
+}
diff --git a/flow-part2.sh b/flow-part2.sh
new file mode 100755
index 0000000..db27d98
--- /dev/null
+++ b/flow-part2.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+# Filename: flow-part2.sh
+# Startdate: 2020-05-30 16:46
+# Purpose: automate the post-download components of the issue-saving process
+
+OUTDIR=/mnt/public/www/issues
+
+cd /mnt/public/work/devuan
+
+sed -i -r -e 's/\\n/\n/g;' "${OUTDIR}"/*.html
+
+ls -1 "${OUTDIR}"/*.html > output/files-for-timestamps.txt
+./fix-timestamps.py
+
+./fetch-images.sh
+sed -i -f fix-images-in-html.sed "${OUTDIR}"/*.html
+
+mkdir -p /mnt/public/www/issues/css
+./fetch-css.sh
+sed -i -f fix-css-in-html.sed "${OUTDIR}"/*.html
+
+sed -i -f remove-useless.sed "${OUTDIR}"/*.html
+
+./remove-useless.py
+
+sed -i -r -f fix-without-systemd-links.sed "${OUTDIR}"/*.html
diff --git a/flow.md b/flow.md
new file mode 100644
index 0000000..5c81d5e
--- /dev/null
+++ b/flow.md
@@ -0,0 +1,57 @@
+#### Metadata
+Startdate: 2020-05-30 15:51
+References:
+Everything on this page, for jq filtering. https://stedolan.github.io/jq/manual/#Basicfilters
+
+
+# Flow
+
+1. Use gitlablib to list all issue web urls, and then remove all the "build", "buildmodify" and similar CI/CD issues.
+
+ . gitlablib.sh
+ list_all_issues | tee output/issues.all
+ <output/issues.all jq '.[]| if(.title|test("build-?(a(ll)?|mod(ify)?|add|del)?$")) then empty else . end | .web_url' | sed -r -e 's/"//g;' > output/issues.all.web_url
+
+ Manually munge the data to put the devuan/devuan-project/issues/20 on top.
+
+2. Use fetch-issue-webpages.py to fetch all those webpages
+
+ ln -s issues.all.web_url output/files-to-fetch.txt
+ ./fetch-issue-webpages.py
+
+3. munge the downloaded html
+ All of the following is performed by `flow-part2.sh`
+
+ * fix newlines
+
+ sed -i -r -e 's/\\n/\n/g;' /mnt/public/www/issues/*.html
+
+ * find data-original-titles and replace the <time> tag contents with the value of its data-original-title. Also, this will BeautifulSoup pretty-print the html so some of the following commands work correctly.
+
+ ls -1 /mnt/public/www/issues/*.html > output/files-for-timestamps.txt
+ ./fix-timestamps.py
+
+ * download all relevant images, and then fix them.
+
+ ./fetch-images.sh
+ sed -i -f fix-images-in-html.sed /mnt/public/www/issues/*.html
+
+ * download all stylesheets and then fix them.
+
+ mkdir -p /mnt/public/www/issues/css
+ ./fetch-css.sh
+ sed -i -f fix-css-in-html.sed /mnt/public/www/issues/*.html
+
+ * fix some encoding oddities
+
+ sed -i -f remove-useless.sed /mnt/public/www/issues/*.html
+
+ * remove html components that are not necessary
+
+ remove-useless.py
+
+ * Fix links that point to defunct domain without-systemd.org.
+
+ sed -i -r -f fix-without-systemd-links.sed /mnt/public/www/issues/*.html
+
+ * build some sort of index?
diff --git a/gitlablib.sh b/gitlablib.sh
new file mode 100644
index 0000000..8f57b7d
--- /dev/null
+++ b/gitlablib.sh
@@ -0,0 +1,88 @@
+#!/bin/sh
+# Startdate: 2020-05-29
+# Dependencies:
+# jq
+# my private token
+# Library for interacting with Gitlab API
+# For manual work:
+# curl --header "${authheader}" "https://git.devuan.org/api/v4/projects/devuan%2Fdevuan-project/issues"
+# References:
+# https://docs.gitlab.com/ee/api/README.html#pagination
+# handle transforming the / in the path_with_namespace to %2F per https://docs.gitlab.com/ee/api/README.html#namespaced-path-encoding https://docs.gitlab.com/ee/api/README.html#namespaced-path-encoding
+# https://docs.gitlab.com/ee/api/issues.html
+
+export token="$( cat /mnt/public/work/devuan/git.devuan.org.token.txt )"
+export authheader="Private-Token: ${token}"
+
+export server=git.devuan.org
+
+export GLL_TMPDIR="$( mktemp -d )"
+
+clean_gitlablib() {
+ rm -rf "${GLL_TMPDIR:-NOTHINGTODELETE}"/*
+}
+
+# PRIVATE
+_handle_gitlab_pagination() {
+ # call: list_all_projects "${startUri}"
+ ___hgp_starturi="${1}"
+ test -n "${GLL_DEBUG}" && set -x
+ # BEGIN
+ rhfile="$( TMPDIR="${GLL_TMPDIR}" mktemp -t "headers.XXXXXXXXXX" )"
+ done=0
+ size=-1
+ uri="${___hgp_starturi}"
+
+ # LOOP
+ while test ${done} -eq 0 ;
+ do
+ response="$( curl -v -L --header "${authheader}" "${uri}" 2>"${rhfile}" )"
+ #grep -iE "^< link" "${rhfile}"
+ # determine size
+ if test "${size}" = "-1" ; then # run only if size is still undefined
+ tmpsize="$( awk '$2 == "x-total:" {print $3}' "${rhfile}" 2>/dev/null )"
+ test -n "${tmpsize}" && size="${tmpsize}"
+ echo "Number of items: ${size}" 1>&2
+ fi
+
+ tmpnextpage="$( awk '$2 == "x-next-page:" {print $3}' "${rhfile}" 2>/dev/null )"
+ # if x-next-page is blank, that means we are on the last page. Also, we could try x-total-pages compared to x-page.
+ test -z "${tmpnextpage}" && done=1
+ # so if we have a next page, get that link
+ nextUri="$( awk '{$1="";$2="";print}' "${rhfile}" | tr ',' '\n' | awk -F';' '/rel="next"/{print $1}' | sed -r -e 's/^\s*<//;' -e 's/>\s*$//;' )"
+ if test -n "${nextUri}" ; then
+ uri="${nextUri}"
+ else
+ echo "No next page provided! Error." 1>&2
+ done=1
+ fi
+
+ # show contents
+ echo "${response}"
+ done
+
+ # cleanup
+ rm "${rhfile}"
+ set +x
+}
+
+list_all_projects() {
+ _handle_gitlab_pagination "https://${server}/api/v4/projects"
+}
+
+list_all_issues() {
+ _handle_gitlab_pagination "https://${server}/api/v4/issues?scope=all&status=all"
+}
+
+list_issues_for_project() {
+ ___lifp_project="${1}"
+ ___lifp_htmlencode_bool="${2}"
+ istruthy "${___lifp_htmlencode_bool}" && ___lifp_project="$( echo "${___lifp_project}" | sed -r -e 's/\//%2F/g;' )"
+ _handle_gitlab_pagination "https://${server}/api/v4/projects/${___lifp_project}/issues"
+}
+
+list_issues_for_all_projects_pipe() {
+ # call: <projects.path_with_namespace.txt list_issues_for_all_projects_pipe
+ echo "STUB"
+}
+
diff --git a/remove-useless.py b/remove-useless.py
new file mode 100755
index 0000000..e68f458
--- /dev/null
+++ b/remove-useless.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+# Startdate: 2020-05-30 19:30
+# Purpose: remove key, useless html elements from slurped pages
+from bs4 import BeautifulSoup
+import sys
+
+def remove_useless(contents):
+ soup = BeautifulSoup(contents,"html.parser")
+ try:
+ sidebar = soup.find(class_="nav-sidebar")
+ sidebar.replace_with("")
+ except:
+ pass
+ try:
+ navbar = soup.find(class_="navbar-gitlab")
+ navbar.replace_with("")
+ except:
+ pass
+ try:
+ rightbar = soup.find(class_="issuable-context-form")
+ rightbar.replace_with("")
+ except:
+ pass
+ try:
+ rightbar = soup.find(class_="js-issuable-sidebar")
+ rightbar.replace_with("")
+ except:
+ pass
+ try:
+ rightbar = soup.find(class_="js-issuable-actions")
+ rightbar.replace_with("")
+ except:
+ pass
+ try:
+ rightbar = soup.find(class_="js-noteable-awards")
+ rightbar.replace_with("")
+ except:
+ pass
+ try:
+ rightbar = soup.find(class_="disabled-comment")
+ rightbar.replace_with("")
+ except:
+ pass
+ try:
+ rightbar = soup.find(class_="notes-form")
+ rightbar.replace_with("")
+ except:
+ pass
+ try:
+ rightbar = soup.find(class_="btn-edit")
+ rightbar.replace_with("")
+ except:
+ pass
+ try:
+ rightbar = soup.find(class_="js-issuable-edit")
+ rightbar.replace_with("")
+ except:
+ pass
+ try:
+ mylist = soup.find_all(class_="note-actions")
+ for i in mylist:
+ i.replace_with("")
+ except:
+ pass
+ try:
+ mylist = soup.find_all(class_="emoji-block")
+ for i in mylist:
+ i.replace_with("")
+ except:
+ return soup
+
+# this works, for the single file called
+#with open(sys.argv[1],"r") as infile:
+# lines = infile.read()
+
+with open("output/files-for-timestamps.txt") as f:
+ lines = [line.rstrip() for line in f]
+
+for thisfile in lines:
+ print("Removing useless html in file",thisfile)
+ with open(thisfile) as tf:
+ output=remove_useless(tf.read())
+ with open(thisfile,"w",encoding='utf-8') as tf:
+ tf.write(str(output.prettify()))
diff --git a/remove-useless.sed b/remove-useless.sed
new file mode 100644
index 0000000..3dbe856
--- /dev/null
+++ b/remove-useless.sed
@@ -0,0 +1,12 @@
+$ {s/^'//}
+1 {s/^b'//}
+s/·/·/g # do not ask how I made this one
+s/Â//g
+s/\\'/'/g
+s/\xc2(\x91|\x82|\x)//g
+s/\\xc2\\xb7/·/g # two characters here
+s/\\xc3\\xab/Ã/g
+s/\\xe1\\xb4\\x84\\xe1\\xb4\\xa0\\xe1\\xb4\\x87/CVE/g
+s/\\xe2\\x80\\x99/'/g
+s/\\xe2\\x80\\xa6/.../g
+s/(\\x..)*\\xb7/·/g # two characters here
bgstack15