From 0c80c29d0fde63d9617d5769038963375e698628 Mon Sep 17 00:00:00 2001
From: B Stack <bgstack15@gmail.com>
Date: Tue, 9 Jun 2020 16:55:15 -0400
Subject: initial commit

---
 fetch-css.sh                  | 28 ++++++++++++++
 fetch-images.sh               | 33 ++++++++++++++++
 fetch-issue-webpages.py       | 78 ++++++++++++++++++++++++++++++++++++++
 fix-css-in-html.sed           |  2 +
 fix-images-in-html.sed        |  6 +++
 fix-timestamps.py             | 46 ++++++++++++++++++++++
 fix-without-systemd-links.sed |  5 +++
 flow-part2.sh                 | 26 +++++++++++++
 flow.md                       | 57 ++++++++++++++++++++++++++++
 gitlablib.sh                  | 88 +++++++++++++++++++++++++++++++++++++++++++
 remove-useless.py             | 84 +++++++++++++++++++++++++++++++++++++++++
 remove-useless.sed            | 12 ++++++
 12 files changed, 465 insertions(+)
 create mode 100755 fetch-css.sh
 create mode 100755 fetch-images.sh
 create mode 100755 fetch-issue-webpages.py
 create mode 100644 fix-css-in-html.sed
 create mode 100644 fix-images-in-html.sed
 create mode 100755 fix-timestamps.py
 create mode 100644 fix-without-systemd-links.sed
 create mode 100755 flow-part2.sh
 create mode 100644 flow.md
 create mode 100644 gitlablib.sh
 create mode 100755 remove-useless.py
 create mode 100644 remove-useless.sed

diff --git a/fetch-css.sh b/fetch-css.sh
new file mode 100755
index 0000000..06718c2
--- /dev/null
+++ b/fetch-css.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+# Startdate: 2020-05-29 20:18
+
+INDIR=/mnt/public/www/issues
+INGLOB=*.html
+
+SEDSCRIPT=/mnt/public/work/devuan/fix-css-in-html.sed
+
+# OUTDIR will be made in INDIR, because of the `cd` below.
+OUTDIR=css
+test ! -d "${OUTDIR}" && mkdir -p "${OUTDIR}"
+
+INSERVER=https://git.devuan.org
+
+cd "${INDIR}"
+
+orig_css="$( sed -n -r -e 's/^.*<link.*(href="[^"]+\.css").*/\1/p' ${INGLOB} | awk -F'"' '!x[$2]++{print $2}' )"
+
+cat /dev/null > "${SEDSCRIPT}"
+
+echo "${orig_css}" | while read line ; do
+   getpath="${INSERVER}${line}"
+   targetfile="${OUTDIR}/$( basename "${line}" )"
+   test -n "${DEBUG}" && echo "process ${getpath} and save to ${targetfile}" 1>&2
+   test -z "${DRYRUN}" && wget --quiet --content-disposition -O "${targetfile}" "${getpath}"
+   # dynamically build a sed script
+   echo "s:${line}:${targetfile##/}:g;" | tee -a "${SEDSCRIPT}"
+done
diff --git a/fetch-images.sh b/fetch-images.sh
new file mode 100755
index 0000000..4f4884b
--- /dev/null
+++ b/fetch-images.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+# startdate 2020-05-29 20:04
+# After running this, be sure to do the sed.
+#    sed -i -f fix-images-in-html.sed /mnt/public/www/issues/*.html
+# Improve:
+#    It is probably an artifact of the weird way the asset svgs are embedded, but I cannot get them to display at all even though they are downloaded successfully. I have seen this before, the little embedded images you cannot easily download and simply display.
+
+INDIR=/mnt/public/www/issues
+INGLOB=*.html
+
+SEDSCRIPT=/mnt/public/work/devuan/fix-images-in-html.sed
+
+INSERVER=https://git.devuan.org
+
+cd "${INDIR}"
+
+# could use this line to get all the assets, but they do not display regardless due to html weirdness
+#orig_src="$( grep -oE '(\<src|xlink:href)="?\/[^"]*"' ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$0]++{print $2}' )"
+orig_src="$( grep -oE '\<src="?\/[^"]*"' ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$2]++{print $2}' )"
+
+cat /dev/null > "${SEDSCRIPT}"
+
+echo "${orig_src}" | while read line ; do
+   #echo "${line}" | awk -F'"' '{print $2}'
+   getpath="${INSERVER}${line}"
+   outdir="$( echo "${line}" | awk -F'/' '{print $2}' )"
+   test ! -d "${outdir}" && mkdir -p "${outdir}"
+   targetfile="${outdir}/$( basename "${line}" )"
+   test -n "${DEBUG}" && echo "process ${getpath} and save to ${targetfile}" 1>&2
+   test -z "${DRYRUN}" && wget --quiet --content-disposition -O "${targetfile}" "${getpath}"
+   # dynamically build a sed script
+   echo "s:${line}:${targetfile##/}:g;" | tee -a "${SEDSCRIPT}"
+done
diff --git a/fetch-issue-webpages.py b/fetch-issue-webpages.py
new file mode 100755
index 0000000..86d6b71
--- /dev/null
+++ b/fetch-issue-webpages.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# Startdate: 2020-05-29 16:22
+# History:
+# Usage:
+#    ln -s issues.all.web_url output/files-to-fetch.txt
+#    ./fetch-issues-webpages.py
+# How to make this work:
+#    apt-get install python3-pyvirtualdisplay
+#    download this geckodriver, place in /usr/local/bin
+# References:
+#    basic guide https://web.archive.org/web/20191031110759/http://scraping.pro/use-headless-firefox-scraping-linux/
+#    https://stackoverflow.com/questions/40302006/no-such-file-or-directory-geckodriver-for-a-python-simple-selenium-applicatio
+#    geckodriver https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
+#    https://www.selenium.dev/selenium/docs/api/py/index.html?highlight=get
+#    page source https://www.selenium.dev/selenium/docs/api/py/webdriver_remote/selenium.webdriver.remote.webdriver.html?highlight=title#selenium.webdriver.remote.webdriver.WebDriver.title
+#    make sure all comments load https://stackoverflow.com/questions/26566799/wait-until-page-is-loaded-with-selenium-webdriver-for-python/44998503#44998503
+#    https://crossbrowsertesting.com/blog/test-automation/automate-login-with-selenium/
+# Improve:
+from pyvirtualdisplay import Display
+from selenium import webdriver
+from selenium.webdriver.support.ui import WebDriverWait
+import re, time, getpass
+
+def ask_password(prompt):
+    #return input(prompt+": ")
+    return getpass.getpass(prompt+": ")
+
+def scrollDown(driver, value):
+   driver.execute_script("window.scrollBy(0,"+str(value)+")")
+
+# Scroll down the page
+def scrollDownAllTheWay(driver):
+   old_page = driver.page_source
+   while True:
+      #logging.debug("Scrolling loop")
+      for i in range(2):
+         scrollDown(driver, 500)
+         time.sleep(2)
+      new_page = driver.page_source
+      if new_page != old_page:
+         old_page = new_page
+      else:
+         break
+   return True
+
+server_string="https://git.devuan.org"
+outdir="/mnt/public/www/issues"
+
+with open("output/files-to-fetch.txt") as f:
+   lines=[line.rstrip() for line in f]
+
+# ask password now instead of after the delay
+password = ask_password("Enter password for "+server_string)
+
+display = Display(visible=0, size=(800, 600))
+display.start()
+
+browser = webdriver.Firefox()
+
+# log in to gitlab instance
+browser.get(server_string+"/users/sign_in")
+browser.find_element_by_id("user_login").send_keys('bgstack15')
+browser.find_element_by_id("user_password").send_keys(password)
+browser.find_element_by_class_name("qa-sign-in-button").click()
+browser.get(server_string+"/profile") # always needs the authentication
+scrollDownAllTheWay(browser)
+
+for thisfile in lines:
+   destfile=re.sub("\.+",".",re.sub("\/|issues",".",re.sub("^"+re.escape(server_string)+"\/","",thisfile)))+".html"
+   print("Saving",thisfile,outdir+"/"+destfile)
+   browser.get(thisfile)
+   scrollDownAllTheWay(browser)
+   with open(outdir+"/"+destfile,"w") as text_file:
+      print(browser.page_source.encode('utf-8'),file=text_file)
+
+# done with loop
+browser.quit()
+display.stop()
diff --git a/fix-css-in-html.sed b/fix-css-in-html.sed
new file mode 100644
index 0000000..eb07dd9
--- /dev/null
+++ b/fix-css-in-html.sed
@@ -0,0 +1,2 @@
+s:/assets/application-5e11f017cc719f3e7c77f1024cdea9381ee9b21f47fe391ef5d29a7f3c94ce4a.css:css/application-5e11f017cc719f3e7c77f1024cdea9381ee9b21f47fe391ef5d29a7f3c94ce4a.css:g;
+s:/assets/print-c8ff536271f8974b8a9a5f75c0ca25d2b8c1dceb4cff3c01d1603862a0bdcbfc.css:css/print-c8ff536271f8974b8a9a5f75c0ca25d2b8c1dceb4cff3c01d1603862a0bdcbfc.css:g;
diff --git a/fix-images-in-html.sed b/fix-images-in-html.sed
new file mode 100644
index 0000000..aedf89f
--- /dev/null
+++ b/fix-images-in-html.sed
@@ -0,0 +1,6 @@
+s:/uploads/-/system/appearance/header_logo/1/gdo-icon.png:uploads/gdo-icon.png:g;
+s:/uploads/-/system/user/avatar/4095/avatar.png?width=23:uploads/avatar.png?width=23:g;
+s:/assets/illustrations/cluster_popover-9830388038d966d8d64d43576808f9d5ba05f639a78a40bae9a5ddc7cbf72f24.svg:assets/cluster_popover-9830388038d966d8d64d43576808f9d5ba05f639a78a40bae9a5ddc7cbf72f24.svg:g;
+s:/uploads/-/system/user/avatar/4095/avatar.png?width=40:uploads/avatar.png?width=40:g;
+s:/uploads/-/system/user/avatar/17/cph-devuan.png?width=24:uploads/cph-devuan.png?width=24:g;
+# TRUNCATED FOR GIT REPO
diff --git a/fix-timestamps.py b/fix-timestamps.py
new file mode 100755
index 0000000..a564257
--- /dev/null
+++ b/fix-timestamps.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+# Startdate: 2020-05-29 20:40
+# Purpose: convert timestamps on gitlab issue web page into UTC
+# History:
+#    2020-05-30 09:24 add loop through files listed in output/files-for-timestamps.txt
+# Usage:
+#    ls -1 /mnt/public/www/issues/output*.html > output/files-for-timestamps.txt
+#    ./fix-timestamps.py
+# References:
+#    https://www.crummy.com/software/BeautifulSoup/bs4/doc/#pretty-printing
+#    https://gitlab.com/bgstack15/vooblystats/-/blob/master/vooblystats.py
+#    https://bgstack15.wordpress.com/2020/02/16/python3-convert-relative-date-to-utc-timestamp/
+# Improve:
+#    this is hardcoded to work when the pages are shown in EDT.
+from bs4 import BeautifulSoup
+from datetime import timedelta
+from parsedatetime import Calendar
+from pytz import timezone 
+
+def fix_timestamps(page_text):
+   soup = BeautifulSoup(page_text,"html.parser")
+   cal = Calendar()
+   x = 0
+   for i in soup.find_all(name='time'):
+      x = x + 1
+      j = i.attrs["data-original-title"]
+      if 'EDT' == j[-3:] or 'EST' == j[-3:]:
+         tzobject=timezone("US/Eastern")
+      else:
+         tzobject=timezone("UTC")
+      dto, _ = cal.parseDT(datetimeString=j,tzinfo=timezone("US/Eastern"))
+      add_hours = int((str(dto)[-6:])[:3])
+      j = (timedelta(hours=-add_hours) + dto).strftime('%Y-%m-%dT%H:%MZ')
+      # second precision %S is not needed for this use case.
+      i.string = j
+   return soup
+
+with open("output/files-for-timestamps.txt") as f:
+   lines = [line.rstrip() for line in f]
+
+for thisfile in lines:
+   print("Fixing timestamps in file",thisfile)
+   with open(thisfile) as tf:
+      output=fix_timestamps(tf.read())
+   with open(thisfile,"w",encoding='utf-8') as tf:
+      tf.write(str(output.prettify()))
diff --git a/fix-without-systemd-links.sed b/fix-without-systemd-links.sed
new file mode 100644
index 0000000..d2b25c6
--- /dev/null
+++ b/fix-without-systemd-links.sed
@@ -0,0 +1,5 @@
+/without-systemd\.org/{
+   /archive\.org/!{
+      s@(http://without-systemd\.org)@https://web.archive.org/web/20190208013412/\1@g;
+   }
+}
diff --git a/flow-part2.sh b/flow-part2.sh
new file mode 100755
index 0000000..db27d98
--- /dev/null
+++ b/flow-part2.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+# Filename: flow-part2.sh
+# Startdate: 2020-05-30 16:46
+# Purpose: automate the post-download components of the issue-saving process
+
+OUTDIR=/mnt/public/www/issues
+
+cd /mnt/public/work/devuan
+
+sed -i -r -e 's/\\n/\n/g;' "${OUTDIR}"/*.html
+
+ls -1 "${OUTDIR}"/*.html > output/files-for-timestamps.txt
+./fix-timestamps.py
+
+./fetch-images.sh
+sed -i -f fix-images-in-html.sed "${OUTDIR}"/*.html
+
+mkdir -p /mnt/public/www/issues/css
+./fetch-css.sh
+sed -i -f fix-css-in-html.sed "${OUTDIR}"/*.html
+
+sed -i -f remove-useless.sed "${OUTDIR}"/*.html
+
+./remove-useless.py
+
+sed -i -r -f fix-without-systemd-links.sed "${OUTDIR}"/*.html
diff --git a/flow.md b/flow.md
new file mode 100644
index 0000000..5c81d5e
--- /dev/null
+++ b/flow.md
@@ -0,0 +1,57 @@
+#### Metadata
+Startdate: 2020-05-30 15:51
+References:
+Everything on this page, for jq filtering. https://stedolan.github.io/jq/manual/#Basicfilters
+
+
+# Flow
+
+1. Use gitlablib to list all issue web urls, and then remove all the "build", "buildmodify" and similar CI/CD issues.
+
+    . gitlablib.sh
+    list_all_issues | tee output/issues.all
+    <output/issues.all jq '.[]| if(.title|test("build-?(a(ll)?|mod(ify)?|add|del)?$")) then empty else . end | .web_url' | sed -r -e 's/"//g;' > output/issues.all.web_url
+
+   Manually munge the data to put the devuan/devuan-project/issues/20 on top.
+
+2. Use fetch-issue-webpages.py to fetch all those webpages
+
+    ln -s issues.all.web_url output/files-to-fetch.txt
+    ./fetch-issue-webpages.py
+
+3. munge the downloaded html
+   All of the following is performed by `flow-part2.sh`
+
+  * fix newlines
+
+    sed -i -r -e 's/\\n/\n/g;' /mnt/public/www/issues/*.html
+
+  * find data-original-titles and replace the <time> tag contents with the value of its data-original-title. Also, this will BeautifulSoup pretty-print the html so some of the following commands work correctly.
+
+    ls -1 /mnt/public/www/issues/*.html > output/files-for-timestamps.txt
+    ./fix-timestamps.py
+
+  * download all relevant images, and then fix them.
+
+    ./fetch-images.sh
+    sed -i -f fix-images-in-html.sed /mnt/public/www/issues/*.html
+
+  * download all stylesheets and then fix them.
+
+    mkdir -p /mnt/public/www/issues/css
+    ./fetch-css.sh
+    sed -i -f fix-css-in-html.sed /mnt/public/www/issues/*.html
+
+  * fix some encoding oddities
+
+    sed -i -f remove-useless.sed /mnt/public/www/issues/*.html
+
+  * remove html components that are not necessary
+
+    remove-useless.py
+
+  * Fix links that point to defunct domain without-systemd.org.
+
+    sed -i -r -f fix-without-systemd-links.sed /mnt/public/www/issues/*.html
+
+  * build some sort of index?
diff --git a/gitlablib.sh b/gitlablib.sh
new file mode 100644
index 0000000..8f57b7d
--- /dev/null
+++ b/gitlablib.sh
@@ -0,0 +1,88 @@
+#!/bin/sh
+# Startdate: 2020-05-29
+# Dependencies:
+#    jq
+#    my private token
+# Library for interacting with Gitlab API
+# For manual work:
+#    curl --header "${authheader}" "https://git.devuan.org/api/v4/projects/devuan%2Fdevuan-project/issues"
+# References:
+#    https://docs.gitlab.com/ee/api/README.html#pagination
+#    handle transforming the / in the path_with_namespace to %2F per https://docs.gitlab.com/ee/api/README.html#namespaced-path-encoding https://docs.gitlab.com/ee/api/README.html#namespaced-path-encoding
+#    https://docs.gitlab.com/ee/api/issues.html
+
+export token="$( cat /mnt/public/work/devuan/git.devuan.org.token.txt )"
+export authheader="Private-Token: ${token}"
+
+export server=git.devuan.org
+
+export GLL_TMPDIR="$( mktemp -d )"
+
+clean_gitlablib() {
+   rm -rf "${GLL_TMPDIR:-NOTHINGTODELETE}"/*
+}
+
+# PRIVATE
+_handle_gitlab_pagination() {
+   # call: list_all_projects "${startUri}"
+   ___hgp_starturi="${1}"
+   test -n "${GLL_DEBUG}" && set -x
+   # BEGIN
+   rhfile="$( TMPDIR="${GLL_TMPDIR}" mktemp -t "headers.XXXXXXXXXX" )"
+   done=0
+   size=-1
+   uri="${___hgp_starturi}"
+
+   # LOOP
+   while test ${done} -eq 0 ;
+   do
+      response="$( curl -v -L --header "${authheader}" "${uri}" 2>"${rhfile}" )" 
+      #grep -iE "^< link" "${rhfile}"
+      # determine size
+      if test "${size}" = "-1" ; then # run only if size is still undefined
+         tmpsize="$( awk '$2 == "x-total:" {print $3}' "${rhfile}" 2>/dev/null )"
+         test -n "${tmpsize}" && size="${tmpsize}"
+         echo "Number of items: ${size}" 1>&2
+      fi
+
+      tmpnextpage="$( awk '$2 == "x-next-page:" {print $3}' "${rhfile}" 2>/dev/null )"
+      # if x-next-page is blank, that means we are on the last page. Also, we could try x-total-pages compared to x-page.
+      test -z "${tmpnextpage}" && done=1
+      # so if we have a next page, get that link
+      nextUri="$( awk '{$1="";$2="";print}' "${rhfile}" | tr ',' '\n' | awk -F';' '/rel="next"/{print $1}' | sed -r -e 's/^\s*<//;' -e 's/>\s*$//;' )"
+      if test -n "${nextUri}" ; then
+         uri="${nextUri}"
+      else
+         echo "No next page provided! Error." 1>&2
+         done=1
+      fi
+
+      # show contents
+      echo "${response}"
+   done
+
+   # cleanup
+   rm "${rhfile}"
+   set +x
+}
+
+list_all_projects() {
+   _handle_gitlab_pagination "https://${server}/api/v4/projects"
+}
+
+list_all_issues() {
+   _handle_gitlab_pagination "https://${server}/api/v4/issues?scope=all&status=all"
+}
+
+list_issues_for_project() {
+   ___lifp_project="${1}"
+   ___lifp_htmlencode_bool="${2}"
+   istruthy "${___lifp_htmlencode_bool}" && ___lifp_project="$( echo "${___lifp_project}" | sed -r -e 's/\//%2F/g;' )"
+   _handle_gitlab_pagination "https://${server}/api/v4/projects/${___lifp_project}/issues"
+}
+
+list_issues_for_all_projects_pipe() {
+   # call: <projects.path_with_namespace.txt list_issues_for_all_projects_pipe
+   echo "STUB"
+}
+
diff --git a/remove-useless.py b/remove-useless.py
new file mode 100755
index 0000000..e68f458
--- /dev/null
+++ b/remove-useless.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+# Startdate: 2020-05-30 19:30
+# Purpose: remove key, useless html elements from slurped pages
+from bs4 import BeautifulSoup
+import sys
+
+def remove_useless(contents):
+   soup = BeautifulSoup(contents,"html.parser")
+   try:
+      sidebar = soup.find(class_="nav-sidebar")
+      sidebar.replace_with("")
+   except:
+      pass
+   try:
+      navbar = soup.find(class_="navbar-gitlab")
+      navbar.replace_with("")
+   except:
+      pass
+   try:
+      rightbar = soup.find(class_="issuable-context-form")
+      rightbar.replace_with("")
+   except:
+      pass
+   try:
+      rightbar = soup.find(class_="js-issuable-sidebar")
+      rightbar.replace_with("")
+   except:
+      pass
+   try:
+      rightbar = soup.find(class_="js-issuable-actions")
+      rightbar.replace_with("")
+   except:
+      pass
+   try:
+      rightbar = soup.find(class_="js-noteable-awards")
+      rightbar.replace_with("")
+   except:
+      pass
+   try:
+      rightbar = soup.find(class_="disabled-comment")
+      rightbar.replace_with("")
+   except:
+      pass
+   try:
+      rightbar = soup.find(class_="notes-form")
+      rightbar.replace_with("")
+   except:
+      pass
+   try:
+      rightbar = soup.find(class_="btn-edit")
+      rightbar.replace_with("")
+   except:
+      pass
+   try:
+      rightbar = soup.find(class_="js-issuable-edit")
+      rightbar.replace_with("")
+   except:
+      pass
+   try:
+      mylist = soup.find_all(class_="note-actions")
+      for i in mylist:
+         i.replace_with("")
+   except:
+      pass
+   try:
+      mylist = soup.find_all(class_="emoji-block")
+      for i in mylist:
+         i.replace_with("")
+   except:
+   return soup
+
+# this works, for the single file called
+#with open(sys.argv[1],"r") as infile:
+#   lines = infile.read()
+
+with open("output/files-for-timestamps.txt") as f:
+   lines = [line.rstrip() for line in f]
+
+for thisfile in lines:
+   print("Removing useless html in file",thisfile)
+   with open(thisfile) as tf:
+      output=remove_useless(tf.read())
+   with open(thisfile,"w",encoding='utf-8') as tf:
+      tf.write(str(output.prettify()))
diff --git a/remove-useless.sed b/remove-useless.sed
new file mode 100644
index 0000000..3dbe856
--- /dev/null
+++ b/remove-useless.sed
@@ -0,0 +1,12 @@
+$ {s/^'//}
+1 {s/^b'//}
+s/Â·/·/g # do not ask how I made this one
+s/Ã//g
+s/\\'/'/g
+s/\xc2(\x91|\x82|\x)//g
+s/\\xc2\\xb7/·/g # two characters here
+s/\\xc3\\xab/�/g
+s/\\xe1\\xb4\\x84\\xe1\\xb4\\xa0\\xe1\\xb4\\x87/CVE/g
+s/\\xe2\\x80\\x99/'/g
+s/\\xe2\\x80\\xa6/.../g
+s/(\\x..)*\\xb7/·/g # two characters here
-- 
cgit