diff options
-rwxr-xr-x | fetch-css.sh | 28 | ||||
-rwxr-xr-x | fetch-images.sh | 33 | ||||
-rwxr-xr-x | fetch-issue-webpages.py | 78 | ||||
-rw-r--r-- | fix-css-in-html.sed | 2 | ||||
-rw-r--r-- | fix-images-in-html.sed | 6 | ||||
-rwxr-xr-x | fix-timestamps.py | 46 | ||||
-rw-r--r-- | fix-without-systemd-links.sed | 5 | ||||
-rwxr-xr-x | flow-part2.sh | 26 | ||||
-rw-r--r-- | flow.md | 57 | ||||
-rw-r--r-- | gitlablib.sh | 88 | ||||
-rwxr-xr-x | remove-useless.py | 84 | ||||
-rw-r--r-- | remove-useless.sed | 12 |
12 files changed, 465 insertions, 0 deletions
diff --git a/fetch-css.sh b/fetch-css.sh new file mode 100755 index 0000000..06718c2 --- /dev/null +++ b/fetch-css.sh @@ -0,0 +1,28 @@ +#!/bin/sh +# Startdate: 2020-05-29 20:18 + +INDIR=/mnt/public/www/issues +INGLOB=*.html + +SEDSCRIPT=/mnt/public/work/devuan/fix-css-in-html.sed + +# OUTDIR will be made in INDIR, because of the `cd` below. +OUTDIR=css +test ! -d "${OUTDIR}" && mkdir -p "${OUTDIR}" + +INSERVER=https://git.devuan.org + +cd "${INDIR}" + +orig_css="$( sed -n -r -e 's/^.*<link.*(href="[^"]+\.css").*/\1/p' ${INGLOB} | awk -F'"' '!x[$2]++{print $2}' )" + +cat /dev/null > "${SEDSCRIPT}" + +echo "${orig_css}" | while read line ; do + getpath="${INSERVER}${line}" + targetfile="${OUTDIR}/$( basename "${line}" )" + test -n "${DEBUG}" && echo "process ${getpath} and save to ${targetfile}" 1>&2 + test -z "${DRYRUN}" && wget --quiet --content-disposition -O "${targetfile}" "${getpath}" + # dynamically build a sed script + echo "s:${line}:${targetfile##/}:g;" | tee -a "${SEDSCRIPT}" +done diff --git a/fetch-images.sh b/fetch-images.sh new file mode 100755 index 0000000..4f4884b --- /dev/null +++ b/fetch-images.sh @@ -0,0 +1,33 @@ +#!/bin/sh +# startdate 2020-05-29 20:04 +# After running this, be sure to do the sed. +# sed -i -f fix-images-in-html.sed /mnt/public/www/issues/*.html +# Improve: +# It is probably an artifact of the weird way the asset svgs are embedded, but I cannot get them to display at all even though they are downloaded successfully. I have seen this before, the little embedded images you cannot easily download and simply display. + +INDIR=/mnt/public/www/issues +INGLOB=*.html + +SEDSCRIPT=/mnt/public/work/devuan/fix-images-in-html.sed + +INSERVER=https://git.devuan.org + +cd "${INDIR}" + +# could use this line to get all the assets, but they do not display regardless due to html weirdness +#orig_src="$( grep -oE '(\<src|xlink:href)="?\/[^"]*"' ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$0]++{print $2}' )" +orig_src="$( grep -oE '\<src="?\/[^"]*"' ${INGLOB} | grep -vE '\.js' | awk -F'"' '!x[$2]++{print $2}' )" + +cat /dev/null > "${SEDSCRIPT}" + +echo "${orig_src}" | while read line ; do + #echo "${line}" | awk -F'"' '{print $2}' + getpath="${INSERVER}${line}" + outdir="$( echo "${line}" | awk -F'/' '{print $2}' )" + test ! -d "${outdir}" && mkdir -p "${outdir}" + targetfile="${outdir}/$( basename "${line}" )" + test -n "${DEBUG}" && echo "process ${getpath} and save to ${targetfile}" 1>&2 + test -z "${DRYRUN}" && wget --quiet --content-disposition -O "${targetfile}" "${getpath}" + # dynamically build a sed script + echo "s:${line}:${targetfile##/}:g;" | tee -a "${SEDSCRIPT}" +done diff --git a/fetch-issue-webpages.py b/fetch-issue-webpages.py new file mode 100755 index 0000000..86d6b71 --- /dev/null +++ b/fetch-issue-webpages.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# Startdate: 2020-05-29 16:22 +# History: +# Usage: +# ln -s issues.all.web_url output/files-to-fetch.txt +# ./fetch-issues-webpages.py +# How to make this work: +# apt-get install python3-pyvirtualdisplay +# download this geckodriver, place in /usr/local/bin +# References: +# basic guide https://web.archive.org/web/20191031110759/http://scraping.pro/use-headless-firefox-scraping-linux/ +# https://stackoverflow.com/questions/40302006/no-such-file-or-directory-geckodriver-for-a-python-simple-selenium-applicatio +# geckodriver https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz +# https://www.selenium.dev/selenium/docs/api/py/index.html?highlight=get +# page source https://www.selenium.dev/selenium/docs/api/py/webdriver_remote/selenium.webdriver.remote.webdriver.html?highlight=title#selenium.webdriver.remote.webdriver.WebDriver.title +# make sure all comments load https://stackoverflow.com/questions/26566799/wait-until-page-is-loaded-with-selenium-webdriver-for-python/44998503#44998503 +# https://crossbrowsertesting.com/blog/test-automation/automate-login-with-selenium/ +# Improve: +from pyvirtualdisplay import Display +from selenium import webdriver +from selenium.webdriver.support.ui import WebDriverWait +import re, time, getpass + +def ask_password(prompt): + #return input(prompt+": ") + return getpass.getpass(prompt+": ") + +def scrollDown(driver, value): + driver.execute_script("window.scrollBy(0,"+str(value)+")") + +# Scroll down the page +def scrollDownAllTheWay(driver): + old_page = driver.page_source + while True: + #logging.debug("Scrolling loop") + for i in range(2): + scrollDown(driver, 500) + time.sleep(2) + new_page = driver.page_source + if new_page != old_page: + old_page = new_page + else: + break + return True + +server_string="https://git.devuan.org" +outdir="/mnt/public/www/issues" + +with open("output/files-to-fetch.txt") as f: + lines=[line.rstrip() for line in f] + +# ask password now instead of after the delay +password = ask_password("Enter password for "+server_string) + +display = Display(visible=0, size=(800, 600)) +display.start() + +browser = webdriver.Firefox() + +# log in to gitlab instance +browser.get(server_string+"/users/sign_in") +browser.find_element_by_id("user_login").send_keys('bgstack15') +browser.find_element_by_id("user_password").send_keys(password) +browser.find_element_by_class_name("qa-sign-in-button").click() +browser.get(server_string+"/profile") # always needs the authentication +scrollDownAllTheWay(browser) + +for thisfile in lines: + destfile=re.sub("\.+",".",re.sub("\/|issues",".",re.sub("^"+re.escape(server_string)+"\/","",thisfile)))+".html" + print("Saving",thisfile,outdir+"/"+destfile) + browser.get(thisfile) + scrollDownAllTheWay(browser) + with open(outdir+"/"+destfile,"w") as text_file: + print(browser.page_source.encode('utf-8'),file=text_file) + +# done with loop +browser.quit() +display.stop() diff --git a/fix-css-in-html.sed b/fix-css-in-html.sed new file mode 100644 index 0000000..eb07dd9 --- /dev/null +++ b/fix-css-in-html.sed @@ -0,0 +1,2 @@ +s:/assets/application-5e11f017cc719f3e7c77f1024cdea9381ee9b21f47fe391ef5d29a7f3c94ce4a.css:css/application-5e11f017cc719f3e7c77f1024cdea9381ee9b21f47fe391ef5d29a7f3c94ce4a.css:g; +s:/assets/print-c8ff536271f8974b8a9a5f75c0ca25d2b8c1dceb4cff3c01d1603862a0bdcbfc.css:css/print-c8ff536271f8974b8a9a5f75c0ca25d2b8c1dceb4cff3c01d1603862a0bdcbfc.css:g; diff --git a/fix-images-in-html.sed b/fix-images-in-html.sed new file mode 100644 index 0000000..aedf89f --- /dev/null +++ b/fix-images-in-html.sed @@ -0,0 +1,6 @@ +s:/uploads/-/system/appearance/header_logo/1/gdo-icon.png:uploads/gdo-icon.png:g; +s:/uploads/-/system/user/avatar/4095/avatar.png?width=23:uploads/avatar.png?width=23:g; +s:/assets/illustrations/cluster_popover-9830388038d966d8d64d43576808f9d5ba05f639a78a40bae9a5ddc7cbf72f24.svg:assets/cluster_popover-9830388038d966d8d64d43576808f9d5ba05f639a78a40bae9a5ddc7cbf72f24.svg:g; +s:/uploads/-/system/user/avatar/4095/avatar.png?width=40:uploads/avatar.png?width=40:g; +s:/uploads/-/system/user/avatar/17/cph-devuan.png?width=24:uploads/cph-devuan.png?width=24:g; +# TRUNCATED FOR GIT REPO diff --git a/fix-timestamps.py b/fix-timestamps.py new file mode 100755 index 0000000..a564257 --- /dev/null +++ b/fix-timestamps.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# Startdate: 2020-05-29 20:40 +# Purpose: convert timestamps on gitlab issue web page into UTC +# History: +# 2020-05-30 09:24 add loop through files listed in output/files-for-timestamps.txt +# Usage: +# ls -1 /mnt/public/www/issues/output*.html > output/files-for-timestamps.txt +# ./fix-timestamps.py +# References: +# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#pretty-printing +# https://gitlab.com/bgstack15/vooblystats/-/blob/master/vooblystats.py +# https://bgstack15.wordpress.com/2020/02/16/python3-convert-relative-date-to-utc-timestamp/ +# Improve: +# this is hardcoded to work when the pages are shown in EDT. +from bs4 import BeautifulSoup +from datetime import timedelta +from parsedatetime import Calendar +from pytz import timezone + +def fix_timestamps(page_text): + soup = BeautifulSoup(page_text,"html.parser") + cal = Calendar() + x = 0 + for i in soup.find_all(name='time'): + x = x + 1 + j = i.attrs["data-original-title"] + if 'EDT' == j[-3:] or 'EST' == j[-3:]: + tzobject=timezone("US/Eastern") + else: + tzobject=timezone("UTC") + dto, _ = cal.parseDT(datetimeString=j,tzinfo=timezone("US/Eastern")) + add_hours = int((str(dto)[-6:])[:3]) + j = (timedelta(hours=-add_hours) + dto).strftime('%Y-%m-%dT%H:%MZ') + # second precision %S is not needed for this use case. + i.string = j + return soup + +with open("output/files-for-timestamps.txt") as f: + lines = [line.rstrip() for line in f] + +for thisfile in lines: + print("Fixing timestamps in file",thisfile) + with open(thisfile) as tf: + output=fix_timestamps(tf.read()) + with open(thisfile,"w",encoding='utf-8') as tf: + tf.write(str(output.prettify())) diff --git a/fix-without-systemd-links.sed b/fix-without-systemd-links.sed new file mode 100644 index 0000000..d2b25c6 --- /dev/null +++ b/fix-without-systemd-links.sed @@ -0,0 +1,5 @@ +/without-systemd\.org/{ + /archive\.org/!{ + s@(http://without-systemd\.org)@https://web.archive.org/web/20190208013412/\1@g; + } +} diff --git a/flow-part2.sh b/flow-part2.sh new file mode 100755 index 0000000..db27d98 --- /dev/null +++ b/flow-part2.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# Filename: flow-part2.sh +# Startdate: 2020-05-30 16:46 +# Purpose: automate the post-download components of the issue-saving process + +OUTDIR=/mnt/public/www/issues + +cd /mnt/public/work/devuan + +sed -i -r -e 's/\\n/\n/g;' "${OUTDIR}"/*.html + +ls -1 "${OUTDIR}"/*.html > output/files-for-timestamps.txt +./fix-timestamps.py + +./fetch-images.sh +sed -i -f fix-images-in-html.sed "${OUTDIR}"/*.html + +mkdir -p /mnt/public/www/issues/css +./fetch-css.sh +sed -i -f fix-css-in-html.sed "${OUTDIR}"/*.html + +sed -i -f remove-useless.sed "${OUTDIR}"/*.html + +./remove-useless.py + +sed -i -r -f fix-without-systemd-links.sed "${OUTDIR}"/*.html @@ -0,0 +1,57 @@ +#### Metadata +Startdate: 2020-05-30 15:51 +References: +Everything on this page, for jq filtering. https://stedolan.github.io/jq/manual/#Basicfilters + + +# Flow + +1. Use gitlablib to list all issue web urls, and then remove all the "build", "buildmodify" and similar CI/CD issues. + + . gitlablib.sh + list_all_issues | tee output/issues.all + <output/issues.all jq '.[]| if(.title|test("build-?(a(ll)?|mod(ify)?|add|del)?$")) then empty else . end | .web_url' | sed -r -e 's/"//g;' > output/issues.all.web_url + + Manually munge the data to put the devuan/devuan-project/issues/20 on top. + +2. Use fetch-issue-webpages.py to fetch all those webpages + + ln -s issues.all.web_url output/files-to-fetch.txt + ./fetch-issue-webpages.py + +3. munge the downloaded html + All of the following is performed by `flow-part2.sh` + + * fix newlines + + sed -i -r -e 's/\\n/\n/g;' /mnt/public/www/issues/*.html + + * find data-original-titles and replace the <time> tag contents with the value of its data-original-title. Also, this will BeautifulSoup pretty-print the html so some of the following commands work correctly. + + ls -1 /mnt/public/www/issues/*.html > output/files-for-timestamps.txt + ./fix-timestamps.py + + * download all relevant images, and then fix them. + + ./fetch-images.sh + sed -i -f fix-images-in-html.sed /mnt/public/www/issues/*.html + + * download all stylesheets and then fix them. + + mkdir -p /mnt/public/www/issues/css + ./fetch-css.sh + sed -i -f fix-css-in-html.sed /mnt/public/www/issues/*.html + + * fix some encoding oddities + + sed -i -f remove-useless.sed /mnt/public/www/issues/*.html + + * remove html components that are not necessary + + remove-useless.py + + * Fix links that point to defunct domain without-systemd.org. + + sed -i -r -f fix-without-systemd-links.sed /mnt/public/www/issues/*.html + + * build some sort of index? diff --git a/gitlablib.sh b/gitlablib.sh new file mode 100644 index 0000000..8f57b7d --- /dev/null +++ b/gitlablib.sh @@ -0,0 +1,88 @@ +#!/bin/sh +# Startdate: 2020-05-29 +# Dependencies: +# jq +# my private token +# Library for interacting with Gitlab API +# For manual work: +# curl --header "${authheader}" "https://git.devuan.org/api/v4/projects/devuan%2Fdevuan-project/issues" +# References: +# https://docs.gitlab.com/ee/api/README.html#pagination +# handle transforming the / in the path_with_namespace to %2F per https://docs.gitlab.com/ee/api/README.html#namespaced-path-encoding https://docs.gitlab.com/ee/api/README.html#namespaced-path-encoding +# https://docs.gitlab.com/ee/api/issues.html + +export token="$( cat /mnt/public/work/devuan/git.devuan.org.token.txt )" +export authheader="Private-Token: ${token}" + +export server=git.devuan.org + +export GLL_TMPDIR="$( mktemp -d )" + +clean_gitlablib() { + rm -rf "${GLL_TMPDIR:-NOTHINGTODELETE}"/* +} + +# PRIVATE +_handle_gitlab_pagination() { + # call: list_all_projects "${startUri}" + ___hgp_starturi="${1}" + test -n "${GLL_DEBUG}" && set -x + # BEGIN + rhfile="$( TMPDIR="${GLL_TMPDIR}" mktemp -t "headers.XXXXXXXXXX" )" + done=0 + size=-1 + uri="${___hgp_starturi}" + + # LOOP + while test ${done} -eq 0 ; + do + response="$( curl -v -L --header "${authheader}" "${uri}" 2>"${rhfile}" )" + #grep -iE "^< link" "${rhfile}" + # determine size + if test "${size}" = "-1" ; then # run only if size is still undefined + tmpsize="$( awk '$2 == "x-total:" {print $3}' "${rhfile}" 2>/dev/null )" + test -n "${tmpsize}" && size="${tmpsize}" + echo "Number of items: ${size}" 1>&2 + fi + + tmpnextpage="$( awk '$2 == "x-next-page:" {print $3}' "${rhfile}" 2>/dev/null )" + # if x-next-page is blank, that means we are on the last page. Also, we could try x-total-pages compared to x-page. + test -z "${tmpnextpage}" && done=1 + # so if we have a next page, get that link + nextUri="$( awk '{$1="";$2="";print}' "${rhfile}" | tr ',' '\n' | awk -F';' '/rel="next"/{print $1}' | sed -r -e 's/^\s*<//;' -e 's/>\s*$//;' )" + if test -n "${nextUri}" ; then + uri="${nextUri}" + else + echo "No next page provided! Error." 1>&2 + done=1 + fi + + # show contents + echo "${response}" + done + + # cleanup + rm "${rhfile}" + set +x +} + +list_all_projects() { + _handle_gitlab_pagination "https://${server}/api/v4/projects" +} + +list_all_issues() { + _handle_gitlab_pagination "https://${server}/api/v4/issues?scope=all&status=all" +} + +list_issues_for_project() { + ___lifp_project="${1}" + ___lifp_htmlencode_bool="${2}" + istruthy "${___lifp_htmlencode_bool}" && ___lifp_project="$( echo "${___lifp_project}" | sed -r -e 's/\//%2F/g;' )" + _handle_gitlab_pagination "https://${server}/api/v4/projects/${___lifp_project}/issues" +} + +list_issues_for_all_projects_pipe() { + # call: <projects.path_with_namespace.txt list_issues_for_all_projects_pipe + echo "STUB" +} + diff --git a/remove-useless.py b/remove-useless.py new file mode 100755 index 0000000..e68f458 --- /dev/null +++ b/remove-useless.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +# Startdate: 2020-05-30 19:30 +# Purpose: remove key, useless html elements from slurped pages +from bs4 import BeautifulSoup +import sys + +def remove_useless(contents): + soup = BeautifulSoup(contents,"html.parser") + try: + sidebar = soup.find(class_="nav-sidebar") + sidebar.replace_with("") + except: + pass + try: + navbar = soup.find(class_="navbar-gitlab") + navbar.replace_with("") + except: + pass + try: + rightbar = soup.find(class_="issuable-context-form") + rightbar.replace_with("") + except: + pass + try: + rightbar = soup.find(class_="js-issuable-sidebar") + rightbar.replace_with("") + except: + pass + try: + rightbar = soup.find(class_="js-issuable-actions") + rightbar.replace_with("") + except: + pass + try: + rightbar = soup.find(class_="js-noteable-awards") + rightbar.replace_with("") + except: + pass + try: + rightbar = soup.find(class_="disabled-comment") + rightbar.replace_with("") + except: + pass + try: + rightbar = soup.find(class_="notes-form") + rightbar.replace_with("") + except: + pass + try: + rightbar = soup.find(class_="btn-edit") + rightbar.replace_with("") + except: + pass + try: + rightbar = soup.find(class_="js-issuable-edit") + rightbar.replace_with("") + except: + pass + try: + mylist = soup.find_all(class_="note-actions") + for i in mylist: + i.replace_with("") + except: + pass + try: + mylist = soup.find_all(class_="emoji-block") + for i in mylist: + i.replace_with("") + except: + return soup + +# this works, for the single file called +#with open(sys.argv[1],"r") as infile: +# lines = infile.read() + +with open("output/files-for-timestamps.txt") as f: + lines = [line.rstrip() for line in f] + +for thisfile in lines: + print("Removing useless html in file",thisfile) + with open(thisfile) as tf: + output=remove_useless(tf.read()) + with open(thisfile,"w",encoding='utf-8') as tf: + tf.write(str(output.prettify())) diff --git a/remove-useless.sed b/remove-useless.sed new file mode 100644 index 0000000..3dbe856 --- /dev/null +++ b/remove-useless.sed @@ -0,0 +1,12 @@ +$ {s/^'//} +1 {s/^b'//} +s/·/·/g # do not ask how I made this one +s/Â//g +s/\\'/'/g +s/\xc2(\x91|\x82|\x)//g +s/\\xc2\\xb7/·/g # two characters here +s/\\xc3\\xab/Ã/g +s/\\xe1\\xb4\\x84\\xe1\\xb4\\xa0\\xe1\\xb4\\x87/CVE/g +s/\\xe2\\x80\\x99/'/g +s/\\xe2\\x80\\xa6/.../g +s/(\\x..)*\\xb7/·/g # two characters here |