From 0c80c29d0fde63d9617d5769038963375e698628 Mon Sep 17 00:00:00 2001 From: B Stack Date: Tue, 9 Jun 2020 16:55:15 -0400 Subject: initial commit --- fetch-css.sh | 28 ++++++++++++++ fetch-images.sh | 33 ++++++++++++++++ fetch-issue-webpages.py | 78 ++++++++++++++++++++++++++++++++++++++ fix-css-in-html.sed | 2 + fix-images-in-html.sed | 6 +++ fix-timestamps.py | 46 ++++++++++++++++++++++ fix-without-systemd-links.sed | 5 +++ flow-part2.sh | 26 +++++++++++++ flow.md | 57 ++++++++++++++++++++++++++++ gitlablib.sh | 88 +++++++++++++++++++++++++++++++++++++++++++ remove-useless.py | 84 +++++++++++++++++++++++++++++++++++++++++ remove-useless.sed | 12 ++++++ 12 files changed, 465 insertions(+) create mode 100755 fetch-css.sh create mode 100755 fetch-images.sh create mode 100755 fetch-issue-webpages.py create mode 100644 fix-css-in-html.sed create mode 100644 fix-images-in-html.sed create mode 100755 fix-timestamps.py create mode 100644 fix-without-systemd-links.sed create mode 100755 flow-part2.sh create mode 100644 flow.md create mode 100644 gitlablib.sh create mode 100755 remove-useless.py create mode 100644 remove-useless.sed diff --git a/fetch-css.sh b/fetch-css.sh new file mode 100755 index 0000000..06718c2 --- /dev/null +++ b/fetch-css.sh @@ -0,0 +1,28 @@ +#!/bin/sh +# Startdate: 2020-05-29 20:18 + +INDIR=/mnt/public/www/issues +INGLOB=*.html + +SEDSCRIPT=/mnt/public/work/devuan/fix-css-in-html.sed + +# OUTDIR will be made in INDIR, because of the `cd` below. +OUTDIR=css +test ! -d "${OUTDIR}" && mkdir -p "${OUTDIR}" + +INSERVER=https://git.devuan.org + +cd "${INDIR}" + +orig_css="$( sed -n -r -e 's/^.* "${SEDSCRIPT}" + +echo "${orig_css}" | while read line ; do + getpath="${INSERVER}${line}" + targetfile="${OUTDIR}/$( basename "${line}" )" + test -n "${DEBUG}" && echo "process ${getpath} and save to ${targetfile}" 1>&2 + test -z "${DRYRUN}" && wget --quiet --content-disposition -O "${targetfile}" "${getpath}" + # dynamically build a sed script + echo "s:${line}:${targetfile##/}:g;" | tee -a "${SEDSCRIPT}" +done diff --git a/fetch-images.sh b/fetch-images.sh new file mode 100755 index 0000000..4f4884b --- /dev/null +++ b/fetch-images.sh @@ -0,0 +1,33 @@ +#!/bin/sh +# startdate 2020-05-29 20:04 +# After running this, be sure to do the sed. +# sed -i -f fix-images-in-html.sed /mnt/public/www/issues/*.html +# Improve: +# It is probably an artifact of the weird way the asset svgs are embedded, but I cannot get them to display at all even though they are downloaded successfully. I have seen this before, the little embedded images you cannot easily download and simply display. + +INDIR=/mnt/public/www/issues +INGLOB=*.html + +SEDSCRIPT=/mnt/public/work/devuan/fix-images-in-html.sed + +INSERVER=https://git.devuan.org + +cd "${INDIR}" + +# could use this line to get all the assets, but they do not display regardless due to html weirdness +#orig_src="$( grep -oE '(\ "${SEDSCRIPT}" + +echo "${orig_src}" | while read line ; do + #echo "${line}" | awk -F'"' '{print $2}' + getpath="${INSERVER}${line}" + outdir="$( echo "${line}" | awk -F'/' '{print $2}' )" + test ! -d "${outdir}" && mkdir -p "${outdir}" + targetfile="${outdir}/$( basename "${line}" )" + test -n "${DEBUG}" && echo "process ${getpath} and save to ${targetfile}" 1>&2 + test -z "${DRYRUN}" && wget --quiet --content-disposition -O "${targetfile}" "${getpath}" + # dynamically build a sed script + echo "s:${line}:${targetfile##/}:g;" | tee -a "${SEDSCRIPT}" +done diff --git a/fetch-issue-webpages.py b/fetch-issue-webpages.py new file mode 100755 index 0000000..86d6b71 --- /dev/null +++ b/fetch-issue-webpages.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# Startdate: 2020-05-29 16:22 +# History: +# Usage: +# ln -s issues.all.web_url output/files-to-fetch.txt +# ./fetch-issues-webpages.py +# How to make this work: +# apt-get install python3-pyvirtualdisplay +# download this geckodriver, place in /usr/local/bin +# References: +# basic guide https://web.archive.org/web/20191031110759/http://scraping.pro/use-headless-firefox-scraping-linux/ +# https://stackoverflow.com/questions/40302006/no-such-file-or-directory-geckodriver-for-a-python-simple-selenium-applicatio +# geckodriver https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz +# https://www.selenium.dev/selenium/docs/api/py/index.html?highlight=get +# page source https://www.selenium.dev/selenium/docs/api/py/webdriver_remote/selenium.webdriver.remote.webdriver.html?highlight=title#selenium.webdriver.remote.webdriver.WebDriver.title +# make sure all comments load https://stackoverflow.com/questions/26566799/wait-until-page-is-loaded-with-selenium-webdriver-for-python/44998503#44998503 +# https://crossbrowsertesting.com/blog/test-automation/automate-login-with-selenium/ +# Improve: +from pyvirtualdisplay import Display +from selenium import webdriver +from selenium.webdriver.support.ui import WebDriverWait +import re, time, getpass + +def ask_password(prompt): + #return input(prompt+": ") + return getpass.getpass(prompt+": ") + +def scrollDown(driver, value): + driver.execute_script("window.scrollBy(0,"+str(value)+")") + +# Scroll down the page +def scrollDownAllTheWay(driver): + old_page = driver.page_source + while True: + #logging.debug("Scrolling loop") + for i in range(2): + scrollDown(driver, 500) + time.sleep(2) + new_page = driver.page_source + if new_page != old_page: + old_page = new_page + else: + break + return True + +server_string="https://git.devuan.org" +outdir="/mnt/public/www/issues" + +with open("output/files-to-fetch.txt") as f: + lines=[line.rstrip() for line in f] + +# ask password now instead of after the delay +password = ask_password("Enter password for "+server_string) + +display = Display(visible=0, size=(800, 600)) +display.start() + +browser = webdriver.Firefox() + +# log in to gitlab instance +browser.get(server_string+"/users/sign_in") +browser.find_element_by_id("user_login").send_keys('bgstack15') +browser.find_element_by_id("user_password").send_keys(password) +browser.find_element_by_class_name("qa-sign-in-button").click() +browser.get(server_string+"/profile") # always needs the authentication +scrollDownAllTheWay(browser) + +for thisfile in lines: + destfile=re.sub("\.+",".",re.sub("\/|issues",".",re.sub("^"+re.escape(server_string)+"\/","",thisfile)))+".html" + print("Saving",thisfile,outdir+"/"+destfile) + browser.get(thisfile) + scrollDownAllTheWay(browser) + with open(outdir+"/"+destfile,"w") as text_file: + print(browser.page_source.encode('utf-8'),file=text_file) + +# done with loop +browser.quit() +display.stop() diff --git a/fix-css-in-html.sed b/fix-css-in-html.sed new file mode 100644 index 0000000..eb07dd9 --- /dev/null +++ b/fix-css-in-html.sed @@ -0,0 +1,2 @@ +s:/assets/application-5e11f017cc719f3e7c77f1024cdea9381ee9b21f47fe391ef5d29a7f3c94ce4a.css:css/application-5e11f017cc719f3e7c77f1024cdea9381ee9b21f47fe391ef5d29a7f3c94ce4a.css:g; +s:/assets/print-c8ff536271f8974b8a9a5f75c0ca25d2b8c1dceb4cff3c01d1603862a0bdcbfc.css:css/print-c8ff536271f8974b8a9a5f75c0ca25d2b8c1dceb4cff3c01d1603862a0bdcbfc.css:g; diff --git a/fix-images-in-html.sed b/fix-images-in-html.sed new file mode 100644 index 0000000..aedf89f --- /dev/null +++ b/fix-images-in-html.sed @@ -0,0 +1,6 @@ +s:/uploads/-/system/appearance/header_logo/1/gdo-icon.png:uploads/gdo-icon.png:g; +s:/uploads/-/system/user/avatar/4095/avatar.png?width=23:uploads/avatar.png?width=23:g; +s:/assets/illustrations/cluster_popover-9830388038d966d8d64d43576808f9d5ba05f639a78a40bae9a5ddc7cbf72f24.svg:assets/cluster_popover-9830388038d966d8d64d43576808f9d5ba05f639a78a40bae9a5ddc7cbf72f24.svg:g; +s:/uploads/-/system/user/avatar/4095/avatar.png?width=40:uploads/avatar.png?width=40:g; +s:/uploads/-/system/user/avatar/17/cph-devuan.png?width=24:uploads/cph-devuan.png?width=24:g; +# TRUNCATED FOR GIT REPO diff --git a/fix-timestamps.py b/fix-timestamps.py new file mode 100755 index 0000000..a564257 --- /dev/null +++ b/fix-timestamps.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# Startdate: 2020-05-29 20:40 +# Purpose: convert timestamps on gitlab issue web page into UTC +# History: +# 2020-05-30 09:24 add loop through files listed in output/files-for-timestamps.txt +# Usage: +# ls -1 /mnt/public/www/issues/output*.html > output/files-for-timestamps.txt +# ./fix-timestamps.py +# References: +# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#pretty-printing +# https://gitlab.com/bgstack15/vooblystats/-/blob/master/vooblystats.py +# https://bgstack15.wordpress.com/2020/02/16/python3-convert-relative-date-to-utc-timestamp/ +# Improve: +# this is hardcoded to work when the pages are shown in EDT. +from bs4 import BeautifulSoup +from datetime import timedelta +from parsedatetime import Calendar +from pytz import timezone + +def fix_timestamps(page_text): + soup = BeautifulSoup(page_text,"html.parser") + cal = Calendar() + x = 0 + for i in soup.find_all(name='time'): + x = x + 1 + j = i.attrs["data-original-title"] + if 'EDT' == j[-3:] or 'EST' == j[-3:]: + tzobject=timezone("US/Eastern") + else: + tzobject=timezone("UTC") + dto, _ = cal.parseDT(datetimeString=j,tzinfo=timezone("US/Eastern")) + add_hours = int((str(dto)[-6:])[:3]) + j = (timedelta(hours=-add_hours) + dto).strftime('%Y-%m-%dT%H:%MZ') + # second precision %S is not needed for this use case. + i.string = j + return soup + +with open("output/files-for-timestamps.txt") as f: + lines = [line.rstrip() for line in f] + +for thisfile in lines: + print("Fixing timestamps in file",thisfile) + with open(thisfile) as tf: + output=fix_timestamps(tf.read()) + with open(thisfile,"w",encoding='utf-8') as tf: + tf.write(str(output.prettify())) diff --git a/fix-without-systemd-links.sed b/fix-without-systemd-links.sed new file mode 100644 index 0000000..d2b25c6 --- /dev/null +++ b/fix-without-systemd-links.sed @@ -0,0 +1,5 @@ +/without-systemd\.org/{ + /archive\.org/!{ + s@(http://without-systemd\.org)@https://web.archive.org/web/20190208013412/\1@g; + } +} diff --git a/flow-part2.sh b/flow-part2.sh new file mode 100755 index 0000000..db27d98 --- /dev/null +++ b/flow-part2.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# Filename: flow-part2.sh +# Startdate: 2020-05-30 16:46 +# Purpose: automate the post-download components of the issue-saving process + +OUTDIR=/mnt/public/www/issues + +cd /mnt/public/work/devuan + +sed -i -r -e 's/\\n/\n/g;' "${OUTDIR}"/*.html + +ls -1 "${OUTDIR}"/*.html > output/files-for-timestamps.txt +./fix-timestamps.py + +./fetch-images.sh +sed -i -f fix-images-in-html.sed "${OUTDIR}"/*.html + +mkdir -p /mnt/public/www/issues/css +./fetch-css.sh +sed -i -f fix-css-in-html.sed "${OUTDIR}"/*.html + +sed -i -f remove-useless.sed "${OUTDIR}"/*.html + +./remove-useless.py + +sed -i -r -f fix-without-systemd-links.sed "${OUTDIR}"/*.html diff --git a/flow.md b/flow.md new file mode 100644 index 0000000..5c81d5e --- /dev/null +++ b/flow.md @@ -0,0 +1,57 @@ +#### Metadata +Startdate: 2020-05-30 15:51 +References: +Everything on this page, for jq filtering. https://stedolan.github.io/jq/manual/#Basicfilters + + +# Flow + +1. Use gitlablib to list all issue web urls, and then remove all the "build", "buildmodify" and similar CI/CD issues. + + . gitlablib.sh + list_all_issues | tee output/issues.all + output/issues.all.web_url + + Manually munge the data to put the devuan/devuan-project/issues/20 on top. + +2. Use fetch-issue-webpages.py to fetch all those webpages + + ln -s issues.all.web_url output/files-to-fetch.txt + ./fetch-issue-webpages.py + +3. munge the downloaded html + All of the following is performed by `flow-part2.sh` + + * fix newlines + + sed -i -r -e 's/\\n/\n/g;' /mnt/public/www/issues/*.html + + * find data-original-titles and replace the