Improvements to obsmirror

bgstack15

2022-04-07 08:58

Previous posts on the topic of mirroring the OBS apt repos locally:

I have improved the shell script that is the main obsmirror logic. I also split out the rebuild-apt-repo logic into its own script so I could call it separately.

	`#!/bin/sh`
	`# File: /etc/installed/obsmirror.sh`
	`# Location: https://gitlab.com/bgstack15/former-gists/tree/master/obsmirror.sh`
	`# Author: bgstack15`
	`# Startdate: 2020-03-03 08:43`
	`# SPDX-License-Identifier: CC-BY-SA-4.0`
	`# Title: Script that scrapes down OBS site to serve a copy to intranet`
	`# Purpose: save down my OBS site so I can serve it locally`
	`# History:`
	`# 2020-01-05 v1: begin which used httrack`
	`# 2020-02-28 v2: complete rewrite to exclude httrack`
	`# 2020-03-03 v3: complete rewrite to get explicit files and loop through their contents, and rebuild apt repo`
	`# 2020-03-13 add on-prompt notifications`
	`# 2022-04-01 22:33`
	`# Usage:`
	`# in a cron job: /etc/cron.d/mirror.cron`
	`# 50 12 * * * root /etc/installed/obsmirror.sh 1>/dev/null 2>&1`
	`# Reference:`
	`# https://software.opensuse.org//download.html?project=home%3Abgstack15&package=freefilesync`
	`# /mnt/public/www/internal/repo/devuan-deb/update-devuan-deb.sh`
	`# https://medium.com/sqooba/create-your-own-custom-and-authenticated-apt-repository-1e4a4cf0b864`
	`# https://unix.stackexchange.com/questions/113898/how-to-merge-two-files-based-on-the-matching-of-two-columns/113903#113903`
	`# sed\|sed to get line numbers printed https://stackoverflow.com/questions/52882594/insert-line-numbers-into-file-with-sed/52884598#52884598`
	`# Improve:`
	`# Documentation:`
	`# Download the release key and trust it.`
	`# curl -s http://repo.example.com/mirror/obs/Release.key \| apt-key add -`
	`# Use a sources.list.d/ file with contents:`
	`# deb https://repo.example.com/mirror/obs/ /`
	`# Dependencies:`
	`# binaries: wget sed awk`
	`# user: obsmirror`
	`umask 0002`
	`exec 8>&0`

	`test -n "${OBSMIRROR_CONF}" && . "${OBSMIRROR_CONF}"`
	`test -z "${logfile}" && logfile="/tmp/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log"`
	`test -z "${inurl}" && inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable"`
	`test -z "${workdir}" && workdir=/tmp/obs`
	`test -z "${thisuser}" && thisuser=obsmirror`
	`test -z "${tempdir}" && tempdir="$( TMPDIR="${TMPDIR:-/tmp}" mktemp -d )"`
	`# also use include_sources resign_repo gpg_passfile gpg_keyfile DEBUG`

	`exec 3>&1`
	`show() {`
	`printf "%s" "${*}" 1>&3`
	`}`

	`reset_show() {`
	`printf "\r%s" "${*}" 1>&3`
	`}`

	`get_file() {`
	`# call: get_file "${tu}" "${md5sum}"`
	`___tu="${1}"`
	`___sum1="${2}" # nominally from the locally-generated Packages from previous run`
	`___sum2="${3}" # from current set`
	`tn="${___tu##${inurl}}"`
	`tf="${workdir}/${tn}" ; tf="$( readlink -m "${tf}" )"`
	`td="$( dirname "${tf}" )"`
	`test -d "${td}" \|\| mkdir -p "${td}"`
	`gotten="skipped "`
	`if test -z "${DRYRUN}" ;`
	`then`
	`# determine if file is good enough`
	`___matches_either=0`
	`___md5sum_file="$( md5sum "${tf}" 2>/dev/null \| awk '{print $1}' )"`
	`# sum1 check is disabled. If upstream obs rebuilds a package, we would never download the new package of the exact same name+version! So all my fancy sum1 logic is useless.`
	`#test "${___md5sum_file}" = "${___sum1}" && ___matches_either=1`
	`test "${___md5sum_file}" = "${___sum2}" && ___matches_either=1`
	`#test -n "${VERBOSE}" && printf "%s matches:%s\n" "${tn}" "${___matches_either}"`
	`if test -z "${___sum2}" \|\| test "${___matches_either}" = "0" ;`
	`then`
	`# so the checksum is empty, or the given checksum does not match the existing downloaded file`
	`# use the Link headers because provo-mirror sucks and presents a lot of 404s.`
	`# if Links header does not exist then this list will be short.`
	`___links="$( printf '%s\n%s\n' "${___tu}" "$( curl --head "${___tu}" --silent \| awk '/Link:/ && !/type=/{print $2}' \| tr -d '<>;' )" \| sed -e '/^\s*$/d' )"`
	`test -n "${VERBOSE}" && show "retrieving ${___tu}" 2>/dev/null \|\| :`
	`___valid=0`
	`___x=0`
	`while test ${___valid} -eq 0 && test ${___x} -lt $( echo "${___links}" \| wc -l ) ;`
	`do`
	`___x=$((___x+1))`
	`___tl="$( echo "${___links}" \| sed -n "${___x}p" 2>/dev/null )"`
	`test -n "${___tl}" && wget --content-disposition --no-verbose --quiet --output-document "${tf}" "${___tl}" && ___valid=1`
	`grep -qiE '404 Not Found' "${tf}" 2>/dev/null && ___valid=0`
	`! test -s "${tf}" && ___valid=0`
	`___matches_either=0`
	`___md5sum_file="$( mdtsum "${tf}" 2>/dev/null \| awk '{print $1}' )"`
	`#test "${___md5sum_file}" = "${___sum1}" && ___matches_either=1`
	`test "${___md5sum_file}" = "${___sum2}" && ___matches_either=1`
	`if test "${___matches_either}" = "1" ;`
	`then`
	`___valid=1`
	`fi`
	`done`
	`test ${___valid:-0} -eq 1 && gotten="DOWNLOADED"`
	`test ${___valid:-0} -eq 0 && gotten='x FAILED '`
	`fi`
	`fi`
	`test -n "${VERBOSE}" && reset_show 2>/dev/null \|\| :`
	`test -n "${VERBOSE}" && echo "${gotten} ${___tu} -> ${tf}"`
	`#echo "PAUSED: " ; read -u8 foo`
	`}`

	`wget_verbose=--quiet`
	`test -n "${VERBOSE}" && unset wget_verbose`
	`{`
	`test "${DEBUG:-NONE}" = "FULL" && set -x`
	`echo "logfile=${logfile}"`

	`# These files define an apt repo`
	`# archive the Packages file, which might be generated locally by rebuild-apt-repo.sh the previous time, and might have more useful md5sums of the packages`
	`/bin/cp -pf "${workdir}/Packages" "${tempdir}/Packages.$$"`
	`for word in InRelease Packages Packages.gz Release Release.gpg Release.key Sources Sources.gz ;`
	`do`
	`get_file "${inurl}/${word}"`
	`done`

	`# loop through named packages and download them`
	`# extract these 2 fields, from both old and new Packages files. Combine them into one line. Sed\|sed adds the line numbers so we can put the lines back in original order just because, and then sort.`
	`awk '/Filename:\|MD5/{print $2}' "${tempdir}/Packages.$$" \| xargs -n2 \| sed '=' \| sed 'N; s/\n/ /' \| sort -k2 > "${tempdir}/old_list"`
	`awk '/Filename:\|MD5/{print $2}' "${workdir}/Packages" \| xargs -n2 \| sed '=' \| sed 'N; s/\n/ /' \| sort -k2 > "${tempdir}/new_list"`
	`#awk '/Filename:\|MD5/{print $2}' "${workdir}/Packages" \| xargs -n2 \| while read word sum`
	`# The sort above was so join will work. Print these columns, sort by original line number, and then remove the lineno column with awk. Then split into these var names and process.`
	`join -j 2 -a 2 -o 2.1,2.2,1.3,2.3 "${tempdir}/old_list" "${tempdir}/new_list" \| sort -k1 \| awk '{$1="";print}' \| while read word sum_old sum_new`
	`do`
	`get_file "$( echo "${word}" \| sed -r -e "s@^\.@${inurl}@;" )" "${sum_old}" "${sum_new}"`
	`#echo "a=${a} b=${b}"`
	`done`

	`# loop through dsc, orig.tar.gz, and debian.tar.xz files`
	`test -n "${include_sources}" && {`
	`for word in $( sed -n -r -e '/Files:/,/^\s*$/{/^ /p;}' ${workdir}/Sources \| awk '{print $NF}' ) ;`
	`do`
	`get_file "${inurl}/${word}"`
	`done`
	`}`

	`test -n "${resign_repo}" && . /etc/installed/rebuild-apt-repo.sh`

	`chown -R "${thisuser}:$( id -G "${thisuser}" \| awk '{print $1}' )" "${workdir}"`
	`} 2>&1 \| tee -a "${logfile}"`

	`test -z "${NO_CLEAN}" && rm -rf "${tempdir:-NOTHINGTODEL}"`

Observe the ___sum1 lines that are commented out. I wrote a lot of logic to support reading the old (previous run's locally-generated) Packages file and accepting the .deb checksums from it. I realized that due to how I rebuild the exact same version of a package (yes, I'm a terrible person and I know it), accepting the old Packages's checksum for a file means that it would never download the newer build of the exact same version-release number. So I disabled allowing it from the old Packages file, which entirely defeats the purpose of my byzantine logic to join the two Packages entries' together and comparing against both checksums for a given file.

Here is the rebuild-apt-repo.sh script.

	`#!/bin/sh`
	`# Startdate: 2022-04-02 19:51`
	`# Purpose: To rebuild apt repo, primarily for obsmirror operations`
	`# Usage: Can be called by itself, with appropriate env vars, or from obsmirror.sh`
	`# ( . /etc/installed/obsmirror-cdemu.conf ; . /etc/installed/rebuild-apt-repo.sh ; )`
	`# Dependencies:`
	`# environment vars: workdir, gpg_passfile, gpg_keyfile`
	`# On CentOS7, gnupg2 package that supports --pinentry-mode loopback, 2.2.18-2.el7 from @copr:bgstack15:el7-gnupg2-debmirror`
	`# /usr/bin/dpkg-scanpackages`
	`# References:`
	`# obsmirror.sh`
	`# https://medium.com/sqooba/create-your-own-custom-and-authenticated-apt-repository-1e4a4cf0b864`

	`# rebuild release files`
	`echo "Rebuild apt repo in ${workdir}"`

	`repodir="${workdir}"`
	`cd "${repodir}"`
	`dpkg-scanpackages -m . > Packages`
	`gzip -9c < Packages > Packages.gz`
	`# create the Release file`
	`PKGS="$(wc -c Packages)"`
	`PKGS_GZ="$(wc -c Packages.gz)"`
	`old_headers1="$( grep -E '^(Archive\|Codename\|Origin\|Label\|Architectures):' Release )"`
	`old_headers2="$( grep -E '^(Description):' Release )"`
	`cat <<EOF > Release`
	`${old_headers1}`
	`Date: $(date -u '+%a, %d %b %Y %T %Z')`
	`${old_headers2}`
	`MD5Sum:`
	`$(md5sum Packages \| cut -d" " -f1) $PKGS`
	`$(md5sum Packages.gz \| cut -d" " -f1) $PKGS_GZ`
	`SHA1:`
	`$(sha1sum Packages \| cut -d" " -f1) $PKGS`
	`$(sha1sum Packages.gz \| cut -d" " -f1) $PKGS_GZ`
	`SHA256:`
	`$(sha256sum Packages \| cut -d" " -f1) $PKGS`
	`$(sha256sum Packages.gz \| cut -d" " -f1) $PKGS_GZ`
	`EOF`
	`test -e "${gpg_passfile}" && gpg --batch --yes --passphrase-file "${gpg_passfile}" --pinentry-mode loopback -abs -o Release.gpg Release`
	`test -e "${gpg_passfile}" && gpg --batch --yes --passphrase-file "${gpg_passfile}" --pinentry-mode loopback --clearsign -o InRelease Release`
	`# and because we are resigning it, replace Release.key with the one we used`
	`test -e "${gpg_keyfile}" && /bin/cp -pf "${gpg_keyfile}" Release.key`

To run this by itself, I recommend you run it in a sub shell, because you need to dot-source the conf file.

( . /etc/installed/obsmirror-cdemu.conf ; . /etc/installed/rebuild-apt-repo.sh ; )

And a reminder, the conf file looks like this:

# vim: syntax=sh
logfile="/var/server1/shares/public/Support/Systems/server1/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log"
inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable"
workdir=/var/server1/shares/public/www/mirror/obs
include_sources=
resign_repo=yes
gpg_passfile=/root/.gnupg/linuxadmin
gpg_keyfile=/var/server1/shares/public/www/internal/repo/deb/internaldeb.gpg
thisuser=obsmirror
VERBOSE=1

Knowledge Base

Comments