Knowledge Base

Preserving for the future: Shell scripts, AoC, and more

Improvements to obsmirror

Previous posts on the topic of mirroring the OBS apt repos locally:

  1. Mirror an OBS deb repository locally
  2. Mirror an OBS repository locally -- update 1

I have improved the shell script that is the main obsmirror logic. I also split out the rebuild-apt-repo logic into its own script so I could call it separately.

#!/bin/sh
# File: /etc/installed/obsmirror.sh
# Location: https://gitlab.com/bgstack15/former-gists/tree/master/obsmirror.sh
# Author: bgstack15
# Startdate: 2020-03-03 08:43
# SPDX-License-Identifier: CC-BY-SA-4.0
# Title: Script that scrapes down OBS site to serve a copy to intranet
# Purpose: save down my OBS site so I can serve it locally
# History:
#    2020-01-05 v1: begin which used httrack
#    2020-02-28 v2: complete rewrite to exclude httrack
#    2020-03-03 v3: complete rewrite to get explicit files and loop through their contents, and rebuild apt repo
#    2020-03-13 add on-prompt notifications
#    2022-04-01 22:33
# Usage:
#    in a cron job: /etc/cron.d/mirror.cron
#       50  12  *   *   *   root    /etc/installed/obsmirror.sh 1>/dev/null 2>&1
# Reference:
#    https://software.opensuse.org//download.html?project=home%3Abgstack15&package=freefilesync
#    /mnt/public/www/internal/repo/devuan-deb/update-devuan-deb.sh
#    https://medium.com/sqooba/create-your-own-custom-and-authenticated-apt-repository-1e4a4cf0b864
#    https://unix.stackexchange.com/questions/113898/how-to-merge-two-files-based-on-the-matching-of-two-columns/113903#113903
#    sed|sed to get line numbers printed https://stackoverflow.com/questions/52882594/insert-line-numbers-into-file-with-sed/52884598#52884598
# Improve:
# Documentation:
#    Download the release key and trust it.
#       curl -s http://repo.example.com/mirror/obs/Release.key | apt-key add -
#    Use a sources.list.d/ file with contents:
#       deb https://repo.example.com/mirror/obs/ /
# Dependencies:
#    binaries: wget sed awk
#    user: obsmirror
umask 0002
exec 8>&0
test -n "${OBSMIRROR_CONF}" && . "${OBSMIRROR_CONF}"
test -z "${logfile}" && logfile="/tmp/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log"
test -z "${inurl}" && inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable"
test -z "${workdir}" && workdir=/tmp/obs
test -z "${thisuser}" && thisuser=obsmirror
test -z "${tempdir}" && tempdir="$( TMPDIR="${TMPDIR:-/tmp}" mktemp -d )"
# also use include_sources resign_repo gpg_passfile gpg_keyfile DEBUG
exec 3>&1
show() {
    printf "%s" "${*}" 1>&3
}
reset_show() {
    printf "\r%s" "${*}" 1>&3
}
get_file() {
   # call: get_file "${tu}" "${md5sum}"
   ___tu="${1}"
   ___sum1="${2}" # nominally from the locally-generated Packages from previous run
   ___sum2="${3}" # from current set
   tn="${___tu##${inurl}}"
   tf="${workdir}/${tn}" ; tf="$( readlink -m "${tf}" )"
   td="$( dirname "${tf}" )"
   test -d "${td}" || mkdir -p "${td}"
   gotten="skipped   "
   if test -z "${DRYRUN}" ;
   then
      # determine if file is good enough
      ___matches_either=0
      ___md5sum_file="$( md5sum "${tf}" 2>/dev/null | awk '{print $1}' )"
      # sum1 check is disabled. If upstream obs rebuilds a package, we would never download the new package of the exact same name+version! So all my fancy sum1 logic is useless.
      #test "${___md5sum_file}" = "${___sum1}" && ___matches_either=1
      test "${___md5sum_file}" = "${___sum2}" && ___matches_either=1
      #test -n "${VERBOSE}" && printf "%s matches:%s\n" "${tn}" "${___matches_either}"
      if test -z "${___sum2}" || test "${___matches_either}" = "0" ;
      then
         # so the checksum is empty, or the given checksum does not match the existing downloaded file
         # use the Link headers because provo-mirror sucks and presents a lot of 404s.
         # if Links header does not exist then this list will be short.
         ___links="$( printf '%s\n%s\n' "${___tu}" "$( curl --head "${___tu}" --silent | awk '/Link:/ && !/type=/{print $2}' | tr -d '<>;' )" | sed -e '/^\s*$/d' )"
         test -n "${VERBOSE}" && show "retrieving ${___tu}" 2>/dev/null || :
         ___valid=0
         ___x=0
         while test ${___valid} -eq 0 && test ${___x} -lt $( echo "${___links}" | wc -l ) ;
         do
            ___x=$((___x+1))
            ___tl="$( echo "${___links}" | sed -n "${___x}p" 2>/dev/null )"
            test -n "${___tl}" && wget --content-disposition --no-verbose --quiet --output-document "${tf}" "${___tl}" && ___valid=1
            grep -qiE '404 Not Found' "${tf}" 2>/dev/null && ___valid=0
            ! test -s "${tf}" && ___valid=0
            ___matches_either=0
            ___md5sum_file="$( mdtsum "${tf}" 2>/dev/null | awk '{print $1}' )"
            #test "${___md5sum_file}" = "${___sum1}" && ___matches_either=1
            test "${___md5sum_file}" = "${___sum2}" && ___matches_either=1
            if test "${___matches_either}" = "1" ;
            then
               ___valid=1
            fi
         done
         test ${___valid:-0} -eq 1 && gotten="DOWNLOADED"
         test ${___valid:-0} -eq 0 && gotten='x FAILED   '
      fi
   fi
   test -n "${VERBOSE}" && reset_show 2>/dev/null || :
   test -n "${VERBOSE}" && echo "${gotten} ${___tu} -> ${tf}"
   #echo "PAUSED: " ; read -u8 foo
}
wget_verbose=--quiet
test -n "${VERBOSE}" && unset wget_verbose
{
   test "${DEBUG:-NONE}" = "FULL" && set -x
   echo "logfile=${logfile}"
   # These files define an apt repo
   # archive the Packages file, which might be generated locally by rebuild-apt-repo.sh the previous time, and might have more useful md5sums of the packages
   /bin/cp -pf "${workdir}/Packages" "${tempdir}/Packages.$$"
   for word in InRelease Packages Packages.gz Release Release.gpg Release.key Sources Sources.gz ;
   do
      get_file "${inurl}/${word}"
   done
   # loop through named packages and download them
   # extract these 2 fields, from both old and new Packages files. Combine them into one line. Sed|sed adds the line numbers so we can put the lines back in original order just because, and then sort.
   awk '/Filename:|MD5/{print $2}' "${tempdir}/Packages.$$" | xargs -n2 | sed '=' | sed 'N; s/\n/ /' | sort -k2 > "${tempdir}/old_list"
   awk '/Filename:|MD5/{print $2}' "${workdir}/Packages"    | xargs -n2 | sed '=' | sed 'N; s/\n/ /' | sort -k2 > "${tempdir}/new_list"
   #awk '/Filename:|MD5/{print $2}' "${workdir}/Packages" | xargs -n2 | while read word sum
   # The sort above was so join will work. Print these columns, sort by original line number, and then remove the lineno column with awk. Then split into these var names and process.
   join -j 2 -a 2 -o 2.1,2.2,1.3,2.3 "${tempdir}/old_list" "${tempdir}/new_list" | sort -k1 | awk '{$1="";print}' | while read word sum_old sum_new
   do
      get_file "$( echo "${word}" | sed -r -e "s@^\.@${inurl}@;" )" "${sum_old}" "${sum_new}"
      #echo "a=${a}   b=${b}"
   done
   # loop through dsc, orig.tar.gz, and debian.tar.xz files
   test -n "${include_sources}" && {
      for word in $( sed -n -r -e '/Files:/,/^\s*$/{/^ /p;}' ${workdir}/Sources | awk '{print $NF}' ) ;
      do
         get_file "${inurl}/${word}"
      done
   }
   test -n "${resign_repo}" && . /etc/installed/rebuild-apt-repo.sh
   chown -R "${thisuser}:$( id -G "${thisuser}" | awk '{print $1}' )" "${workdir}"
} 2>&1 | tee -a "${logfile}"
test -z "${NO_CLEAN}" && rm -rf "${tempdir:-NOTHINGTODEL}"

Observe the ___sum1 lines that are commented out. I wrote a lot of logic to support reading the old (previous run's locally-generated) Packages file and accepting the .deb checksums from it. I realized that due to how I rebuild the exact same version of a package (yes, I'm a terrible person and I know it), accepting the old Packages's checksum for a file means that it would never download the newer build of the exact same version-release number. So I disabled allowing it from the old Packages file, which entirely defeats the purpose of my byzantine logic to join the two Packages entries' together and comparing against both checksums for a given file.

Here is the rebuild-apt-repo.sh script.

#!/bin/sh
# Startdate: 2022-04-02 19:51
# Purpose: To rebuild apt repo, primarily for obsmirror operations
# Usage: Can be called by itself, with appropriate env vars, or from obsmirror.sh
#    ( . /etc/installed/obsmirror-cdemu.conf ; . /etc/installed/rebuild-apt-repo.sh ; )
# Dependencies:
#    environment vars: workdir, gpg_passfile, gpg_keyfile
#    On CentOS7, gnupg2 package that supports --pinentry-mode loopback, 2.2.18-2.el7 from @copr:bgstack15:el7-gnupg2-debmirror
#    /usr/bin/dpkg-scanpackages
# References:
#    obsmirror.sh
#    https://medium.com/sqooba/create-your-own-custom-and-authenticated-apt-repository-1e4a4cf0b864
# rebuild release files
echo "Rebuild apt repo in ${workdir}"
repodir="${workdir}"
cd "${repodir}"
dpkg-scanpackages -m . > Packages
gzip -9c < Packages > Packages.gz
# create the Release file
PKGS="$(wc -c Packages)"
PKGS_GZ="$(wc -c Packages.gz)"
old_headers1="$( grep -E '^(Archive|Codename|Origin|Label|Architectures):' Release )"
old_headers2="$( grep -E '^(Description):' Release )"
cat <<EOF > Release
${old_headers1}
Date: $(date -u '+%a, %d %b %Y %T %Z')
${old_headers2}
MD5Sum:
$(md5sum Packages  | cut -d" " -f1) $PKGS
$(md5sum Packages.gz  | cut -d" " -f1) $PKGS_GZ
SHA1:
$(sha1sum Packages  | cut -d" " -f1) $PKGS
$(sha1sum Packages.gz  | cut -d" " -f1) $PKGS_GZ
SHA256:
$(sha256sum Packages | cut -d" " -f1) $PKGS
$(sha256sum Packages.gz | cut -d" " -f1) $PKGS_GZ
EOF
test -e "${gpg_passfile}" && gpg --batch --yes --passphrase-file "${gpg_passfile}" --pinentry-mode loopback -abs -o Release.gpg Release
test -e "${gpg_passfile}" && gpg --batch --yes --passphrase-file "${gpg_passfile}" --pinentry-mode loopback --clearsign -o InRelease Release
# and because we are resigning it, replace Release.key with the one we used
test -e "${gpg_keyfile}" && /bin/cp -pf "${gpg_keyfile}" Release.key

To run this by itself, I recommend you run it in a sub shell, because you need to dot-source the conf file.

( . /etc/installed/obsmirror-cdemu.conf ; . /etc/installed/rebuild-apt-repo.sh ; )

And a reminder, the conf file looks like this:

# vim: syntax=sh
logfile="/var/server1/shares/public/Support/Systems/server1/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log"
inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable"
workdir=/var/server1/shares/public/www/mirror/obs
include_sources=
resign_repo=yes
gpg_passfile=/root/.gnupg/linuxadmin
gpg_keyfile=/var/server1/shares/public/www/internal/repo/deb/internaldeb.gpg
thisuser=obsmirror
VERBOSE=1

Comments