3 files changed, 90 insertions, 132 deletions
diff --git a/obsmirror.sh/obsmirror.conf.example b/obsmirror.sh/obsmirror.conf.example
new file mode 100644
index 0000000..9c47ca9
--- /dev/null
+++ b/obsmirror.sh/obsmirror.conf.example
@@ -0,0 +1,9 @@
+# vim: syntax=sh
+logfile="/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log"
+inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable"
+workdir=/var/tmp/obs
+include_sources=
+resign_repo=yes
+gpg_passfile=/root/.gnupg/filename
+gpg_keyfile=/var/www/deb/public.gpg
+thisuser=obsmirror
diff --git a/obsmirror.sh/obsmirror.sh b/obsmirror.sh/obsmirror.sh
index 8cafb2a..2b6522c 100755
--- a/obsmirror.sh/obsmirror.sh
+++ b/obsmirror.sh/obsmirror.sh
@@ -1,17 +1,22 @@
 #!/bin/sh
 # File: /etc/installed/obsmirror.sh
-# License: CC-BY-SA 4.0
+# Location: https://gitlab.com/bgstack15/former-gists/tree/master/obsmirror.sh
 # Author: bgstack15
-# Startdate: 2020-01-05 18:01
+# Startdate: 2020-03-03 08:43
+# SPDX-License-Identifier: CC-BY-SA-4.0
 # Title: Script that scrapes down OBS site to serve a copy to intranet
 # Purpose: save down my OBS site so I can serve it locally
 # History:
+#    2020-01-05 v1: begin which used httrack
+#    2020-02-28 v2: complete rewrite to exclude httrack
+#    2020-03-03 v3: complete rewrite to get explicit files and loop through their contents, and rebuild apt repo
 # Usage:
 #    in a cron job: /etc/cron.d/mirror.cron
 #       50	12	*	*	*	root	/etc/installed/obsmirror.sh 1>/dev/null 2>&1
 # Reference:
-#    https://unix.stackexchange.com/questions/114044/how-to-make-wget-download-recursive-combining-accept-with-exclude-directorie?rq=1
 #    https://software.opensuse.org//download.html?project=home%3Abgstack15&package=freefilesync
+#    /mnt/public/www/smith122/repo/devuan-deb/update-devuan-deb.sh
+#    https://medium.com/sqooba/create-your-own-custom-and-authenticated-apt-repository-1e4a4cf0b864
 # Improve:
 # Documentation:
 #    Download the release key and trust it.
@@ -19,76 +24,91 @@
 #    Use a sources.list.d/ file with contents:
 #       deb https://repo.example.com/mirror/obs/ /
 # Dependencies:
-#    binaries: curl wget grep sed awk chmod chown rm
+#    binaries: wget sed awk
 #    user: obsmirror
-
-parse_obs_dl_page() {
-   # simply wget the ${inurl} and play around with this master string. Goal is to remove all links that are not dpkg, gzip, repo files, or subdirs.
-   grep -oE 'href="[^"]+">' | awk '!x[$0]++' | sed -r -e 's/^href="//;' -e 's/">$//;' | grep -viE 'https?:\/\/[A-Za-z0-9\.]+\.[A-Za-z]+|mirrorlist|orig.*z$|^\/(debug|distribution|factory|ports|repositories|source|tumbleweed|update)\/$|^\?[A-Z]=[A-Z]|^\/|\.dsc$'
-}
-
-parse_obs_page_and_subdirs() {
-   # call: curl -s -L "${inurl}" | parse_obs_page_and_subdirs "${inurl}" "${tmpfile}"
-   # return to stdout: all the wanted files from this page and its associated subdirs
-   ___input="$( parse_obs_dl_page )"
-   ___inurl="${1}"
-   ___tmpfile="${2}"
-   {
-      echo "${___input}" | grep -vE '\/$' | sed -r -e "s@^@${___inurl}\/@"
-      # iterate over all listed subdirs parse out their files
-      for subdir in $( echo "${___input}" | grep -E "\/$" ) ;
-      do
-         #echo "${___inurl}/${subdir}"
-         curl -s -L "${___inurl}/${subdir}" | parse_obs_dl_page | sed -r -e "s@^@${___inurl}/${subdir}@"
-      done
-   } > "${___tmpfile}"
-}
+umask 0002
 
 test -n "${OBSMIRROR_CONF}" && . "${OBSMIRROR_CONF}"
 test -z "${logfile}" && logfile="/tmp/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log"
-test -z "${tmpfile}" && tmpfile="$( mktemp )"
 test -z "${inurl}" && inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable"
 test -z "${workdir}" && workdir=/tmp/obs
-test -z "${outdir}" && outdir=/tmp/var/www/mirror/obs
 test -z "${thisuser}" && thisuser=obsmirror
+# also use include_sources resign_repo gpg_passfile gpg_keyfile DEBUG
+
+get_file() {
+   # call: get_file "${tu}" "${md5sum}"
+   ___tu="${1}"
+   ___sum="${2}"
+   tn="${___tu##${inurl}}"
+   tf="${workdir}/${tn}" ; tf="$( readlink -m "${tf}" )"
+   td="$( dirname "${tf}" )"
+   test -d "${td}" || mkdir -p "${td}"
+   gotten="skipped   "
+   if test -z "${DRYRUN}" ;
+   then
+      if test -z "${___sum}" || test "$( md5sum "${tf}" 2>/dev/null | awk '{print $1}' )" != "${___sum}" ;
+      then
+         wget --content-disposition --no-verbose --quiet -O "${tf}" "${___tu}" && gotten=DOWNLOADED
+      fi
+   fi
+   test -n "${VERBOSE}" && echo "${gotten} ${___tu} -> ${tf}"
+}
 
+wget_verbose=--quiet
+test -n "${VERBOSE}" && unset wget_verbose
 {
    test "${DEBUG:-NONE}" = "FULL" && set -x
-      echo "logfile=${logfile}"
-
-   mkdir -p "${workdir}" ; chmod "0755" "${workdir}" ; chown "${thisuser}:$( id -G "${thisuser}" | awk '{print $1}' )" "${workdir}"
-   cd "${workdir}"
-   test "${use_top_result}" = "yes" && {
-      # get mirrorlist of Packages.gz file and find the one that lists the most packages
-      step1="$( curl -s -L "${inurl}/Packages.gz.mirrorlist" )"
-      options="$( echo "${step1}" | grep -oE 'href="[^"]+">' | awk '!x[$0]++' | sed -r -e 's/^href="//;' -e 's/">$//;' | grep -iE '^(ht|f)tps?:\/\/.*Packages\.gz$' )"
-      echo "${options}" 1>&2
-      results="$(
-      for entry in ${options} ; do
-         # use package count
-         #curl -s -L "${entry}" | zgrep -cE '^Package:' | sed -r -e "s@\$@ ${entry}@;"
-         # use last modified timestamp of Packages.gz file
-         wget --content-disposition --quiet "${entry}" -O tmpfile.$$ ; find tmpfile.$$ -printf "%T@ ${entry}\n"
-      done ; rm tmpfile.$$ )"
-      topresult_line="$( echo "${results}" | sort -nr | head -n1 | sed -r -e 's/\/Packages\.gz$//;' )"
-      topresult_packagecount="$( echo "${topresult_line}" | awk '{print $1}' )"
-      topresult="$( echo "${topresult_line}" | awk '{print $2}' )"
-      echo "USING ${topresult} with ${topresult_packagecount} packages" 1>&2
-      inurl="${topresult}"
-   }
+   echo "logfile=${logfile}"
 
-   step1="$( curl -s -L "${inurl}" )"
-   echo "${step1}" | parse_obs_page_and_subdirs "${inurl}" "${tmpfile}"
-   # loop over all entries and download them
-   for thisurl in $( cat "${tmpfile}" ) ;
+   # These files define an apt repo
+   for word in InRelease Packages Packages.gz Release Release.gpg Release.key Sources Sources.gz ;
    do
-      thisfile="$( echo "${thisurl}" | sed -r -e "s@${inurl}@${workdir}@" -e 's/%2B/+/g;' )"
-      thisdir="$( dirname "${thisfile}" )"
-      test -d "${thisdir}" || mkdir -p "${thisdir}"
-      test -n "${VERBOSE}" && echo "FROM ${thisurl} TO ${thisfile}"
-      test -z "${DRYRUN}" && wget --no-verbose -O "${thisfile}" "${thisurl}" &
+      get_file "${inurl}/${word}"
    done
 
-} 2>&1 | tee -a "${logfile}"
+   # loop through named packages and download them
+   #for word in $( awk '/Filename:/{print $2}' "${workdir}/Packages" ) ;
+   awk '/Filename:|MD5/{print $2}' "${workdir}/Packages" | xargs -n2 | while read word sum
+   do
+      get_file "$( echo "${word}" | sed -r -e "s@^\.@${inurl}@;" )" "${sum}"
+      #echo "a=${a}   b=${b}"
+   done 
 
-rm "${tmpfile:-NOTHINGTODEL}"
+   # loop through dsc, orig.tar.gz, and debian.tar.xz files
+   test -n "${include_sources}" && {
+      for word in $( sed -n -r -e '/Files:/,/^\s*$/{/^ /p;}' ${workdir}/Sources | awk '{print $NF}' ) ;
+      do
+         get_file "${inurl}/${word}"
+      done
+   }
+
+   test -n "${resign_repo}" && {
+      # rebuild release files
+      repodir="${workdir}"
+      cd "${repodir}"
+      dpkg-scanpackages -m . > Packages
+      gzip -9c < Packages > Packages.gz
+      # create the Release file
+      PKGS="$(wc -c Packages)"
+      PKGS_GZ="$(wc -c Packages.gz)"
+      cat <<EOF > Release
+Architectures: all
+Date: $(date -u '+%a, %d %b %Y %T %Z')
+MD5Sum:
+ $(md5sum Packages  | cut -d" " -f1) $PKGS
+ $(md5sum Packages.gz  | cut -d" " -f1) $PKGS_GZ
+SHA1:
+ $(sha1sum Packages  | cut -d" " -f1) $PKGS
+ $(sha1sum Packages.gz  | cut -d" " -f1) $PKGS_GZ
+SHA256:
+ $(sha256sum Packages | cut -d" " -f1) $PKGS
+ $(sha256sum Packages.gz | cut -d" " -f1) $PKGS_GZ
+EOF
+      test -e "${gpg_passfile}" && gpg --batch --yes --passphrase-file "${gpg_passfile}" -abs -o Release.gpg Release
+      test -e "${gpg_passfile}" && gpg --batch --yes --passphrase-file "${gpg_passfile}" --clearsign -o InRelease Release
+      # and because we are resigning it, replace Release.key with the one we used
+      test -e "${gpg_keyfile}" && cp -p "${gpg_keyfile}" Release.key
+   }
+
+   chown -R "${thisuser}:$( id -G "${thisuser}" | awk '{print $1}' )" "${workdir}"
+} 2>&1 | tee -a "${logfile}"
diff --git a/obsmirror.sh/obsmirror2.sh b/obsmirror.sh/obsmirror2.sh
deleted file mode 100755
index 71284dd..0000000
--- a/obsmirror.sh/obsmirror2.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/sh
-# File: /etc/installed/obsmirror.sh
-# Author: bgstack15
-# SPDX-License-Identifier: CC-BY-SA-4.0
-# Startdate: 2020-03-03 08:43
-# Title: Script that scrapes down OBS site to serve a copy to intranet
-# Purpose: save down my OBS site so I can serve it locally
-# History:
-#    2020-01-05 v1: begin which used httrack
-#    2020-02-28 v2: complete rewrite to exclude httrack
-#    2020-03-03 v3: complete rewrite to get explicit files and loop through their contents
-# Usage:
-#    in a cron job: /etc/cron.d/mirror.cron
-#       50	12	*	*	*	root	/etc/installed/obsmirror.sh 1>/dev/null 2>&1
-# Reference:
-#    https://software.opensuse.org//download.html?project=home%3Abgstack15&package=freefilesync
-# Improve:
-# Documentation:
-#    Download the release key and trust it.
-#       curl -s http://repo.example.com/mirror/obs/Release.key | apt-key add -
-#    Use a sources.list.d/ file with contents:
-#       deb https://repo.example.com/mirror/obs/ /
-# Dependencies:
-#    binaries: wget sed awk
-#    user: obsmirror
-umask 0002
-
-test -n "${OBSMIRROR_CONF}" && . "${OBSMIRROR_CONF}"
-test -z "${logfile}" && logfile="/tmp/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log"
-test -z "${inurl}" && inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable"
-test -z "${workdir}" && workdir=/tmp/obs
-# also use include_sources DEBUG
-
-get_file() {
-   # call: get_file "${tu}" "${md5sum}"
-   ___tu="${1}"
-   tn="$( basename "${___tu}" )"
-   tf="${workdir}/${tn}" ; tf="$( readlink -m "${tf}" )"
-   td="$( dirname "${tf}" )"
-   test -d "${td}" || mkdir -p "${td}"
-   test -n "${DRYRUN}" && test -n "${VERBOSE}" && echo "${___tu} -> ${tf}"
-   test -z "${DRYRUN}" && wget --content-disposition --no-verbose ${wget_verbose} -O "${tf}" "${___tu}"
-}
-
-wget_verbose=--quiet
-test -n "${VERBOSE}" && unset wget_verbose
-{
-   test "${DEBUG:-NONE}" = "FULL" && set -x
-   echo "logfile=${logfile}"
-
-   # These files define an apt repo
-   for word in InRelease Packages Packages.gz Release Release.gpg Release.key Sources Sources.gz ;
-   do
-      get_file "${inurl}/${word}"
-   done
-
-   # loop through named packages and download them
-   for word in $( awk '/Filename:/{print $2}' "${workdir}/Packages" ) ;
-   do
-      get_file "$( echo "${word}" | sed -r -e "s@^\.@${inurl}@;" )"
-   done
-
-   # loop through dsc, orig.tar.gz, and debian.tar.xz files
-   test -n "${include_sources}" && {
-      for word in $( sed -n -r -e '/Files:/,/^\s*$/{/^ /p;}' ${workdir}/Sources | awk '{print $NF}' ) ;
-      do
-         get_file "${inurl}/${word}"
-      done
-   }
-
-} 2>&1 | tee -a "${logfile}"