From 878629a91a0c7e7613bfea350eb0b8af5e0053c5 Mon Sep 17 00:00:00 2001 From: B Stack Date: Tue, 3 Mar 2020 11:34:42 -0500 Subject: rewrite obsmirror again Now we are parsing Packages and Sources files directly, and rebuilding and re-signing the repo. --- obsmirror.sh/obsmirror.conf.example | 9 +++ obsmirror.sh/obsmirror.sh | 142 ++++++++++++++++++++---------------- obsmirror.sh/obsmirror2.sh | 71 ------------------ 3 files changed, 90 insertions(+), 132 deletions(-) create mode 100644 obsmirror.sh/obsmirror.conf.example delete mode 100755 obsmirror.sh/obsmirror2.sh diff --git a/obsmirror.sh/obsmirror.conf.example b/obsmirror.sh/obsmirror.conf.example new file mode 100644 index 0000000..9c47ca9 --- /dev/null +++ b/obsmirror.sh/obsmirror.conf.example @@ -0,0 +1,9 @@ +# vim: syntax=sh +logfile="/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log" +inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable" +workdir=/var/tmp/obs +include_sources= +resign_repo=yes +gpg_passfile=/root/.gnupg/filename +gpg_keyfile=/var/www/deb/public.gpg +thisuser=obsmirror diff --git a/obsmirror.sh/obsmirror.sh b/obsmirror.sh/obsmirror.sh index 8cafb2a..2b6522c 100755 --- a/obsmirror.sh/obsmirror.sh +++ b/obsmirror.sh/obsmirror.sh @@ -1,17 +1,22 @@ #!/bin/sh # File: /etc/installed/obsmirror.sh -# License: CC-BY-SA 4.0 +# Location: https://gitlab.com/bgstack15/former-gists/tree/master/obsmirror.sh # Author: bgstack15 -# Startdate: 2020-01-05 18:01 +# Startdate: 2020-03-03 08:43 +# SPDX-License-Identifier: CC-BY-SA-4.0 # Title: Script that scrapes down OBS site to serve a copy to intranet # Purpose: save down my OBS site so I can serve it locally # History: +# 2020-01-05 v1: begin which used httrack +# 2020-02-28 v2: complete rewrite to exclude httrack +# 2020-03-03 v3: complete rewrite to get explicit files and loop through their contents, and rebuild apt repo # Usage: # in a cron job: /etc/cron.d/mirror.cron # 50 12 * * * root /etc/installed/obsmirror.sh 1>/dev/null 2>&1 # Reference: -# https://unix.stackexchange.com/questions/114044/how-to-make-wget-download-recursive-combining-accept-with-exclude-directorie?rq=1 # https://software.opensuse.org//download.html?project=home%3Abgstack15&package=freefilesync +# /mnt/public/www/smith122/repo/devuan-deb/update-devuan-deb.sh +# https://medium.com/sqooba/create-your-own-custom-and-authenticated-apt-repository-1e4a4cf0b864 # Improve: # Documentation: # Download the release key and trust it. @@ -19,76 +24,91 @@ # Use a sources.list.d/ file with contents: # deb https://repo.example.com/mirror/obs/ / # Dependencies: -# binaries: curl wget grep sed awk chmod chown rm +# binaries: wget sed awk # user: obsmirror - -parse_obs_dl_page() { - # simply wget the ${inurl} and play around with this master string. Goal is to remove all links that are not dpkg, gzip, repo files, or subdirs. - grep -oE 'href="[^"]+">' | awk '!x[$0]++' | sed -r -e 's/^href="//;' -e 's/">$//;' | grep -viE 'https?:\/\/[A-Za-z0-9\.]+\.[A-Za-z]+|mirrorlist|orig.*z$|^\/(debug|distribution|factory|ports|repositories|source|tumbleweed|update)\/$|^\?[A-Z]=[A-Z]|^\/|\.dsc$' -} - -parse_obs_page_and_subdirs() { - # call: curl -s -L "${inurl}" | parse_obs_page_and_subdirs "${inurl}" "${tmpfile}" - # return to stdout: all the wanted files from this page and its associated subdirs - ___input="$( parse_obs_dl_page )" - ___inurl="${1}" - ___tmpfile="${2}" - { - echo "${___input}" | grep -vE '\/$' | sed -r -e "s@^@${___inurl}\/@" - # iterate over all listed subdirs parse out their files - for subdir in $( echo "${___input}" | grep -E "\/$" ) ; - do - #echo "${___inurl}/${subdir}" - curl -s -L "${___inurl}/${subdir}" | parse_obs_dl_page | sed -r -e "s@^@${___inurl}/${subdir}@" - done - } > "${___tmpfile}" -} +umask 0002 test -n "${OBSMIRROR_CONF}" && . "${OBSMIRROR_CONF}" test -z "${logfile}" && logfile="/tmp/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log" -test -z "${tmpfile}" && tmpfile="$( mktemp )" test -z "${inurl}" && inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable" test -z "${workdir}" && workdir=/tmp/obs -test -z "${outdir}" && outdir=/tmp/var/www/mirror/obs test -z "${thisuser}" && thisuser=obsmirror +# also use include_sources resign_repo gpg_passfile gpg_keyfile DEBUG + +get_file() { + # call: get_file "${tu}" "${md5sum}" + ___tu="${1}" + ___sum="${2}" + tn="${___tu##${inurl}}" + tf="${workdir}/${tn}" ; tf="$( readlink -m "${tf}" )" + td="$( dirname "${tf}" )" + test -d "${td}" || mkdir -p "${td}" + gotten="skipped " + if test -z "${DRYRUN}" ; + then + if test -z "${___sum}" || test "$( md5sum "${tf}" 2>/dev/null | awk '{print $1}' )" != "${___sum}" ; + then + wget --content-disposition --no-verbose --quiet -O "${tf}" "${___tu}" && gotten=DOWNLOADED + fi + fi + test -n "${VERBOSE}" && echo "${gotten} ${___tu} -> ${tf}" +} +wget_verbose=--quiet +test -n "${VERBOSE}" && unset wget_verbose { test "${DEBUG:-NONE}" = "FULL" && set -x - echo "logfile=${logfile}" - - mkdir -p "${workdir}" ; chmod "0755" "${workdir}" ; chown "${thisuser}:$( id -G "${thisuser}" | awk '{print $1}' )" "${workdir}" - cd "${workdir}" - test "${use_top_result}" = "yes" && { - # get mirrorlist of Packages.gz file and find the one that lists the most packages - step1="$( curl -s -L "${inurl}/Packages.gz.mirrorlist" )" - options="$( echo "${step1}" | grep -oE 'href="[^"]+">' | awk '!x[$0]++' | sed -r -e 's/^href="//;' -e 's/">$//;' | grep -iE '^(ht|f)tps?:\/\/.*Packages\.gz$' )" - echo "${options}" 1>&2 - results="$( - for entry in ${options} ; do - # use package count - #curl -s -L "${entry}" | zgrep -cE '^Package:' | sed -r -e "s@\$@ ${entry}@;" - # use last modified timestamp of Packages.gz file - wget --content-disposition --quiet "${entry}" -O tmpfile.$$ ; find tmpfile.$$ -printf "%T@ ${entry}\n" - done ; rm tmpfile.$$ )" - topresult_line="$( echo "${results}" | sort -nr | head -n1 | sed -r -e 's/\/Packages\.gz$//;' )" - topresult_packagecount="$( echo "${topresult_line}" | awk '{print $1}' )" - topresult="$( echo "${topresult_line}" | awk '{print $2}' )" - echo "USING ${topresult} with ${topresult_packagecount} packages" 1>&2 - inurl="${topresult}" - } + echo "logfile=${logfile}" - step1="$( curl -s -L "${inurl}" )" - echo "${step1}" | parse_obs_page_and_subdirs "${inurl}" "${tmpfile}" - # loop over all entries and download them - for thisurl in $( cat "${tmpfile}" ) ; + # These files define an apt repo + for word in InRelease Packages Packages.gz Release Release.gpg Release.key Sources Sources.gz ; do - thisfile="$( echo "${thisurl}" | sed -r -e "s@${inurl}@${workdir}@" -e 's/%2B/+/g;' )" - thisdir="$( dirname "${thisfile}" )" - test -d "${thisdir}" || mkdir -p "${thisdir}" - test -n "${VERBOSE}" && echo "FROM ${thisurl} TO ${thisfile}" - test -z "${DRYRUN}" && wget --no-verbose -O "${thisfile}" "${thisurl}" & + get_file "${inurl}/${word}" done -} 2>&1 | tee -a "${logfile}" + # loop through named packages and download them + #for word in $( awk '/Filename:/{print $2}' "${workdir}/Packages" ) ; + awk '/Filename:|MD5/{print $2}' "${workdir}/Packages" | xargs -n2 | while read word sum + do + get_file "$( echo "${word}" | sed -r -e "s@^\.@${inurl}@;" )" "${sum}" + #echo "a=${a} b=${b}" + done -rm "${tmpfile:-NOTHINGTODEL}" + # loop through dsc, orig.tar.gz, and debian.tar.xz files + test -n "${include_sources}" && { + for word in $( sed -n -r -e '/Files:/,/^\s*$/{/^ /p;}' ${workdir}/Sources | awk '{print $NF}' ) ; + do + get_file "${inurl}/${word}" + done + } + + test -n "${resign_repo}" && { + # rebuild release files + repodir="${workdir}" + cd "${repodir}" + dpkg-scanpackages -m . > Packages + gzip -9c < Packages > Packages.gz + # create the Release file + PKGS="$(wc -c Packages)" + PKGS_GZ="$(wc -c Packages.gz)" + cat < Release +Architectures: all +Date: $(date -u '+%a, %d %b %Y %T %Z') +MD5Sum: + $(md5sum Packages | cut -d" " -f1) $PKGS + $(md5sum Packages.gz | cut -d" " -f1) $PKGS_GZ +SHA1: + $(sha1sum Packages | cut -d" " -f1) $PKGS + $(sha1sum Packages.gz | cut -d" " -f1) $PKGS_GZ +SHA256: + $(sha256sum Packages | cut -d" " -f1) $PKGS + $(sha256sum Packages.gz | cut -d" " -f1) $PKGS_GZ +EOF + test -e "${gpg_passfile}" && gpg --batch --yes --passphrase-file "${gpg_passfile}" -abs -o Release.gpg Release + test -e "${gpg_passfile}" && gpg --batch --yes --passphrase-file "${gpg_passfile}" --clearsign -o InRelease Release + # and because we are resigning it, replace Release.key with the one we used + test -e "${gpg_keyfile}" && cp -p "${gpg_keyfile}" Release.key + } + + chown -R "${thisuser}:$( id -G "${thisuser}" | awk '{print $1}' )" "${workdir}" +} 2>&1 | tee -a "${logfile}" diff --git a/obsmirror.sh/obsmirror2.sh b/obsmirror.sh/obsmirror2.sh deleted file mode 100755 index 71284dd..0000000 --- a/obsmirror.sh/obsmirror2.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/sh -# File: /etc/installed/obsmirror.sh -# Author: bgstack15 -# SPDX-License-Identifier: CC-BY-SA-4.0 -# Startdate: 2020-03-03 08:43 -# Title: Script that scrapes down OBS site to serve a copy to intranet -# Purpose: save down my OBS site so I can serve it locally -# History: -# 2020-01-05 v1: begin which used httrack -# 2020-02-28 v2: complete rewrite to exclude httrack -# 2020-03-03 v3: complete rewrite to get explicit files and loop through their contents -# Usage: -# in a cron job: /etc/cron.d/mirror.cron -# 50 12 * * * root /etc/installed/obsmirror.sh 1>/dev/null 2>&1 -# Reference: -# https://software.opensuse.org//download.html?project=home%3Abgstack15&package=freefilesync -# Improve: -# Documentation: -# Download the release key and trust it. -# curl -s http://repo.example.com/mirror/obs/Release.key | apt-key add - -# Use a sources.list.d/ file with contents: -# deb https://repo.example.com/mirror/obs/ / -# Dependencies: -# binaries: wget sed awk -# user: obsmirror -umask 0002 - -test -n "${OBSMIRROR_CONF}" && . "${OBSMIRROR_CONF}" -test -z "${logfile}" && logfile="/tmp/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log" -test -z "${inurl}" && inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable" -test -z "${workdir}" && workdir=/tmp/obs -# also use include_sources DEBUG - -get_file() { - # call: get_file "${tu}" "${md5sum}" - ___tu="${1}" - tn="$( basename "${___tu}" )" - tf="${workdir}/${tn}" ; tf="$( readlink -m "${tf}" )" - td="$( dirname "${tf}" )" - test -d "${td}" || mkdir -p "${td}" - test -n "${DRYRUN}" && test -n "${VERBOSE}" && echo "${___tu} -> ${tf}" - test -z "${DRYRUN}" && wget --content-disposition --no-verbose ${wget_verbose} -O "${tf}" "${___tu}" -} - -wget_verbose=--quiet -test -n "${VERBOSE}" && unset wget_verbose -{ - test "${DEBUG:-NONE}" = "FULL" && set -x - echo "logfile=${logfile}" - - # These files define an apt repo - for word in InRelease Packages Packages.gz Release Release.gpg Release.key Sources Sources.gz ; - do - get_file "${inurl}/${word}" - done - - # loop through named packages and download them - for word in $( awk '/Filename:/{print $2}' "${workdir}/Packages" ) ; - do - get_file "$( echo "${word}" | sed -r -e "s@^\.@${inurl}@;" )" - done - - # loop through dsc, orig.tar.gz, and debian.tar.xz files - test -n "${include_sources}" && { - for word in $( sed -n -r -e '/Files:/,/^\s*$/{/^ /p;}' ${workdir}/Sources | awk '{print $NF}' ) ; - do - get_file "${inurl}/${word}" - done - } - -} 2>&1 | tee -a "${logfile}" -- cgit