From 8cd32e6330d6f86d46479c3ee58e6b9c3e832c5f Mon Sep 17 00:00:00 2001 From: B Stack Date: Tue, 7 Jan 2020 21:46:07 -0500 Subject: obsmirror: add logging --- obsmirror.sh/obsmirror.sh | 106 +++++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 48 deletions(-) diff --git a/obsmirror.sh/obsmirror.sh b/obsmirror.sh/obsmirror.sh index 3f5539c..fc69b4a 100755 --- a/obsmirror.sh/obsmirror.sh +++ b/obsmirror.sh/obsmirror.sh @@ -24,57 +24,67 @@ # binaries: curl httrack grep head tr sed awk chmod chown find rm ln # user: obsmirror -# learn site -inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable" -workdir=/tmp/obs-stage -outdir=/var/www/mirror/obs -thisuser=obsmirror +logfile="/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log" +{ + test "${DEBUG:-NONE}" = "FULL" && set -x + inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable" + workdir=/tmp/obs-stage + outdir=/var/www/mirror/obs + thisuser=obsmirror + echo "logfile=${logfile}" -mkdir -p "${workdir}" ; chmod "0711" "${workdir}" ; chown "${thisuser}:$( id -Gn obsmirror )" "${workdir}" -cd "${workdir}" -# get page contents -step1="$( curl -s -L "${inurl}/all" )" -# get first listed package -step2="$( echo "${step1}" | grep --color=always -oE 'href="[a-zA-Z0-9_.+\-]+\.deb"' | head -n1 | grep -oE '".*"' | tr -d '"' )" -# get full url to a package -step3="$( curl -s -I "${inurl}/all/${step2}" | awk '/Location:/ {print $2}' )" -# get directory of the mirror to save down -step4="$( echo "${step3}" | sed -r -e "s/all\/${step2}//;" -e 's/\s*$//;' )" -echo "TARGET URL: ${step4}" -test -z "${DRYRUN}" && { - # have to skip the orig.tar.gz files because they are large and slow down the sync process significantly. - echo su "${thisuser}" -c "httrack \"${step4}\" -*.orig.t* --mirror --update -s0 -r3 -%e0 \"${workdir}\"" - time su "${thisuser}" -c "httrack ${step4} -*.orig.t* --mirror --update -s0 -r3 -%e0 ${workdir}" -} -# -s0 ignore robots.txt -# -r3 only go down 3 links -# -%e0 follow 0 links to external sites + mkdir -p "${workdir}" ; chmod "0711" "${workdir}" ; chown "${thisuser}:$( id -Gn obsmirror )" "${workdir}" + cd "${workdir}" + # get page contents + step1="$( curl -s -L "${inurl}/all" )" + # get first listed package + step2="$( echo "${step1}" | grep --color=always -oE 'href="[a-zA-Z0-9_.+\-]+\.deb"' | head -n1 | grep -oE '".*"' | tr -d '"' )" + # get full url to a package + step3="$( curl -s -I "${inurl}/all/${step2}" | awk '/Location:/ {print $2}' )" + # get directory of the mirror to save down + step4="$( echo "${step3}" | sed -r -e "s/all\/${step2}//;" -e 's/\s*$//;' )" + # get domain of full url + domainname="$( echo "${step3}" | grep -oE '(ht|f)tps?:\/\/[^\/]+\/' | cut -d'/' -f3 )" + echo "TARGET URL: ${step4}" + test -z "${DRYRUN}" && { + # clean workdir of specific domain name in use right now. + echo su "${thisuser}" -c "rm -rf \"${workdir:-SOMETHING}/${domainname:-SOMETHING}\"" + su "${thisuser}" -c "rm -rf \"${workdir:-SOMETHING}/${domainname:-SOMETHING}\"*" + # have to skip the orig.tar.gz files because they are large and slow down the sync process significantly. + echo su "${thisuser}" -c "httrack \"${step4}\" -*.orig.t* -v --mirror --update -s0 -r3 -%e0 \"${workdir}\"" + time su "${thisuser}" -c "httrack ${step4} -*.orig.t* -v --mirror --update -s0 -r3 -%e0 ${workdir}" + } + # -s0 ignore robots.txt + # -r3 only go down 3 links + # -%e0 follow 0 links to external sites -# find most recent directory of that level -levelcount="$(( $( printf "%s" "${inurl}" | tr -dc '/' | wc -c ) - 1 ))" -subdir="$( find "${workdir}" -mindepth "${levelcount}" -maxdepth "${levelcount}" -type d -name 'Debian_Unstable' -printf '%T@ %p\n' | sort -n -k1 | head -n1 | awk '{print $2}' )" + # find most recent directory of that level + levelcount="$(( $( printf "%s" "${inurl}" | tr -dc '/' | wc -c ) - 1 ))" + subdir="$( find "${workdir}" -mindepth "${levelcount}" -maxdepth "${levelcount}" -type d -name 'Debian_Unstable' -printf '%T@ %p\n' | sort -n -k1 | head -n1 | awk '{print $2}' )" -# if the work directory actually synced -if test -n "${subdir}" ; -then - printf "%s " "DIRECTORY SIZE:" - du -sxBM "${subdir:-.}" - mkdir -p "$( dirname "${outdir}" )" - # get current target of symlink - current_target="$( find "${outdir}" -maxdepth 0 -type l -printf '%l\n' )" - - # if the current link is pointing to a different directory than this subdir - if test "${current_target}" != "${subdir}" ; + # if the work directory actually synced + if test -n "${subdir}" ; then - # then replace it with a link to this one - test -L "${outdir}" && unlink "${outdir}" - echo ln -sf "${subdir}" "${outdir}" - ln -sf "${subdir}" "${outdir}" + + printf "%s " "DIRECTORY SIZE:" + du -sxBM "${subdir:-.}" + mkdir -p "$( dirname "${outdir}" )" + # get current target of symlink + current_target="$( find "${outdir}" -maxdepth 0 -type l -printf '%l\n' )" + + # if the current link is pointing to a different directory than this subdir + if test "${current_target}" != "${subdir}" ; + then + # then replace it with a link to this one + test -L "${outdir}" && unlink "${outdir}" + echo ln -sf "${subdir}" "${outdir}" + ln -sf "${subdir}" "${outdir}" + fi + + else + echo "ERROR: No subdir found, so cannot update the symlink." fi - -else - echo "ERROR: No subdir found, so cannot update the symlink." -fi -# disable the index.html with all the httrack comments and original site links -find "${workdir}" -iname '*index.html' -exec rm {} + + # disable the index.html with all the httrack comments and original site links + find "${workdir}" -iname '*index.html' -exec rm {} + +} 2>&1 | tee -a "${logfile}" -- cgit