diff options
author | B Stack <bgstack15@gmail.com> | 2020-02-25 16:46:33 -0500 |
---|---|---|
committer | B Stack <bgstack15@gmail.com> | 2020-02-25 16:46:33 -0500 |
commit | c8427b45cd3019cf79c5ca342ab1530bb42697ec (patch) | |
tree | 3184a2ddb3d2ea357d61e245d3106a75ad55f2ec | |
parent | obsmirror: add logging (diff) | |
download | former-gists-c8427b45cd3019cf79c5ca342ab1530bb42697ec.tar.gz former-gists-c8427b45cd3019cf79c5ca342ab1530bb42697ec.tar.bz2 former-gists-c8427b45cd3019cf79c5ca342ab1530bb42697ec.zip |
rewrite obsmirror to be much simpler
Drop httrack which just sucked. Rewrote script to list all files
from main page, without navigating to a mirror site first. This
vastly simplifies the process and ensures that we always pull down
every package, and does not need a temp dir.
-rwxr-xr-x | obsmirror.sh/obsmirror.sh | 91 |
1 files changed, 35 insertions, 56 deletions
diff --git a/obsmirror.sh/obsmirror.sh b/obsmirror.sh/obsmirror.sh index fc69b4a..e0c21b2 100755 --- a/obsmirror.sh/obsmirror.sh +++ b/obsmirror.sh/obsmirror.sh @@ -11,80 +11,59 @@ # 50 12 * * * root /etc/installed/obsmirror.sh 1>/dev/null 2>&1 # Reference: # https://unix.stackexchange.com/questions/114044/how-to-make-wget-download-recursive-combining-accept-with-exclude-directorie?rq=1 -# man 1 httrack # https://software.opensuse.org//download.html?project=home%3Abgstack15&package=freefilesync # Improve: -# use some text file as a list of recently-synced URLs, and if today's URL matches a recent one, then run the httrack with the --update flag. Probably keep a running list forever. # Documentation: # Download the release key and trust it. # curl -s http://repo.example.com/mirror/obs/Release.key | apt-key add - # Use a sources.list.d/ file with contents: # deb https://repo.example.com/mirror/obs/ / # Dependencies: -# binaries: curl httrack grep head tr sed awk chmod chown find rm ln +# binaries: curl wget grep sed awk chmod chown rm # user: obsmirror -logfile="/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log" +parse_obs_dl_page() { + # simply wget the ${inurl} and play around with this master string. Goal is to remove all links that are not dpkg, gzip, repo files, or subdirs. + grep -oE 'href="[^"]+">' | awk '!x[$0]++' | sed -r -e 's/^href="//;' -e 's/">$//;' | grep -viE 'https?:\/\/[A-Za-z0-9\.]+\.[A-Za-z]+|mirrorlist|orig.*z$|^\/(debug|distribution|factory|ports|repositories|source|tumbleweed|update)\/$|^\?[A-Z]=[A-Z]|^\/|\.dsc$' +} + +logfile="/tmp/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log" +tmpfile="$( mktemp )" + { + test "${DEBUG:-NONE}" = "FULL" && set -x inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable" workdir=/tmp/obs-stage - outdir=/var/www/mirror/obs - thisuser=obsmirror + outdir=/tmp/var/www/mirror/obs + thisuser=$USER echo "logfile=${logfile}" - mkdir -p "${workdir}" ; chmod "0711" "${workdir}" ; chown "${thisuser}:$( id -Gn obsmirror )" "${workdir}" + mkdir -p "${workdir}" ; chmod "0711" "${workdir}" ; chown "${thisuser}:$( id -G "${thisuser}" | awk '{print $1}' )" "${workdir}" cd "${workdir}" # get page contents - step1="$( curl -s -L "${inurl}/all" )" - # get first listed package - step2="$( echo "${step1}" | grep --color=always -oE 'href="[a-zA-Z0-9_.+\-]+\.deb"' | head -n1 | grep -oE '".*"' | tr -d '"' )" - # get full url to a package - step3="$( curl -s -I "${inurl}/all/${step2}" | awk '/Location:/ {print $2}' )" - # get directory of the mirror to save down - step4="$( echo "${step3}" | sed -r -e "s/all\/${step2}//;" -e 's/\s*$//;' )" - # get domain of full url - domainname="$( echo "${step3}" | grep -oE '(ht|f)tps?:\/\/[^\/]+\/' | cut -d'/' -f3 )" - echo "TARGET URL: ${step4}" - test -z "${DRYRUN}" && { - # clean workdir of specific domain name in use right now. - echo su "${thisuser}" -c "rm -rf \"${workdir:-SOMETHING}/${domainname:-SOMETHING}\"" - su "${thisuser}" -c "rm -rf \"${workdir:-SOMETHING}/${domainname:-SOMETHING}\"*" - # have to skip the orig.tar.gz files because they are large and slow down the sync process significantly. - echo su "${thisuser}" -c "httrack \"${step4}\" -*.orig.t* -v --mirror --update -s0 -r3 -%e0 \"${workdir}\"" - time su "${thisuser}" -c "httrack ${step4} -*.orig.t* -v --mirror --update -s0 -r3 -%e0 ${workdir}" - } - # -s0 ignore robots.txt - # -r3 only go down 3 links - # -%e0 follow 0 links to external sites - - # find most recent directory of that level - levelcount="$(( $( printf "%s" "${inurl}" | tr -dc '/' | wc -c ) - 1 ))" - subdir="$( find "${workdir}" -mindepth "${levelcount}" -maxdepth "${levelcount}" -type d -name 'Debian_Unstable' -printf '%T@ %p\n' | sort -n -k1 | head -n1 | awk '{print $2}' )" + step1="$( curl -s -L "${inurl}" )" + step2="$( echo "${step1}" | parse_obs_dl_page )" + { + echo "${step2}" | grep -vE '\/$' | sed -r -e "s@^@${inurl}\/@" + # iterate over all listed subdirs parse out their files + for subdir in $( echo "${step2}" | grep -E "\/$" ) ; + do + #echo "${inurl}/${subdir}" + curl -s -L "${inurl}/${subdir}" | parse_obs_dl_page | sed -r -e "s@^@${inurl}/${subdir}@" + done + } > "${tmpfile}" - # if the work directory actually synced - if test -n "${subdir}" ; - then + # loop over all entries and download them + for thisurl in $( cat "${tmpfile}" ) ; + do + thisfile="$( echo "${thisurl}" | sed -r -e "s@${inurl}@${workdir}@" )" + thisdir="$( dirname "${thisfile}" )" + test -d "${thisdir}" || mkdir -p "${thisdir}" + test -n "${VERBOSE}" && echo "FROM ${thisurl} TO ${thisfile}" + test -z "${DRYRUN}" && wget --continue --no-verbose -O "${thisfile}" "${thisurl}" & + done - printf "%s " "DIRECTORY SIZE:" - du -sxBM "${subdir:-.}" - mkdir -p "$( dirname "${outdir}" )" - # get current target of symlink - current_target="$( find "${outdir}" -maxdepth 0 -type l -printf '%l\n' )" - - # if the current link is pointing to a different directory than this subdir - if test "${current_target}" != "${subdir}" ; - then - # then replace it with a link to this one - test -L "${outdir}" && unlink "${outdir}" - echo ln -sf "${subdir}" "${outdir}" - ln -sf "${subdir}" "${outdir}" - fi - - else - echo "ERROR: No subdir found, so cannot update the symlink." - fi - - # disable the index.html with all the httrack comments and original site links - find "${workdir}" -iname '*index.html' -exec rm {} + } 2>&1 | tee -a "${logfile}" + +rm "${tmpfile:-NOTHINGTODEL}" |