rewrite obsmirror to be much simpler

Drop httrack which just sucked. Rewrote script to list all files from main page, without navigating to a mirror site first. This vastly simplifies the process and ensures that we always pull down every package, and does not need a temp dir.
author: B Stack <bgstack15@gmail.com> 2020-02-25 16:46:33 -0500
committer: B Stack <bgstack15@gmail.com> 2020-02-25 16:46:33 -0500
commit: c8427b45cd3019cf79c5ca342ab1530bb42697ec (patch)
tree: 3184a2ddb3d2ea357d61e245d3106a75ad55f2ec
parent: obsmirror: add logging (diff)
download: former-gists-c8427b45cd3019cf79c5ca342ab1530bb42697ec.tar.gz
former-gists-c8427b45cd3019cf79c5ca342ab1530bb42697ec.tar.bz2
former-gists-c8427b45cd3019cf79c5ca342ab1530bb42697ec.zip
1 files changed, 35 insertions, 56 deletions
diff --git a/obsmirror.sh/obsmirror.sh b/obsmirror.sh/obsmirror.sh
index fc69b4a..e0c21b2 100755
--- a/obsmirror.sh/obsmirror.sh
+++ b/obsmirror.sh/obsmirror.sh
@@ -11,80 +11,59 @@
 #       50	12	*	*	*	root	/etc/installed/obsmirror.sh 1>/dev/null 2>&1
 # Reference:
 #    https://unix.stackexchange.com/questions/114044/how-to-make-wget-download-recursive-combining-accept-with-exclude-directorie?rq=1
-#    man 1 httrack
 #    https://software.opensuse.org//download.html?project=home%3Abgstack15&package=freefilesync
 # Improve:
-#    use some text file as a list of recently-synced URLs, and if today's URL matches a recent one, then run the httrack with the --update flag. Probably keep a running list forever.
 # Documentation:
 #    Download the release key and trust it.
 #       curl -s http://repo.example.com/mirror/obs/Release.key | apt-key add -
 #    Use a sources.list.d/ file with contents:
 #       deb https://repo.example.com/mirror/obs/ /
 # Dependencies:
-#    binaries: curl httrack grep head tr sed awk chmod chown find rm ln
+#    binaries: curl wget grep sed awk chmod chown rm
 #    user: obsmirror
 
-logfile="/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log"
+parse_obs_dl_page() {
+   # simply wget the ${inurl} and play around with this master string. Goal is to remove all links that are not dpkg, gzip, repo files, or subdirs.
+   grep -oE 'href="[^"]+">' | awk '!x[$0]++' | sed -r -e 's/^href="//;' -e 's/">$//;' | grep -viE 'https?:\/\/[A-Za-z0-9\.]+\.[A-Za-z]+|mirrorlist|orig.*z$|^\/(debug|distribution|factory|ports|repositories|source|tumbleweed|update)\/$|^\?[A-Z]=[A-Z]|^\/|\.dsc$'
+}
+
+logfile="/tmp/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log"
+tmpfile="$( mktemp )"
+
 {
+
    test "${DEBUG:-NONE}" = "FULL" && set -x
    inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable"
    workdir=/tmp/obs-stage
-   outdir=/var/www/mirror/obs
-   thisuser=obsmirror
+   outdir=/tmp/var/www/mirror/obs
+   thisuser=$USER
    echo "logfile=${logfile}"
 
-   mkdir -p "${workdir}" ; chmod "0711" "${workdir}" ; chown "${thisuser}:$( id -Gn obsmirror )" "${workdir}" 
+   mkdir -p "${workdir}" ; chmod "0711" "${workdir}" ; chown "${thisuser}:$( id -G "${thisuser}" | awk '{print $1}' )" "${workdir}"
    cd "${workdir}"
    # get page contents
-   step1="$( curl -s -L "${inurl}/all" )"
-   # get first listed package
-   step2="$( echo "${step1}" | grep --color=always -oE 'href="[a-zA-Z0-9_.+\-]+\.deb"' | head -n1 | grep -oE '".*"' | tr -d '"' )"
-   # get full url to a package
-   step3="$( curl -s -I "${inurl}/all/${step2}" | awk '/Location:/ {print $2}' )"
-   # get directory of the mirror to save down
-   step4="$( echo "${step3}" | sed -r -e "s/all\/${step2}//;" -e 's/\s*$//;' )"
-   # get domain of full url
-   domainname="$( echo "${step3}" | grep -oE '(ht|f)tps?:\/\/[^\/]+\/' | cut -d'/' -f3 )"
-   echo "TARGET URL: ${step4}"
-   test -z "${DRYRUN}" && {
-      # clean workdir of specific domain name in use right now.
-      echo su "${thisuser}" -c "rm -rf \"${workdir:-SOMETHING}/${domainname:-SOMETHING}\""
-      su "${thisuser}" -c "rm -rf \"${workdir:-SOMETHING}/${domainname:-SOMETHING}\"*"
-      # have to skip the orig.tar.gz files because they are large and slow down the sync process significantly.
-      echo su "${thisuser}" -c "httrack \"${step4}\" -*.orig.t* -v --mirror --update -s0 -r3 -%e0 \"${workdir}\""
-      time su "${thisuser}" -c "httrack ${step4} -*.orig.t* -v --mirror --update -s0 -r3 -%e0 ${workdir}"
-   }
-   # -s0 ignore robots.txt
-   # -r3 only go down 3 links
-   # -%e0 follow 0 links to external sites
-
-   # find most recent directory of that level
-   levelcount="$(( $( printf "%s" "${inurl}" | tr -dc '/' | wc -c ) - 1 ))"
-   subdir="$( find "${workdir}" -mindepth "${levelcount}" -maxdepth "${levelcount}" -type d -name 'Debian_Unstable' -printf '%T@ %p\n' | sort -n -k1 | head -n1 | awk '{print $2}' )"
+   step1="$( curl -s -L "${inurl}" )"
+   step2="$( echo "${step1}" | parse_obs_dl_page )"
+   {
+      echo "${step2}" | grep -vE '\/$' | sed -r -e "s@^@${inurl}\/@"
+      # iterate over all listed subdirs parse out their files
+      for subdir in $( echo "${step2}" | grep -E "\/$" ) ;
+      do
+         #echo "${inurl}/${subdir}"
+         curl -s -L "${inurl}/${subdir}" | parse_obs_dl_page | sed -r -e "s@^@${inurl}/${subdir}@"
+      done
+   } > "${tmpfile}"
 
-   # if the work directory actually synced
-   if test -n "${subdir}" ;
-   then
+   # loop over all entries and download them
+   for thisurl in $( cat "${tmpfile}" ) ;
+   do
+      thisfile="$( echo "${thisurl}" | sed -r -e "s@${inurl}@${workdir}@" )"
+      thisdir="$( dirname "${thisfile}" )"
+      test -d "${thisdir}" || mkdir -p "${thisdir}"
+      test -n "${VERBOSE}" && echo "FROM ${thisurl} TO ${thisfile}"
+      test -z "${DRYRUN}" && wget --continue --no-verbose -O "${thisfile}" "${thisurl}" &
+   done
 
-      printf "%s " "DIRECTORY SIZE:"
-      du -sxBM "${subdir:-.}"
-      mkdir -p "$( dirname "${outdir}" )"
-      # get current target of symlink
-      current_target="$( find "${outdir}" -maxdepth 0 -type l -printf '%l\n' )"
-
-      # if the current link is pointing to a different directory than this subdir
-      if test "${current_target}" != "${subdir}" ;
-      then
-         # then replace it with a link to this one
-         test -L "${outdir}" && unlink "${outdir}"
-         echo ln -sf "${subdir}" "${outdir}"
-         ln -sf "${subdir}" "${outdir}"
-      fi
-
-   else
-      echo "ERROR: No subdir found, so cannot update the symlink."
-   fi
-
-   # disable the index.html with all the httrack comments and original site links
-   find "${workdir}" -iname '*index.html' -exec rm {} +
 } 2>&1 | tee -a "${logfile}"
+
+rm "${tmpfile:-NOTHINGTODEL}"
author	B Stack <bgstack15@gmail.com>	2020-02-25 16:46:33 -0500
committer	B Stack <bgstack15@gmail.com>	2020-02-25 16:46:33 -0500
commit	c8427b45cd3019cf79c5ca342ab1530bb42697ec (patch)
tree	3184a2ddb3d2ea357d61e245d3106a75ad55f2ec
parent	obsmirror: add logging (diff)
download	former-gists-c8427b45cd3019cf79c5ca342ab1530bb42697ec.tar.gz former-gists-c8427b45cd3019cf79c5ca342ab1530bb42697ec.tar.bz2 former-gists-c8427b45cd3019cf79c5ca342ab1530bb42697ec.zip