summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorB Stack <bgstack15@gmail.com>2020-02-25 16:46:33 -0500
committerB Stack <bgstack15@gmail.com>2020-02-25 16:46:33 -0500
commitc8427b45cd3019cf79c5ca342ab1530bb42697ec (patch)
tree3184a2ddb3d2ea357d61e245d3106a75ad55f2ec
parentobsmirror: add logging (diff)
downloadformer-gists-c8427b45cd3019cf79c5ca342ab1530bb42697ec.tar.gz
former-gists-c8427b45cd3019cf79c5ca342ab1530bb42697ec.tar.bz2
former-gists-c8427b45cd3019cf79c5ca342ab1530bb42697ec.zip
rewrite obsmirror to be much simpler
Drop httrack which just sucked. Rewrote script to list all files from main page, without navigating to a mirror site first. This vastly simplifies the process and ensures that we always pull down every package, and does not need a temp dir.
-rwxr-xr-xobsmirror.sh/obsmirror.sh91
1 files changed, 35 insertions, 56 deletions
diff --git a/obsmirror.sh/obsmirror.sh b/obsmirror.sh/obsmirror.sh
index fc69b4a..e0c21b2 100755
--- a/obsmirror.sh/obsmirror.sh
+++ b/obsmirror.sh/obsmirror.sh
@@ -11,80 +11,59 @@
# 50 12 * * * root /etc/installed/obsmirror.sh 1>/dev/null 2>&1
# Reference:
# https://unix.stackexchange.com/questions/114044/how-to-make-wget-download-recursive-combining-accept-with-exclude-directorie?rq=1
-# man 1 httrack
# https://software.opensuse.org//download.html?project=home%3Abgstack15&package=freefilesync
# Improve:
-# use some text file as a list of recently-synced URLs, and if today's URL matches a recent one, then run the httrack with the --update flag. Probably keep a running list forever.
# Documentation:
# Download the release key and trust it.
# curl -s http://repo.example.com/mirror/obs/Release.key | apt-key add -
# Use a sources.list.d/ file with contents:
# deb https://repo.example.com/mirror/obs/ /
# Dependencies:
-# binaries: curl httrack grep head tr sed awk chmod chown find rm ln
+# binaries: curl wget grep sed awk chmod chown rm
# user: obsmirror
-logfile="/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log"
+parse_obs_dl_page() {
+ # simply wget the ${inurl} and play around with this master string. Goal is to remove all links that are not dpkg, gzip, repo files, or subdirs.
+ grep -oE 'href="[^"]+">' | awk '!x[$0]++' | sed -r -e 's/^href="//;' -e 's/">$//;' | grep -viE 'https?:\/\/[A-Za-z0-9\.]+\.[A-Za-z]+|mirrorlist|orig.*z$|^\/(debug|distribution|factory|ports|repositories|source|tumbleweed|update)\/$|^\?[A-Z]=[A-Z]|^\/|\.dsc$'
+}
+
+logfile="/tmp/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log"
+tmpfile="$( mktemp )"
+
{
+
test "${DEBUG:-NONE}" = "FULL" && set -x
inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable"
workdir=/tmp/obs-stage
- outdir=/var/www/mirror/obs
- thisuser=obsmirror
+ outdir=/tmp/var/www/mirror/obs
+ thisuser=$USER
echo "logfile=${logfile}"
- mkdir -p "${workdir}" ; chmod "0711" "${workdir}" ; chown "${thisuser}:$( id -Gn obsmirror )" "${workdir}"
+ mkdir -p "${workdir}" ; chmod "0711" "${workdir}" ; chown "${thisuser}:$( id -G "${thisuser}" | awk '{print $1}' )" "${workdir}"
cd "${workdir}"
# get page contents
- step1="$( curl -s -L "${inurl}/all" )"
- # get first listed package
- step2="$( echo "${step1}" | grep --color=always -oE 'href="[a-zA-Z0-9_.+\-]+\.deb"' | head -n1 | grep -oE '".*"' | tr -d '"' )"
- # get full url to a package
- step3="$( curl -s -I "${inurl}/all/${step2}" | awk '/Location:/ {print $2}' )"
- # get directory of the mirror to save down
- step4="$( echo "${step3}" | sed -r -e "s/all\/${step2}//;" -e 's/\s*$//;' )"
- # get domain of full url
- domainname="$( echo "${step3}" | grep -oE '(ht|f)tps?:\/\/[^\/]+\/' | cut -d'/' -f3 )"
- echo "TARGET URL: ${step4}"
- test -z "${DRYRUN}" && {
- # clean workdir of specific domain name in use right now.
- echo su "${thisuser}" -c "rm -rf \"${workdir:-SOMETHING}/${domainname:-SOMETHING}\""
- su "${thisuser}" -c "rm -rf \"${workdir:-SOMETHING}/${domainname:-SOMETHING}\"*"
- # have to skip the orig.tar.gz files because they are large and slow down the sync process significantly.
- echo su "${thisuser}" -c "httrack \"${step4}\" -*.orig.t* -v --mirror --update -s0 -r3 -%e0 \"${workdir}\""
- time su "${thisuser}" -c "httrack ${step4} -*.orig.t* -v --mirror --update -s0 -r3 -%e0 ${workdir}"
- }
- # -s0 ignore robots.txt
- # -r3 only go down 3 links
- # -%e0 follow 0 links to external sites
-
- # find most recent directory of that level
- levelcount="$(( $( printf "%s" "${inurl}" | tr -dc '/' | wc -c ) - 1 ))"
- subdir="$( find "${workdir}" -mindepth "${levelcount}" -maxdepth "${levelcount}" -type d -name 'Debian_Unstable' -printf '%T@ %p\n' | sort -n -k1 | head -n1 | awk '{print $2}' )"
+ step1="$( curl -s -L "${inurl}" )"
+ step2="$( echo "${step1}" | parse_obs_dl_page )"
+ {
+ echo "${step2}" | grep -vE '\/$' | sed -r -e "s@^@${inurl}\/@"
+ # iterate over all listed subdirs parse out their files
+ for subdir in $( echo "${step2}" | grep -E "\/$" ) ;
+ do
+ #echo "${inurl}/${subdir}"
+ curl -s -L "${inurl}/${subdir}" | parse_obs_dl_page | sed -r -e "s@^@${inurl}/${subdir}@"
+ done
+ } > "${tmpfile}"
- # if the work directory actually synced
- if test -n "${subdir}" ;
- then
+ # loop over all entries and download them
+ for thisurl in $( cat "${tmpfile}" ) ;
+ do
+ thisfile="$( echo "${thisurl}" | sed -r -e "s@${inurl}@${workdir}@" )"
+ thisdir="$( dirname "${thisfile}" )"
+ test -d "${thisdir}" || mkdir -p "${thisdir}"
+ test -n "${VERBOSE}" && echo "FROM ${thisurl} TO ${thisfile}"
+ test -z "${DRYRUN}" && wget --continue --no-verbose -O "${thisfile}" "${thisurl}" &
+ done
- printf "%s " "DIRECTORY SIZE:"
- du -sxBM "${subdir:-.}"
- mkdir -p "$( dirname "${outdir}" )"
- # get current target of symlink
- current_target="$( find "${outdir}" -maxdepth 0 -type l -printf '%l\n' )"
-
- # if the current link is pointing to a different directory than this subdir
- if test "${current_target}" != "${subdir}" ;
- then
- # then replace it with a link to this one
- test -L "${outdir}" && unlink "${outdir}"
- echo ln -sf "${subdir}" "${outdir}"
- ln -sf "${subdir}" "${outdir}"
- fi
-
- else
- echo "ERROR: No subdir found, so cannot update the symlink."
- fi
-
- # disable the index.html with all the httrack comments and original site links
- find "${workdir}" -iname '*index.html' -exec rm {} +
} 2>&1 | tee -a "${logfile}"
+
+rm "${tmpfile:-NOTHINGTODEL}"
bgstack15