summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xobsmirror.sh/obsmirror.sh106
1 files changed, 58 insertions, 48 deletions
diff --git a/obsmirror.sh/obsmirror.sh b/obsmirror.sh/obsmirror.sh
index 3f5539c..fc69b4a 100755
--- a/obsmirror.sh/obsmirror.sh
+++ b/obsmirror.sh/obsmirror.sh
@@ -24,57 +24,67 @@
# binaries: curl httrack grep head tr sed awk chmod chown find rm ln
# user: obsmirror
-# learn site
-inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable"
-workdir=/tmp/obs-stage
-outdir=/var/www/mirror/obs
-thisuser=obsmirror
+logfile="/var/log/obsmirror/obsmirror.$( date "+%FT%H%M%S" ).log"
+{
+ test "${DEBUG:-NONE}" = "FULL" && set -x
+ inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable"
+ workdir=/tmp/obs-stage
+ outdir=/var/www/mirror/obs
+ thisuser=obsmirror
+ echo "logfile=${logfile}"
-mkdir -p "${workdir}" ; chmod "0711" "${workdir}" ; chown "${thisuser}:$( id -Gn obsmirror )" "${workdir}"
-cd "${workdir}"
-# get page contents
-step1="$( curl -s -L "${inurl}/all" )"
-# get first listed package
-step2="$( echo "${step1}" | grep --color=always -oE 'href="[a-zA-Z0-9_.+\-]+\.deb"' | head -n1 | grep -oE '".*"' | tr -d '"' )"
-# get full url to a package
-step3="$( curl -s -I "${inurl}/all/${step2}" | awk '/Location:/ {print $2}' )"
-# get directory of the mirror to save down
-step4="$( echo "${step3}" | sed -r -e "s/all\/${step2}//;" -e 's/\s*$//;' )"
-echo "TARGET URL: ${step4}"
-test -z "${DRYRUN}" && {
- # have to skip the orig.tar.gz files because they are large and slow down the sync process significantly.
- echo su "${thisuser}" -c "httrack \"${step4}\" -*.orig.t* --mirror --update -s0 -r3 -%e0 \"${workdir}\""
- time su "${thisuser}" -c "httrack ${step4} -*.orig.t* --mirror --update -s0 -r3 -%e0 ${workdir}"
-}
-# -s0 ignore robots.txt
-# -r3 only go down 3 links
-# -%e0 follow 0 links to external sites
+ mkdir -p "${workdir}" ; chmod "0711" "${workdir}" ; chown "${thisuser}:$( id -Gn obsmirror )" "${workdir}"
+ cd "${workdir}"
+ # get page contents
+ step1="$( curl -s -L "${inurl}/all" )"
+ # get first listed package
+ step2="$( echo "${step1}" | grep --color=always -oE 'href="[a-zA-Z0-9_.+\-]+\.deb"' | head -n1 | grep -oE '".*"' | tr -d '"' )"
+ # get full url to a package
+ step3="$( curl -s -I "${inurl}/all/${step2}" | awk '/Location:/ {print $2}' )"
+ # get directory of the mirror to save down
+ step4="$( echo "${step3}" | sed -r -e "s/all\/${step2}//;" -e 's/\s*$//;' )"
+ # get domain of full url
+ domainname="$( echo "${step3}" | grep -oE '(ht|f)tps?:\/\/[^\/]+\/' | cut -d'/' -f3 )"
+ echo "TARGET URL: ${step4}"
+ test -z "${DRYRUN}" && {
+ # clean workdir of specific domain name in use right now.
+ echo su "${thisuser}" -c "rm -rf \"${workdir:-SOMETHING}/${domainname:-SOMETHING}\""
+ su "${thisuser}" -c "rm -rf \"${workdir:-SOMETHING}/${domainname:-SOMETHING}\"*"
+ # have to skip the orig.tar.gz files because they are large and slow down the sync process significantly.
+ echo su "${thisuser}" -c "httrack \"${step4}\" -*.orig.t* -v --mirror --update -s0 -r3 -%e0 \"${workdir}\""
+ time su "${thisuser}" -c "httrack ${step4} -*.orig.t* -v --mirror --update -s0 -r3 -%e0 ${workdir}"
+ }
+ # -s0 ignore robots.txt
+ # -r3 only go down 3 links
+ # -%e0 follow 0 links to external sites
-# find most recent directory of that level
-levelcount="$(( $( printf "%s" "${inurl}" | tr -dc '/' | wc -c ) - 1 ))"
-subdir="$( find "${workdir}" -mindepth "${levelcount}" -maxdepth "${levelcount}" -type d -name 'Debian_Unstable' -printf '%T@ %p\n' | sort -n -k1 | head -n1 | awk '{print $2}' )"
+ # find most recent directory of that level
+ levelcount="$(( $( printf "%s" "${inurl}" | tr -dc '/' | wc -c ) - 1 ))"
+ subdir="$( find "${workdir}" -mindepth "${levelcount}" -maxdepth "${levelcount}" -type d -name 'Debian_Unstable' -printf '%T@ %p\n' | sort -n -k1 | head -n1 | awk '{print $2}' )"
-# if the work directory actually synced
-if test -n "${subdir}" ;
-then
- printf "%s " "DIRECTORY SIZE:"
- du -sxBM "${subdir:-.}"
- mkdir -p "$( dirname "${outdir}" )"
- # get current target of symlink
- current_target="$( find "${outdir}" -maxdepth 0 -type l -printf '%l\n' )"
-
- # if the current link is pointing to a different directory than this subdir
- if test "${current_target}" != "${subdir}" ;
+ # if the work directory actually synced
+ if test -n "${subdir}" ;
then
- # then replace it with a link to this one
- test -L "${outdir}" && unlink "${outdir}"
- echo ln -sf "${subdir}" "${outdir}"
- ln -sf "${subdir}" "${outdir}"
+
+ printf "%s " "DIRECTORY SIZE:"
+ du -sxBM "${subdir:-.}"
+ mkdir -p "$( dirname "${outdir}" )"
+ # get current target of symlink
+ current_target="$( find "${outdir}" -maxdepth 0 -type l -printf '%l\n' )"
+
+ # if the current link is pointing to a different directory than this subdir
+ if test "${current_target}" != "${subdir}" ;
+ then
+ # then replace it with a link to this one
+ test -L "${outdir}" && unlink "${outdir}"
+ echo ln -sf "${subdir}" "${outdir}"
+ ln -sf "${subdir}" "${outdir}"
+ fi
+
+ else
+ echo "ERROR: No subdir found, so cannot update the symlink."
fi
-
-else
- echo "ERROR: No subdir found, so cannot update the symlink."
-fi
-# disable the index.html with all the httrack comments and original site links
-find "${workdir}" -iname '*index.html' -exec rm {} +
+ # disable the index.html with all the httrack comments and original site links
+ find "${workdir}" -iname '*index.html' -exec rm {} +
+} 2>&1 | tee -a "${logfile}"
bgstack15