diff options
-rw-r--r-- | obsmirror.sh/description | 1 | ||||
-rwxr-xr-x | obsmirror.sh/obsmirror.sh | 80 |
2 files changed, 81 insertions, 0 deletions
diff --git a/obsmirror.sh/description b/obsmirror.sh/description new file mode 100644 index 0000000..4cc4c7a --- /dev/null +++ b/obsmirror.sh/description @@ -0,0 +1 @@ +Mirror an OBS repository diff --git a/obsmirror.sh/obsmirror.sh b/obsmirror.sh/obsmirror.sh new file mode 100755 index 0000000..eaf752e --- /dev/null +++ b/obsmirror.sh/obsmirror.sh @@ -0,0 +1,80 @@ +#!/bin/sh +# File: /etc/installed/obsmirror.sh +# License: CC-BY-SA 4.0 +# Author: bgstack15 +# Startdate: 2020-01-05 18:01 +# Title: Script that scrapes down OBS site to serve a copy to intranet +# Purpose: save down my OBS site so I can serve it locally +# History: +# Usage: +# in a cron job: /etc/cron.d/mirror.cron +# 50 12 * * * root /etc/installed/obsmirror.sh 1>/dev/null 2>&1 +# Reference: +# https://unix.stackexchange.com/questions/114044/how-to-make-wget-download-recursive-combining-accept-with-exclude-directorie?rq=1 +# man 1 httrack +# https://software.opensuse.org//download.html?project=home%3Abgstack15&package=freefilesync +# Improve: +# use some text file as a list of recently-synced URLs, and if today's URL matches a recent one, then run the httrack with the --update flag. Probably keep a running list forever. +# Documentation: +# Download the release key and trust it. +# curl -s http://albion320.no-ip.biz/mirror/obs/Release.key | apt-key add - +# Use a sources.list.d/ file with contents: +# deb https://albion320.no-ip.biz/mirror/obs/ / +# Dependencies: +# binaries: curl httrack grep head tr sed awk chmod chown find rm ln +# user: obsmirror + +# learn site +inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable" +workdir=/tmp/obs-stage +outdir=/var/www/mirror/obs +thisuser=obsmirror + +mkdir -p "${workdir}" ; chmod "0711" "${workdir}" ; chown "${thisuser}:$( id -Gn obsmirror )" "${workdir}" +cd "${workdir}" +# get page contents +step1="$( curl -s -L "${inurl}/all" )" +# get first listed package +step2="$( echo "${step1}" | grep --color=always -oE 'href="[a-zA-Z0-9_.+\-]+\.deb"' | head -n1 | grep -oE '".*"' | tr -d '"' )" +# get full url to a package +step3="$( curl -s -I "${inurl}/all/${step2}" | awk '/Location:/ {print $2}' )" +# get directory of the mirror to save down +step4="$( echo "${step3}" | sed -r -e "s/all\/${step2}//;" -e 's/\s*$//;' )" +echo "TARGET URL: ${step4}" +test -z "${DRYRUN}" && { + # have to skip the orig.tar.gz files because they are large and slow down the sync process significantly. + echo su "${thisuser}" -c "httrack \"${step4}\" -*.orig.t* --mirror --update -s0 -r3 -%e0 \"${workdir}\"" + time su "${thisuser}" -c "httrack ${step4} -*.orig.t* --mirror --update -s0 -r3 -%e0 ${workdir}" +} +# -s0 ignore robots.txt +# -r3 only go down 3 links +# -%e0 follow 0 links to external sites + +# find most recent directory of that level +levelcount="$(( $( printf "%s" "${inurl}" | tr -dc '/' | wc -c ) - 1 ))" +subdir="$( find "${workdir}" -mindepth "${levelcount}" -maxdepth "${levelcount}" -type d -name 'Debian_Unstable' -printf '%T@ %p\n' | sort -n -k1 | head -n1 | awk '{print $2}' )" + +# if the work directory actually synced +if test -n "${subdir}" ; +then + printf "%s " "DIRECTORY SIZE:" + du -sxBM "${subdir:-.}" + mkdir -p "$( dirname "${outdir}" )" + # get current target of symlink + current_target="$( find "${outdir}" -maxdepth 0 -type l -printf '%l\n' )" + + # if the current link is pointing to a different directory than this subdir + if test "${current_target}" != "${subdir}" ; + then + # then replace it with a link to this one + test -L "${outdir}" && unlink "${outdir}" + echo ln -sf "${subdir}" "${outdir}" + ln -sf "${subdir}" "${outdir}" + fi + +else + echo "ERROR: No subdir found, so cannot update the symlink." +fi + +# disable the index.html with all the httrack comments and original site links +find "${workdir}" -iname '*index.html' -exec rm {} + |