summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--obsmirror.sh/description1
-rwxr-xr-xobsmirror.sh/obsmirror.sh80
2 files changed, 81 insertions, 0 deletions
diff --git a/obsmirror.sh/description b/obsmirror.sh/description
new file mode 100644
index 0000000..4cc4c7a
--- /dev/null
+++ b/obsmirror.sh/description
@@ -0,0 +1 @@
+Mirror an OBS repository
diff --git a/obsmirror.sh/obsmirror.sh b/obsmirror.sh/obsmirror.sh
new file mode 100755
index 0000000..eaf752e
--- /dev/null
+++ b/obsmirror.sh/obsmirror.sh
@@ -0,0 +1,80 @@
+#!/bin/sh
+# File: /etc/installed/obsmirror.sh
+# License: CC-BY-SA 4.0
+# Author: bgstack15
+# Startdate: 2020-01-05 18:01
+# Title: Script that scrapes down OBS site to serve a copy to intranet
+# Purpose: save down my OBS site so I can serve it locally
+# History:
+# Usage:
+# in a cron job: /etc/cron.d/mirror.cron
+# 50 12 * * * root /etc/installed/obsmirror.sh 1>/dev/null 2>&1
+# Reference:
+# https://unix.stackexchange.com/questions/114044/how-to-make-wget-download-recursive-combining-accept-with-exclude-directorie?rq=1
+# man 1 httrack
+# https://software.opensuse.org//download.html?project=home%3Abgstack15&package=freefilesync
+# Improve:
+# use some text file as a list of recently-synced URLs, and if today's URL matches a recent one, then run the httrack with the --update flag. Probably keep a running list forever.
+# Documentation:
+# Download the release key and trust it.
+# curl -s http://albion320.no-ip.biz/mirror/obs/Release.key | apt-key add -
+# Use a sources.list.d/ file with contents:
+# deb https://albion320.no-ip.biz/mirror/obs/ /
+# Dependencies:
+# binaries: curl httrack grep head tr sed awk chmod chown find rm ln
+# user: obsmirror
+
+# learn site
+inurl="http://download.opensuse.org/repositories/home:/bgstack15/Debian_Unstable"
+workdir=/tmp/obs-stage
+outdir=/var/www/mirror/obs
+thisuser=obsmirror
+
+mkdir -p "${workdir}" ; chmod "0711" "${workdir}" ; chown "${thisuser}:$( id -Gn obsmirror )" "${workdir}"
+cd "${workdir}"
+# get page contents
+step1="$( curl -s -L "${inurl}/all" )"
+# get first listed package
+step2="$( echo "${step1}" | grep --color=always -oE 'href="[a-zA-Z0-9_.+\-]+\.deb"' | head -n1 | grep -oE '".*"' | tr -d '"' )"
+# get full url to a package
+step3="$( curl -s -I "${inurl}/all/${step2}" | awk '/Location:/ {print $2}' )"
+# get directory of the mirror to save down
+step4="$( echo "${step3}" | sed -r -e "s/all\/${step2}//;" -e 's/\s*$//;' )"
+echo "TARGET URL: ${step4}"
+test -z "${DRYRUN}" && {
+ # have to skip the orig.tar.gz files because they are large and slow down the sync process significantly.
+ echo su "${thisuser}" -c "httrack \"${step4}\" -*.orig.t* --mirror --update -s0 -r3 -%e0 \"${workdir}\""
+ time su "${thisuser}" -c "httrack ${step4} -*.orig.t* --mirror --update -s0 -r3 -%e0 ${workdir}"
+}
+# -s0 ignore robots.txt
+# -r3 only go down 3 links
+# -%e0 follow 0 links to external sites
+
+# find most recent directory of that level
+levelcount="$(( $( printf "%s" "${inurl}" | tr -dc '/' | wc -c ) - 1 ))"
+subdir="$( find "${workdir}" -mindepth "${levelcount}" -maxdepth "${levelcount}" -type d -name 'Debian_Unstable' -printf '%T@ %p\n' | sort -n -k1 | head -n1 | awk '{print $2}' )"
+
+# if the work directory actually synced
+if test -n "${subdir}" ;
+then
+ printf "%s " "DIRECTORY SIZE:"
+ du -sxBM "${subdir:-.}"
+ mkdir -p "$( dirname "${outdir}" )"
+ # get current target of symlink
+ current_target="$( find "${outdir}" -maxdepth 0 -type l -printf '%l\n' )"
+
+ # if the current link is pointing to a different directory than this subdir
+ if test "${current_target}" != "${subdir}" ;
+ then
+ # then replace it with a link to this one
+ test -L "${outdir}" && unlink "${outdir}"
+ echo ln -sf "${subdir}" "${outdir}"
+ ln -sf "${subdir}" "${outdir}"
+ fi
+
+else
+ echo "ERROR: No subdir found, so cannot update the symlink."
+fi
+
+# disable the index.html with all the httrack comments and original site links
+find "${workdir}" -iname '*index.html' -exec rm {} +
bgstack15