From b26ccc2019921aa399d801a5241dbdf21b8350c0 Mon Sep 17 00:00:00 2001 From: Niklas Meyer Date: Tue, 13 Aug 2024 15:59:57 +0200 Subject: [PATCH] unbound: fix healthcheck logging + added fail tolerance to checks (#6004) * unbound: fix healthcheck logging to stdout + rewrote healthcheck logic * compose: bump unbound tag * unbound: fixed healthcheck logic --- data/Dockerfiles/unbound/Dockerfile | 15 ++- data/Dockerfiles/unbound/healthcheck.sh | 128 ++++++++++++-------- data/Dockerfiles/unbound/stop-supervisor.sh | 10 ++ data/Dockerfiles/unbound/supervisord.conf | 32 +++++ data/Dockerfiles/unbound/syslog-ng.conf | 21 ++++ docker-compose.yml | 2 +- 6 files changed, 152 insertions(+), 56 deletions(-) create mode 100755 data/Dockerfiles/unbound/stop-supervisor.sh create mode 100644 data/Dockerfiles/unbound/supervisord.conf create mode 100644 data/Dockerfiles/unbound/syslog-ng.conf diff --git a/data/Dockerfiles/unbound/Dockerfile b/data/Dockerfiles/unbound/Dockerfile index 958d24e5f..fc7b14817 100644 --- a/data/Dockerfiles/unbound/Dockerfile +++ b/data/Dockerfiles/unbound/Dockerfile @@ -5,14 +5,17 @@ LABEL maintainer = "The Infrastructure Company GmbH " RUN apk add --update --no-cache \ curl \ bind-tools \ + coreutils \ unbound \ bash \ openssl \ drill \ tzdata \ + syslog-ng \ + supervisor \ && curl -o /etc/unbound/root.hints https://www.internic.net/domain/named.cache \ && chown root:unbound /etc/unbound \ - && adduser unbound tty \ + && adduser unbound tty \ && chmod 775 /etc/unbound EXPOSE 53/udp 53/tcp @@ -21,9 +24,13 @@ COPY docker-entrypoint.sh /docker-entrypoint.sh # healthcheck (dig, ping) COPY healthcheck.sh /healthcheck.sh +COPY syslog-ng.conf /etc/syslog-ng/syslog-ng.conf +COPY supervisord.conf /etc/supervisor/supervisord.conf +COPY stop-supervisor.sh /usr/local/sbin/stop-supervisor.sh + RUN chmod +x /healthcheck.sh -HEALTHCHECK --interval=30s --timeout=30s CMD [ "/healthcheck.sh" ] +HEALTHCHECK --interval=30s --timeout=10s \ + CMD sh -c '[ -f /tmp/healthcheck_status ] && [ "$(cat /tmp/healthcheck_status)" -eq 0 ] || exit 1' ENTRYPOINT ["/docker-entrypoint.sh"] - -CMD ["/usr/sbin/unbound"] +CMD exec /usr/bin/supervisord -c /etc/supervisor/supervisord.conf diff --git a/data/Dockerfiles/unbound/healthcheck.sh b/data/Dockerfiles/unbound/healthcheck.sh index 3da98e245..7d9181127 100644 --- a/data/Dockerfiles/unbound/healthcheck.sh +++ b/data/Dockerfiles/unbound/healthcheck.sh @@ -1,76 +1,102 @@ #!/bin/bash -# Skip Unbound (DNS Resolver) Healthchecks (NOT Recommended!) -if [[ "${SKIP_UNBOUND_HEALTHCHECK}" =~ ^([yY][eE][sS]|[yY])+$ ]]; then - SKIP_UNBOUND_HEALTHCHECK=y -fi +STATUS_FILE="/tmp/healthcheck_status" +RUNS=0 -# Reset logfile -echo "$(date +"%Y-%m-%d %H:%M:%S"): Starting health check - logs can be found in /var/log/healthcheck.log" -echo "$(date +"%Y-%m-%d %H:%M:%S"): Starting health check" > /var/log/healthcheck.log - -# Declare log function for logfile inside container -function log_to_file() { - echo "$(date +"%Y-%m-%d %H:%M:%S"): $1" >> /var/log/healthcheck.log +# Declare log function for logfile to stdout +function log_to_stdout() { +echo "$(date +"%Y-%m-%d %H:%M:%S"): $1" } # General Ping function to check general pingability function check_ping() { - declare -a ipstoping=("1.1.1.1" "8.8.8.8" "9.9.9.9") +declare -a ipstoping=("1.1.1.1" "8.8.8.8" "9.9.9.9") +local fail_tolerance=1 +local failures=0 - for ip in "${ipstoping[@]}" ; do - ping -q -c 3 -w 5 "$ip" - if [ $? -ne 0 ]; then - log_to_file "Healthcheck: Couldn't ping $ip for 5 seconds... Gave up!" - log_to_file "Please check your internet connection or firewall rules to fix this error, because a simple ping test should always go through from the unbound container!" - return 1 - fi +for ip in "${ipstoping[@]}" ; do + success=false + for ((i=1; i<=3; i++)); do + ping -q -c 3 -w 5 "$ip" > /dev/null + if [ $? -eq 0 ]; then + success=true + break + else + log_to_stdout "Healthcheck: Failed to ping $ip on attempt $i. Trying again..." + fi done + + if [ "$success" = false ]; then + log_to_stdout "Healthcheck: Couldn't ping $ip after 3 attempts. Marking this IP as failed." + ((failures++)) + fi +done + +if [ $failures -gt $fail_tolerance ]; then + log_to_stdout "Healthcheck: Too many ping failures ($fail_tolerance failures allowed, you got $failures failures), marking Healthcheck as unhealthy..." + return 1 +fi + +return 0 - log_to_file "Healthcheck: Ping Checks WORKING properly!" - return 0 } # General DNS Resolve Check against Unbound Resolver himself function check_dns() { - declare -a domains=("mailcow.email" "github.com" "hub.docker.com") +declare -a domains=("fuzzy.mailcow.email" "github.com" "hub.docker.com") +local fail_tolerance=1 +local failures=0 - for domain in "${domains[@]}" ; do - for ((i=1; i<=3; i++)); do - dig +short +timeout=2 +tries=1 "$domain" @127.0.0.1 > /dev/null - if [ $? -ne 0 ]; then - log_to_file "Healthcheck: DNS Resolution Failed on $i attempt! Trying again..." - if [ $i -eq 3 ]; then - log_to_file "Healthcheck: DNS Resolution not possible after $i attempts... Gave up!" - log_to_file "Maybe check your outbound firewall, as it needs to resolve DNS over TCP AND UDP!" - return 1 - fi +for domain in "${domains[@]}" ; do + success=false + for ((i=1; i<=3; i++)); do + dig_output=$(dig +short +timeout=2 +tries=1 "$domain" @127.0.0.1 2>/dev/null) + dig_rc=$? + + if [ $dig_rc -ne 0 ] || [ -z "$dig_output" ]; then + log_to_stdout "Healthcheck: DNS Resolution Failed on attempt $i for $domain! Trying again..." + else + success=true + break fi - done done - - log_to_file "Healthcheck: DNS Resolver WORKING properly!" - return 0 + if [ "$success" = false ]; then + log_to_stdout "Healthcheck: DNS Resolution not possible after 3 attempts for $domain... Gave up!" + ((failures++)) + fi +done + +if [ $failures -gt $fail_tolerance ]; then + log_to_stdout "Healthcheck: Too many DNS failures ($fail_tolerance failures allowed, you got $failures failures), marking Healthcheck as unhealthy..." + return 1 +fi + +return 0 } -if [[ ${SKIP_UNBOUND_HEALTHCHECK} == "y" ]]; then - log_to_file "Healthcheck: ALL CHECKS WERE SKIPPED! Unbound is healthy!" - exit 0 -fi +while true; do -# run checks, if check is not returning 0 (return value if check is ok), healthcheck will exit with 1 (marked in docker as unhealthy) -check_ping + if [[ ${SKIP_UNBOUND_HEALTHCHECK} == "y" ]]; then + log_to_stdout "Healthcheck: ALL CHECKS WERE SKIPPED! Unbound is healthy!" + echo "0" > $STATUS_FILE + sleep 365d + fi -if [ $? -ne 0 ]; then - exit 1 -fi + # run checks, if check is not returning 0 (return value if check is ok), healthcheck will exit with 1 (marked in docker as unhealthy) + check_ping + PING_STATUS=$? -check_dns + check_dns + DNS_STATUS=$? -if [ $? -ne 0 ]; then - exit 1 -fi + if [ $PING_STATUS -ne 0 ] || [ $DNS_STATUS -ne 0 ]; then + echo "1" > $STATUS_FILE -log_to_file "Healthcheck: ALL CHECKS WERE SUCCESSFUL! Unbound is healthy!" -exit 0 \ No newline at end of file + else + echo "0" > $STATUS_FILE + fi + + sleep 30 + +done \ No newline at end of file diff --git a/data/Dockerfiles/unbound/stop-supervisor.sh b/data/Dockerfiles/unbound/stop-supervisor.sh new file mode 100755 index 000000000..acd402738 --- /dev/null +++ b/data/Dockerfiles/unbound/stop-supervisor.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +printf "READY\n"; + +while read line; do + echo "Processing Event: $line" >&2; + kill -3 $(cat "/var/run/supervisord.pid") +done < /dev/stdin + +rm -rf /tmp/healthcheck_status \ No newline at end of file diff --git a/data/Dockerfiles/unbound/supervisord.conf b/data/Dockerfiles/unbound/supervisord.conf new file mode 100644 index 000000000..b47c8b11b --- /dev/null +++ b/data/Dockerfiles/unbound/supervisord.conf @@ -0,0 +1,32 @@ +[supervisord] +nodaemon=true +user=root +pidfile=/var/run/supervisord.pid + +[program:syslog-ng] +command=/usr/sbin/syslog-ng --foreground --no-caps +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +autostart=true + +[program:unbound] +command=/usr/sbin/unbound +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +autorestart=true + +[program:unbound-healthcheck] +command=/bin/bash /healthcheck.sh +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +autorestart=true + +[eventlistener:processes] +command=/usr/local/sbin/stop-supervisor.sh +events=PROCESS_STATE_STOPPED, PROCESS_STATE_EXITED, PROCESS_STATE_FATAL diff --git a/data/Dockerfiles/unbound/syslog-ng.conf b/data/Dockerfiles/unbound/syslog-ng.conf new file mode 100644 index 000000000..de858f9e0 --- /dev/null +++ b/data/Dockerfiles/unbound/syslog-ng.conf @@ -0,0 +1,21 @@ +@version: 4.5 +@include "scl.conf" +options { + chain_hostnames(off); + flush_lines(0); + use_dns(no); + use_fqdn(no); + owner("root"); group("adm"); perm(0640); + stats(freq(0)); + keep_timestamp(no); + bad_hostname("^gconfd$"); +}; +source s_dgram { + unix-dgram("/dev/log"); + internal(); +}; +destination d_stdout { pipe("/dev/stdout"); }; +log { + source(s_dgram); + destination(d_stdout); +}; diff --git a/docker-compose.yml b/docker-compose.yml index 1df07ea15..59f417856 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,7 @@ services: unbound-mailcow: - image: mailcow/unbound:1.22 + image: mailcow/unbound:1.23 environment: - TZ=${TZ} - SKIP_UNBOUND_HEALTHCHECK=${SKIP_UNBOUND_HEALTHCHECK:-n}