#! /bin/bash

# chkconfig: 2345 01 99
# description: configures the crash dump kernel

. /etc/init.d/functions

# Try to load settings from sysconfig
if [[ -e /etc/sysconfig/kdump ]];
then
    . /etc/sysconfig/kdump
fi

TIME=$(which time) # Need /usr/bin/time rather than bash builtin

# Defaults, overridden by /etc/sysconfig/kdump
KEXEC=${KEXEC:-"/usr/sbin/kexec"}
XCA=${XCA:-"/usr/lib64/xen/bin/xen-crashdump-analyser"}
XCA_CMDLINE_EXTRA=${XCA_CMDLINE_EXTRA:-"-v"}
KDUMP_KERNEL_VERSION=${KDUMP_KERNEL_VERSION:-"$(uname -r | sed s/xen/kdump/)"}
KDUMP_KERNEL_CMDLINE=${KDUMP_KERNEL_CMDLINE:-"$(cat /proc/cmdline)"}
KDUMP_KERNEL_CMDLINE_EXTRA=${KDUMP_KERNEL_CMDLINE_EXTRA:-"udev.children-max=1 irqpoll maxcpus=1 reset_devices no-hlt panic=1 iommu=off nointremap"}
CRASH_LOG_DIR=${CRASH_LOG_DIR:-"/var/crash"}
CRASH_LOGIN=${CRASH_LOGIN:-"no"}
CRASH_REBOOT=${CRASH_REBOOT:-"yes"}
CRASH_MAX_LOGS=${CRASH_MAX_LOGS:-"4"}
CRASH_SPACE=${CRASH_SPACE:-"$CRASH_LOG_DIR/.sacrificial-space-for-logs"}
CRASH_SPACE_SIZE_MB=${CRASH_SPACE_SIZE_MB:-64}

crash_dump()
{
    CRASH_XENVERSION=""
    CRASH_LINUXVERSION=""

    # Pull CRASH_{XEN,LINUX}VERSION from the kernel command line
    set -- $(cat /proc/cmdline)
    while [[ -n "$1" ]]; do
        case "$1" in
            kdump-xenversion=*)
                CRASH_XENVERSION="$(echo "$1" | sed -e 's/kdump-xenversion=//')"
                ;;
            kdump-linuxversion=*)
                CRASH_LINUXVERSION="$(echo "$1" | sed -e 's/kdump-linuxversion=//')"
                ;;
        esac
        shift
    done

    # Clean up /var/lock/subsys.  We have crashed, so none of the rc3 programs
    # are actually running.  This will prevent rc6 from trying to shut them down.
    rm -f /var/lock/subsys/*

    # Do we need to consider removing some logs?
    if [[ -d "$CRASH_LOG_DIR" && "$CRASH_MAX_LOGS" -gt 0 ]] ; then
        FIND_CMD="find $CRASH_LOG_DIR/ -regextype emacs -maxdepth 1 -type d -regex $CRASH_LOG_DIR/[0-9]+.*"

        # Current number of log directories
        CUR_DUMPS=$($FIND_CMD | wc -l)

        # Number of log directories to be removed, leaving room for the soon-to-be-generated log
        DEL_NUM=$((1+$CUR_DUMPS-$CRASH_MAX_LOGS))

        if [[ "$DEL_NUM" -gt 0 ]]; then
            echo -n $"Removing older log directories: "; success; echo
            # Find, sort (oldest first), top N of them, delete
            $FIND_CMD | sort -n | head -n $DEL_NUM | xargs rm -rf
        fi
    fi

    # Delete the sacrificial space.  Experimentally, a crash which fills
    # the filesystem causes many mistruths to be given by stat &
    # friends.  To make things easy, unconditionally delete the file
    # here to be sure we have at least $CRASH_SPACE_SIZE free.
    rm "$CRASH_SPACE"

    dir=$CRASH_LOG_DIR/$(date  +%Y%m%d-%H%M%S-%Z)

    # Try to set the core pattern sensibly
    if [[ -f /proc/sys/kernel/core_pattern ]] ; then
        OLD_CORE_PATTERN=$(cat /proc/sys/kernel/core_pattern)
        echo "$dir/core.%e.%p" > /proc/sys/kernel/core_pattern
        ulimit -c 16384 # 16M
    fi

    # Collect some initial information
    echo -n $"Collecting initial information: "
    mkdir -p "$dir"
    readelf -Wl /proc/vmcore &> "$dir/readelf-Wl.out"
    readelf -Wn /proc/vmcore &> "$dir/readelf-Wn.out"
    sync
    success; echo
    echo s > /proc/sysrq-trigger

    XEN_SYMTAB="/boot/xen-$CRASH_XENVERSION.map"
    DOM0_SYMTAB="/boot/System.map-$CRASH_LINUXVERSION"

    # Run xen-crashdump-analyser
    if [[ -e $XCA ]] ; then
        echo -n $"Running xen-crashdump-analyser: "

        XCA_CMD="$XCA --outdir $dir --xen-symtab $XEN_SYMTAB \
                --dom0-symtab $DOM0_SYMTAB $XCA_CMDLINE_EXTRA"

        # Do we have gnu time utility
        if [[ -n "$TIME" ]] ; then
            $TIME -v -o "$dir/time-v.out" $XCA_CMD && success || failure
        else
            $XCA_CMD && success || failure
        fi
        echo
        sync
        echo s > /proc/sysrq-trigger

    else
        echo -n $"Xen Crashdump Analyser not found"; failure; echo
    fi

    # Collect some subsequent information
    echo -n $"Collecting subsequent information: "
    dmesg &> "$dir/dmesg.kexec.log"
    sync
    lspci -tv &> "$dir/lspci-tv.out"
    sync
    lspci -vv &> "$dir/lspci-vv.out"
    sync
    lspci -vvxxxx &> "$dir/lspci-vvxxxx.out"
    sync
    success; echo
    echo s > /proc/sysrq-trigger

    # Return core pattern to previous
    if [[ -f /proc/sys/kernel/core_pattern ]] ; then
        echo "$OLD_CORE_PATTERN" > /proc/sys/kernel/core_pattern
    fi

    # If this shell is not interactive, consider sulogin or reboot following analysis
    if [[ "$-" != "i" ]] ; then
        if [[ "$CRASH_LOGIN" == "yes" ]] ; then
            sulogin
        fi

        if [[ "$CRASH_REBOOT" == "yes" ]] ; then
	    echo "Rebooting"
            reboot
	    echo "Clean shutdown failed, forcing reboot"
	    umount -ar
	    sync
	    echo s > /proc/sysrq-trigger
	    reboot -f
        fi
    fi
}

setup_kdump()
{
    logger -t kdump $"Setting up crash kernel:"

    # Find the current hypervisor version
    if [[ -d "/sys/hypervisor/version/" ]] ; then
        xen_major=$(cat /sys/hypervisor/version/major)
        xen_minor=$(cat /sys/hypervisor/version/minor)
        xen_extra=$(cat /sys/hypervisor/version/extra)
        xen_version="${xen_major}.${xen_minor}${xen_extra}"
    else
        # Fall back to reading the boot symlink for a version
        xen_version=$(readlink -f /boot/xen.gz | sed 's!/boot/xen-\(.*\)\.gz!\1!')
        if [[ -h /boot/xen.gz && -n "$xen_version" ]] ; then
            logger -t kdump $"Warning: Falling back to symlink version of xen '$xen_version'"
        else
            logger -t kdump $"Warning: can't find Xen version"
        fi
    fi

    # Check for xen symbol file
    xen_symfile="/boot/xen-$xen_version.map"
    [[ -e "$xen_symfile" ]] || \
        logger -t kdump $"Warning: can't find Xen symbol file '$xen_symfile'"

    # Find the current dom0 kernel version
    dom0_kernel_version=$(uname -r)

    # Check for the dom0 symbol file
    dom0_symfile="/boot/System.map-$dom0_kernel_version"
    [[ -e "$dom0_symfile" ]] || \
        logger -t kdump $"Warning: can't find Dom0 symbol file '$dom0_symfile'"

    # Convert xen serial console parameters to kdump serial parameters
    if [[ -n "$KDUMP_KERNEL_CMDLINE" ]] ; then
        serialcon=''
        # XXX HACK  xl currently bails if it can't stat /var/run/xenstored.pid, even
        # though it doesn't use xenstored
        touch /var/run/xenstored.pid
        set -- $(xl info | grep xen_commandline)

        shift 2 # skip "xen-commandline" and ":"
        while [[ -n "$1" ]]; do
            case $1 in
                com1=*)
                    port=ttyS0
                    com1=${1#com1=};;
                com2=*)
                    port=ttyS1
                    com2=${1#com2=};;
                console=*)
                    xsc=$(expr "$1" : '.*\(com[12]\)')
                    if [[ -n "$xsc" ]]; then
                        xenparam=$(eval echo \$$xsc)
                        baud=$(expr "$xenparam" : '\([0-9]*\)')
                        data=$(expr "$xenparam" : '[^,]*,\([0-9]\)')
                        par=$(expr "$xenparam" : '[^,]*,[0-9]\(.\)')
                        details="$port,$baud$par$data"
                        serialcon="earlyprintk=serial,$details console=$details"
                    fi
                    break
            esac
            shift
            ioport=$([[ -n "$port" ]] && cat "/sys/class/tty/$port/device/resources" 2>/dev/null | \
                sed -n "/^io/ s/io \(0x[0-9a-f]\+\).*/\1/ p")
        done

        if echo "$KDUMP_KERNEL_CMDLINE" | grep -q 'xencons='; then
            KDUMP_KERNEL_CMDLINE=$(echo "$KDUMP_KERNEL_CMDLINE" | \
                sed -e "s/xencons=[^ ]*//; s/console=hvc0/$serialcon/; s/  / /g")
        fi

        # If we have an IO port and baud rate, enable serial in purgatory
        [[ -n "$ioport" && -n "$baud" ]] && \
            KEXEC_SERIAL_OPTIONS="--console-serial --serial=$ioport --serial-baud=$baud"
    fi

    # Append xen and dom0 version to kdump command line
    [[ -n "$xen_version" ]] && \
        KDUMP_KERNEL_CMDLINE="$KDUMP_KERNEL_CMDLINE kdump-xenversion=$xen_version"
    [[ -n "$dom0_kernel_version" ]] && \
        KDUMP_KERNEL_CMDLINE="$KDUMP_KERNEL_CMDLINE kdump-linuxversion=$dom0_kernel_version"

    # Append extra arguments
    KDUMP_KERNEL_CMDLINE="$KDUMP_KERNEL_CMDLINE $KDUMP_KERNEL_CMDLINE_EXTRA"
    KERNEL="/boot/vmlinuz-${KDUMP_KERNEL_VERSION}"
    INITRD="/boot/initrd-${KDUMP_KERNEL_VERSION}.img"

    # Check we have a kernel
    if [[ ! -f "$KERNEL" ]] ; then
        echo -n $"No kdump kernel image found"; failure; echo
        logger -t kdump $"Error: Unable to locate kdump kernel '$KERNEL'"
        return 1
    fi

    # Check we have an initrd
    if [[ ! -f "$INITRD" ]] ; then
        echo -n $"No kdump initrd found"; failure; echo
        logger -t kdump $"Error: No kdump initrd '$INITRD' not found"
        return 1
    fi

    # Log some information
    logger -t kdump $"Crash kernel: $KERNEL"
    logger -t kdump $"Crash ramdisk: $INITRD"
    logger -t kdump $"Crash kernel command line: $KDUMP_KERNEL_CMDLINE"

    # Set up sacrificial crash space.
    if [[ ! -e "$CRASH_SPACE" ]]; then
        # If file does not exist, create it.  On error, such as a full
        # filesytem, remove the crash space to give breathing room for
        # services like ovs to start back up.
        dd if=/dev/zero of="$CRASH_SPACE" bs=1M count="$CRASH_SPACE_SIZE_MB" &> /dev/null \
            || { rm -f "$CRASH_SPACE"; touch "$CRASH_SPACE.failed"; }
    else
        # If file does exist, verify it is regular
        if [[ ! -f "$CRASH_SPACE" ]]; then
            echo -n $"Crash space exists, but is not regular file"; failure; echo
            return 1
        fi
    fi

    # Actually try to load the crash kernel
    action $"Loading crash kernel:" \
        "$KEXEC" -p --elf64-core-headers $KEXEC_SERIAL_OPTIONS \
        --append="$KDUMP_KERNEL_CMDLINE" --ramdisk="$INITRD" "$KERNEL"
    if [[ $? -eq 0 ]]; then
        logger -t kdump $"Loaded crash kernel"
        return 0
    else
        logger -t kdump $"Error: Failed to load crash kernel"
        return 1
    fi
}

case "$1" in
    start)
        if [[ ! -f /proc/vmcore ]]; then
            setup_kdump
        else
            echo -n "kdump start: already in crash"; failure; echo
        fi
        ;;
    crash_dump)
        if [[ -f /proc/vmcore ]]; then
            crash_dump
        else
            echo -n "kdump crash_dump: cannot find /proc/vmcore"; failure; echo
        fi
        ;;
    stop)
        action $"Unloading crash kernel: " $KEXEC -p -u
        ;;
    status)
        ;;
    restart)
        $0 stop
        $0 start
        ;;
    *)
        echo $"Usage: $0 {start|stop|status|restart}"
        exit 1
esac

exit $?
