Kdump之kdump分析

说Kexec是基于kexec机制工作的,但关于Kdump到底是怎么实现的,
比如将第二个内核怎么加载到具体的保留位置,第一个内核crash后怎么传需要的elfcorehdr和memmap参数给第二个内核,另外第二个内核是怎么调用makdedumpfile来过滤压缩页的,网上一些资料给的都太概括了,还没找到相关分析的,看了下代码,有了个大概,可能部分理解有误,欢迎拍砖和探讨.

先看一张图,这个是网上找到的Vivek Goyal的PPT中两幅图,这里合成一张了

KEXEC的设计是用新内核去覆盖原内核位置;而KDUMP是预留一块内存来加载第二个内核(和相关数据),Crash后第二个内核在原位置运行(不然就达不到相关目的了),收集第一个内核的相关内存信息。在KDUMP中Kexec算是一个引导器,类似GRUB(2).  真正的实现是在kexec-tools中,对于RH系列,相关的kexec-tools RPM包中除了封装相关程序外,还有个/etc/rc.d/init.d/kdump shell脚本来负责将相关工具粘在一起

下面来说下大致流程:
1).第一个内核以crashkernel启动后,内核解析此crashkernel命令行选项并将此选项值放到crash_res中,并预留相关内存区域

/* crashkernel=size@addr specifies the location to reserve for
 * a crash kernel. By reserving this memory we guarantee
 * that linux never sets it up as a DMA target.
 * Useful for holding code to do something appropriate
 * after a kernel panic.
 */

/* Location of the reserved area for the crash kernel */

struct resource crashk_res = {//参见crash_res定义!

    .name  = "Crash kernel",
    .start = 0,
    .end   = 0,
    .flags = IORESOURCE_BUSY | IORESOURCE_MEM
};//这里存放的应该是命令行和/proc/iomem看到的0x1000000-0x7ffffff      :  Crash kernel  

static int __init parse_crashkernel(char *arg)
{
    unsigned long size, base;
    size = memparse(arg, &arg);
    if (*arg == '@') {
        base = memparse(arg+1, &arg);
        /* FIXME: Do I want a sanity check
         * to validate the memory range?
         */

        crashk_res.start = base;//存到此处
        crashk_res.end = base + size - 1;
    }
    return 0;
}
early_param("crashkernel", parse_crashkernel);//将与crashkernel关联的parse_crashkernel放到.init.setup中


/etc/init.d/kdump start启动时(只摘录部分相关的)


function save_core()
{
    local kdump_path
    kdump_path=`grep ^path $KDUMP_CONFIG_FILE | cut -d' ' -f2-`
    if [ -z "$kdump_path" ]; then
        coredir="/var/crash/`date +"%Y-%m-%d-%H:%M"`"
    else
        coredir="${kdump_path}/`date +"%Y-%m-%d-%H:%M"`"
    fi

    mkdir -p $coredir
    cp --sparse=always /proc/vmcore $coredir/vmcore-incomplete
    exitcode=$?
    if [ $exitcode == 0 ]; then
        mv $coredir/vmcore-incomplete $coredir/vmcore
        $LOGGER "saved a vmcore to $coredir"
    else
        $LOGGER "failed to save a vmcore to $coredir"
    fi
    return $exitcode
}
function load_kdump()
{

    if [ -z "$KDUMP_COMMANDLINE" ]
    then
        KDUMP_COMMANDLINE=`cat /proc/cmdline`
    fi

    ARCH=`uname -m`
    if [ "$ARCH" == "ppc64" ]
    then
        MEM_RESERVED=`grep "crashkernel=[0-9]\+[MmKkGg]@[0-9]\+[MmGgKk]" /proc/cmdline`
    else
        MEM_RESERVED=`grep "Crash kernel" /proc/iomem | grep -v "00000000-00000000"`
    fi
    if [ -z "$MEM_RESERVED" ]
    then
        $LOGGER "No crashkernel parameter specified for running kernel"
        return 1
    fi

    if [ "$ARCH" == "i686" -o "$ARCH" == "i386" ]
    then

        need_64bit_headers
        if [ $? == 1 ]
        then
            FOUND_ELF_ARGS=`echo $KEXEC_ARGS | grep elf32-core-headers`
            if [ -n "$FOUND_ELF_ARGS" ]
            then
                echo -n "Warning: elf32-core-headers overrides correct elf64 setting"
                warning
                echo
            else
                KEXEC_ARGS="$KEXEC_ARGS --elf64-core-headers"
            fi
        else
            FOUND_ELF_ARGS=`echo $KEXEC_ARGS | grep elf64-core-headers`
            if [ -z "$FOUND_ELF_ARGS" ]
            then
                KEXEC_ARGS="$KEXEC_ARGS --elf32-core-headers"
            fi
        fi
    fi

    KDUMP_COMMANDLINE=`echo $KDUMP_COMMANDLINE | sed -e 's/crashkernel=[0-9]\+[MmKkGg]@[0-9]\+[MmGgKk]//'`
    KDUMP_COMMANDLINE=`echo $KDUMP_COMMANDLINE | sed -e's/mem=[0-9]\+[GMKgmk]* *//'`
    KDUMP_COMMANDLINE=`echo $KDUMP_COMMANDLINE | sed -e's/hugepages=[0-9]\+ */ /g' -e's/hugepagesz=[0-9]\+[kKmMgG]* */ /g'`

    KDUMP_COMMANDLINE="${KDUMP_COMMANDLINE} ${KDUMP_COMMANDLINE_APPEND}"
    avoid_cdrom_drive
    KDUMP_COMMANDLINE="${KDUMP_COMMANDLINE} ${KDUMP_IDE_NOPROBE_COMMANDLINE}"

#最主要的是这部分

    KEXEC_OUTPUT=`$KEXEC $KEXEC_ARGS $standard_kexec_args \
        --command-line="$KDUMP_COMMANDLINE" \
        --initrd=$kdump_initrd $kdump_kernel 2>&1`
    if [ $? == 0 ]; then
        $LOGGER "kexec: loaded kdump kernel"
        return 0
    else
        $LOGGER $KEXEC_OUTPUT
        $LOGGER "kexec: failed to load kdump kernel"
        return 1
    fi
}

function start()
{
    #TODO check raw partition for core dump image

    status
    rc=$?
    if [ $rc == 2 ]; then
        echo -n "Kdump is not supported on this kernel"; failure; echo
        return 1;
    else
        if [ $rc == 0 ]; then
            echo -n "Kdump already running"; success; echo
            return 0
        fi
    fi
    check_config
    if [ $? != 0 ]; then
        echo -n "Starting kdump:"; failure; echo
        $LOGGER "failed to start up, config file incorrect"
        return 1
    fi
    load_kdump
    if [ $? != 0 ]; then
        echo -n "Starting kdump:"; failure; echo
        $LOGGER "failed to start up"
        return 1
    fi

    echo -n "Starting kdump:"; success; echo
    $LOGGER "started up"
}

case "$1" in
  start)
    if [ -s /proc/vmcore ]; then #第二个内核启动后走此步!

        run_kdump_pre
        save_core
        run_kdump_post $?
        do_final_action
    else #刚开始走此步!

        start
    fi
    ;;


最后是调用如下形式

kexec --args-linux --elf32(64)-core-headers -p --command-line="$KDUMP_COMMANDLINE" --initrd=$kdump_initrd $kdump_kernel

其中commandline是在配置文件中手动设置的或者从/proc/cmdline得到


这个就到了上次分析kexec的代码了,注意此处是以-p来调用的


int elf_x86_load(int argc, char **argv, const char *buf, off_t len,
    struct kexec_info *info)//******************

{
    struct mem_ehdr ehdr;
    const char *command_line;
    char *modified_cmdline;
    int command_line_len;
    int modified_cmdline_len;
    const char *ramdisk;
    unsigned long entry, max_addr;
    int arg_style;
#define ARG_STYLE_ELF 0
#define ARG_STYLE_LINUX 1
#define ARG_STYLE_NONE 2
    int opt;
#define OPT_APPEND        (OPT_ARCH_MAX+0)
#define OPT_REUSE_CMDLINE    (OPT_ARCH_MAX+1)
#define OPT_RAMDISK        (OPT_ARCH_MAX+2)
#define OPT_ARGS_ELF     (OPT_ARCH_MAX+3)
#define OPT_ARGS_LINUX     (OPT_ARCH_MAX+4)
#define OPT_ARGS_NONE     (OPT_ARCH_MAX+5)

    static const struct option options[] = {//参见http://xuwenzhang.org/blog/tag/getopt_long/ noted by peter.guo

        KEXEC_ARCH_OPTIONS
        { "command-line",    1, NULL, OPT_APPEND },
        { "append",        1, NULL, OPT_APPEND },
        { "reuse-cmdline",    1, NULL, OPT_REUSE_CMDLINE },
        { "initrd",        1, NULL, OPT_RAMDISK },
        { "ramdisk",        1, NULL, OPT_RAMDISK },
        { "args-elf",        0, NULL, OPT_ARGS_ELF },
        { "args-linux",        0, NULL, OPT_ARGS_LINUX },
        { "args-none",        0, NULL, OPT_ARGS_NONE },
        { 0,             0, NULL, 0 },
    };

    static const char short_options[] = KEXEC_OPT_STR "";

    /*
     * Parse the command line arguments
     */

    arg_style = ARG_STYLE_ELF;
    command_line = 0;
    modified_cmdline = 0;
    modified_cmdline_len = 0;
    ramdisk = 0;
    while((opt = getopt_long(argc, argv, short_options, options, 0)) != -1) {
//属于gnu体系
        switch(opt) {
        default:
            /* Ignore core options */
            if (opt < OPT_ARCH_MAX) {
                break;
            }
        case '?':
            usage();
            return -1;
        case OPT_APPEND://进入此!
            command_line = optarg;
            break;
        case OPT_REUSE_CMDLINE:
            command_line = get_command_line();
            break;
        case OPT_RAMDISK: //进入此!

            ramdisk = optarg;
            break;
        case OPT_ARGS_ELF:
            arg_style = ARG_STYLE_ELF;
            break;
        case OPT_ARGS_LINUX://进入此!

            arg_style = ARG_STYLE_LINUX;
            break;
        case OPT_ARGS_NONE:
#ifdef __i386__
            arg_style = ARG_STYLE_NONE;
#else
            die("--args-none only works on arch i386\n");
#endif
            break;
        }
    }
    command_line_len = 0;
    if (command_line) {
        command_line_len = strlen(command_line) +1;
    }

    /* Need to append some command line parameters internally in case of
     * taking crash dumps.
     */

    if (info->kexec_flags & (KEXEC_ON_CRASH|KEXEC_PRESERVE_CONTEXT)) {
        modified_cmdline = xmalloc(COMMAND_LINE_SIZE);//分配一个新的空间来盛命令行!

        memset((void *)modified_cmdline, 0, COMMAND_LINE_SIZE);
        if (command_line) {
            strncpy(modified_cmdline, command_line,
                        COMMAND_LINE_SIZE);
            modified_cmdline[COMMAND_LINE_SIZE - 1] = '\0';
        }
        modified_cmdline_len = strlen(modified_cmdline);
    }

    /* Load the ELF executable */
    elf_exec_build_load(info, &ehdr, buf, len, 0);//========================>


    entry = ehdr.e_entry;
    max_addr = elf_max_addr(&ehdr);

    /* Do we want arguments? */
    if (arg_style != ARG_STYLE_NONE) {//=====>

        /* Load the setup code *///===========>pay more attention to purgatory!!!!!!

        elf_rel_build_load(info, &info->rhdr, (char *) purgatory, purgatory_size,
            0, ULONG_MAX, 1, 0);
    }
    if (arg_style == ARG_STYLE_NONE) {
        info->entry = (void *)entry;

    }
    else if (arg_style == ARG_STYLE_ELF) {
        unsigned long note_base;
        struct entry32_regs regs;
        uint32_t arg1, arg2;

        /* Setup the ELF boot notes */
        note_base = elf_boot_notes(info, max_addr,
            (unsigned char *) command_line, command_line_len);

        /* Initialize the stack arguments */
        arg2 = 0; /* No return address */
        arg1 = note_base;
        elf_rel_set_symbol(&info->rhdr, "stack_arg32_1", &arg1, sizeof(arg1));
        elf_rel_set_symbol(&info->rhdr, "stack_arg32_2", &arg2, sizeof(arg2));
        
        /* Initialize the registers */
        elf_rel_get_symbol(&info->rhdr, "entry32_regs", &regs, sizeof(regs));
        regs.eip = entry; /* The entry point */
        regs.esp = elf_rel_get_addr(&info->rhdr, "stack_arg32_2");
        elf_rel_set_symbol(&info->rhdr, "entry32_regs", &regs, sizeof(regs));

        if (ramdisk) {
            die("Ramdisks not supported with generic elf arguments");
        }
    }
    else if (arg_style == ARG_STYLE_LINUX) {//=====>got it !!!!!!

        struct x86_linux_faked_param_header *hdr;
        unsigned long param_base;
        const unsigned char *ramdisk_buf;
        off_t ramdisk_length;
        struct entry32_regs regs;
        int rc = 0;

        /* Get the linux parameter header */
        hdr = xmalloc(sizeof(*hdr));

        /* Hack: With some ld versions, vmlinux program headers show
         * a gap of two pages between bss segment and data segment
         * but effectively kernel considers it as bss segment and
         * overwrites the any data placed there. Hence bloat the
         * memsz of parameter segment to 16K to avoid being placed
         * in such gaps.
         * This is a makeshift solution until it is fixed in kernel
         */

        param_base = add_buffer(info, hdr, sizeof(*hdr), 16*1024,
            16, 0, max_addr, 1);

        /* Initialize the parameter header */
        memset(hdr, 0, sizeof(*hdr));
        init_linux_parameters(&hdr->hdr);

        /* Add a ramdisk to the current image */
        ramdisk_buf = NULL;
        ramdisk_length = 0;
        if (ramdisk) {
            ramdisk_buf = (unsigned char *) slurp_file(ramdisk, &ramdisk_length);
        }

        /* If panic kernel is being loaded, additional segments need
         * to be created. */

        if (info->kexec_flags & (KEXEC_ON_CRASH|KEXEC_PRESERVE_CONTEXT)) {


/*

Command line: ro root=LABEL=/ rhgb quiet irqpoll maxcpus=1 reset_devices memmap=exactmap  memmap=640K@0K  memmap=5264K@16384K  
memmap=125152K@22288K  elfcorehdr=147440K (0x8ffc000)
memmap=56K#1834688K  memmap=136K#1834744K memmap=128K#1834880K memmap=1024K$4193280K //红色部分'#'代表specific memory  forACPI data. '$'代表specific memory as reserved. 没在代码中查找到?

/此处得到相关的memmap和elfcorehdr参数并存入新的命令行参数中

*/   

            rc = load_crashdump_segments(info, modified_cmdline,
                        max_addr, 0);
         if (rc < 0)
                return -1;
            /* Use new command line. */
            command_line = modified_cmdline;
            command_line_len = strlen(modified_cmdline) + 1;
        }

        /* Tell the kernel what is going on */
        setup_linux_bootloader_parameters(info, &hdr->hdr, param_base,
            offsetof(struct x86_linux_faked_param_header, command_line),
            command_line, command_line_len,
            ramdisk_buf, ramdisk_length);//======>got it !!!!!!


        /* Fill in the information bios calls would usually provide */
        setup_linux_system_parameters(&hdr->hdr, info->kexec_flags);

        /* Initialize the registers */
        elf_rel_get_symbol(&info->rhdr, "entry32_regs", &regs, sizeof(regs));
        regs.ebx = 0;        /* Bootstrap processor */
        regs.esi = param_base;    /* Pointer to the parameters */
        regs.eip = entry;    /* The entry point */
        regs.esp = elf_rel_get_addr(&info->rhdr, "stack_end"); /* Stack, unused */
        elf_rel_set_symbol(&info->rhdr, "entry32_regs", &regs, sizeof(regs));
    }
    else {
        die("Unknown argument style\n");
    }
    return 0;
}



/* Loads additional segments in case of a panic kernel is being loaded.
 * One segment for backup region, another segment for storing elf headers
 * for crash memory image.
 */

int load_crashdump_segments(struct kexec_info *info, char* mod_cmdline,
                unsigned long max_addr, unsigned long min_base)

{
    void *tmp;
    unsigned long sz, elfcorehdr;
    int nr_ranges, align = 1024;
    struct memory_range *mem_range, *memmap_p;

    if (get_crash_memory_ranges(&mem_range, &nr_ranges,
                 info->kexec_flags) < 0)
        return -1;

    /*
     * if the core type has not been set on command line, set it here
     * automatically
     */

    if (arch_options.core_header_type == CORE_TYPE_UNDEF) {
        arch_options.core_header_type =
            get_core_type(info, mem_range, nr_ranges);
    }

    /* 1.Memory regions which panic kernel can safely use to boot into */
    sz = (sizeof(struct memory_range) * (KEXEC_MAX_SEGMENTS + 1));
    memmap_p = xmalloc(sz);
    memset(memmap_p, 0, sz);
    add_memmap(memmap_p, BACKUP_SRC_START, BACKUP_SRC_SIZE);//第一块!

    sz = crash_reserved_mem.end - crash_reserved_mem.start +1;
    add_memmap(memmap_p, crash_reserved_mem.start, sz);//第二块!


    /* 2.Create a backup region segment to store backup data*/
    if (!(info->kexec_flags & KEXEC_PRESERVE_CONTEXT)) {
        sz = (BACKUP_SRC_SIZE + align - 1) & ~(align - 1);
        tmp = xmalloc(sz);
        memset(tmp, 0, sz);
        info->backup_start = add_buffer(info, tmp, sz, sz, align,
                        0, max_addr, -1);
        dbgprintf("Created backup segment at 0x%lx\n",
             info->backup_start);
        if (delete_memmap(memmap_p, info->backup_start, sz) < 0)
            return -1;
    }

    /* 3.Create elf header segment and store crash image (1st or 2nd????????)data. */
    if (arch_options.core_header_type == CORE_TYPE_ELF64) {
        if (crash_create_elf64_headers(info, &elf_info64,
                     crash_memory_range, nr_ranges,
                     &tmp, &sz,
                     ELF_CORE_HEADER_ALIGN) < 0)
            return -1;
    }
    else {
        if (crash_create_elf32_headers(info, &elf_info32,
                     crash_memory_range, nr_ranges,
                     &tmp, &sz,
                     ELF_CORE_HEADER_ALIGN) < 0)//哪里定义的??????noted by peter.guo

            return -1;
    }

    /* Hack: With some ld versions (GNU ld version 2.14.90.0.4 20030523),
     * vmlinux program headers show a gap of two pages between bss segment
     * and data segment but effectively kernel considers it as bss segment
     * and overwrites the any data placed there. Hence bloat (使膨胀)the memsz of
     * elf core header segment to 16K to avoid being placed in such gaps.
     * This is a makeshift solution until it is fixed in kernel.
     */

    elfcorehdr = add_buffer(info, tmp, sz, 16*1024, align, min_base,
                            max_addr, -1);
    dbgprintf("Created elf header segment at 0x%lx\n", elfcorehdr);
    if (delete_memmap(memmap_p, elfcorehdr, sz) < 0)
        return -1;
    cmdline_add_memmap(mod_cmdline, memmap_p);

    cmdline_add_elfcorehdr(mod_cmdline, elfcorehdr);

    //为啥此处没有K# 和K$形式的 ???????

    return 0;
}


你可能感兴趣的:(Kdump之kdump分析)