jvm开发笔记

笔者最近对java虚拟机产生了浓厚的兴趣, 想了解下最简单的jvm是如何写出来的,于是看起了《java虚拟机规范》,这个规范如同intel开发手册一样,是每个jvm开发人员必须掌握的。 要想翻译执行java byte code, 首先得从java class文件中把Code属性解析出来才行。 在笔者看来, java的class文件结构着实比elf文件结构复杂很多,不过在复杂的结构, 只要耐心对照着手册中的结构一一解析即可, 经过几天的努力, 用c实现了一个class文件解析器,目前它只能解析手册中规定的jvm最基本的要解析出来的一些属性:Code, StackMapTable, LineNumberTable。当然, 随着开发的深入, 它会不断的健壮起来。

下面说说我在解析java class文件格式中碰到的几个问题, 帮助后面也要自己动手写解析器的朋友少走一点弯路:

1、为了提高解析性能, 使用了mmap讲class文件全部映射到内存中, 而不是每次解析都要使用read读磁盘文件。

int mmap_class_file(const char *class_file)
{
        struct stat f_stat;

        class_fd = open(class_file, O_RDONLY);
        if (class_fd == -1) {
                perror("open");
                return -1;
        }

        if (stat(class_file, &f_stat) == -1) {
                perror("stat");
                close(class_fd);
                return -1;
        }

        class_file_len = f_stat.st_size;
        printf("%s file len: %d\n", class_file, class_file_len);

        class_start_mem = mmap(NULL, class_file_len, PROT_READ, MAP_PRIVATE, class_fd, 0);
        if (!class_start_mem) {
                perror("mmap");
                close(class_fd);
                return -1;
        }
        printf("mmap %s at %p\n", class_file, class_start_mem);

        return 0;
}

2、java class使用的是big-endian字节序,x86使用的litte-endian字节序, 所以要转换一下,就是移位操作而已。

 

#define CLASS_READ_U4(s, p)                             \
        do {                                            \
                s = (((p >> 24) & 0x000000ff) |         \
                        ((p >> 8) & 0x0000ff00) |       \
                        ((p << 24) & 0xff000000) |      \
                        ((p << 8) & 0x00ff0000));       \
        } while (0);

#define CLASS_READ_U2(s, p)                             \
        do {                                            \
                s = (((p >> 8) & 0x00ff) |              \
                        ((p << 8) & 0xff00));           \
        } while (0);

#define CLASS_READ_U1(s, p)                             \
        do {                                            \
                s = p;                                  \
        } while (0);

 

 

例如读一个4字节内容:

 

        u4 class_magic;

        /* read class magic number. */
        CLASS_READ_U4(class_magic, (*(u4 *)p_mem))
        p_mem = 4;

        printf("magic: 0x%x\n", class_magic);

////////////////////////////////////////////////////////////////////////////////////////////

下面是全部的源码:

 

jvm.h

#ifndef JVM_H
#define JVM_H

#define JVM_CLASS_MAGIC                                 0xcafebabe

#define CLASS_READ_U4(s, p)                             \
        do {                                            \
                s = (((p >> 24) & 0x000000ff) |         \
                        ((p >> 8) & 0x0000ff00) |       \
                        ((p << 24) & 0xff000000) |      \
                        ((p << 8) & 0x00ff0000));       \
        } while (0);

#define CLASS_READ_U2(s, p)                             \
        do {                                            \
                s = (((p >> 8) & 0x00ff) |              \
                        ((p << 8) & 0xff00));           \
        } while (0);

#define CLASS_READ_U1(s, p)                             \
        do {                                            \
                s = p;                                  \
        } while (0);

#define CLASS_READ_STRING(s, p, len)                    \
        do {                                            \
                memcpy(s, p, len);                      \
        } while (0);

typedef unsigned int u4;
typedef unsigned short u2;
typedef unsigned char u1;

#define CONSTANT_Class                                  7
#define CONSTANT_Fieldref                               9
#define CONSTANT_Methodref                              10
#define CONSTANT_InterfaceMethodref                     11
#define CONSTANT_String                                 8
#define CONSTANT_Integer                                3
#define CONSTANT_Float                                  4
#define CONSTANT_Long                                   5
#define CONSTANT_Double                                 6
#define CONSTANT_NameAndType                            12
#define CONSTANT_Utf8                                   1
#define CONSTANT_MethodHandle                           15
#define CONSTANT_MethodType                             16
#define CONSTANT_InvokeDynamic                          18

#define ACC_PUBLIC                                      0x0001
#define ACC_FINAL                                       0x0010
#define ACC_SUPER                                       0x0020
#define ACC_INTERFACE                                   0x0200
#define ACC_ABSTRACT                                    0X0400
#define ACC_SYNTHETIC                                   0x1000
#define ACC_ANNOTATION                                  0x2000
#define ACC_ENUM                                        0x4000

#define METHOD_ACC_PUBLIC                               0x0001
#define METHOD_ACC_PRIVATE                              0x0002
#define METHOD_ACC_PROTECTED                            0x0004
#define METHOD_ACC_STATIC                               0x0008
#define METHOD_ACC_FINAL                                0x0010
#define METHOD_ACC_SYNCHRONIED                          0x0020
#define METHOD_ACC_BRIDGE                               0x0040
#define METHOD_ACC_VARARGS                              0x0080
#define METHOD_ACC_NATIVE                               0x0100
#define METHOD_ACC_ABSTRACT                             0x0400
#define METHOD_ACC_STRICT                               0x0800
#define METHOD_ACC_SYNTHETIC                            0x1000

#define ITEM_Top                                        0
#define ITEM_Integer                                    1
#define ITEM_Float                                      2
#define ITEM_Double                                     3
#define ITEM_Long                                       4
#define ITEM_Null                                       5
#define ITEM_UninitializedThis                          6
#define ITEM_Object                                     7
#define ITEM_Uninitialized                              8

struct constant_info_st {
        u2 index;
        u1 *base;
}__attribute__ ((packed));

struct cp_info {
        u1 tag;
        u1 info[];
}__attribute__ ((packed));

struct CONSTANT_Class_info {
        //u1 tag;
        u2 name_index;
}__attribute__ ((packed));

struct CONSTANT_Fieldref_info {
        //u1 tag;
        u2 class_index;
        u2 name_and_type_index;
}__attribute__ ((packed));

struct CONSTANT_Methodref_info {
        //u1 tag;
        u2 class_index;
        u2 name_and_type_index;
}__attribute__ ((packed));

struct CONSTANT_InterfaceMethodref_info {
        //u1 tag;
        u2 class_index;
        u2 name_and_type_inex;
}__attribute__ ((packed));

struct CONSTANT_String_info {
        //u1 tag;
        u2 string_index;
}__attribute__ ((packed));

struct CONSTANT_Integer_info {
        //u1 tag;
        u4 bytes;
}__attribute__ ((packed));

struct CONSTANT_Float_info {
        //u1 tag;
        u4 bytes;
}__attribute__ ((packed));

struct CONSTANT_Long_info {
        //u1 tag;
        u4 high_bytes;
        u4 low_bytes;
}__attribute__ ((packed));

struct CONSTANT_Double_info {
        //u1 tag;
        u4 high_bytes;
        u4 low_bytes;
}__attribute__ ((packed));

struct CONSTANT_NameAndType_info {
        //u1 tag;
        u2 name_index;
        u2 descriptor_index;
}__attribute__ ((packed));

struct CONSTANT_Utf8_info {
        //u1 tag;
        u2 length;
        u1 bytes[];
}__attribute__ ((packed));

struct CONSTANT_MethodHandle_info {
        //u1 tag;
        u1 reference_kind;
        u2 reference_index;
}__attribute__ ((packed));

struct CONSTANT_MethodType_info {
        //u1 tag;
        u2 descriptor_index;
}__attribute__ ((packed));

struct CONSTANT_InvokeDynamic_info {
        //u1 tag;
        u2 bootstrap_method_attr_index;
        u2 name_and_type_index;
}__attribute__ ((packed));

#endif

////////////////////////////////////////////////////////////////////////

classreader.c:

/*
 * classreader.c - jvm class file parser.
 *
 * (c) wzt 2012         http://www.cloud-sec.org
 *
 */

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#include "jvm.h"

static int class_fd;
static int class_file_len;
static void *class_start_mem;
static char *p_mem;
static struct constant_info_st *constant_info;

int mmap_class_file(const char *class_file)
{
        struct stat f_stat;

        class_fd = open(class_file, O_RDONLY);
        if (class_fd == -1) {
                perror("open");
                return -1;
        }

        if (stat(class_file, &f_stat) == -1) {
                perror("stat");
                close(class_fd);
                return -1;
        }

        class_file_len = f_stat.st_size;
        printf("%s file len: %d\n", class_file, class_file_len);

        class_start_mem = mmap(NULL, class_file_len, PROT_READ, MAP_PRIVATE, class_fd, 0);
        if (!class_start_mem) {
                perror("mmap");
                close(class_fd);
                return -1;
        }
        printf("mmap %s at %p\n", class_file, class_start_mem);

        return 0;
}

int mmap_exit(void)
{
        if (munmap(class_start_mem, class_file_len) == -1) {
                perror("munmap");
                return -1;
        }

        close(class_fd);
        return 0;
}

int parse_class_magic(void)
{
        u4 class_magic;

        /* read class magic number. */
        CLASS_READ_U4(class_magic, (*(u4 *)p_mem))
        p_mem = 4;

        printf("magic: 0x%x\n", class_magic);
        if (class_magic != JVM_CLASS_MAGIC) {
                printf("jvm class magic not match.\n");
                return -1;
        }
        printf("jvm class magic match: 0x%x\n", class_magic);
        return 0;
}

int parse_class_version(void)
{
        u2 minor_version, major_version;
        u2 constant_pool_count;

        /* read class minor_version. */
        CLASS_READ_U2(minor_version, (*(u2 *)p_mem))
        p_mem = 2;
        printf("jvm class minor_version: %d\n", minor_version);

        /* read class major_version. */
        CLASS_READ_U2(major_version, (*(u2 *)p_mem))
        p_mem = 2;
        printf("jvm class major_version: %d\n", major_version);

        return 0;
}

int parse_class_constant(void)
{
        u2 constant_pool_count;
        u1 constant_tag;
        u2 idx;

        printf("\n-----------parse contant pool count----------------------:\n\n");
        /* read constant_pool_count */
        CLASS_READ_U2(constant_pool_count, (*(u2 *)p_mem))
        p_mem = 2;
        printf("jvm constant_pool_count: %d\n", constant_pool_count);

        constant_info = (struct constant_info_st *)
                        malloc(sizeof(struct constant_info_st) *
                                constant_pool_count);
        if (!constant_info) {
                printf("Malloc failed.\n");
                return -1;
        }

        for (idx = 1; idx <= constant_pool_count - 1; idx ) {
                CLASS_READ_U1(constant_tag, (*(u1 *)p_mem))
                p_mem = 1;
                printf("- idx: - constant tag: %d\t", idx, (int)constant_tag);
                switch (constant_tag) {
                case CONSTANT_Fieldref:
                case CONSTANT_Methodref:
                case CONSTANT_InterfaceMethodref:
                {
                        struct CONSTANT_Methodref_info methodref_info;

                        CLASS_READ_U2(methodref_info.class_index, (*(u2 *)p_mem));
                        p_mem = 2;
                        assert(methodref_info.class_index > 0 &&
                                methodref_info.class_index < constant_pool_count);

                        CLASS_READ_U2(methodref_info.name_and_type_index, (*(u2 *)p_mem));
                        p_mem = 2;
                        assert(methodref_info.class_index > 0 &&
                                methodref_info.class_index < constant_pool_count);

                        printf("class_index: %d, name_and_type_index: %d\n",
                                methodref_info.class_index,
                                methodref_info.name_and_type_index);
                        break;
                }
                case CONSTANT_Class:
                {
                        struct CONSTANT_Class_info class_info;

                        CLASS_READ_U2(class_info.name_index, (*(u2 *)p_mem));
                        p_mem = 2;
                        assert(class_info.name_index > 0 &&
                                class_info.name_index < constant_pool_count);
                        printf("name_index: %d\n", class_info.name_index);
                        break;
                }
                case CONSTANT_String:
                {
                        struct CONSTANT_String_info string_info;

                        CLASS_READ_U2(string_info.string_index, (*(u2 *)p_mem));
                        p_mem = 2;
                        assert(string_info.string_index > 0 &&
                                string_info.string_index < constant_pool_count);
                        printf("string index: %d\n", string_info.string_index);
                        break;
                }
                case CONSTANT_Long:
                {
                        struct CONSTANT_Long_info long_info;

                        CLASS_READ_U2(long_info.high_bytes, (*(u2 *)p_mem));
                        p_mem = 2;

                        CLASS_READ_U2(long_info.low_bytes, (*(u2 *)p_mem));
                        p_mem = 2;

                        printf("high bytes: %d, low bytes: %d\n",
                                long_info.high_bytes, long_info.low_bytes);
                        break;
                }
                case CONSTANT_Integer:
                {
                        struct CONSTANT_Integer_info integer_info;

                        CLASS_READ_U4(integer_info.bytes, (*(u4 *)p_mem));
                        p_mem = 4;
                        printf("bytes: %d\n", integer_info.bytes);
                        break;
                }
                case CONSTANT_Float:
                {
                        struct CONSTANT_Float_info float_info;

                        CLASS_READ_U4(float_info.bytes, (*(u4 *)p_mem));
                        p_mem = 4;
                        printf("bytes: %d\n", float_info.bytes);
                        break;
                }
                case CONSTANT_Double:
                {
                        struct CONSTANT_Double_info double_info;

                        CLASS_READ_U4(double_info.high_bytes, (*(u4 *)p_mem));
                        p_mem = 4;

                        CLASS_READ_U4(double_info.low_bytes, (*(u4 *)p_mem));
                        p_mem = 4;
                        printf("high_bytes: %d, low_bytes: %d\n",
                                double_info.high_bytes, double_info.low_bytes);
                        break;
                }
                case CONSTANT_NameAndType:
                {
                        struct CONSTANT_NameAndType_info name_type_info;

                        CLASS_READ_U2(name_type_info.name_index, (*(u2 *)p_mem));
                        p_mem = 2;

                        CLASS_READ_U2(name_type_info.descriptor_index, (*(u2 *)p_mem));
                        p_mem = 2;

                        printf("name_index: %d, descriptor_index: %d\n",
                                name_type_info.name_index, name_type_info.descriptor_index);
                        break;
                }
                case CONSTANT_MethodHandle:
                {
                        struct CONSTANT_MethodHandle_info method_handle_info;

                        CLASS_READ_U1(method_handle_info.reference_kind, (*(u1 *)p_mem));
                        p_mem = 1;

                        CLASS_READ_U2(method_handle_info.reference_index, (*(u2 *)p_mem));
                        p_mem = 2;

                        printf("reference_kind: %d, reference_index: %d\n",
                                method_handle_info.reference_kind,
                                method_handle_info.reference_index);
                        break;
                }
                case CONSTANT_MethodType:
                {
                        struct CONSTANT_MethodType_info method_type_info;

                        CLASS_READ_U2(method_type_info.descriptor_index, (*(u2 *)p_mem));
                        p_mem = 2;

                        printf("descriptor_index %d\n", method_type_info.descriptor_index);
                        break;
                }
                case CONSTANT_InvokeDynamic:
                {
                        struct CONSTANT_InvokeDynamic_info invoke_dyc_info;

                        CLASS_READ_U2(invoke_dyc_info.bootstrap_method_attr_index, (*(u2 *)p_mem));
                        p_mem = 2;

                        CLASS_READ_U2(invoke_dyc_info.name_and_type_index, (*(u2 *)p_mem));
                        p_mem = 2;

                        printf("bootstrap_method_attr_index: %d, name_and_type_index: %d\n",
                                invoke_dyc_info.bootstrap_method_attr_index,
                                invoke_dyc_info.name_and_type_index);
                        break;
                }
                case CONSTANT_Utf8:
                {
                        u2 len;
                        char *buf;

                        CLASS_READ_U2(len, (*(u2 *)p_mem));
                        p_mem = 2;

                        buf = malloc(len 1);
                        buf[len] = '\0';
                        assert(buf != NULL);

                        memcpy(buf, p_mem, len);
                        printf("len: %d\t%s\n", len, buf);
                        p_mem = len;

                        constant_info[idx].index = idx;
                        constant_info[idx].base = buf;
                        break;
                }
                default:
                        ;
                }
        }
        printf("\n");
/*
        for (idx = 1; idx <= constant_pool_count - 1; idx )
                printf("%d: %s\n", constant_info[idx].index, constant_info[idx].base);
*/
        return 0;

out:
        mmap_exit();
        return -1;
}

int parse_class_access_flag(void)
{
        u2 access_flag;

        /* read class access flag. */
        CLASS_READ_U2(access_flag, (*(u2 *)p_mem))
        p_mem = 2;

        printf("access_flag: 0x%x\n", access_flag);
        return 0;
}
int parse_class_this_super(void)
{
        u2 this_class;
        u2 super_class;

        CLASS_READ_U2(this_class, (*(u2 *)p_mem))
        p_mem = 2;

        CLASS_READ_U2(super_class, (*(u2 *)p_mem))
        p_mem = 2;

        printf("this_class: %d\tsuper_class: %d\n\n", this_class, super_class);
        return 0;
}

int parse_class_interface(void)
{
        u2 interfaces_count;
        u2 idx, index;

        CLASS_READ_U2(interfaces_count, (*(u2 *)p_mem))
        p_mem = 2;
        printf("interfaces_count: %d\n", interfaces_count);

        for (idx = 0; idx < interfaces_count; idx ) {
                CLASS_READ_U2(index, (*(u2 *)p_mem));
                p_mem = 2;
                printf("index: %d\n", index);
        }

        return 0;
}

int parse_class_filed(void)
{
        u2 fileds_count;
        u2 idx;

        CLASS_READ_U2(fileds_count, (*(u2 *)p_mem))
        p_mem = 2;
        printf("filed_count: %d\n", fileds_count);

        return 0;
}
int __parse_exception_table(int len)
{
        u2 start_pc, end_pc;
        u2 handler_pc, catch_type;
        u2 idx;

        for (idx = 0; idx < len; idx ) {
                CLASS_READ_U2(start_pc, (*(u2 *)p_mem))
                p_mem = 2;
                printf("start_pc: %d\n", start_pc);

                CLASS_READ_U2(end_pc, (*(u2 *)p_mem))
                p_mem = 2;
                printf("end_pc: %d\n", end_pc);

                CLASS_READ_U2(handler_pc, (*(u2 *)p_mem))
                p_mem = 2;
                printf("handler_pc: %d\n", handler_pc);

                CLASS_READ_U2(catch_type, (*(u2 *)p_mem))
                p_mem = 2;
                printf("catch_type: %d\n", catch_type);
        }

        return 0;
}

int __parse_line_number_table(void)
{
        u4 attribute_length;
        u2 line_number_table_length;
        u2 start_pc, line_number;
        u2 idx;

        CLASS_READ_U4(attribute_length, (*(u4 *)p_mem))
        p_mem = 4;
        printf("\t\tattribute_length: %d\n", attribute_length);

        CLASS_READ_U2(line_number_table_length, (*(u2 *)p_mem))
        p_mem = 2;
        printf("\t\tline_number_table_length: %d\n", line_number_table_length);

        for (idx = 0; idx < line_number_table_length; idx ) {
                CLASS_READ_U2(start_pc, (*(u2 *)p_mem))
                p_mem = 2;
                printf("\t\tstart_pc: %d\n", start_pc);

                CLASS_READ_U2(line_number, (*(u2 *)p_mem))
                p_mem = 2;
                printf("\t\tline_number: %d\n", line_number);
        }

        return 0;
}

int __parse_verification_type_info(u1 number)
{
        u1 idx, tag;

        for (idx = 0; idx < number; idx ) {
                CLASS_READ_U1(tag, (*(u1 *)p_mem))
                p_mem = 1;
                printf("\t\ttag: %d\n", tag);
                switch (tag) {
                case ITEM_Top:
                        printf("\t\tITEM_Top.\n");
                        break;
                case ITEM_Integer:
                        printf("\t\tITEM_Integer.\n");
                        break;
                case ITEM_Float:
                        printf("\t\tITEM_float.\n");
                        break;
                case ITEM_Double:
                        printf("\t\tITEM_Double.\n");
                        break;
                case ITEM_Long:
                        printf("\t\tITEM_Long.\n");
                        break;
                case ITEM_Null:
                        printf("\t\tITEM_Long.\n");
                        break;
                case ITEM_UninitializedThis:
                        printf("\t\tITEM_UninitializedThis.\n");
                        break;
                case ITEM_Object:
                {
                        u2 cpool_index;

                        printf("\t\tITEM_Object.\n");
                        CLASS_READ_U2(cpool_index, (*(u2 *)p_mem))
                        p_mem = 2;
                        printf("\t\tcpool_index: %d\n", cpool_index);
                        break;
                }
                case ITEM_Uninitialized:
                {
                        u2 offset;

                        printf("\t\tITEM_Uninitialized.\n");
                        CLASS_READ_U2(offset, (*(u2 *)p_mem))
                        p_mem = 2;
                        printf("\t\toffset: %d\n", offset);
                        break;
                }
                default:
                        return -1;
                }
        }

        return 0;
}

int __parse_stack_map_frame(u2 number)
{
        u1 frame_type;
        u1 offset_delta;
        u2 idx;
        u1 stack_num;
        u1 locals_num;
        u1 local_idx;

        for (idx = 0; idx < number; idx ) {
                CLASS_READ_U1(frame_type, (*(u1 *)p_mem))
                p_mem = 1;
                printf("\t\tframe_type: %d\n", frame_type);

                if (frame_type >= 0 && frame_type <= 63) {
                        offset_delta = frame_type;
                        printf("\t\tsame_frame\toffset_delta: %d\n", offset_delta);
                }
                if (frame_type >= 64 && frame_type <= 127) {
                        offset_delta = frame_type - 64;
                        stack_num = 1;
                        printf("\t\tsame_locals_l_stack_item_frame\toffset_delta: %d\n",
                                offset_delta);
                        __parse_verification_type_info(stack_num);
                }
                if (frame_type == 247) {
                        stack_num = 1;
                        CLASS_READ_U2(offset_delta, (*(u2 *)p_mem))
                        p_mem = 2;
                        printf("\t\tsame_locals_l_stack_item_frame_extendedn\toffset_delta: %d\n",
                                offset_delta);
                        __parse_verification_type_info(stack_num);
                }
                if (frame_type >= 248 && frame_type <= 250) {
                        CLASS_READ_U2(offset_delta, (*(u2 *)p_mem))
                        p_mem = 2;
                        printf("\t\tsame_locals_l_stack_item_frame_extended\toffset_delta: %d\n",
                                offset_delta);
                }
                if (frame_type == 251) {
                        CLASS_READ_U2(offset_delta, (*(u2 *)p_mem))
                        p_mem = 2;
                        printf("\t\tsame_frame_extended\toffset_delta: %d\n", offset_delta);
                }
                if (frame_type >= 252 && frame_type <= 254) {
                        CLASS_READ_U2(offset_delta, (*(u2 *)p_mem))
                        p_mem = 2;
                        printf("\t\tappend_frame\toffset_delta: %d\n", offset_delta);

                        locals_num = frame_type - 251;
                        printf("\t\tlocals_num: %d\n", locals_num);

                        __parse_verification_type_info(locals_num);
                }
        }
}
int __parse_stack_map_table(void)
{
        u4 attribute_length;
        u2 number_of_entries;
        u2 idx;

        CLASS_READ_U4(attribute_length, (*(u4 *)p_mem))
        p_mem = 4;
        printf("\t\tattribute_length: %d\n", attribute_length);

        CLASS_READ_U2(number_of_entries, (*(u2 *)p_mem))
        p_mem = 2;
        printf("\t\tnumber_of_entries: %d\n", number_of_entries);

        __parse_stack_map_frame(number_of_entries);

        return 0;
}
/* attribute_name_index has been parsed before. */
int parse_code_attribute(void)
{
        u2 attribute_name_index;
        u4 attribute_length;
        u2 max_stack;
        u2 max_locals;
        u4 code_length;
        u1 *code;
        u2 exception_table_length;
        u2 attributes_count;
        u2 idx;

        CLASS_READ_U4(attribute_length, (*(u4 *)p_mem))
        p_mem = 4;
        printf("\tattribute_length: %d\n", attribute_length);

        CLASS_READ_U2(max_stack, (*(u2 *)p_mem))
        p_mem = 2;
        printf("\tmax_stack: %d\n", max_stack);

        CLASS_READ_U2(max_locals, (*(u2 *)p_mem))
        p_mem = 2;
        printf("\tmax_locals: %d\n", max_locals);

        CLASS_READ_U4(code_length, (*(u4 *)p_mem))
        p_mem = 4;
        printf("\tcode_length: %d\n", code_length);

        code = (u1 *)malloc(code_length 1);
        if (!code) {
                printf("Malloc failed.\n");
                return -1;
        }
        memcpy(code, p_mem, code_length);
        code[code_length] = '\0';
        p_mem = code_length;

        CLASS_READ_U2(exception_table_length, (*(u2 *)p_mem))
        p_mem = 2;
        printf("\texception_table_length: %d\n", exception_table_length);

        __parse_exception_table(exception_table_length);

        CLASS_READ_U2(attributes_count, (*(u2 *)p_mem))
        p_mem = 2;
        printf("\tattributes_count: %d\n", attributes_count);

        /* parse attributes */
        for (idx = 0; idx < attributes_count; idx ) {
                CLASS_READ_U2(attribute_name_index, (*(u2 *)p_mem))
                p_mem = 2;
                printf("\tidx: %d attribute_name_index: %d", idx 1, attribute_name_index);

                if (!strcmp(constant_info[attribute_name_index].base, "LineNumberTable")) {
                        printf("\n\tparse LineNumberTable:\n");
                        __parse_line_number_table();
                }
                if (!strcmp(constant_info[attribute_name_index].base, "StackMapTable")) {
                        printf("\n\tparse StackMapTable:\n");
                        __parse_stack_map_table();
                }
                if (!strcmp(constant_info[attribute_name_index].base, "LocalVariableTable")) {
                        ;
                }
                if (!strcmp(constant_info[attribute_name_index].base, "LocalVariableTypeTable")) {
                        ;
                }
                if (!strcmp(constant_info[attribute_name_index].base, "StackMapTable")) {
                        ;
                }
        }

        return 0;
}

int parse_class_method(void)
{
        u2 method_count;
        u2 access_flags, name_index;
        u2 descriptor_index, attributes_count;
        u2 idx;

        printf("\n---------------parse class method-------------------------:\n\n");
        CLASS_READ_U2(method_count, (*(u2 *)p_mem))
        p_mem = 2;
        printf("method_count: %d\n", method_count);

        for (idx = 0; idx < method_count; idx ) {
                CLASS_READ_U2(access_flags, (*(u2 *)p_mem))
                p_mem = 2;
                printf("access_flags: 0x%x\n", access_flags);

                CLASS_READ_U2(name_index, (*(u2 *)p_mem))
                p_mem = 2;
                printf("name_index: %d\n", name_index);

                CLASS_READ_U2(descriptor_index, (*(u2 *)p_mem))
                p_mem = 2;
                printf("descriptor_index: %d\n", descriptor_index);

                CLASS_READ_U2(attributes_count, (*(u2 *)p_mem))
                p_mem = 2;
                printf("attributes_count: %d\n\n", attributes_count);

                /* parse attributes */
                CLASS_READ_U2(name_index, (*(u2 *)p_mem))
                p_mem = 2;
                printf("attritbutes name_index: %d\n", name_index);

                if (!strcmp(constant_info[name_index].base, "Code")) {
                        printf("parse code attribute:\n");
                        parse_code_attribute();
                }
                if (!strcmp(constant_info[name_index].base, "Exceptions")) {
                        ;
                }
                if (!strcmp(constant_info[name_index].base, "Signature")) {
                        ;
                }
        }

        return 0;
}

int jvm_parse_class_file(const char *class_file)
{
        assert(class_file != NULL);
        if (mmap_class_file(class_file) == -1)
                return -1;

        p_mem = class_start_mem;
        if (parse_class_magic() == -1)
                goto out;

        if (parse_class_version() == -1)
                goto out;

        if (parse_class_constant() == -1)
                goto out;

        if (parse_class_access_flag() == -1)
                goto out;

        if (parse_class_this_super() == -1)
                goto out;

        if (parse_class_interface() == -1)
                goto out;

        if (parse_class_filed() == -1)
                goto out;

        if (parse_class_method() == -1)
                goto out;

        mmap_exit();
        return 0;
out:
        mmap_exit();
        return -1;
}

void jvm_usage(const char *proc)
{
        fprintf(stdout, "usage: %s \n", proc);
}

int main(int argc, char **argv)
{
        if (argc == 1) {
                jvm_usage(argv[0]);
                return 0;
        }

        jvm_parse_class_file(argv[1]);

        return 0;
}.h>.h>.h>.h>.h>.h>

 

 

////////////////////////////////////////////////////////////////////////////

[email protected] # gcc -o classreader classreader.c -w
[email protected] # ./classreader test.class
test.class file len: 462
mmap test.class at 0x2b0b78fa5000
magic: 0xcafebabe
jvm class magic match: 0xcafebabe
jvm class minor_version: 0
jvm class major_version: 50

-----------parse contant pool count----------------------:

jvm constant_pool_count: 30
- idx:  1 constant tag: 10      class_index: 6, name_and_type_index: 16
- idx:  2 constant tag: 9       class_index: 17, name_and_type_index: 18
- idx:  3 constant tag: 8       string index: 19
- idx:  4 constant tag: 10      class_index: 20, name_and_type_index: 21
- idx:  5 constant tag: 7       name_index: 22
- idx:  6 constant tag: 7       name_index: 23
- idx:  7 constant tag: 1       len: 6  
- idx:  8 constant tag: 1       len: 3  ()V
- idx:  9 constant tag: 1       len: 4  Code
- idx: 10 constant tag: 1       len: 15 LineNumberTable
- idx: 11 constant tag: 1       len: 4  main
- idx: 12 constant tag: 1       len: 22 ([Ljava/lang/String;)V
- idx: 13 constant tag: 1       len: 13 StackMapTable
- idx: 14 constant tag: 1       len: 10 SourceFile
- idx: 15 constant tag: 1       len: 9  test.java
- idx: 16 constant tag: 12      name_index: 7, descriptor_index: 8
- idx: 17 constant tag: 7       name_index: 24
- idx: 18 constant tag: 12      name_index: 25, descriptor_index: 26
- idx: 19 constant tag: 1       len: 4  hehe
- idx: 20 constant tag: 7       name_index: 27
- idx: 21 constant tag: 12      name_index: 28, descriptor_index: 29
- idx: 22 constant tag: 1       len: 4  test
- idx: 23 constant tag: 1       len: 16 java/lang/Object
- idx: 24 constant tag: 1       len: 16 java/lang/System
- idx: 25 constant tag: 1       len: 3  out
- idx: 26 constant tag: 1       len: 21 Ljava/io/PrintStream;
- idx: 27 constant tag: 1       len: 19 java/io/PrintStream
- idx: 28 constant tag: 1       len: 7  println
- idx: 29 constant tag: 1       len: 21 (Ljava/lang/String;)V

access_flag: 0x21
this_class: 5   super_class: 6

interfaces_count: 0
filed_count: 0

---------------parse class method-------------------------:

method_count: 2
access_flags: 0x1
name_index: 7
descriptor_index: 8
attributes_count: 1

attritbutes name_index: 9
parse code attribute:
        attribute_length: 29
        max_stack: 1
        max_locals: 1
        code_length: 5
        exception_table_length: 0
        attributes_count: 1
        idx: 1 attribute_name_index: 10
        parse LineNumberTable:
                attribute_length: 6
                line_number_table_length: 1
                start_pc: 0
                line_number: 5
access_flags: 0x9
name_index: 11
descriptor_index: 12
attributes_count: 1

attritbutes name_index: 9
parse code attribute:
        attribute_length: 77
        max_stack: 2
        max_locals: 2
        code_length: 24
        exception_table_length: 0
        attributes_count: 2
        idx: 1 attribute_name_index: 10
        parse LineNumberTable:
                attribute_length: 22
                line_number_table_length: 5
                start_pc: 0
                line_number: 7
                start_pc: 2
                line_number: 9
                start_pc: 9
                line_number: 10
                start_pc: 17
                line_number: 9
                start_pc: 23
                line_number: 11
        idx: 2 attribute_name_index: 13
        parse StackMapTable:
                attribute_length: 7
                number_of_entries: 2
                frame_type: 252
                append_frame    offset_delta: 4
                locals_num: 1
                tag: 1
                ITEM_Integer.
                frame_type: 18
                same_frame      offset_delta: 18
[email protected] #
 
  
 
  
 
  

这两天在class文件解析器的基础上, 加上了java反汇编的功能, 反汇编器是指令解释器的基础,通过编写反汇编器可以熟悉jvm的指令系统, 不过jvm的指令一共有201个,反汇编过程基本就是个体力活。在《java虚拟机规范》中对每一条指令都有了详细的描述,下面说说我是如何解析bytecode的:

一个java文件经过javac编译后会生成class格式文件, 在class格式中method字段里会有Code属性,Code属性包含了java的指令码和长度。 首先用class解析器将指令码提取出来, 举个例子:

test.java

class aa {
        int a = 6;
};

public class test {
        public static void main(String args[]) {
                int i = 0;

                for (i = 0; i < 5; i++)
                        System.out.println("hehe");
        }
}

我们用class文件解析器把test对应的bytecode打印出来:
len: 5
0x2a0xb70x00x10xb1

这一串bytecode为:0x2a0xb70x00x10xb1, 长度是5个字节。

对照《java虚拟机规范》我们来一步步手工解析:

0x2a代表aload_0指令, 它将本地局部变量中的第一个变量压入到堆栈里。这个指令本身长度就是一个字节,没有参数, 因此0x2a的解析就非常简单, 直接在屏幕打印出aload_0即可:

printf(“%s\n”, symbol);

0xb7代表invokespecial 它用来调用超类构造方法,实例初始化方法, 私有方法。它的用法如下:
invokespecial indexbyte1 indexbyte2,indexbyte1和indexbyte2各占一个字节,用(indexbyte1 << 8) | indexbyte2来构建一个常量池中的索引。每个jvm指令本身都占用一个字节,加上它的两个参数, invokespecial语句它将占用3个字节空间。 所以它的解析算法如下:

        u2 index;

        index = ((*(u1 *)(base + 1)) << 8) | (*(u1 *)(base + 2));
        printf("%s #%x\n", symbol, index);

注意0xb7解析完后,我们要跳过3个字节的地址,那么就是0xb1了, 它是return指令,没有参数,因此它的解析方法跟aload_0一样:
printf(“%s\n”, symbol);

以上是我们手工解析的过程, 但是jvm有201条指令, 我们需要建立一个合适的数据结构:

typedef int (*interp_func)(u2 opcode_len, char *symbol, void *base);

typedef struct bytecode_st {
        u2 opcode;                  // jvm的指令码
        u2 opcode_len;            // 指令总的长度,包括参数
        char symbol[OPCODE_SYMBOL_LEN];    // 指令对应的助记符
        interp_func func;         // 解析指令的回调函数
}BYTECODE;

我们可以直接建立一个大的BYTECODE数组:

BYTECODE jvm_byte_code[OPCODE_LEN] = {
                {0x00,  1,      "nop",          jvm_interp_nop},
                {0x01,  1,      "aconst_null",          jvm_interp_aconst_null},
                {0x02,  1,      "iconst_m1",            jvm_interp_iconst_m1},
                {0x03,  1,      "iconst_0",             jvm_interp_iconst_0},
                {0x04,  1,      "iconst_1",             jvm_interp_iconst_1},
                {0x05,  1,      "iconst_2",             jvm_interp_iconst_2},
                {0x06,  1,      "iconst_3",             jvm_interp_iconst_3},
                {0x07,  1,      "iconst_4",             jvm_interp_iconst_4},
                {0x08,  1,      "iconst_5",             jvm_interp_iconst_5},
                {0x09,  1,      "lconst_0",             jvm_interp_lconst_0},
                {0x0a,  1,      "lconst_1",             jvm_interp_lconst_1},
                {0x0b,  1,      "fconst_0",             jvm_interp_fconst_0},
                {0x0c,  1,      "fconst_1",             jvm_interp_fconst_1},
                {0x0d,  1,      "fconst_2",             jvm_interp_fconst_2},
                {0x0e,  1,      "dconst_0",             jvm_interp_dconst_0},
                {0x0f,  1,      "dconst_1",             jvm_interp_dconst_1},
                {0x10,  1,      "bipush",               jvm_interp_bipush},
                {0x11,  1,      "sipush",               jvm_interp_sipush},
                {0x12,  2,      "ldc",          jvm_interp_ldc},
                {0x13,  1,      "ldc_w",                jvm_interp_ldc_w},
                {0x14,  1,      "ldc2_w",               jvm_interp_ldc2_w},
                {0x15,  1,      "iload",                jvm_interp_iload},
                {0x16,  1,      "lload",                jvm_interp_lload},
                {0x17,  1,      "fload",                jvm_interp_fload},
                {0x18,  1,      "dload",                jvm_interp_dload},
                {0x19,  1,      "aload",                jvm_interp_aload},
                {0x1a,  1,      "iload_0",              jvm_interp_iload_0},
                {0x1b,  1,      "iload_1",              jvm_interp_iload_1},
                {0x1c,  1,      "iload_2",              jvm_interp_iload_2},
                {0x1d,  1,      "iload_3",              jvm_interp_iload_3},
                {0x1e,  1,      "lload_0",              jvm_interp_lload_0},
                {0x1f,  1,      "lload_1",              jvm_interp_lload_1},
                {0x20,  1,      "lload_2",              jvm_interp_lload_2},
                {0x21,  1,      "lload_3",              jvm_interp_lload_3},
                {0x22,  1,      "fload_0",              jvm_interp_fload_0},
                {0x23,  1,      "fload_1",              jvm_interp_fload_1},
                {0x24,  1,      "fload_2",              jvm_interp_fload_2},
                {0x25,  1,      "fload_3",              jvm_interp_fload_3},
                {0x26,  1,      "dload_0",              jvm_interp_dload_0},
                {0x27,  1,      "dload_1",              jvm_interp_dload_1},
                {0x28,  1,      "dload_2",              jvm_interp_dload_2},
                {0x29,  1,      "dload_3",              jvm_interp_dload_3},
                {0x2a,  1,      "aload_0",              jvm_interp_aload_0},
                {0x2b,  1,      "aload_1",              jvm_interp_aload_1},
                {0x2c,  1,      "aload_2",              jvm_interp_aload_2},
                {0x2d,  1,      "aload_3",              jvm_interp_aload_3},
                {0x2e,  1,      "iaload",               jvm_interp_iaload},
                {0x2f,  1,      "laload",               jvm_interp_laload},
                {0x30,  1,      "faload",               jvm_interp_faload},
                {0x31,  1,      "daload",               jvm_interp_daload},
                {0x32,  1,      "aaload",               jvm_interp_aaload},
                {0x33,  1,      "baload",               jvm_interp_baload},
                {0x34,  1,      "caload",               jvm_interp_caload},
                {0x35,  1,      "saload",               jvm_interp_saload},
                {0x36,  1,      "istore",               jvm_interp_istore},
                {0x37,  1,      "lstore",               jvm_interp_lstore},
                {0x38,  1,      "fstore",               jvm_interp_fstore},
                {0x39,  1,      "dstore",               jvm_interp_dstore},
                {0x3a,  1,      "astore",               jvm_interp_astore},
                {0x3b,  1,      "istore_0",             jvm_interp_istore_0},
                {0x3c,  1,      "istore_1",             jvm_interp_istore_1},
                {0x3d,  1,      "istore_2",             jvm_interp_istore_2},
                {0x3e,  1,      "istore_3",             jvm_interp_istore_3},
                {0x3f,  1,      "lstore_0",             jvm_interp_lstore_0},
                {0x40,  1,      "lstore_1",             jvm_interp_lstore_1},
                {0x41,  1,      "lstore_2",             jvm_interp_lstore_2},
                {0x42,  1,      "lstore_3",             jvm_interp_lstore_3},
                {0x43,  1,      "fstore_0",             jvm_interp_fstore_0},
                {0x44,  1,      "fstore_1",             jvm_interp_fstore_1},
                {0x45,  1,      "fstore_2",             jvm_interp_fstore_2},
                {0x46,  1,      "fstore_3",             jvm_interp_fstore_3},
                {0x47,  1,      "dstore_0",             jvm_interp_dstore_0},
                {0x48,  1,      "dstore_1",             jvm_interp_dstore_1},
                {0x49,  1,      "dstore_2",             jvm_interp_dstore_2},
                {0x4a,  1,      "dstore_3",             jvm_interp_dstore_3},
                {0x4b,  1,      "astore_0",             jvm_interp_astore_0},
                {0x4c,  1,      "astore_1",             jvm_interp_astore_1},
                {0x4d,  1,      "astore_2",             jvm_interp_astore_2},
                {0x4e,  1,      "astore_3",             jvm_interp_astore_3},
                {0x4f,  1,      "iastore",              jvm_interp_iastore},
                {0x50,  1,      "lastore",              jvm_interp_lastore},
                {0x51,  1,      "fastore",              jvm_interp_fastore},
                {0x52,  1,      "dastore",              jvm_interp_dastore},
                {0x53,  1,      "aastore",              jvm_interp_aastore},
                {0x54,  1,      "bastore",              jvm_interp_bastore},
                {0x55,  1,      "castore",              jvm_interp_castore},
                {0x56,  1,      "sastore",              jvm_interp_sastore},
                {0x57,  1,      "pop",          jvm_interp_pop},
                {0x58,  1,      "pop2",         jvm_interp_pop2},
                {0x59,  1,      "dup",          jvm_interp_dup},
                {0x5a,  1,      "dup_x1",               jvm_interp_dup_x1},
                {0x5b,  1,      "dup_x2",               jvm_interp_dup_x2},
                {0x5c,  1,      "dup2",         jvm_interp_dup2},
                {0x5d,  1,      "dup2_x1",              jvm_interp_dup2_x1},
                {0x5e,  1,      "dup2_x2",              jvm_interp_dup2_x2},
                {0x5f,  1,      "swap",         jvm_interp_swap},
                {0x60,  1,      "iadd",         jvm_interp_iadd},
                {0x61,  1,      "ladd",         jvm_interp_ladd},
                {0x62,  1,      "fadd",         jvm_interp_fadd},
                {0x63,  1,      "dadd",         jvm_interp_dadd},
                {0x64,  1,      "isub",         jvm_interp_isub},
                {0x65,  1,      "lsub",         jvm_interp_lsub},
                {0x66,  1,      "fsub",         jvm_interp_fsub},
                {0x67,  1,      "dsub",         jvm_interp_dsub},
                {0x68,  1,      "imul",         jvm_interp_imul},
                {0x69,  1,      "lmul",         jvm_interp_lmul},
                {0x6a,  1,      "fmul",         jvm_interp_fmul},
                {0x6b,  1,      "dmul",         jvm_interp_dmul},
                {0x6c,  1,      "idiv",         jvm_interp_idiv},
                {0x6d,  1,      "ldiv",         jvm_interp_ldiv},
                {0x6e,  1,      "fdiv",         jvm_interp_fdiv},
                {0x6f,  1,      "ddiv",         jvm_interp_ddiv},
                {0x70,  1,      "irem",         jvm_interp_irem},
                {0x71,  1,      "lrem",         jvm_interp_lrem},
                {0x72,  1,      "frem",         jvm_interp_frem},
                {0x73,  1,      "drem",         jvm_interp_drem},
                {0x74,  1,      "ineg",         jvm_interp_ineg},
                {0x75,  1,      "lneg",         jvm_interp_lneg},
                {0x76,  1,      "fneg",         jvm_interp_fneg},
                {0x77,  1,      "dneg",         jvm_interp_dneg},
                {0x78,  1,      "ishl",         jvm_interp_ishl},
                {0x79,  1,      "lshl",         jvm_interp_lshl},
                {0x7a,  1,      "ishr",         jvm_interp_ishr},
                {0x7b,  1,      "lshr",         jvm_interp_lshr},
                {0x7c,  1,      "iushr",                jvm_interp_iushr},
                {0x7d,  1,      "lushr",                jvm_interp_lushr},
                {0x7e,  1,      "iand",         jvm_interp_iand},
                {0x7f,  1,      "land",         jvm_interp_land},
                {0x80,  1,      "ior",          jvm_interp_ior},
                {0x81,  1,      "lor",          jvm_interp_lor},
                {0x82,  1,      "ixor",         jvm_interp_ixor},
                {0x83,  1,      "lxor",         jvm_interp_lxor},
                {0x84,  3,      "iinc",         jvm_interp_iinc},
                {0x85,  1,      "i2l",          jvm_interp_i2l},
                {0x86,  1,      "i2f",          jvm_interp_i2f},
                {0x87,  1,      "i2d",          jvm_interp_i2d},
                {0x88,  1,      "l2i",          jvm_interp_l2i},
                {0x89,  1,      "l2f",          jvm_interp_l2f},
                {0x8a,  1,      "l2d",          jvm_interp_l2d},
                {0x8b,  1,      "f2i",          jvm_interp_f2i},
                {0x8c,  1,      "f2l",          jvm_interp_f2l},
                {0x8d,  1,      "f2d",          jvm_interp_f2d},
                {0x8e,  1,      "d2i",          jvm_interp_d2i},
                {0x8f,  1,      "d2l",          jvm_interp_d2l},
                {0x90,  1,      "d2f",          jvm_interp_d2f},
                {0x91,  1,      "i2b",          jvm_interp_i2b},
                {0x92,  1,      "i2c",          jvm_interp_i2c},
                {0x93,  1,      "i2s",          jvm_interp_i2s},
                {0x94,  1,      "lcmp",         jvm_interp_lcmp},
                {0x95,  1,      "fcmpl",                jvm_interp_fcmpl},
                {0x96,  1,      "fcmpg",                jvm_interp_fcmpg},
                {0x97,  1,      "dcmpl",                jvm_interp_dcmpl},
                {0x98,  1,      "dcmpg",                jvm_interp_dcmpg},
                {0x99,  1,      "ifeq",                 jvm_interp_ifeq},
                {0x9a,  1,      "ifne",                 jvm_interp_ifne},
                {0x9b,  1,      "iflt",                 jvm_interp_iflt},
                {0x9c,  1,      "ifge",                 jvm_interp_ifge},
                {0x9d,  1,      "ifgt",                 jvm_interp_ifgt},
                {0x9e,  1,      "ifle",                 jvm_interp_ifle},
                {0x9f,  1,      "if_icmpeq",            jvm_interp_if_icmpeq},
                {0xa0,  1,      "if_icmpne",            jvm_interp_if_icmpne},
                {0xa1,  1,      "if_icmplt",            jvm_interp_if_icmplt},
                {0xa2,  3,      "if_icmpge",            jvm_interp_if_icmpge},
                {0xa3,  1,      "if_icmpgt",            jvm_interp_if_icmpgt},
                {0xa4,  1,      "if_icmple",            jvm_interp_if_icmple},
                {0xa5,  1,      "if_acmpeq",            jvm_interp_if_acmpeq},
                {0xa6,  1,      "if_acmpne",            jvm_interp_if_acmpne},
                {0xa7,  3,      "goto",                 jvm_interp_goto},
                {0xa8,  1,      "jsr",                  jvm_interp_jsr},
                {0xa9,  1,      "ret",                  jvm_interp_ret},
                {0xaa,  1,      "tableswitch",          jvm_interp_tableswitch},
                {0xab,  1,      "lookupswitch",         jvm_interp_lookupswitch},
                {0xac,  1,      "ireturn",              jvm_interp_ireturn},
                {0xad,  1,      "lreturn",              jvm_interp_lreturn},
                {0xae,  1,      "freturn",              jvm_interp_freturn},
                {0xaf,  1,      "dreturn",              jvm_interp_dreturn},
                {0xb0,  1,      "areturn",              jvm_interp_areturn},
                {0xb1,  1,      "return",               jvm_interp_return},
                {0xb2,  3,      "getstatic",            jvm_interp_getstatic},
                {0xb3,  1,      "putstatic",            jvm_interp_putstatic},
                {0xb4,  1,      "getfield",             jvm_interp_getfield},
                {0xb5,  1,      "putfield",             jvm_interp_putfield},
                {0xb6,  3,      "invokevirtual",        jvm_interp_invokevirtual},
                {0xb7,  3,      "invokespecial",        jvm_interp_invokespecial},
                {0xb8,  1,      "invokestatic",         jvm_interp_invokestatic},
                {0xb9,  1,      "invokeinterface",      jvm_interp_invokeinterface},
                {0xba,  1,      "invokedynamic",        jvm_interp_invokedynamic},
                {0xbb,  1,      "new",                  jvm_interp_new},
                {0xbc,  1,      "newarray",             jvm_interp_newarray},
                {0xbd,  1,      "anewarray",            jvm_interp_anewarray},
                {0xbe,  1,      "arraylength",          jvm_interp_arraylength},
                {0xbf,  1,      "athrow",               jvm_interp_athrow},
                {0xc0,  1,      "checkcast",            jvm_interp_checkcast},
                {0xc1,  1,      "instanceof",           jvm_interp_instanceof},
                {0xc2,  1,      "monitorenter",         jvm_interp_monitorenter},
                {0xc3,  1,      "monitorexit",          jvm_interp_monitorexit},
                {0xc4,  1,      "wide",                 jvm_interp_wide},
                {0xc5,  1,      "multianewarray",       jvm_interp_multianewarray},
                {0xc6,  1,      "ifnull",               jvm_interp_ifnull},
                {0xc7,  1,      "ifnonnull",            jvm_interp_ifnonnull},
                {0xc8,  1,      "goto_w",               jvm_interp_goto_w},
                {0xc9,  1,      "jsr_w",                jvm_interp_jsr_w},
                };

每个jvm指令的指令码就是数组的索引, 这样就能找到指令对应的BYTECODE结构,通过调用其回调函数, 就可以进入具体的解析过程了。 这样做的好处就是不用switch case一大堆分支了。

 

 

int jvm_interp_invokespecial(u2 len, char *symbol, void *base)
{
        u2 index;

        index = ((*(u1 *)(base + 1)) << 8) | (*(u1 *)(base + 2));
        printf("%s #%x\n", symbol, index);
}

int jvm_interp_aload_0(u2 len, char *symbol, void *base)
{
        printf("%s\n", symbol);
}

int jvm_interp_return(u2 len, char *symbol, void *base)
{
        printf("%s\n", symbol);
}

int __disass_bytecode(u1 *base, u2 len)
{
        u1 idx = 0;
        u1 index;

        while (idx < len) {
                index = *(u1 *)(base + idx);
                //printf("!0x%x\n", index);
                jvm_byte_code[index].func(jvm_byte_code[index].opcode_len,
                        jvm_byte_code[index].symbol, base + idx);
                idx += (u1)jvm_byte_code[index].opcode_len;
        }
}

目前这个反汇编器只能解析一小部分指令, 随着开发的深入, 会慢慢补全的, 下面是反汇编test.class的结果:

diassember bytecode:

aload_0
invokespecial #1
return

-----------------------------
iconst_0
istore_1
iconst_0
istore_1
iload_1
iconst_5
if_icmpge 17
getstatic #2
ldc #3
invokevirtual #4
iinc 1 1
goto 0xfff0
return

java工具集中提供了javap, 可以反汇编java指令,本来是想山寨一个javap的, 但是现在对jvm整体结构还是不清晰,数据结构还不能很好的设计出来, 但是随着对jvm的了解深入, 反汇编器会越来越成熟。


 
  
 
  

一、背景

笔者希望通过自己动手编写一个简单的jvm来了解java虚拟机内部的工作细节毕竟hotsopt以及android的dalvik都有几十万行的c代码级别。 在前面的2篇开发笔记中已经实现了一个class文件解析器和一个java反汇编器 在这基础上 java虚拟机的雏形也已经写好。还没有内存管理功能 没有线程支持。它能解释执行的指令取决于我的java语法范围 在这之前我对java一无所知 通过写这个jvm顺便也把java学会了

它现在的功能如下

1、java反汇编器 山寨了javap的部分功能。 2、能解释执行如下jvm指令

iload_n, istore_n, aload_n, astore_n, iadd, isub, bipush, invokespecail, invokestatic, invokevirtual, goto, return, ireturn, if_icmpge, putfiled, new, dup

 

源码地址 http://www.cloud-sec.org/jvm.tgz 举2个测试例子

test.java =========

class aa {
        int a = 6;

        int debug(int a, int b)
        {
                int sum;

                sum = a + b;

                return sum;
        }
}

public class test {
        public static void main(String args[]) {
                int a;

                aa bb = new aa();
                a = bb.debug(1, 2);
        }
}

test7.java
==========

public class test7 {
        static int sub(int value)
        {
                int a = 1;

                return value - 1;
        }

        static int add(int a, int b)
        {
                int sum = 0;
                int c;

                sum = a + b;

                c = sub(sum);

                return c;
        }

        public static void main(String args[]) {
                int a = 1, b = 2;
                int ret;

                ret = add(a, b);
                return ;
        }
}

二、JVM架构

2个核心文件:

classloader.c   – 从硬盘加载class文件并解析。
interp_engine.c – bytecode解释器。

运行时数据区

————————————————————–
| 方法区(method) | 堆栈(stack) | 程序计数器(pc) |
————————————————————–

注意这里缺少了heap, native stack 因为我们现在还不支持这些功能。
每个method都有自己对应的栈帧stack frame 在class文件解析的时候就已经创建好。

typedef struct jvm_stack_frame {
        u1 *local_var_table;        // 本地变量表的指针
        u1 *operand_stack;          // 操作数栈的指针
        u4 *method;
        u1 *return_addr;            // method调用函数的时候保存的返回地址
        u4 offset;                  // 操作数栈的偏移量
        u2 max_stack;               // 本地变量表中的变量数量
        u2 max_locals;              // 操作数栈的变量数量
        struct jvm_stack_frame *prev_stack;    // 指向前一个栈帧结构
}JVM_STACK_FRAME;

定义了一个叫curr_jvm_stack的全局变量 它用来保存当前解释器使用的栈帧结构 在jvm初始化的时候进行设置

int jvm_stack_init(void)
{
        curr_jvm_stack = (JVM_STACK_FRAME *)malloc(sizeof(JVM_STACK_FRAME));
        if (!curr_jvm_stack) {
                __error("malloc failed.");
                return -1;
        }
        memset(curr_jvm_stack, '', sizeof(JVM_STACK_FRAME));

        jvm_stack_depth = 0;

        return 0;
}

三、实现细节

1、 虚拟机执行过程

初始化jvm_init()
从磁盘加载class文件并解析在内存建立方法区数据结构 初始化内存堆栈 初始化jvm运行环境。

解释器运行 jvm_run()
初始化程序计数器pc, 从方法区中查找main函数开始解释执行。

退出 jvm_exit()
释放所有数据结构

2、class文件加载与解析

对于每一个class文件使用CLASS数据结构表示

typedef struct jvm_class {
        u4 class_magic;                
        u2 access_flag;                
        u2 this_class;
        u2 super_class;
        u2 minor_version;
        u2 major_version;
        u2 constant_pool_count;
        u2 interfaces_count;
        u2 fileds_count;
        u2 method_count;
        char class_file[1024];
        struct constant_info_st *constant_info;
        struct list_head interface_list_head;
        struct list_head filed_list_head;
        struct list_head method_list_head;
        struct list_head list;
}CLASS;

CLASS结构的前部分是按java虚拟机规范中对class文件结构的描述设置的。 class_file保存的是这个CLASS结构对应的磁盘class文件名。constant_info保存的是class文件常量池的字符串。utf8interface_list_headfiled_list_headmethod_list_head分别是接口字段 方法的链表头。

在解析class文件的时候 只解析了java虚拟机规范中规定的一个jvm最起码能解析的属性。 这个部分没什么好说的大家直接看源码 在对照java虚拟机规范就能看懂了。

3、解释器设计

java虚拟机规范中一共涉及了201条指令。没有使用switch case这种常用的算法。而是为每个jvm指令设计了一个数据结构

typedef int (*interp_func)(u2 opcode_len, char *symbol, void *base);

typedef struct bytecode_st {
        u2 opcode;
        u2 opcode_len;
        char symbol[OPCODE_SYMBOL_LEN];
        interp_func func;
}BYTECODE;

opcode是jvm指令的机器码 opcode_len是这条jvm指令的长度symbol指令的助记符func是具体的这条指令解释函数。事先建立了一个BYTECODE数组

BYTECODE jvm_byte_code[OPCODE_LEN] = {
                {0x00,  1,      "nop",                  jvm_interp_nop},
                {0x01,  1,      "aconst_null",          jvm_interp_aconst_null},
                {0x02,  1,      "iconst_m1",            jvm_interp_iconst_m1},
                {0x03,  1,      "iconst_0",             jvm_interp_iconst_0},
                {0x04,  1,      "iconst_1",             jvm_interp_iconst_1},
                {0x05,  1,      "iconst_2",             jvm_interp_iconst_2},
                {0x06,  1,      "iconst_3",             jvm_interp_iconst_3},
                {0x07,  1,      "iconst_4",             jvm_interp_iconst_4},
                {0x08,  1,      "iconst_5",             jvm_interp_iconst_5},
                {0x09,  1,      "lconst_0",             jvm_interp_lconst_0},
                {0x0a,  1,      "lconst_1",             jvm_interp_lconst_1},
                {0x0b,  1,      "fconst_0",             jvm_interp_fconst_0},
         ...
                {0xc5,  1,      "multianewarray",       jvm_interp_multianewarray},
                {0xc6,  1,      "ifnull",               jvm_interp_ifnull},
                {0xc7,  1,      "ifnonnull",            jvm_interp_ifnonnull},
                {0xc8,  1,      "goto_w",               jvm_interp_goto_w},
                {0xc9,  1,      "jsr_w",                jvm_interp_jsr_w},
                };

int jvm_interp_invokespecial(u2 len, char *symbol, void *base)
{
        u2 index;

        index = ((*(u1 *)(base + 1)) << 8) | (*(u1 *)(base + 2));
        printf("%s #%xn", symbol, index);
}

int jvm_interp_aload_0(u2 len, char *symbol, void *base)
{
        printf("%sn", symbol);
}

int jvm_interp_return(u2 len, char *symbol, void *base)
{
        printf("%sn", symbol);
}

对于一段bytecode0x2a0xb70x00x10xb1 手工解析如下

0x2a代表aload_0指令 它将本地局部变量中的第一个变量压入到堆栈里。这个指令本身长度就是一个字节没有参数 因此0x2a的解析就非常简单 直接在屏幕打印出aload_0即可

printf(“%sn”, symbol);

0xb7代表invokespecial 它用来调用超类构造方法实例初始化方法 私有方法。它的用法如下
invokespecial indexbyte1 indexbyte2indexbyte1和indexbyte2各占一个字节用(indexbyte1 << 8) | indexbyte2来构建一个常量池中的索引。每个jvm指令本身都占用一个字节加上它的两个参数 invokespecial语句它将占用3个字节空间。 所以它的解析算法如下

        u2 index;

        index = ((*(u1 *)(base + 1)) << 8) | (*(u1 *)(base + 2));
        printf("%s #%xn", symbol, index);

注意0xb7解析完后我们要跳过3个字节的地址那么就是0xb1了 它是return指令没有参数因此它的解析方法跟aload_0一样
printf(“%sn”, symbol);

用程序代码实现是

int interp_bytecode(CLASS_METHOD *method)
{
        jvm_stack_depth++;                    // 函数掉用计数加1
        curr_jvm_stack = &method->code_attr->stack_frame;    // 设置当前栈帧指针

        curr_jvm_interp_env->constant_info = method->class->constant_info;    // 设置当前运行环境
        curr_jvm_interp_env->prev_env = NULL;
        for (;;) {
                if (jvm_stack_depth == 0) {            // 为0代表所有函数执行完毕
                        printf("interpret bytecode done.n");
                        break;
                }

                index = *(u1 *)jvm_pc.pc;            // 设置程序计数器
                jvm_byte_code[index].func(jvm_byte_code[index].opcode_len, // 解释具体指令
                        jvm_byte_code[index].symbol, jvm_pc.pc);
                sleep(1);
        }
}

举个例子

int jvm_interp_iadd(u2 len, char *symbol, void *base)
{
        u4 tmp1, tmp2;

        printf("%sn", symbol);

        pop_operand_stack(int, tmp1)
        pop_operand_stack(int, tmp2)

        push_operand_stack(int, (tmp1 + tmp2))
        jvm_pc.pc += len;
}

jvm_interp_iadd用于解释执行iadd指令 首先从操作数栈中弹出2个int型变量tmp1, tmp2。
把tmp1 + tmp2相加后在压入到操作数栈里。

下面是test7.java的执行演示

public class test7 {
        static int sub(int value)
        {
                int a = 1;

                return value - 1;
        }

        static int add(int a, int b)
        {
                int sum = 0;
                int c;

                sum = a + b;

                c = sub(sum);

                return c;
        }

        public static void main(String args[]) {
                int a = 1, b = 2;
                int ret;

                ret = add(a, b);
                return ;
        }
}

 


 
  
 
  

ajvm是一个笔者正在开发中的java虚拟机, 用c和少量汇编语言编写, 目的在于探究一个可运行的java虚拟机是如何实现的, 目前整个jvm的source code代码量在5000行左右, 预计控制在1w行以内,只要能运行简单的java代码即可。笔者希望ajvm能变成一个教学用的简单java虚拟机实现, 帮助java程序员在陷入庞大的hotspot vm源码之前, 能对jvm的结构有个清晰的认识。 ajvm是笔者利用业余时间编写的, 每次完成一个重要功能都会以笔记的形式发布到ata, 和大家共同学习和探讨。

 

git repo:  https://github.com/cloudsec/ajvm
git clone [email protected]:cloudsec/ajvm.git

 

最近笔者给ajvm增加了stack calltrace的功能, 用于帮助和调试jvm crash后的信息。 大家知道oracle的hotspot jvm在crash后会给出大量的crash信息, 这些信息能帮助jvm开发人员快速定位问题。同样, ajvm也增加了类似的功能:

 

1、calltrace(),  打印函数调用栈。

2、截获SIGSEGV信号, jvm segfault后, 打印离堆栈指针rsp最近的16字节信息;打印cpu寄存器信息;打印函数调用栈。

 

首先看如何打印函数调用栈:

笔者在《理解堆栈及其利用方法 》: http://blog.aliyun.com/964?spm=0.0.0.0.BykR2E

这篇paper中详细讲述了intel x86和x86_64下进程堆栈的结构, 关于堆栈的基础知识请大家参考此paper。

下面举一个简单的例子:

 

#include 

#include "trace.h"
#include "log.h"

void test2()
{
        calltrace();
        *(int *)0 = 0;
}

void test1()
{
        test2();
}

void test()
{
        test1();
}

int main(void)
{
        log_init();
        GET_BP(top_rbp);
        calltrace_init();
        test();

        return 0;
}

 

在test2函数中调用了calltrace()函数, 用来打印它的函数调用栈, 我们知道它的函数调用栈是这样的: main->test->test1->test2->calltrace。我们想让calltrace的输出信息类似如下:

test2
test1
test
main

 

要完成此功能, 我们要利用gcc编译器的一个特点, 注意在-O2或-fomit-frame-pointer参数下, 这个方法就无效了。 反汇编这个程序后, 会发现每个函数调用的开头总会有这么几句汇编指令:

 

0000000000401138 :
  401138:       55                         push   %rbp
  401139:       48 89 e5                   mov    %rsp,%rbp

000000000040114e :
  40114e:       55                         push   %rbp
  40114f:       48 89 e5                   mov    %rsp,%rbp

000000000040115e :
  40115e:       55                         push   %rbp
  40115f:       48 89 e5                   mov    %rsp,%rbp

000000000040116e :
  40116e:       55                         push   %rbp
  40116f:       48 89 e5                   mov    %rsp,%rbp

 

大家想起来了吧, rbp在intel处理器中代表的是一个堆栈中栈帧开始的地址, rsp代表当前堆栈栈顶的地址。在c语言中一个函数的调用过程是这样的:

 

test()
{
       test1();
}

 

在test函数中调用test1()的时候,  cpu会先自动把test1函数后面的指令地址压入test1函数的栈帧里, 然后在执行push rbp; mov rsp, rbp指令。 我们画一下,从main函数到calltrace函数的整个堆栈栈帧结构:

 

        |...|
        |rbp|<--|   push rbp; mov rsp, rbp
ctrace->|rip|   |   call calltrace + 1
        |...|   |
        |rbp|<--|   push rbp; mov rsp, rbp
test2-> |rip|   |   call test2 + 1
        |...|   |
        |rbp|<--|   push rbp; mov rsp, rbp
test1-> |rip|   |   call test1 + 1
        |...|   |
        |rbp|<--|   push rbp; mov rsp, rbp
test->  |rip|   |   call test + 1
        |...|   |
        |rbp|<--|   push rbp; mov rsp, rbp
main->  |rip|   |   call main + 1
        |...|   |
glibc   |...|<--|   rbp->unkonwn

所以在正常情况下堆栈的栈帧中每个rbp后面,保存的都是上一个函数的返回地址, calltrace的实现其实就很简单了, 首先得到rbp的地址,然后rbp后面的地址就是ret rip的地址, 通过这个地址,我们可以解析出栈帧对应的符号信息, 因为ajvm通过自己解析elf文件, 来获得符号表信息。 calltrace的大致实现如下:

void calltrace(void)
{
        CALL_TRACE trace, prev_trace;
        uint64_t *rbp, rip, real_rip;
        int flag = 0, first_bp = 0;

        printf("Call trace:\n\n");
        GET_BP(rbp)
        while (rbp != top_rbp) {
                rip = *(uint64_t *)(rbp + 1);
                rbp = (uint64_t *)*rbp;
                real_rip = compute_real_func_addr(rip);

                if (flag == 1) {
                        if (search_symbol_by_addr(real_rip, &prev_trace) == -1) {
                                __error("calltrace: search symbol failed.");
                                exit(-1);
                        }

                        prev_trace.rip = rip - 5;
                        prev_trace.offset = trace.rip - prev_trace.symbol_addr;
                        show_calltrace(&prev_trace);

                        trace = prev_trace;
                }
                else {
                        if (search_symbol_by_addr(real_rip, &trace) == -1) {
                                __error("calltrace: search symbol failed.");
                                exit(-1);
                        }
                        trace.rip = rip - 5;
                        flag = 1;
                }
        }
        printf("\n");
}

 

我们刚才讲ajvm还截获了进程的SIGSEGV信号处理流程, 在jvm初始化的时候,通过signal_init()来实现:

 

int signal_init(void)
{
        struct sigaction sa;

        sa.sa_flags = SA_SIGINFO;
        sa.sa_sigaction = signal_handler;
        sigemptyset(&sa.sa_mask);

        if (sigaction(SIGSEGV, &sa, NULL) == -1) {
                perror("sigaction");
                return -1;
        }

        return 0;
}

 

当jvm crash后, signal_handler()函数接管了信号的处理流程, 注意此时整个jvm进程的堆栈结构跟calltrace结构有一点不一样:

 

        |...|
        |rbp|<--|   push rbp; mov rsp, rbp
do_sig->|eip|   |   unkown
        |...|<----- segfault
        |...|
        |rbp|<--|   push rbp; mov rsp, rbp
test2-> |rip|   |   call test2 + 1
        |...|   |
        |rbp|<--|   push rbp; mov rsp, rbp
test1-> |rip|   |   call test1 + 1
        |...|   |
        |rbp|<--|   push rbp; mov rsp, rbp
test->  |rip|   |   call test + 1
        |...|   |
        |rbp|<--|   push rbp; mov rsp, rbp
main->  |rip|   |   call main + 1
        |...|   |
glibc   |...|<--|   rbp->unkonwn

test2并没有调用do_sig函数, 这是因为test2函数里有一个空指针引用的操作, 操作系统内核在处理这个缺页异常中断的时候, 向进程发送了SIGSEGV信号, 通常情况下, 会直接杀死进程, 但是这个信号被do_sig函数接管了, 我们要在这个函数里打印充足的调试信息后, 在退出进程。

 

void signal_handler(int sig_num, siginfo_t *sig_info, void *ptr)
{
        CALL_TRACE trace, prev_trace;
        uint64_t *rbp, rip, real_rip;
        int flag = 0, first_bp = 0;

        assert(sig_info != NULL);
        printf("\nPid: %d segfault at addr: 0x%016x\tsi_signo: %d\tsi_errno: %d\n\n",
                getpid(), sig_info->si_addr,
                sig_info->si_signo, sig_info->si_errno);

        show_stack();
        show_registers();

        printf("Call trace:\n\n");
        GET_BP(rbp)
        while (rbp != top_rbp) {
                rip = *(uint64_t *)(rbp + 1);
                rbp = (uint64_t *)*rbp;
                real_rip = compute_real_func_addr(rip);

                if (flag == 1) {
                        if (search_symbol_by_addr(real_rip, &prev_trace) == -1) {
                                __error("calltrace: search symbol failed.");
                                exit(-1);
                        }

                        prev_trace.rip = rip - 5;
                        if (first_bp == 0) {
                                first_bp = 1;
                                prev_trace.offset = 0;
                        }
                        else {
                                prev_trace.offset = trace.rip - prev_trace.symbol_addr;
                        }
                        show_calltrace(&prev_trace);

                        trace = prev_trace;
                }
                else {
                        /* it's in a single handler function, the last call frame is unkown,
                         * we can't locate the rip addr. */
                        search_symbol_by_addr(real_rip, &trace);
                        trace.rip = rip - 5;
                        flag = 1;
                }
        }
        printf("\n");

        exit(0);
}

至于show_stack()和show_registers()函数就很简单了:

#define GET_BP(x)               asm("movq %%rbp, %0":"=r"(x));
#define GET_SP(x)               asm("movq %%rsp, %0":"=r"(x));
#define GET_AX(x)               asm("movq %%rax, %0":"=r"(x));
#define GET_BX(x)               asm("movq %%rbx, %0":"=r"(x));
#define GET_CX(x)               asm("movq %%rcx, %0":"=r"(x));
#define GET_DX(x)               asm("movq %%rdx, %0":"=r"(x));
#define GET_SI(x)               asm("movq %%rsi, %0":"=r"(x));
#define GET_DI(x)               asm("movq %%rdi, %0":"=r"(x));
#define GET_R8(x)               asm("movq %%r8, %0":"=r"(x));
#define GET_R9(x)               asm("movq %%r9, %0":"=r"(x));
#define GET_R10(x)              asm("movq %%r10, %0":"=r"(x));
#define GET_R11(x)              asm("movq %%r11, %0":"=r"(x));
#define GET_R12(x)              asm("movq %%r12, %0":"=r"(x));
#define GET_R13(x)              asm("movq %%r13, %0":"=r"(x));
#define GET_R14(x)              asm("movq %%r14, %0":"=r"(x));
#define GET_R15(x)              asm("movq %%r15, %0":"=r"(x));

void show_stack(void)
{
        int i;
        uint64_t *rsp, *rbp;

        GET_SP(rsp);
        GET_BP(rbp);
        printf("Stack:\t\t\nrsp: 0x%016x\t\trbp: 0x%016x\n", rsp, rbp);
        for (i = 0; i < 16; i++) {
                printf("0x%02x ", *((unsigned char *)rsp + i));
        }
        printf("\n\n");
}

void show_registers(void)
{
        uint64_t rax, rbx, rcx, rdx, rsi, rdi;
        uint64_t r9, r10, r11, r12, r13, r14, r15;

        GET_AX(rax)
        GET_BX(rbx)
        GET_CX(rcx)
        GET_DX(rdx)
        GET_SI(rsi)
        GET_DI(rdi)
        GET_R9(r9)
        GET_R10(r10)
        GET_R11(r11)
        GET_R12(r12)
        GET_R13(r13)
        GET_R14(r14)
        GET_R15(r15)
        printf("Registers:\n");
        printf("rax = 0x%016x, rbx = 0x%016x, rcx = 0x%016x, rdx = 0x%016x\n"
                "rsi = 0x%016x, rdi = 0x%016x, r8 = 0x%016x, r9 = 0x%016x\n"
                "r10 = 0x%016x, r11 = 0x%016x, r12 = 0x%016x, r13 = 0x%016x\n"
                "r14 = 0x%016x, r15 = 0x%016x\n\n",
                rax, rbx, rcx, rdx, rsi, rdi,
                r9, r10, r11, r12, r13, r14, r15);
}

最后演示一下ajvm在crash后的出错信息:

 

Pid: 8739 segfault at addr: 0x0000000000000000  si_signo: 11    si_errno: 0

Stack:
rsp: 0x00000000caa88680         rbp: 0x00000000caa886a0
0x90 0x87 0xa8 0xca 0xff 0x7f 0x00 0x00 0x58 0xd3 0xe4 0x3d 0x0c 0x00 0x00 0x00

Registers:
rax = 0x000000003de6c144, rbx = 0x000000003e151780, rcx = 0x0000000000000001, rdx = 0x0000000000000001
rsi = 0x000000003de6317a, rdi = 0x0000000000000000, r8 = 0x00000000caa886a0, r9 = 0x0000000000000000
r10 = 0x000000000040accf, r11 = 0x00000000caa88790, r12 = 0x000000003de4d358, r13 = 0x00000000caa88680
r14 = 0x00000000caa886a0, r15 = 0x000000000000000b

Call trace:

[<0x401457>] jvm_pc_init + 0x0/0x42
[<0x4015dc>] jvm_run + 0x4b/0x7d

 

利用这个crash信息, 可以帮助程序员快速定位ajvm的bug。

 

一、 前言
ajvm是笔者正在开发中的一个java虚拟机, 想通过编写这个jvm帮助程序员了解jvm的具体实现细节, 它是国内第一个开源的java虚拟机项目:https://github.com/cloudsec/ajvm, 同时笔者把它的开发笔记也分享到了ata上。 在前面4篇笔记中, 已经实现了class文件加载器, 反汇编器,jvm的crash信息处理, 同时它已经能运行简单的java代码了。 在今天的这篇笔记中, 将开始分享ajvm的内存管理模块是如何编写的。

二、内存分配

看下面一段java代码:

public class test6 {
        public static void main(String args[]) {
                int[] data, data1;
                int i;
                int num = 0;

                data = new int[2];
                for (i = 0; i < 2; i++) {
                        data[i] = i;
                }

                data1 = new int[3];
        }
}

首先用javac编译下, 然后用ajvm的反汇编器查看bytecode:

$./wvm -d test/test6.class
Diassember bytecode:

    ()V
stack: 1    local: 1

    0: aload_0
    1: invokespecial #1
    4: return

main    ([Ljava/lang/String;)V
stack: 3    local: 5

    0: iconst_0
    1: istore 4
    3: iconst_2
    4: newarray 10
    6: astore_1
    7: iconst_0
    8: istore_3
    9: iload_3
   10: iconst_2
   11: if_icmpge 13
   14: aload_1
   15: iload_3
   16: iload_3
   17: iastore
   18: iinc 3 1
   21: goto 0xfffffff4
   24: iconst_3
   25: newarray 10
   27: astore_2
   28: return

源码中data = new int[2];对应的汇编指令为:

    4: newarray 10

根据jvm虚拟机规范的描述, newarray指令的作用是, 从操作数堆栈用取出data数组的元素个数,然后根据newarray后面的type进行计算要申请的内存大小, type的值在虚拟机规范中如下:

#define T_BOOLEAN                               4
#define T_CHAR                                  5
#define T_FLOAT                                 6
#define T_DOUBLE                                7
#define T_BYTE                                  8
#define T_SHORT                                 9
#define T_INT                                   10
#define T_LONG                                  11

所以10代表这个int类型的数组, 接下来就要给data这个数组从heap中分配内存了。

void *alloc_newarray_memroy(u1 atype, int count)
{
        void *addr = NULL;

        switch (atype) {
        case T_BOOLEAN:
        case T_CHAR:
        case T_BYTE:
                addr = (void *)slab_alloc(jvm_thread_mem, count * sizeof(char));
                break;
        case T_SHORT:
                addr = (void *)slab_alloc(jvm_thread_mem, count * sizeof(short));
                break;
        case T_INT:
        case T_FLOAT:
                addr = (void *)slab_alloc(jvm_thread_mem, count * sizeof(int));
                break;
        case T_LONG:
        case T_DOUBLE:
                addr = (void *)slab_alloc(jvm_thread_mem, count * sizeof(long long));
                break;
        default:
                error("bad atype value.n");
                return NULL;
        }

        return addr;
}

ajvm的内存堆用的是slab算法, slab的内存结构如下:

   -------     ------     ------    ------
   |cache|-->  |slab| --> |slab| -->|slab|
   -------     ------     ------    ------
   |cache|
   -----
   |cache| ...
   -----      ------     ------    ------
   |cache|--> |slab| --> |slab| -->|slab|
   -----      ------     -----     ------
   |cache| ...
   -------    
   |cache|
   -------
   |cache|-->|slab|-->|slab| -->|slab|
   -------   ------   ------    ------

源码中的slab.c是它完整的实现, 不熟悉slab的同学请自行google。

三、垃圾回收

gc是java程序员普遍关心的问题, 当内存不够时, 将会触发jvm的垃圾回收机制。
ajvm使用最原始的引用计数法, 需要建立一个新的数据结构:

typedef struct jvm_object {
        int ref_count;
        CLASS *class;
        void *addr;
        int size;
        struct list_head list;
}JVM_OBJECT;

当数组申请完内存后, 将会建立一个新的JVM_OBJECT与其对应, ref_count被初始化为0, addr指向数组的首地址, size表示数组的大小, JVM_OBJECT将会被加入到jvm_obj_list_head链表中, 在这将来的垃圾回收时将会用到。

int jvm_interp_newarray(u2 len, char *symbol, void *base)
{
    ...
        addr = (void *)alloc_newarray_memroy(atype, count);
        if (!addr) {
                error("slab alloc failed.n");
                return -1;
        }
        printf("addr: 0x%xn", addr);

        new_obj = create_new_obj(addr, count);
        if (!new_obj) {
                error("create new obj failed.n");
                return -1;
        }
    ...
}

当数组被引用时, 我们跟数组的地址在JVM_OBJECT链表中找到它, 并且把ref_count加1, 表示这个数组在被引用。 比如上面的:

      17: iastore

这条指令就会对data数组进行引用, 我们只要在iastore的解释代码里, 对data对应的ref_count加1即可:

int jvm_interp_iastore(u2 len, char *symbol, void *base)
{
        int *addr, index, value;

        if (jvm_arg->disass_class) {
                printf("%sn", symbol);
                return 0;
        }

        pop_operand_stack(int, value)
        pop_operand_stack(int, index)
        pop_operand_stack(int, addr)

        printf("addr: 0x%xtindex: %dt%dn", addr, index, value);
        *(int *)(addr + index) = value;

        if (inc_obj_ref(addr, (&jvm_obj_list_head)) == -1) {
                jvm_error(VM_ERROR_INTERP, "inc jvm obj ref failed.n");
                return -1;
        }

        jvm_pc.pc += len;
        return 0;
}

对于数组data1, 同样进行了内存分配, 但是始终没有被引用到, 所以data1将会是gc回收时要释放的对象。

 

void start_gc(struct list_head *list_head)
{
        JVM_OBJECT *s;
        struct list_head *p, *q;

        list_for_each_safe(p, q, list_head) {
                s = list_entry(p, JVM_OBJECT, list);
                if (s && s->ref_count == 0) {
                        printf("free addr: 0x%xtsize: %dtref_count: %dn",
                                s->addr, s->size, s->ref_count);
                        list_del(p);
                        free_jvm_obj(s);
                }
        }
}

这是ajvm最简单的gc算法了, 后续将会对其进行优化。

四、演示执行

下面是ajvm对上述java代码的解释和执行过程:

$./wvm -c test test6
jvm pc init at: 0x630510

main    ([Ljava/lang/String;)V
stack: 3    local : 5
code:
0x3 0x36 0x4 0x5 0xbc 0xa 0x4c 0x3 0x3e 0x1d 0x5 0xa2 0x0 0xd 0x2b 0x1d
0x1d 0x4f 0x84 0x3 0x1 0xa7 0xff 0xf4 0x6 0xbc 0xa 0x4d 0xb1
#local at: 0x630540    #stack at: 0x630554

[    1] iconst_0    pc: 0x630510 -> 0x3
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
[    2] istore    pc: 0x630511 -> 0x36
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
[    3] iconst_2    pc: 0x630513 -> 0x5
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x2 0x0 0x0
[    4] newarray    pc: 0x630514 -> 0xbc
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x2 0x0 0x0
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
[    5] astore_1    pc: 0x630516 -> 0x4c
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x0 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
[    6] iconst_0    pc: 0x630517 -> 0x3
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
[    7] istore_3    pc: 0x630518 -> 0x3e
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
[    8] iload_3    pc: 0x630519 -> 0x1d
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
[    9] iconst_2    pc: 0x63051a -> 0x5
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x2 0x0
[   10] if_icmpge    pc: 0x63051b -> 0xa2
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x2 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
[   11] aload_1    pc: 0x63051e -> 0x2b
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
[   12] iload_3    pc: 0x63051f -> 0x1d
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
[   13] iload_3    pc: 0x630520 -> 0x1d
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
[   14] iastore    pc: 0x630521 -> 0x4f
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
[   15] iinc    pc: 0x630522 -> 0x84
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x0 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x0 0x0 0x0
[   16] goto    pc: 0x630525 -> 0xa7
[   17] iload_3    pc: 0x630519 -> 0x1d
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x1 0x0 0x0
[   18] iconst_2    pc: 0x63051a -> 0x5
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x1 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x1 0x2 0x0
[   19] if_icmpge    pc: 0x63051b -> 0xa2
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x1 0x2 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x1 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x1 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x0 0x0 0x0
[   20] aload_1    pc: 0x63051e -> 0x2b
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x0 0x0
[   21] iload_3    pc: 0x63051f -> 0x1d
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x1 0x0
[   22] iload_3    pc: 0x630520 -> 0x1d
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x1 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x1 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x1 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x1 0x1
[   23] iastore    pc: 0x630521 -> 0x4f
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x1 0x1
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x1 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x1 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x627c20 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x0 0x0 0x0
[   24] iinc    pc: 0x630522 -> 0x84
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x1 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x0 0x0 0x0
[   25] goto    pc: 0x630525 -> 0xa7
[   26] iload_3    pc: 0x630519 -> 0x1d
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x2 0x0 0x0
[   27] iconst_2    pc: 0x63051a -> 0x5
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x2 0x0 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x2 0x2 0x0
[   28] if_icmpge    pc: 0x63051b -> 0xa2
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x2 0x2 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x2 0x0 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x2 0x0 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x0 0x0 0x0
[   29] iconst_3    pc: 0x630528 -> 0x6
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x3 0x0 0x0
[   30] newarray    pc: 0x630529 -> 0xbc
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x3 0x0 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x627c80 0x0 0x0
[   31] astore_2    pc: 0x63052b -> 0x4d
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x627c80 0x0 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x0 0x2 0x0     #stack: 0x0 0x0 0x0
#local: 0x0 0x627c20 0x627c80 0x2 0x0     #stack: 0x0 0x0 0x0
[   32] return    pc: 0x63052c -> 0xb1
#local: 0x0 0x627c20 0x627c80 0x2 0x0     #stack: 0x0 0x0 0x0
jvm stack depth is zero.
interpret bytecode done.

你可能感兴趣的:(jvm开发笔记)