学习destor(二)

继续do_delete.c

  1. 结构体GCHashEntry
struct GCHashEntry{
    uint64_t cid;
    Queue* chunk_queue;
};
  1. 函数void destructor(gpointer ptr)
    gpointer是Glib库中的结构体指针。destructor函数的作用是释放GCHashEntry中分配的空间,并释放GCHashEntry这个空间。
void destructor(gpointer ptr){
    struct GCHashEntry* eptr = (struct GCHashEntry*)ptr;
    queue_free(eptr->chunk_queue, free);
    free(ptr);
}
  1. 函数 static void* gether_fingerprint_for_deletion(void *arg)
    注意:global_gc_HashTable中存的是是结构体GCHashEntry(GCHashEntry是value,chunkPointer中的id是key)
    首先创建一个哈希表为 global_gc_HashTable,然后不断从delete_recipe_queue中pop出chunkPointer c。然后在global_gc_HashTable中查找是否有c对应id的GCHashEntry。若没有,则新建一个GCHashEntry,并对这个entry中的id复制,chunk对应指纹压如chunk队列,最后将这个entry插入到global_gc_HashTable中。若有,则直接将chunk对应的指纹插入entry中的chunk队列中。
    这样就将delete_recipe_queue队列中的chunkPointer中对应的指纹全部按照id插入到对应的chunk队列中了。
static void* gether_fingerprint_for_deletion(void *arg) {
    struct chunk* c;
    global_gc_HashTable = g_hash_table_new_full(g_int64_hash, g_int64_equal, NULL, destructor);

    while(c = sync_queue_pop(delete_recipe_queue)){
        if (CHECK_CHUNK(c, CHUNK_FILE_START)) {
            free_chunk(c);
            continue;
        }
        if (CHECK_CHUNK(c, CHUNK_FILE_END)) {
            free_chunk(c);
            endFlag = true;
            break;
        }
        struct GCHashEntry* gcHashEntry = g_hash_table_lookup(global_gc_HashTable, &c->id);
        if(gcHashEntry == NULL){
            struct GCHashEntry* entry = (struct GCHashEntry*)malloc(sizeof(struct GCHashEntry));
            entry->cid = c->id;
            entry->chunk_queue = queue_new();

            fingerprint* fp = (fingerprint*)malloc(sizeof(fingerprint));
            memcpy(fp, &c->fp, sizeof(fingerprint));
            queue_push(entry->chunk_queue, fp);

            g_hash_table_insert(global_gc_HashTable, &entry->cid, entry);
        }else{
            fingerprint* fp = (fingerprint*)malloc(sizeof(fingerprint));
            memcpy(fp, &c->fp, sizeof(fingerprint));
            queue_push(gcHashEntry->chunk_queue, fp);
        }
        free_chunk(c);
    }
    return NULL;
}
  1. 结构体metaEntry
    off表示chunk中的数据的偏移量
    len表示chunk的大小,fp表示指纹。
struct metaEntry {
    int32_t off;
    int32_t len;
    fingerprint fp;
};
  1. 函数 void chunk_filter(void* item, void* user_data)
    这个函数的作用是:若user_data中包含item,则将user_data中的item删除掉
void chunk_filter(void* item, void* user_data){
    fingerprint * fp = (fingerprint*)item;
    GHashTable * gHashTable = (GHashTable*)user_data;
    if(g_hash_table_contains(gHashTable, item)){
        g_hash_table_remove(gHashTable, item);
    }
}
  1. 函数 void chunk_migrate(gpointer key, gpointer value, gpointer user_data)
    我们首先需要看一下结构体container,其中报告containerMeta和data。而containerMeta中包含id,data_size,chunk_num,和指纹到chunk偏移量的映射。new_chunk函数的作用是分配一个大小为size的chunk。
    函数的作用是:key对应指纹,value对应metaEntry,user_data对应container。将指纹和chunk中的数据融合到新的chunk c中,,并将c压入migrate_data_queue队列中。
struct containerMeta {
	containerid id;
	int32_t data_size;
	int32_t chunk_num;

	/* Map fingerprints to chunk offsets. */
	GHashTable *map;
};

struct container {
	struct containerMeta meta;
	unsigned char *data;
};
struct chunk* new_chunk(int32_t size) {
	struct chunk* ck = (struct chunk*) malloc(sizeof(struct chunk));

	ck->flag = CHUNK_UNIQUE;
	ck->id = TEMPORARY_ID;
	memset(&ck->fp, 0x0, sizeof(fingerprint));
	ck->size = size;

	if (size > 0)
		ck->data = malloc(size);
	else
		ck->data = NULL;

	return ck;
}
void chunk_migrate(gpointer key, gpointer value, gpointer user_data){
    struct container* con = (struct container*)user_data;
    struct metaEntry* metaEntry = (struct metaEntry*)value;
    struct chunk* c = new_chunk(metaEntry->len);

    memcpy(&c->fp, key, sizeof(fingerprint));
    c->size = metaEntry->len;
    memcpy(c->data, con->data + metaEntry->off, metaEntry->len);

    migrate_counter++;
    migrate_size += c->size;

    sync_queue_push(migrate_data_queue, c);
}
  1. 函数 void read_container_filter(gpointer key, gpointer value, gpointer user_data)
    这个函数的作用:把value指向的entry找到,然后找到entry所在的container,然后利用函数container_meta_foreach,删除htable中container中meta中map的指纹所对应的container id(删除的是htable中指纹对应的container id,是container中meta中map的指纹对应的container id)。
    然后利用函数queue_foreach,在container中meta中map的指纹中,删除entry中指向的指纹队列中的指纹。(删除container中的指纹,但是是指纹队列中有的指纹才删除
    疑问:为什么container id删除的时候是container中meta中map的指纹,但是删除指纹的时候,却是entry中指向的指纹队列中的指纹
    然后利用函数 g_hash_table_foreach来将container中剩下的meta中map的指纹合并为新的chunk,并将这些chunk放入migrate_data_queue中,然后删除这个container。

    所以,这函数的主要目的就是,找到包含value的container,然后先在htable中删除container的id,然后删除value中包含的指纹,然后将container中剩下的指纹对应的chunk放入migrate_data_queue队列中,最后删除这个container。

    先看函数retrieve_container_by_id。这个函数的作用是根据container的id找到对应的container。方法是。。。太复杂了明天再看。

struct container* retrieve_container_by_id(containerid id) {
	struct container *c = (struct container*) malloc(sizeof(struct container));

	init_container_meta(&c->meta);

	unsigned char *cur = 0;
	if (destor.simulation_level >= SIMULATION_RESTORE) {
		c->data = malloc(CONTAINER_META_SIZE);

		pthread_mutex_lock(&mutex);

		if (destor.simulation_level >= SIMULATION_APPEND)
			fseek(fp, id * CONTAINER_META_SIZE + 8, SEEK_SET);
		else
			fseek(fp, (id + 1) * CONTAINER_SIZE - CONTAINER_META_SIZE + 8,
			SEEK_SET);

		fread(c->data, CONTAINER_META_SIZE, 1, fp);

		pthread_mutex_unlock(&mutex);

		cur = c->data;
	} else {
		c->data = malloc(CONTAINER_SIZE);

		pthread_mutex_lock(&mutex);

		fseek(fp, id * CONTAINER_SIZE + 8, SEEK_SET);
		fread(c->data, CONTAINER_SIZE, 1, fp);

		pthread_mutex_unlock(&mutex);

		cur = &c->data[CONTAINER_SIZE - CONTAINER_META_SIZE];
	}

	unser_declare;
	unser_begin(cur, CONTAINER_META_SIZE);

	unser_int64(c->meta.id);
	unser_int32(c->meta.chunk_num);
	unser_int32(c->meta.data_size);

	if(c->meta.id != id){
		WARNING("expect %lld, but read %lld", id, c->meta.id);
		assert(c->meta.id == id);
	}

	int i;
	for (i = 0; i < c->meta.chunk_num; i++) {
		struct metaEntry* me = (struct metaEntry*) malloc(
				sizeof(struct metaEntry));
		unser_bytes(&me->fp, sizeof(fingerprint));
		unser_bytes(&me->len, sizeof(int32_t));
		unser_bytes(&me->off, sizeof(int32_t));
		g_hash_table_insert(c->meta.map, &me->fp, me);
	}

	unser_end(cur, CONTAINER_META_SIZE);

	if (destor.simulation_level >= SIMULATION_RESTORE) {
		free(c->data);
		c->data = 0;
	}

	return c;
}

函数 void container_meta_foreach(struct containerMeta* cm, void (func)(fingerprint, void*), void* data)
遍历container中的指纹,在每个指纹上应用函数func。

/*
 * foreach the fingerprints in the container.
 * Apply the 'func' for each fingerprint.
 */
void container_meta_foreach(struct containerMeta* cm, void (*func)(fingerprint*, void*), void* data){
	GHashTableIter iter;
	gpointer key, value;
	g_hash_table_iter_init(&iter, cm->map);
	while(g_hash_table_iter_next(&iter, &key, &value)){
		func(key, data);
	}
}


void queue_foreach(Queue *queue, void (*func)(void *data, void *user_data),
		void *user_data) {
	queue_ele_t *item = 0;
	if (queue->elem_num == 0)
		return;
	item = queue->first;
	while (item) {
		func(item->data, user_data);
		item = item->next;
	}
}
void read_container_filter(gpointer key, gpointer value, gpointer user_data){
    struct GCHashEntry* e = (struct GCHashEntry*)value;
    struct container* con = retrieve_container_by_id(e->cid);
    printf("container %lu, total chunk %d, drop chunk %d\n", e->cid, con->meta.chunk_num, e->chunk_queue->elem_num);
    con_counter++;

    container_meta_foreach(&con->meta, delete_an_entry, &e->cid);

    queue_foreach(e->chunk_queue, chunk_filter, con->meta.map);
    g_hash_table_foreach(con->meta.map, chunk_migrate, con);
    free_container(con);
}
  1. 函数 static void* load_container_for_deletion(void *arg)
    函数的作用:
    global_gc_HashTable里面是gcHashEntry
    就在这个样子滴
struct GCHashEntry{
    uint64_t cid;
    Queue* chunk_queue;
};

函数首先创建一个起始的chunk放入migrate_data_queue中,然后遍历 global_gc_hashTable,对每一个GCHashEntry都使用函数read_container_filter。
相当于是删除了包含有GCHashEntry中包含的指纹的container,然后将container中剩下的指纹对应的chunk放入migrate_data_queue队列中。
最后放入一个结束的chunk放入migrate_data_queue中。

static void* load_container_for_deletion(void *arg) {
    struct chunk* c = new_chunk(0);
    SET_CHUNK(c, CHUNK_FILE_START);
    sync_queue_push(migrate_data_queue, c);
    con_counter = 0, migrate_counter = 0, migrate_size = 0;

    g_hash_table_foreach(global_gc_HashTable, read_container_filter, NULL);
    printf("%lu containers involved, %lu chunks (%lu bytes) migrated\n", con_counter, migrate_counter, migrate_size);

    c = new_chunk(0);
    SET_CHUNK(c, CHUNK_FILE_END);
    sync_queue_push(migrate_data_queue, c);

    return NULL;
}
  1. 函数 static void* write_container_for_deletion(void arg)
    函数的作用:从migrate_data_queue中pop出chunk。当该chunk是
    gseq是什么?也可以理解为容器,每次append进chunk
    feature是什么?
    sampling函数是干什么的?
    函数 int container_overflow(struct container
    c, int32_t size) 可以判断container中是否还能放下size大小的chunk。
/*
 * Input features with a container/segment ID.
 * For physical locality, this function is called for each written container.
 * For logical locality, this function is called for each written segment.
 */
void index_update(GHashTable *features, int64_t id){
    VERBOSE("Filter phase: update %d features", g_hash_table_size(features));
    GHashTableIter iter;
    gpointer key, value;
    g_hash_table_iter_init(&iter, features);
    while (g_hash_table_iter_next(&iter, &key, &value)) {
        index_overhead.update_requests++;
        kvstore_update(key, id);
    }
}

int container_overflow(struct container* c, int32_t size) {
	if (c->meta.data_size + size > CONTAINER_SIZE - CONTAINER_META_SIZE)
		return 1;
	/*
	 * 28 is the size of metaEntry.
	 */
	if ((c->meta.chunk_num + 1) * 28 + 16 > CONTAINER_META_SIZE)
		return 1;
	return 0;
}
void write_container_async(struct container* c) {
	assert(c->meta.chunk_num == g_hash_table_size(c->meta.map));

	if (container_empty(c)) {
		/* An empty container
		 * It possibly occurs in the end of backup */
		container_count--;
		VERBOSE("Append phase: Deny writing an empty container %lld",
				c->meta.id);
		return;
	}

	sync_queue_push(container_buffer, c);
}
static void* write_container_for_deletion(void *arg) {
    struct chunk* c;
    struct container* con;
    GSequence * gseq;
    GHashTable * features;
    int32_t seq_count = 0;
    while(c = sync_queue_pop(migrate_data_queue)){
        if (CHECK_CHUNK(c, CHUNK_FILE_START)) {
            con = create_container();
            gseq = g_sequence_new(free_chunk);
            seq_count = 0;
            features = g_hash_table_new_full(g_feature_hash, g_feature_equal, free, NULL);

            free_chunk(c);
            continue;
        }
        if (CHECK_CHUNK(c, CHUNK_FILE_END)) {
            free_chunk(c);
            if(seq_count == 0){
                endFlag = true;
                break;
            }

            write_container_async(con);


            features = sampling(gseq, seq_count);
            GSequenceIter* iter = g_sequence_get_begin_iter(gseq);
            while(!g_sequence_iter_is_end(iter)){

                struct chunk* ck = g_sequence_get(iter);
                fingerprint *ft = malloc(sizeof(fingerprint));
                memcpy(ft, &ck->fp, sizeof(fingerprint));
                g_hash_table_insert(features, ft, NULL);

                iter = g_sequence_iter_next(iter);
            }
            index_update(features, seq_count);

            g_sequence_free(gseq);
            g_hash_table_destroy(features);

            container_store_sync();

            endFlag = true;
            break;
        }

        if (container_overflow(con, c->size)) {
            write_container_async(con);


            features = sampling(gseq, seq_count);
            GSequenceIter* iter = g_sequence_get_begin_iter(gseq);
            while(!g_sequence_iter_is_end(iter)){

                struct chunk* ck = g_sequence_get(iter);
                fingerprint *ft = malloc(sizeof(fingerprint));
                memcpy(ft, &ck->fp, sizeof(fingerprint));
                g_hash_table_insert(features, ft, NULL);

                iter = g_sequence_iter_next(iter);
            }
            index_update(features, seq_count);

            g_sequence_free(gseq);
            g_hash_table_destroy(features);

            seq_count = 0;
            gseq = g_sequence_new(free_chunk);
            features = g_hash_table_new_full(g_feature_hash, g_feature_equal, free, NULL);
            con = create_container();
        }

        add_chunk_to_container(con, c);
        g_sequence_append(gseq, c);
        seq_count++;



    }

    return NULL;
}

endflag是表示write_container_for_deletion函数已完成

/*
 * We assume a FIFO order of deleting backup, namely the oldest backup is deleted first.
 */
void do_delete(int jobid) {

	invalid_containers = trunc_manifest(jobid);

	init_index();
	init_recipe_store();
	init_container_store();

	struct backupVersion* backupVersion = open_backup_version(jobid);

    delete_recipe_queue = sync_queue_new(100);
    pthread_t read_t, build_t, load_t, write_t;
    endFlag = false;
    pthread_create(&read_t, NULL, read_recipe_for_deletion, backupVersion);
    pthread_create(&build_t, NULL, gether_fingerprint_for_deletion, NULL);
    do{
        usleep(100);
    }while(!endFlag);
    endFlag = false;
    migrate_data_queue = sync_queue_new(100);
    pthread_create(&load_t, NULL, load_container_for_deletion, NULL);
    pthread_create(&write_t, NULL, write_container_for_deletion, NULL);
    do{
        usleep(100);
    }while(!endFlag);

	/* Delete the invalid entries in the key-value store */
	if(destor.index_category[1] == INDEX_CATEGORY_PHYSICAL_LOCALITY){

		struct backupVersion* bv = open_backup_version(jobid);

		/* The entries pointing to Invalid Containers are invalid. */
		GHashTableIter iter;
		gpointer key, value;
		g_hash_table_iter_init(&iter, invalid_containers);
		while(g_hash_table_iter_next(&iter, &key, &value)){
			containerid id = *(containerid*)key;
			NOTICE("Reclaim container %lld", id);
			struct containerMeta* cm = retrieve_container_meta_by_id(id);

			container_meta_foreach(cm, delete_an_entry, &id);

			free_container_meta(cm);
		}

		bv->deleted = 1;
		update_backup_version(bv);
		free_backup_version(bv);

	}else if(destor.index_category[1] == INDEX_CATEGORY_LOGICAL_LOCALITY){
		/* Ideally, the entries pointing to segments in backup versions of a 'bv_num' less than 'jobid' are invalid. */
		/* (For simplicity) Since a FIFO order is given, we only need to remove the IDs exactly matched 'bv_num'. */
		struct backupVersion* bv = open_backup_version(jobid);

		struct segmentRecipe* sr;
		while((sr=read_next_segment(bv))){
			segment_recipe_foreach(sr, delete_an_entry, &sr->id);
		}

		bv->deleted = 1;
		update_backup_version(bv);
		free_backup_version(bv);

	}else{
		WARNING("Invalid index type");
		exit(1);
	}

	close_container_store();
	close_recipe_store();
	close_index();

	char logfile[] = "delete.log";
	FILE *fp = fopen(logfile, "a");
	/*
	 * ID of the job we delete,
	 * number of live containers,
	 * memory footprint
	 */
	fprintf(fp, "%d %d %d\n",
			jobid,
			destor.live_container_num,
			destor.index_memory_footprint);

	fclose(fp);

	/* record the IDs of invalid containers */
	sds didfilepath = sdsdup(destor.working_directory);
	char s[128];
	sprintf(s, "recipes/delete_%d.id", jobid);
	didfilepath = sdscat(didfilepath, s);

	FILE*  didfile = fopen(didfilepath, "w");
	if(didfile){
		GHashTableIter iter;
		gpointer key, value;
		g_hash_table_iter_init(&iter, invalid_containers);
		while(g_hash_table_iter_next(&iter, &key, &value)){
			containerid id = *(containerid*)key;
			fprintf(didfile, "%lld\n", id);
		}

		fclose(didfile);
	}


	g_hash_table_destroy(invalid_containers);
}

你可能感兴趣的:(数据去重)