1. 任务:
编写csim.c,模拟cache的命中、不命中与牺牲行。需要牺牲行时,用LRU替换策略进行替换(LRU:最近最少使用)
2. 检测方法:
name | use |
---|---|
I | 指令加载,前无空格 |
L | 数据加载,有空格 |
S | 数据存储,有空格 |
M | 数据修改(数据存储之后的 加载)有空格 |
地址字段指定一个32位的十六进制存储器地址。大小字段指定操作访问的字节数。
对于‘I’指定地操作,我们不需要考虑。
对于‘L’以及‘S’指定的操作,我们简单地可以认为这两个操作都是对某一个地址寄存器进行访问(读取或者存入数据)
对于‘M’指定的操作,可以看作是对于同一地址连续进行‘L‘和’S‘操作。(两次访问)
3. 参数意义:
name | use |
---|---|
-h | 输出帮助信息的选项 |
-v | 输出详细运行过程信息的选项; |
-s | 组索引的位数(意味着组数S= 2 s 2^s 2s) |
-E | 每一组包含的行数; |
-b | 偏移位的宽度(意味着块的大小为B= 2 b 2^b 2b) |
-t | 输入数据文件的路径(测试数据从该文件里面读取) |
4. 编写程序
typedef struct{
int valid;//有效位
long tag;//标记位
long time_tamp;//时间戳
}cache_line;
typedef cache_line* cache;
static void allocate_all_cache_line(cache* cache_p, int s, int e, int b) {
//首先分配出s组出来,每个组也就是
// bug 1 索引位数应该换算成实际组数
int bigs = 1 << s;
*cache_p = (cache_line*)(malloc(sizeof(cache_line) * bigs * e));
cache cache = *cache_p;
for(int i = 0; i < bigs * e;++i){
cache[i].valid = 0;
cache[i].tag = 0;
cache[i].time_tamp = 0;
}
}
static void free_cache(cache* cache_p, int e, int s, int b){
if(*cache_p == NULL)
return;
free(*cache_p);
}
/*
判断是否命中,首先提取出组索引,然后在组内寻找符合的标记位,同时检查有效位
*/
static void handle(cache* cache_p,char op, long addr, int size, int e, int s ,int b, int* hit_p, int* miss_p, int* evic_p, int current_time_tamp, int vflag){
//首先,先新建一个局部变量来引用cache
cache cache = *cache_p;
//解析下addr,分离出组索引,标记位
//也就是前面学习的位级操作
long co = ((0x1 << b) - 1) & addr;//块偏移
long ci = ((0x1 << s) - 1 ) & (addr >> b);//组索引
long ct = ((~((0x1 << (b + s)) - 1)) & addr) >> (b + s);//标记位
//test
printf("ct:%lx\tci:%lx\tco:%lx\t\n",ct, ci, co);
//将co,ci,ct转换成数组索引的形式
int sindex = ci * e;
int i;
//在组内遍历寻找有对应标记的缓存行
for(i = sindex;i < sindex + e; i++){
if(cache[i].tag == ct && cache[i].valid == 1){
break;
}
}
//根据索引判断是否命中,并且分情况处理
if(i != sindex + e){
//命中情况
//题意中指出不会出现这种情况,保险起见我还是加上了
if(co + size > (1 << b)){
printf("这里!");
}
//judge size
++(*hit_p);
if(vflag)
printf("hit ");
}
else{
//不命中
if(vflag)
printf("miss ");
++(*miss_p);
cache_line* empty_line;
cache_line* replace_line;
//寻找有无空行
for(i = sindex;i < sindex + e; i++){
if(cache[i].valid == 0){
empty_line = &(cache[i]);
break;
}
}
//判断是否有空行
if(i != sindex + e){
replace_line = empty_line;
}
else{
for(replace_line = &(cache[sindex]),i = sindex;i < sindex + e;++i){
if(cache[i].time_tamp < (replace_line->time_tamp)){
replace_line = &(cache[i]);
}
}
++(*evic_p);
if(vflag)
printf("eviction ");
}
replace_line->valid = 1;
replace_line->tag = ct;
replace_line->time_tamp = current_time_tamp;
}
//test
printf("cnt:%d\n",current_time_tamp);
for(int j = 0;j < e*(1 << s); ++j){
printf("%d",j);
printf("\tvalid:%d \ttag:%lx \ttime_tamp:%ld\n",cache[j].valid, cache[j].tag, cache[j].time_tamp);
}
//printf("\thandle return!");
}
static void get_opt(int argc, char** argv, char* const optstr, int* ep, int* sp, int* bp, char** file, int* vp){
int c;
opterr = 0;
while((c = getopt(argc, argv, optstr)) != -1){
switch(c){
case 'v':
*vp = 1;
break;
case 's':
//注意这里返回的是字符串,所以需要atoi函数完成转换
*sp = atoi(optarg);
//printf("S = %d\n",*sp);
break;
case 'E':
*ep = atoi(optarg);
//printf("E = %d\n",*ep);
break;
case 'b':
*bp = atoi(optarg);
//printf("b = %d\n",*bp);
break;
case 't':
*file = optarg;
//printf("file:%s\n", *file);
break;
default:
abort();
}
}
}
//逐行读取文件,根据高速缓冲原理,完成计数
static void read_file_and_excute(cache* cache_p, char* file, int* hitp, int* missp, int* evicp, int e, int s ,int b, int vflag){
//初始化变量
cache cache = *cache_p;
char op;
long addr;
int size;
FILE* my_stream;
//第一部分,使用io函数,打开文件
my_stream = fopen(file, "r");
if(my_stream == NULL){
printf("找不到文件:%s\n",file);
fflush(stdout);
exit(0);
}
long cnt = 1;
//fscanf函数返回读到文件结尾会返回null
while(!feof(my_stream)){
int tr = fscanf(my_stream, "%c %lx,%x", &op, &addr, &size);
if(tr != 3){
continue;
}
else{
if(op != 'I'){
if(vflag)
printf("%c %lx,%x ",op,addr,size);
switch(op){
case 'L':
handle(&cache, op, addr, size, e ,s ,b, hitp, missp, evicp, cnt++, vflag);
break;
case 'S':
handle(&cache, op, addr, size, e ,s ,b, hitp, missp, evicp, cnt++, vflag);
break;
case 'M':
//need twice
handle(&cache, op, addr, size, e ,s ,b, hitp, missp, evicp, cnt++, vflag);
handle(&cache, op, addr, size, e ,s ,b, hitp, missp, evicp, cnt++, vflag);
break;
}
if(vflag){
printf("\n");
fflush(stdout);
}
}
}
}
fclose(my_stream);
}
int main(int argc, char** argv)
{
//初始化变量
int hit = 0;
int miss = 0;
int evictions = 0;
char* const optstr = "hvs:E:b:t:";
int e = -1;
int s = -1;
int b = -1;
int vflag = 0;
char* file = NULL;
cache cache = NULL;
get_opt(argc, argv, optstr, &e, &s, &b, &file, &vflag);
allocate_all_cache_line(&cache, s, e, b);
read_file_and_excute(&cache, file, &hit, &miss, &evictions, e ,s ,b, vflag);
free_cache(&cache, e, s, b);
printSummary(hit, miss, evictions);
return 0;
}
优化一个矩阵转置函数,使得cache miss尽可能少。cache规模为:32组,直接映射,每行32字节数据。
1. 任务
编写一个实现矩阵转置的函数。并且使函数调用过程中对cache的不命中数miss尽可能少。在如下函数里面编写最终代码:
char transpose_submit_desc[] = "Transpose submission";
void transpose_submit(int M, int N, int A[N][M], int B[M][N]);
32×32:要求miss次数在300以下。
应用矩阵分块的思想。
冲突不命中实际上就是在访问同一个块中的两个元素的时候,由于中间访问了其它的块,导致已经加载的块被驱逐,进而第二次访问时不命中。基于这个原因,我们可以一次性访问同一个块中的多个元素,访问完以后便不再需要访问这个块了,从而可以大大地减少冲突不命中的数目。这里前8个元素在同一个块中,我们可以直接将这8个元素取出来,然后这8个元素所在的块便不再需要访问了。
考虑8x8分块:
void trans_1(int M, int N, int A[N][M], int B[M][N])
{
int ii, jj, i, val1, val2, val3, val4, val5, val6, val7, val0;
for(jj = 0; jj < 32; jj += 8)
for(ii = 0; ii < 32; ii += 8)
{
for(i = ii; i < ii + 8; i++)
{
val0 = A[i][jj];
val1 = A[i][jj + 1];
val2 = A[i][jj + 2];
val3 = A[i][jj + 3];
val4 = A[i][jj + 4];
val5 = A[i][jj + 5];
val6 = A[i][jj + 6];
val7 = A[i][jj + 7];
B[jj][i] = val0;
B[jj + 1][i] = val1;
B[jj + 2][i] = val2;
B[jj + 3][i] = val3;
B[jj + 4][i] = val4;
B[jj + 5][i] = val5;
B[jj + 6][i] = val6;
B[jj + 7][i] = val7;
}
}
}
64x64
8x8分块并且在b中先做变换
void trans_2(int M, int N, int A[N][M], int B[M][N])
{
int i, j, ii, jj, val0, val1, val2, val3, val4, val5, val6, val7;
for(ii = 0; ii < N; ii += 8)
{
for(jj = 0; jj < M; jj += 8)
{
//For each row in the 8*4 block
for(i = 0; i < 4; i++)
{
val0 = A[ii + i][jj + 0];
val1 = A[ii + i][jj + 1];
val2 = A[ii + i][jj + 2];
val3 = A[ii + i][jj + 3];
val4 = A[ii + i][jj + 4];
val5 = A[ii + i][jj + 5];
val6 = A[ii + i][jj + 6];
val7 = A[ii + i][jj + 7];
B[jj + 0][ii + i] = val0;
B[jj + 1][ii + i] = val1;
B[jj + 2][ii + i] = val2;
B[jj + 3][ii + i] = val3;
B[jj + 0][ii + 4 + i] = val4;
B[jj + 1][ii + 4 + i] = val5;
B[jj + 2][ii + 4 + i] = val6;
B[jj + 3][ii + 4 + i] = val7;
}
//First copy the first 4 rows
for(i = 0; i < 4; i++)//Do the fantastic transformation!
{
//get this row of the right-upper 4*4 block
val0 = B[jj + i][ii + 4];
val1 = B[jj + i][ii + 5];
val2 = B[jj + i][ii + 6];
val3 = B[jj + i][ii + 7];
//update this row to its correct value
val4 = A[ii + 4][jj + i];
val5 = A[ii + 5][jj + i];
val6 = A[ii + 6][jj + i];
val7 = A[ii + 7][jj + i];
B[jj + i][ii + 4] = val4;
B[jj + i][ii + 5] = val5;
B[jj + i][ii + 6] = val6;
B[jj + i][ii + 7] = val7;
//update the left lower 4*4 block of B
B[jj + 4 + i][ii + 0] = val0;
B[jj + 4 + i][ii + 1] = val1;
B[jj + 4 + i][ii + 2] = val2;
B[jj + 4 + i][ii + 3] = val3;
}
//update the right lower 4*4 block
for(i = 4; i < 8; i++)
for(j = 4; j < 8; j++)
B[jj + j][ii + i] = A[ii + i][jj + j];
}
}
}
61x67
16x16分块
void trans_3(int M, int N, int A[N][M], int B[M][N])
{
int ii, jj, i, j, val0, val1, val2, val3, val4, val5, val6, val7;
for(ii = 0; ii + 16 < N; ii += 16)
for(jj = 0; jj + 16 < M; jj += 16)
{
for(i = ii; i < ii + 16; i++)
{
val0 = A[i][jj + 0];
val1 = A[i][jj + 1];
val2 = A[i][jj + 2];
val3 = A[i][jj + 3];
val4 = A[i][jj + 4];
val5 = A[i][jj + 5];
val6 = A[i][jj + 6];
val7 = A[i][jj + 7];
B[jj + 0][i] = val0;
B[jj + 1][i] = val1;
B[jj + 2][i] = val2;
B[jj + 3][i] = val3;
B[jj + 4][i] = val4;
B[jj + 5][i] = val5;
B[jj + 6][i] = val6;
B[jj + 7][i] = val7;
val0 = A[i][jj + 8];
val1 = A[i][jj + 9];
val2 = A[i][jj + 10];
val3 = A[i][jj + 11];
val4 = A[i][jj + 12];
val5 = A[i][jj + 13];
val6 = A[i][jj + 14];
val7 = A[i][jj + 15];
B[jj + 8][i] = val0;
B[jj + 9][i] = val1;
B[jj + 10][i] = val2;
B[jj + 11][i] = val3;
B[jj + 12][i] = val4;
B[jj + 13][i] = val5;
B[jj + 14][i] = val6;
B[jj + 15][i] = val7;
}
}
for(i = ii; i < N; i++)
for(j = 0; j < M; j++)
B[j][i] = A[i][j];
for(i = 0; i < ii; i++)
for(j = jj; j < M; j++)
B[j][i] = A[i][j];
}