struct cache_line
{
int tag; //标记位
int LRU_counter; //最后访问时间
};
struct cache_line **cache;
int hit_count, miss_count, eviction_count;
int *cache_index;
int s, E, b, S;
int count = 0; //时间刻
int opt;
char *trace_name; //trace文件地址
//命令行读入
while ((opt = getopt(argc, argv, "s:E:b:t:")) != -1)
{
switch (opt)
{
case 's':
s = atoi(optarg); //读入s
break;
case 'E':
E = atoi(optarg); //读入E
break;
case 'b':
b = atoi(optarg); //读入b
break;
case 't':
trace_name = optarg; //读入地址
default:
break;
}
}
//初始化
S = pow(2, s); //计算S=2^s
cache = (struct cache_line **)malloc(sizeof(struct cache_line *) * S);
for (int i = 0; i < S; i++)
cache[i] = (struct cache_line *)malloc(sizeof(struct cache_line *) * E); //cache开辟内存
for (int i = 0; i < S; i++)
for (int j = 0; j < E; j++)
cache[i][j].tag = cache[i][j].LRU_counter = 0; //cache初始化
cache_index = (int *)malloc(sizeof(int) * S); //cache_index开辟内存
memset(cache_index, 0, sizeof(int) * S); //cache_index初始化
//文件读入
FILE *pFile = fopen(trace_name, "r"); //打开文件
char identifier;
unsigned address;
int size;
while (fscanf(pFile, " %c %x,%d", &identifier, &address, &size) > 0) //按行读入
{
if (identifier == 'I') //忽略I模式
continue;
int t_address = address / ((int)pow(2, s + b)); //计算标记位
int s_address = address / ((int)pow(2, b)) % (int)(pow(2, s)); //计算索引位
if (identifier == 'M') //M模式两次solve
{
Solve(t_address, s_address);
Solve(t_address, s_address);
}
else //S模式或L模式
Solve(t_address, s_address);
}
//结束
free(cache);
free(cache_index); //释放内存
fclose(pFile); //关闭文件
printSummary(hit_count, miss_count, eviction_count); //输出
前提:同一个组内,有效块按访问时间顺序依次顺序存储(组未满状态下),且使用cache_index来记录该组最后的有效块所在行数
int find_flag = 0; //寻找哨兵
for (int i = 0; i < cache_index[s_address]; i++)
{
if (cache[s_address][i].tag == t_address) //找到
{
++hit_count;
cache[s_address][i].LRU_counter = count;
find_flag = 1;
break;
}
}
if (cache_index[s_address] != E) //组未满
{
++miss_count; //未命中
cache[s_address][cache_index[s_address]].tag = t_address; //更新缓存
cache[s_address][cache_index[s_address]++].LRU_counter = count; //更新最后访问时间
}
else //组已满,需要LRU
{
++eviction_count; //驱逐
++miss_count; //未命中
int min_count = cache[s_address][0].LRU_counter; //最后访问时间最远块访问时间
int min_count_index = 0; //最后访问时间最远块下标
for (int i = 1; i < E; i++)
if (cache[s_address][i].LRU_counter < min_count)
{
min_count = cache[s_address][i].LRU_counter; //更新最后访问时间最远块访问时间
min_count_index = i; //更新最后访问时间最远块下标
}
cache[s_address][min_count_index].tag = t_address; //更新缓存
cache[s_address][min_count_index].LRU_counter = count; //更新最后访问时间
}
++count; //更新时间刻
#include "cachelab.h"
#include
#include
#include
#include
#include
#include
struct cache_line
{
int tag; //标记位
int LRU_counter; //最后访问时间
};
struct cache_line **cache;
int hit_count, miss_count, eviction_count;
int *cache_index;
int s, E, b, S;
int count = 0; //时间刻
void Solve(int t_address, int s_address)
{
int find_flag = 0; //寻找哨兵
for (int i = 0; i < cache_index[s_address]; i++)
{
if (cache[s_address][i].tag == t_address) //找到
{
++hit_count;
cache[s_address][i].LRU_counter = count;
find_flag = 1;
break;
}
}
if (!find_flag) //未找到
{
if (cache_index[s_address] != E) //组未满
{
++miss_count; //未命中
cache[s_address][cache_index[s_address]].tag = t_address; //更新缓存
cache[s_address][cache_index[s_address]++].LRU_counter = count; //更新最后访问时间
}
else //组已满,需要LRU
{
++eviction_count; //驱逐
++miss_count; //未命中
int min_count = cache[s_address][0].LRU_counter; //最后访问时间最远块访问时间
int min_count_index = 0; //最后访问时间最远块下标
for (int i = 1; i < E; i++)
if (cache[s_address][i].LRU_counter < min_count)
{
min_count = cache[s_address][i].LRU_counter; //更新最后访问时间最远块访问时间
min_count_index = i; //更新最后访问时间最远块下标
}
cache[s_address][min_count_index].tag = t_address; //更新缓存
cache[s_address][min_count_index].LRU_counter = count; //更新最后访问时间
}
}
++count; //更新时间刻
}
int main(int argc, char *argv[])
{
int opt;
char *trace_name; //trace文件地址
//命令行读入
while ((opt = getopt(argc, argv, "s:E:b:t:")) != -1)
{
switch (opt)
{
case 's':
s = atoi(optarg); //读入s
break;
case 'E':
E = atoi(optarg); //读入E
break;
case 'b':
b = atoi(optarg); //读入b
break;
case 't':
trace_name = optarg; //读入地址
default:
break;
}
}
//初始化
S = pow(2, s); //计算S=2^s
cache = (struct cache_line **)malloc(sizeof(struct cache_line *) * S);
for (int i = 0; i < S; i++)
cache[i] = (struct cache_line *)malloc(sizeof(struct cache_line *) * E); //cache开辟内存
for (int i = 0; i < S; i++)
for (int j = 0; j < E; j++)
cache[i][j].tag = cache[i][j].LRU_counter = 0; //cache初始化
cache_index = (int *)malloc(sizeof(int) * S); //cache_index开辟内存
memset(cache_index, 0, sizeof(int) * S); //cache_index初始化
//文件读入
FILE *pFile = fopen(trace_name, "r"); //打开文件
char identifier;
unsigned address;
int size;
//处理
while (fscanf(pFile, " %c %x,%d", &identifier, &address, &size) > 0) //按行读入
{
if (identifier == 'I') //忽略I模式
continue;
int t_address = address / ((int)pow(2, s + b)); //计算标记位
int s_address = address / ((int)pow(2, b)) % (int)(pow(2, s)); //计算索引位
if (identifier == 'M') //M模式两次solve
{
Solve(t_address, s_address);
Solve(t_address, s_address);
}
else //S模式或L模式
Solve(t_address, s_address);
}
//结束
free(cache);
free(cache_index); //释放内存
fclose(pFile); //关闭文件
printSummary(hit_count, miss_count, eviction_count); //输出
return 0;
}
前提:从导出的trans.f0文件中可以得知,使用./test-trans命令测试得出的未命中数量比实际的多3(valgrind模拟额外开销)
以下提及的所有未命中均为实际未命中数量,如需转换到命令输出需加3
注意到A矩阵中元素仍然会和B矩阵中元素互相冲突,导致A矩阵的对角线和B矩阵对角线上一格元素发生冲突不命中,具体解释如图所示:
大部分冲突命中均发生在A和B的相邻操作中(红色画圈部分),所以可以考虑先同时对A块中的同一行进行读取操作,再同时对B块对应列进行写入操作,这样便可以消除相邻操作带来的冲突不命中
经过测试,8*8对于32*32矩阵转置未命中为284,已达到满分标准
由于每次对A一次性读取一行,所以A矩阵的命中率得到了提升(12.5%未命中率)
但由于缓存只能存下矩阵前4行元素,B矩阵在读取后4列时会覆盖前4列,同时读取前4列时会覆盖后4列,导致每个元素都发生了冲突不命中(初始为冷不命中)
(红色箭头表示冲突不命中)
由于前后4列会互相覆盖,所以考虑将8*8分块改为4*4分块,每次一次性读取一行,这样便能降低B矩阵的未命中率
经过测试,4*4并一次性读取对于64*64矩阵转置未命中为1696,有了较大的提升,但依然没有小于1300
B矩阵由于降低分块大小且仍然一次性读取一行,不再100%未命中率,但由于采用了4*4分块,在非对角线块内部仍然会发生大量冲突不命中,未命中率为25%,而在对角线块上由于A矩阵和B矩阵所映射到相同的缓存块,导致A矩阵和B矩阵发生冲突不命中,未命中率更高
(非对角线块)
这里的思想参考了某位大神,这里着重分析
每个8*8块中的4*4块按行顺序将块被标记为1,2,3,4块
2.对A矩阵的3块按列读取,对B矩阵的2块按行读取,将读取到的A矩阵按行移入B矩阵的2块,将读取到的B矩阵按行移入B矩阵的3块(两次读取后直接进行移入)
经过测试,该方法对于64*64矩阵转置未命中为1160,已达到满分标准
对于对角块:
对于非对角块:
首先考虑和64*64矩阵一样的转置方法:
将61*67的56*64部分采用和之前一样的方法,而对于剩下的部分采用最原始的直接转置法:
经测试,未命中为2058,已经非常接近2000:
剩下的矩阵考虑分成3大部分:
第一部分(A[64][0]~A[67][59])
分成3*6的块,每次一次性读取一行进行转置
第二部分(A[0][56]~A[63][59])
一次性读取一行进行转置
第三部分(A[64][60]~A[66][60])
直接转置
经测试,未命中为1971,已达到了满分标准,但还有优化空间
经过测试,发现存在更优解:
将61*67的60*60部分分成20*4的块,每次一次性读取两行
剩下的矩阵考虑分成2大部分:
第一部分(A[0][60]~A[66][60])
直接转置
第二部分(A[60][0]~A[66][59])
一次性读取一列进行转置(非按行读取)
经测试,未命中为1750,有了较大幅度的优化,同时代码复杂度也大大下降
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
int i, j, ii;
int x0, x1, x2, x3, x4, x5, x6, x7;
if (M == 32) //32x32矩阵
{
for (i = 0; i < 32; i += 8)
for (j = 0; j < 32; j += 8) //8x8分块
for (ii = i; ii < i + 8; ii++)
{
x0 = A[ii][j]; x1 = A[ii][j + 1];
x2 = A[ii][j + 2]; x3 = A[ii][j + 3];
x4 = A[ii][j + 4]; x5 = A[ii][j + 5];
x6 = A[ii][j + 6]; x7 = A[ii][j + 7]; //每次一次性读取一行,存入x0~x7
B[j][ii] = x0; B[j + 1][ii] = x1;
B[j + 2][ii] = x2; B[j + 3][ii] = x3;
B[j + 4][ii] = x4; B[j + 5][ii] = x5;
B[j + 6][ii] = x6; B[j + 7][ii] = x7; //移入B矩阵
}
}
else if (M == 64) //64x64矩阵
{
for (i = 0; i < 64; i += 8)
for (j = 0; j < 64; j += 8) //先8x8分块
{
//8x8大块中进行3步操作
for (ii = i; ii < i + 4; ii++) //A矩阵的1,2块
{
x0 = A[ii][j]; x1 = A[ii][j + 1];
x2 = A[ii][j + 2]; x3 = A[ii][j + 3];
x4 = A[ii][j + 4]; x5 = A[ii][j + 5];
x6 = A[ii][j + 6]; x7 = A[ii][j + 7]; //一次性读取一行,存入x0~x7
B[j][ii] = x0; B[j + 1][ii] = x1;
B[j + 2][ii] = x2; B[j + 3][ii] = x3; //按列移入B矩阵1块
B[j][ii + 4] = x4; B[j + 1][ii + 4] = x5;
B[j + 2][ii + 4] = x6; B[j + 3][ii + 4] = x7; //按列移入B矩阵2块
}
for (ii = j; ii < j + 4; ii++) //A矩阵的3块和B矩阵的2块
{
x0 = A[i + 4][ii]; x1 = A[i + 5][ii];
x2 = A[i + 6][ii]; x3 = A[i + 7][ii]; //按列读取A矩阵3块
x4 = B[ii][i + 4]; x5 = B[ii][i + 5];
x6 = B[ii][i + 6]; x7 = B[ii][i + 7]; //按行读取B矩阵2块
B[ii][i + 4] = x0; B[ii][i + 5] = x1;
B[ii][i + 6] = x2; B[ii][i + 7] = x3; //按行移入B矩阵2块
B[ii + 4][i] = x4; B[ii + 4][i + 1] = x5;
B[ii + 4][i + 2] = x6; B[ii + 4][i + 3] = x7; //按行移入B矩阵3块
}
for (ii = i + 4; ii < i + 8; ii += 2) //A矩阵的4块
{
x0 = A[ii][j + 4]; x1 = A[ii][j + 5];
x2 = A[ii][j + 6]; x3 = A[ii][j + 7];
x4 = A[ii + 1][j + 4]; x5 = A[ii + 1][j + 5];
x6 = A[ii + 1][j + 6]; x7 = A[ii + 1][j + 7]; //一次性读取两行,存入x0~x7
B[j + 4][ii] = x0; B[j + 5][ii] = x1;
B[j + 6][ii] = x2; B[j + 7][ii] = x3;
B[j + 4][ii + 1] = x4; B[j + 5][ii + 1] = x5;
B[j + 6][ii + 1] = x6; B[j + 7][ii + 1] = x7; //移入B矩阵
}
}
}
else //61x67矩阵
{
//以下为1750miss方法
for (i = 0; i < 60; i += 20)
for (j = 0; j < 60; j += 4) //60x60部分进行20x4分块
for (ii = i; ii < i + 20; ii += 2)
{
x0 = A[ii][j]; x1 = A[ii][j + 1];
x2 = A[ii][j + 2]; x3 = A[ii][j + 3];
x4 = A[ii + 1][j]; x5 = A[ii + 1][j + 1];
x6 = A[ii + 1][j + 2]; x7 = A[ii + 1][j + 3]; //一次性读取两行,存入x0~x7
B[j][ii] = x0; B[j + 1][ii] = x1;
B[j + 2][ii] = x2; B[j + 3][ii] = x3;
B[j][ii + 1] = x4; B[j + 1][ii + 1] = x5;
B[j + 2][ii + 1] = x6; B[j + 3][ii + 1] = x7; //移入B矩阵
}
for (i = 0; i < 67; i++) //A[0][60]~A[66][60]部分直接转置
B[60][i] = A[i][60];
for (i = 0; i < 60; i++) //A[60][0]~A[66][59]部分
{
x0=A[60][i]; x1=A[61][i];
x2=A[62][i]; x3=A[63][i];
x4=A[64][i]; x5=A[65][i];
x6=A[66][i]; //一次性读取一列,存入x0~x6
B[i][60]=x0; B[i][61]=x1;
B[i][62]=x2; B[i][63]=x3;
B[i][64]=x4; B[i][65]=x5;
B[i][66]=x6; //移入B矩阵
}
/*
//以下为1971miss方法
for (i = 0; i < 64; i += 8)
for (j = 0; j < 56; j += 8) //64x56部分进行8x8分块
{
for (ii = i; ii < i + 4; ii++) //以下方法同64x64
{
x0 = A[ii][j]; x1 = A[ii][j + 1];
x2 = A[ii][j + 2]; x3 = A[ii][j + 3];
x4 = A[ii][j + 4]; x5 = A[ii][j + 5];
x6 = A[ii][j + 6]; x7 = A[ii][j + 7];
B[j][ii] = x0; B[j + 1][ii] = x1;
B[j + 2][ii] = x2; B[j + 3][ii] = x3;
B[j][ii + 4] = x4; B[j + 1][ii + 4] = x5;
B[j + 2][ii + 4] = x6; B[j + 3][ii + 4] = x7;
}
for (ii = j; ii < j + 4; ii++)
{
x0 = A[i + 4][ii]; x1 = A[i + 5][ii];
x2 = A[i + 6][ii]; x3 = A[i + 7][ii];
x4 = B[ii][i + 4]; x5 = B[ii][i + 5];
x6 = B[ii][i + 6]; x7 = B[ii][i + 7];
B[ii][i + 4] = x0; B[ii][i + 5] = x1;
B[ii][i + 6] = x2; B[ii][i + 7] = x3;
B[ii + 4][i] = x4; B[ii + 4][i + 1] = x5;
B[ii + 4][i + 2] = x6; B[ii + 4][i + 3] = x7;
}
for (ii = i + 4; ii < i + 8; ii += 2)
{
x0 = A[ii][j + 4]; x1 = A[ii][j + 5];
x2 = A[ii][j + 6]; x3 = A[ii][j + 7];
x4 = A[ii + 1][j + 4]; x5 = A[ii + 1][j + 5];
x6 = A[ii + 1][j + 6]; x7 = A[ii + 1][j + 7];
B[j + 4][ii] = x0; B[j + 5][ii] = x1;
B[j + 6][ii] = x2; B[j + 7][ii] = x3;
B[j + 4][ii + 1] = x4; B[j + 5][ii + 1] = x5;
B[j + 6][ii + 1] = x6; B[j + 7][ii + 1] = x7;
}
}
for (j = 0; j < 60; j += 6)
for (ii = 64; ii < 67; ii++) //A[64][0]~A[67][59]部分进行3x6分块
{
x0 = A[ii][j]; x1 = A[ii][j + 1];
x2 = A[ii][j + 2]; x3 = A[ii][j + 3];
x4 = A[ii][j + 4]; x5 = A[ii][j + 5]; //一次性读取一行
B[j][ii] = x0; B[j + 1][ii] = x1;
B[j + 2][ii] = x2; B[j + 3][ii] = x3;
B[j + 4][ii] = x4; B[j + 5][ii] = x5;
}
for (i = 0; i < 64; i ++) //A[0][56]~A[63][59]部分
{
x0 = A[i][56]; x1 = A[i][57];
x2 = A[i][58]; x3 = A[i][59]; //一次性读取一行
x4 = A[i][60]; B[56][i] = x0;
B[57][i] = x1; B[58][i] = x2;
B[59][i] = x3; B[60][i] = x4;
}
B[60][64] = A[64][60]; //剩余直接转置
B[60][65] = A[65][60];
B[60][66] = A[66][60];
*/
}
}