通过本次实验,可算是搞清楚了高速缓存存储器的寻址逻辑。主要参考了这位老哥的博客。深入理解计算机系统-cachelab,不过他的partB做法有些麻烦,细节分析上好像也还有一些不对的地方(直接暴力分块时产生替换的位置)。课程提供的PPT也值得参考。
有2s组,每组有E行,每一行是一个缓存块,每一行包括1个有效位,t个标记位。有2b字节存数据以及有效位和标记位(标记位用来和给定地址的标记位对比,判断给定地址在不在该行中,如果在则命中)。
参数是判断高速缓存是否存在对应地址的数据的操作为:
参数S和B将m位地址分为了三段,首先通过S找到对应的组,再该组的行中找到标记位t与查询地址的标记为t相等的行,然后再该行包含的块中查找偏移地址位B的位置。即缓存命中。
直接映射高速缓存
在理解了组、行、标记、偏移地址概念后,直接映射就是每组只有一行,因此当两个不同地址在同一组,那么第二次访问就会导致第一次的块被替换出去。这时,每组不止一行,就可以考虑如LRU算法等进行行替换。
上面的概念至少要整明白,lab才能下手。
除了文档,课程提供的PPT也值得参考。有助于帮助解决读测试样例和输入参数的问题。
开始之前还要安装python2
和valgrind
工具(编译、测试结果要用到)
具体关于文档的解读开始时给出的老哥的博客。实验步骤和注意事项就不重复描述了。
该部分要模仿缓存,因此不需要存具体的数据,其中页面置换算法使用LRU。缓存可以用3维数组表示如下
uint64 ***cache; //[i][j][0] : Valid bit; [i][j][1] : Tag; [i][j][2] : LRU counter,
我实现LRU的主要逻辑如下:
hit
返回,否则进行下一步。miss
,否则是miss eviction
。再测试样例中输入了size,我看了很多博客都没有考虑这个,确实这个题目下没有问题。但是如果size的值大于了每一行缓存的数据块(b2)的大小,那么就需要多行来存了。
代码如下:
csim.c
#include
#include
#include
#include
#include "cachelab.h"
typedef long unsigned int uint64;
#define Debug() { printf("wwwwqqqqq\n"); };
#define IN() { printf("-------function in------\n"); }
#define OUT() { printf("-------function out------\n"); }
uint64 cache_s;
uint64 cache_E;
uint64 cache_b;
uint64 ***cache; //[i][j][0] : Valid bit; [i][j][1] : Tag; [i][j][2] : LRU counter,
int _hits = 0, _misses = 0, _evictions = 0;
int verbose = 0;
char* ans[3] = {"miss", "miss eviction", "hit"};
void set_sEb_cache(uint64 s, uint64 E, uint64 b) {
cache_s = s;
cache_E = E;
cache_b = b;
s = (1u << cache_s);
cache = (uint64 ***)malloc(s * sizeof(uint64**));
for (uint64 i = 0; i < s; i++)
*(cache + i) = (uint64 **)malloc(cache_E * sizeof(uint64*));
for (uint64 i = 0; i < s; i++)
for (uint64 j = 0; j < cache_E; j++)
*(*(cache + i) + j) = (uint64 *)malloc(3 * sizeof(uint64));
for (uint64 i = 0; i < s; i++)
for (uint64 j = 0; j < cache_E; j++)
cache[i][j][0] = 0;
}
void free_cache() {
for (uint64 i = 0; i < (1u << cache_s); i++) {
for (uint64 j = 0; j < cache_E; j++)
free(*(*(cache + i) + j));
free(*(cache + i));
}
free(cache);
}
int T = 1;
//0miss ,1 miss_evictions, 2 hit
int LRU(uint64 address) {
uint64 tag = address >> (cache_s + cache_b);
uint64 s = (address >> cache_b) & ((1u << cache_s) - 1);
int ishit = 0;
for (uint64 j = 0; j < cache_E; j++) { //判断是否命中
if (cache[s][j][0] == 1 && cache[s][j][1] == tag) {
cache[s][j][2] = 0;
_hits++;
ishit = 2;
break;
}
}
if (ishit != 2) { //未命中,则替换
_misses++;
unsigned index = -1; //选择位置
unsigned cnt = 0;
for (uint64 j = 0; j < cache_E; j++) {
if (cache[s][j][0] == 0) {
index = j;
break;
}
if (cache[s][j][2] > cnt) {
index = j;
cnt = cache[s][j][2];
}
}
if (cache[s][index][0] == 1) {
_evictions++;
ishit = 1;
}
cache[s][index][0] = 1;
cache[s][index][1] = tag;
cache[s][index][2] = 0;
}
for (uint64 j = 0; j < cache_E; j++) //计数器+1
cache[s][j][2]++;
T++;
return ishit;
}
void printHelp() {
printf("Usage: ./csim-ref [-hv] -s -E -b -t \n" );
printf("Options:\n");
printf(" -h Print this help message.\n");
printf(" -v Optional verbose flag.\n");
printf(" -s Number of set index bits.\n" );
printf(" -E Number of lines per set.\n" );
printf(" -b Number of block offset bits.\n" );
printf(" -t Trace file.\n\n" );
printf("Examples:\n");
printf(" linux> ./csim-ref -s 4 -E 1 -b 4 -t traces/yi.trace\n");
printf(" linux> ./csim-ref -v -s 8 -E 2 -b 4 -t traces/yi.trace\n");
}
char* read_arg(int argc, char* argv[]) {
int opt;
uint64 s, E, b;
s = E = b = 0;
char* path;
while (-1 != (opt = getopt(argc, argv, "hvs:E:b:t:"))) {
switch (opt) {
case 'h':
printHelp(); break;
case 'v':
verbose = 1; break;
case 's':
s = (uint64)atoll(optarg); break;
case 'E':
E = (uint64)atoll(optarg); break;
case 'b':
b = (uint64)atoll(optarg); break;
case 't':
path = optarg; break;
default:
printHelp(); break;
}
}
if (s == 0 || E == 0 || b == 0) {
printHelp();
exit(0);
}
set_sEb_cache(s, E, b);
return path;
}
void read_file(char *path) {
FILE * pFile;
char operation;
uint64 address;
uint64 size;
pFile = fopen(path, "r");
while (fscanf(pFile, " %c %lx,%lu", &operation, &address, &size) > 0) {
if (operation == 'I') continue;
if (verbose == 1) {
if (operation == 'M')
printf("%c %lx,%lu %s %s\n", operation, address, size, ans[LRU(address)], ans[LRU(address)]);
else
printf("%c %lx,%lu %s\n", operation, address, size, ans[LRU(address)]);
} else {
if (operation == 'M')
LRU(address);
LRU(address);
}
}
fclose(pFile);
}
int main(int argc, char* argv[]) {
char *path = read_arg(argc, argv);
read_file(path);
printSummary(_hits, _misses, _evictions);
free_cache();
return 0;
}
直接通过实例进行分析
linux> make
linux> ./test-trans -M 32 -N 32
(s = 5, E = 1, b = 5) E=1,因此每一个组只有一行。
32*32数组中元素(对应位置的值表示所属于的组号)如下因为int占据32位,b=5(25=32),因此每一行存连续8个int(考虑8*8分块来求解,后面给出原因)
关于其中8*8分块的原因。(A转置得到B)
那么对角线上的块还会出现一些问题,以其中一个元素为例子,因为对角线上的块操作的是相同的组,就会有A和B的交替操作就会产生一些冲突,互相miss替换缓存。
为了方便快捷的解决这个问题,我直接一次开了一个8*8数组,用来一次性从A取出来8*8的块,再一次性把8*8的块赋值回B,从而避免该问题。(但是我考虑到一个问题是万一自己设出来的变量又有缓冲命中不命中问题该怎么办,但是由于该提只是模拟考虑对AB矩阵地址的缓存问题,所以不会遇到问题,我看其他博主貌似也没有考虑到自己生成的变量的缓存是否产生影响)
/*
* You can define additional transpose functions below. We've defined
* a simple one below to help you get started.
*/
void transpose_32_32(int M, int N, int A[N][M], int B[M][N]) {
int tmp[8][8];
for (int i = 0 ; i < M; i += 8)
for (int j = 0; j < N; j += 8) {
for (int x = 0; x < 8; x++)
for (int y = 0; y < 8; y++)
tmp[x][y] = A[x + i][j + y];
for (int y = 0; y < 8; y++)
for (int x = 0; x < 8; x++)
B[j + y][x + i] = tmp[x][y];
}
}
同理32*32
这个大小就很迷了,我考虑了很久怎么写判断逻辑。但是感觉没什么好的办法,最终看了别人的实现,然后惊了,竟然直接暴力分块,看结果。当遇到有些产生冲突的分块也不考虑,最终都能全部通过。
trans.c
/*
* trans.c - Matrix transpose B = A^T
*
* Each transpose function must have a prototype of the form:
* void trans(int M, int N, int A[N][M], int B[M][N]);
*
* A transpose function is evaluated by counting the number of misses
* on a 1KB direct mapped cache with a block size of 32 bytes.
*/
#include
#include "cachelab.h"
int is_transpose(int M, int N, int A[N][M], int B[M][N]);
void transpose_32_32(int M, int N, int A[N][M], int B[M][N]);
void transpose_64_64(int M, int N, int A[N][M], int B[M][N]);
void transpose_61_67(int M, int N, int A[N][M], int B[M][N]);
/*
* transpose_submit - This is the solution transpose function that you
* will be graded on for Part B of the assignment. Do not change
* the description string "Transpose submission", as the driver
* searches for that string to identify the transpose function to
* be graded.
*/
char transpose_submit_desc[] = "Transpose submission";
void transpose_submit(int M, int N, int A[N][M], int B[M][N]) {
if (M == 32 && N == 32) transpose_32_32(M, N, A, B);
else if (M == 64 && N == 64) transpose_64_64(M, N, A, B);
else if (M == 61 && N == 67)transpose_61_67(M, N, A, B);
}
/*
* You can define additional transpose functions below. We've defined
* a simple one below to help you get started.
*/
void transpose_32_32(int M, int N, int A[N][M], int B[M][N]) {
int tmp[8][8];
for (int i = 0 ; i < M; i += 8)
for (int j = 0; j < N; j += 8) {
for (int x = 0; x < 8; x++)
for (int y = 0; y < 8; y++)
tmp[x][y] = A[x + i][j + y];
for (int y = 0; y < 8; y++)
for (int x = 0; x < 8; x++)
B[j + y][x + i] = tmp[x][y];
}
}
void transpose_64_64(int M, int N, int A[N][M], int B[M][N]) {
int tmp[8][8];
for (int i = 0 ; i < M; i += 8)
for (int j = 0; j < N; j += 8) {
for (int x = 0; x < 8; x++)
for (int y = 0; y < 8; y++)
tmp[x][y] = A[x + i][j + y];
for (int y = 0; y < 8; y++)
for (int x = 0; x < 8; x++)
B[j + y][x + i] = tmp[x][y];
}
}
void transpose_61_67(int M, int N, int A[N][M], int B[M][N]) {
for (int i = 0 ; i < N; i += 17)
for (int j = 0; j < M; j += 17)
for (int x = 0; x < 17 && x + i < N; x++)
for (int y = 0; y < 17 && y + j < M; y++)
B[j + y][x + i] = A[x + i][j + y];
}
/*
* trans - A simple baseline transpose function, not optimized for the cache.
*/
char trans_desc[] = "Simple row-wise scan transpose";
void trans(int M, int N, int A[N][M], int B[M][N])
{
int i, j, tmp;
for (i = 0; i < N; i++) {
for (j = 0; j < M; j++) {
tmp = A[i][j];
B[j][i] = tmp;
}
}
}
/*
* registerFunctions - This function registers your transpose
* functions with the driver. At runtime, the driver will
* evaluate each of the registered functions and summarize their
* performance. This is a handy way to experiment with different
* transpose strategies.
*/
void registerFunctions()
{
/* Register your solution function */
registerTransFunction(transpose_submit, transpose_submit_desc);
/* Register any additional transpose functions */
registerTransFunction(trans, trans_desc);
}
/*
* is_transpose - This helper function checks if B is the transpose of
* A. You can check the correctness of your transpose by calling
* it before returning from the transpose function.
*/
int is_transpose(int M, int N, int A[N][M], int B[M][N])
{
int i, j;
for (i = 0; i < N; i++) {
for (j = 0; j < M; ++j) {
if (A[i][j] != B[j][i]) {
return 0;
}
}
}
return 1;
}
本次LAB难度不如上次,主要是通过本次实验可算是搞明白了缓存。博客内容整理的仓促写简略,个人任务如果对缓存理解了,那本次实验没有难度。主要是懒的总结赶紧开始下次LAB。