1.cache:S、E、B
2.地址格式:
3.直接映射:one line per set(E=1)
4.E-way Set Associative Cache
5.write(dirty bit)
6.intel core i7 cache hierarchy
1.memory mountain test function
2.Rearranging loops to improve spatial locality
3.Using blocking to improve temporal locality
第一个实验是模拟cache
第二个实验是降低cache miss的次数,感觉很难(一开始没有考虑到A和B矩阵的特性:A和B矩阵对应的同一位置会映射到同一行cache,一直没考虑到这个因素,导致后面一直想不明白为什么会有这个高的cache,即使分了block)
下面是第一个实验的代码:
#include
#include
#include
#include
#include "cachelab.h"
#define WORD_BITS 64
#define BIT(n) (1 << (n))
#define MASK(n) (BIT(n) - 1)
#define TAG_MASK(s, b) (MASK(WORD_BITS-s-b) << (s + b))
#define SET_MASK(s, b) (MASK(s) << b)
#define BLOCK_MASK(b) (MASK(b))
enum
{
VALID_POS = 0,
TAG_POS,
TIME_POS,
};
typedef unsigned long int ulint;
int s_bits, s_bytes; // number of set
int E_bytes; // number of lines per set
int b_bits, b_bytes; // number of bytes per block
int hit_count, miss_count, eviction_count;
ulint ***cache;
FILE *fp;
ulint get_tag(ulint addr)
{
return ((addr & TAG_MASK(s_bits, b_bits)) >> (s_bits + b_bits));
}
ulint get_set(ulint addr)
{
return ((addr & SET_MASK(s_bits, b_bits)) >> b_bits);
}
ulint get_offset(ulint addr)
{
return (addr & BLOCK_MASK(b_bits));
}
/**
* implement lru algorithm: find the leaset-recently uesd block.
*/
int lru(ulint set)
{
int i, temp_pos = __INT_MAX__;
for (i = 0; i < E_bytes; ++i)
{
if (!cache[set][i][VALID_POS])
break;
if (temp_pos == __INT_MAX__)
temp_pos = i;
if (cache[set][temp_pos][TIME_POS] > cache[set][i][TIME_POS])
temp_pos = i;
}
return i != E_bytes ? i : temp_pos;
}
/**
* check if the dest addr is in cache.If not in cache, then load the data into the cache.
* return value:
* 0 - not in cache
* 1 - in cache
*/
int checkHIT(ulint addr, ulint set, int time)
{
int i, pos;
ulint tag = get_tag(addr);
for (i = 0; i < E_bytes; ++i)
{
if (cache[set][i][VALID_POS])
{
if (cache[set][i][TAG_POS] == tag)
{
cache[set][i][TIME_POS] = time;
++hit_count;
return 1;
}
}
}
++miss_count;
pos = lru(set);
if (cache[set][pos][VALID_POS])
++eviction_count;
cache[set][pos][VALID_POS] = 1;
cache[set][pos][TAG_POS] = tag;
cache[set][pos][TIME_POS] = time;
return 0;
}
int main(int argc, char *argv[])
{
char c;
int i, j;
char op;
ulint addr, size;
int time;
while ((c = getopt(argc, argv, "s:E:b:t")) != -1)
{
switch (c)
{
case 's':
s_bits = atoi(optarg);
s_bytes = BIT(s_bits); // translate index bits to number
break;
case 'E':
E_bytes = atoi(optarg);
break;
case 'b':
b_bits = atoi(optarg);
b_bytes = BIT(b_bits);
break;
case 't':
fp = fopen(argv[optind], "r"); // cant use optarg
break;
default:
exit(1);
}
}
cache = (ulint ***)malloc(sizeof(ulint **) * s_bytes); //allocate set
for (i = 0; i < s_bytes; ++i)
{
cache[i] = (ulint **)malloc(sizeof(ulint *) * E_bytes); //allocate ways
for (j = 0; j < E_bytes; ++j)
{
cache[i][j] = (ulint *)malloc(sizeof(ulint) * (3)); // allocate blocks(tag+valid+time)
}
}
hit_count = miss_count = eviction_count = 0;
time = 0;
while (!feof(fp))
{
op = fgetc(fp);
if(op == 'I'){
fscanf(fp, " %lx,%ld\n", &addr, &size);
continue;
}
fscanf(fp, "%lx,%ld\n", &addr, &size);
// printf("%c %lx,%ld\n",op,addr,size);
switch (op)
{
case 'L':
checkHIT(addr, get_set(addr), time);
break;
case 'S':
checkHIT(addr, get_set(addr), time);
break;
case 'M':
//LOAD
checkHIT(addr, get_set(addr), time);
//STORE
// checkHIT(addr, get_set(addr), time);
++hit_count;
break;
default:
break;
}
time++;
}
printSummary(hit_count, miss_count, eviction_count);
return 0;
}
第二个实验没有做具体的实现,不过可以分析一下: