CMU 15213:cache memories笔记和cache实验

1. 笔记

cache基本概念

1.cache:S、E、B

  • S:sets
  • E:the number of lines per set
  • B:the number of bytes per block

2.地址格式:

  • 一般:tag bits+set index+block offset
  • 可选:set index + tag bits + block offset

3.直接映射:one line per set(E=1)

4.E-way Set Associative Cache

  • find set
  • 检查valid位
  • 检查组内的tag

5.write(dirty bit)

  • write through
  • write back
  • write allocate
  • No-write-allocate

6.intel core i7 cache hierarchy

  • core内部:L1 cache(区分Data和Instruction)和L2 unified cache
  • core共享:L3 unified cache
  • 参数:L1(32kB,8-way,4 cycles)、L2(256KB,8-way,10 cycles)、L3(8MB,16-way,40-75 cycles)
  • block size:64B

1.2 利用cache提升程序性能

1.memory mountain test function

  • 测试cache的性能

2.Rearranging loops to improve spatial locality

  • 空间局部性:如果顺序不对的话性能差异有4倍

3.Using blocking to improve temporal locality

cache实验

第一个实验是模拟cache
第二个实验是降低cache miss的次数,感觉很难(一开始没有考虑到A和B矩阵的特性:A和B矩阵对应的同一位置会映射到同一行cache,一直没考虑到这个因素,导致后面一直想不明白为什么会有这个高的cache,即使分了block)

下面是第一个实验的代码:

#include 
#include 
#include 
#include 
#include "cachelab.h"

#define WORD_BITS 64

#define BIT(n) (1 << (n))
#define MASK(n) (BIT(n) - 1)
#define TAG_MASK(s, b) (MASK(WORD_BITS-s-b) << (s + b))
#define SET_MASK(s, b) (MASK(s) << b)
#define BLOCK_MASK(b) (MASK(b))

enum
{
    VALID_POS = 0,
    TAG_POS,
    TIME_POS,
};

typedef unsigned long int ulint;

int s_bits, s_bytes; // number of set
int E_bytes;         // number of lines per set
int b_bits, b_bytes; // number of bytes per block
int hit_count, miss_count, eviction_count;

ulint ***cache;
FILE *fp;

ulint get_tag(ulint addr)
{
    return ((addr & TAG_MASK(s_bits, b_bits)) >> (s_bits + b_bits));
}
ulint get_set(ulint addr)
{
    return ((addr & SET_MASK(s_bits, b_bits)) >> b_bits);
}
ulint get_offset(ulint addr)
{
    return (addr & BLOCK_MASK(b_bits));
}

/**
 * implement lru algorithm: find the leaset-recently uesd block.
 */
int lru(ulint set)
{
    int i, temp_pos = __INT_MAX__;
    for (i = 0; i < E_bytes; ++i)
    {
        if (!cache[set][i][VALID_POS])
            break;
        if (temp_pos == __INT_MAX__)
            temp_pos = i;
        if (cache[set][temp_pos][TIME_POS] > cache[set][i][TIME_POS])
            temp_pos = i;
    }
    return i != E_bytes ? i : temp_pos;
}

/**
 * check if the dest addr is in cache.If not in cache, then load the data into the cache.
 * return value:
 *  0 - not in cache
 *  1 - in cache
 */
int checkHIT(ulint addr, ulint set, int time)
{
    int i, pos;
    ulint tag = get_tag(addr);
    for (i = 0; i < E_bytes; ++i)
    {
        if (cache[set][i][VALID_POS])
        {
            if (cache[set][i][TAG_POS] == tag)
            {
                cache[set][i][TIME_POS] = time;
                ++hit_count;
                return 1;
            }
        }
    }
    ++miss_count;
    pos = lru(set);
    if (cache[set][pos][VALID_POS])
        ++eviction_count;
    cache[set][pos][VALID_POS] = 1;
    cache[set][pos][TAG_POS] = tag;
    cache[set][pos][TIME_POS] = time;
    return 0;
}

int main(int argc, char *argv[])
{
    char c;
    int i, j;
    char op;
    ulint addr, size;
    int time;

    while ((c = getopt(argc, argv, "s:E:b:t")) != -1)
    {
        switch (c)
        {
        case 's':
            s_bits = atoi(optarg);
            s_bytes = BIT(s_bits); // translate index bits to number
            break;
        case 'E':
            E_bytes = atoi(optarg);
            break;
        case 'b':
            b_bits = atoi(optarg);
            b_bytes = BIT(b_bits);
            break;
        case 't':
            fp = fopen(argv[optind], "r"); // cant use optarg
            break;
        default:
            exit(1);
        }
    }
    cache = (ulint ***)malloc(sizeof(ulint **) * s_bytes); //allocate set
    for (i = 0; i < s_bytes; ++i)
    {
        cache[i] = (ulint **)malloc(sizeof(ulint *) * E_bytes); //allocate ways
        for (j = 0; j < E_bytes; ++j)
        {
            cache[i][j] = (ulint *)malloc(sizeof(ulint) * (3)); // allocate blocks(tag+valid+time)
        }
    }
    hit_count = miss_count = eviction_count = 0;
    time = 0;
    while (!feof(fp))
    {
        op = fgetc(fp);
        if(op == 'I'){
            fscanf(fp, " %lx,%ld\n", &addr, &size);
            continue;
        }
        fscanf(fp, "%lx,%ld\n", &addr, &size);
        // printf("%c %lx,%ld\n",op,addr,size);
        switch (op)
        {
        case 'L':
            checkHIT(addr, get_set(addr), time);
            break;
        case 'S':
            checkHIT(addr, get_set(addr), time);
            break;
        case 'M':
            //LOAD
            checkHIT(addr, get_set(addr), time);
            //STORE
            // checkHIT(addr, get_set(addr), time);
            ++hit_count;
            break;
        default:
            break;
        }
        time++;
    }

    printSummary(hit_count, miss_count, eviction_count);
    return 0;
}

第二个实验没有做具体的实现,不过可以分析一下:

  • 对于32x32的,根据前面的描述AB矩阵会产生cache line的冲突,所以提前将A的一行(8个)用局部变量保存起来,那么访问B的时候,会将A的这一行evicted。总的思想就是:让其中一个矩阵的元素尽可能留在cache里面,另外一个矩阵则用局部变量做存储。所以B矩阵只有访问第一个元素会发生miss,A矩阵也是访问第一个元素会发生Miss。
  • 对于64x64的,同32x32的思想一样,让B的元素尽可能存在cache,A的元素用局部变量。但是会发现用8x8的block和不做优化的cache miss数目基本一致。原因是:64x64的四行就足以放满cache,那么从第五行到第八行,会替换掉原来的第一行到第四行。所以这里解法是采用4x4的block,对A则依然采用局部变量的方法。

你可能感兴趣的:(cmu15213)