深入理解计算机系统(第三版)第5章 家庭作业

声明:第四章的处理器体系结构

每节后面的习题都能做一做,做错的对照答案也能弄清楚。。。
但就是家庭作业无法动,应该是汇编语言没学,导致能看但是不会写,,哭
所以就不做第四章的家庭作业了,看了些大佬的博客,很多都说第四章可以不看
心里平衡了很多,嘻嘻

5.13

A:不知道用啥作图,就用了EXCEL(太LOW了,哈哈)
深入理解计算机系统(第三版)第5章 家庭作业_第1张图片
关键路径:红色箭头
深入理解计算机系统(第三版)第5章 家庭作业_第2张图片

B:下界为浮点数加法的延迟界限

C:整数加法的延迟界限

D:因为浮点数加法为关键路径

5.14

void inner4a(vec_ptr u, vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(u);
    long limit = length - 1;
    data_t *udata = get_vec_start(u);
    data_t *vdata = get_vec_start(v);
    data_t sum = (data_t) 0;
    
    for (i = 0; i < limit; i+=6)
    {
        sum = sum + udata[i] * vdata[i]
        + udata[i+1] * vdata[i+1]
        + udata[i+2] * vdata[i+2]
        + udata[i+3] * vdata[i+3]
        + udata[i+4] * vdata[i+4]
        + vdata[i+5] * vdata[i+5];
    }
    
    for (; i < length; i++)
    {
        sum = sum + udata[i] * vdata[i];
    }
    *dest = sum;    
}

A:每个时钟周期只加载2个值

B:因为关键路径为浮点加法

5.15

void inner4b(vec_ptr u, vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(u);
    long limit = length - 1;
    data_t *udata = get_vec_start(u);
    data_t *vdata = get_vec_start(v);
    data_t sum0 = (data_t) 0;
    data_t sum1 = (data_t) 0;
    data_t sum2 = (data_t) 0;
    data_t sum3 = (data_t) 0;
    data_t sum4 = (data_t) 0;
    data_t sum5 = (data_t) 0;
    
    for (i = 0; i < limit; i+=6)
    {
        sum0 = sum0 + udata[i] * vdata[i];
        sum1 = sum1 + udata[i+1] * vdata[i+1];
        sum2 = sum2 + udata[i+2] * vdata[i+2];
        sum3 = sum3 + udata[i+3] * vdata[i+3];
        sum4 = sum4 + udata[i+4] * vdata[i+4];
        sum5 = sum5 + udata[i+5] * vdata[i+5];
    }
    
    for (; i < length; i++)
    {
        sum0 = sum0 + udata[i] * vdata[i];
    }
    *dest = sum0 + sum1 + sum2 + sum3 + sum4 + sum5;    
}

浮点数加法限制了性能达到CPE等于1.00

5.16

void inner4b(vec_ptr u, vec_ptr v, data_t *dest)
{
    long i;
    long length = vec_length(u);
    long limit = length - 1;
    data_t *udata = get_vec_start(u);
    data_t *vdata = get_vec_start(v);
    data_t sum = (data_t) 0;
    
    for (i = 0; i < limit; i+=6)
    {
        sum = sum + (udata[i] * vdata[i] + 
                    (udata[i+1] * vdata[i+1] +
                    (udata[i+2] * vdata[i+2] +
                    (udata[i+3] * vdata[i+3] +
                    (udata[i+4] * vdata[i+4] +
                    (udata[i+5] * vdata[i+5]))))));
    }
    
    for (; i < length; i++)
    {
        sum = sum + udata[i] * vdata[i];
    }
    *dest = sum;   
}

5.17

void *new_memset(void *s, int c, size_t n)
{
    size_t cnt = 0;
    unsigned long w;
    unsigned char *pw = (unsigned char *)&w;
    while (cnt < K)
    {
        *pw++ = (unsigned char)c;
        cnt++;
    }
    
    size_t i;
    unsigned char *schar = s;
    for (i = 0; (size_t)schar % K == 0 || i == n; i++)
    {
        *schar++ = (unsigned char)c;
    }
    
    size_t limit = n - K + 1;
    for (; i < limit && (int)limit > 0; i += K)
    {
        *(unsigned long *)schar = w;\
        schar += K;
    }
    
    return s;
}

5.18

偷懒,就写了个最简单的~

double poly(double a[], double x, long degree)
{
    long i;
    double result0 = a[0];
    double result1 = 0;
    double xpwr = x;
    double xpwr_step = x * x;
    
    for (i = 1; i < degree - 1; i+=2)
    {
        result0 = result0 + a[i] * xpwr;
        result1 = result1 + a[i+1] * xpwr * x; 
        xpwr *= xpwr_step;
    }
    
    for (; i <= degree; i++)
    {
        result0 = result0 + a[i] * xpwr;
        xpwr *= x;
    }
    return (result0 + result1);
}

5.19

void psum_4_1a(float a[], float p[], long n)
{
    long i;
    float val0, val1, val2, val3 = 0;
    
    for (i = 0; i < n - 3; i+=4)
    {
        val0 = val3 + a[i];
        val1 = val0 + a[i+1];
        val2 = val1 + a[i+2];
        val3 = val2 + a[i+3];
        
        p[i] = val0;
        p[i+1] = val1;
        p[i+2] = val2;
        p[i+3] = val3;
    }
    
    for (; i < n; i++)
    {
        val3 += a[i];
        p[i] = val3;
    }
}

你可能感兴趣的:(深入理解计算机系统(第三版))