一起来学《深入理解计算机系统》 第五章 家庭作业部分答案

  • 5.13
    A:略
    B:未从xmm1使用上一次的数据,所以xmm1肯定不是关键路径上的寄存器。xmm0位于关键路径上,浮点数加法的延迟为3,所以CPE下界为3。
    C:整数数加法的延迟为1,所以CPE下界为1。
    D:首先,浮点数乘法可以与关键路径并行进行,且有多个浮点数乘法功能单元,多个加载器,所以只要配置够强大,浮点数乘法不能成为关键路径的阻碍。而关键路径浮点数加法的延迟为3。
  • 5.14
    A:因为只有两个加载器,而每个元素的循环就需要加载两个数值。
    B:即使进行了6x1循环展开,但是还是要依次经行6次浮点加法,算下来单个元素还是需要3个时钟周期。
  • 5.15
    只有两个加载器,而每个元素的循环就需要加载两个数值。
  • 5.16
  • 5.17
include <limits.h>
#define K sizeof(unsigned long)
void *word_memset(void *s, int c, size_t n)
{
  if (n < K)
  {
    size_t cnt = 0;
    unsigned char *schar = s;
    while (cnt < n)
    {
        *schar++ = (unsigned char)c;
        cnt++;
    }
  }
  else
  {
      unsigned long word = 0;
      for (int i = 0; i < K; ++i)
      {
        word <<= K*CHAR_BIT;
        word += (unsigned char)c;
      }

      size_t cnt = 0;
      unsigned long *slong = s;
      while (cnt < n)
      {
        *slong++ = word;
        cnt += K;
      }
      
      unsigned char *schar = slong;
      while (cnt < n)
      {
        *schar++ = (unsigned char)c;
        cnt++;
      }
  }
  return s;
}
  • 5.18
double faster_poly(double a[], double x, long degree)
{
    long i;
    double result1 = a[0];
    double result2 = 0;
    double result3 = 0;
    double result4 = 0;
    double result5 = 0;
    double result6 = 0;
    double result7 = 0;
    double result8 = 0;
    double result9 = 0;
    double result10 = 0;

    double xpwr1 = x;
    double xpwr2 = xpwr1 * x;
    double xpwr3 = xpwr2 * x;
    double xpwr4 = xpwr3 * x;
    double xpwr5 = xpwr4 * x;
    double xpwr6 = xpwr5 * x;
    double xpwr7 = xpwr6 * x;
    double xpwr8 = xpwr7 * x;
    double xpwr9 = xpwr8 * x;
    double xpwr10 = xpwr9 * x;
    double x10 = xpwr10;

    for (i = 1; (i+9) <= degree; i += 10)
    {
        result1 += a[i] * xpwr1;
        result2 += a[i+1] * xpwr2;
        result3 += a[i+2] * xpwr3;
        result4 += a[i+3] * xpwr4;
        result5 += a[i+4] * xpwr5;
        result6 += a[i+5] * xpwr6;
        result7 += a[i+6] * xpwr7;
        result8 += a[i+7] * xpwr8;
        result9 += a[i+8] * xpwr9;
        result10 += a[i+9] * xpwr10;

        xpwr1 *= x10;
        xpwr2 *= x10;
        xpwr3 *= x10;
        xpwr4 *= x10;
        xpwr5 *= x10;
        xpwr6 *= x10;
        xpwr7 *= x10;
        xpwr8 *= x10;
        xpwr9 *= x10;
        xpwr10 *= x10;
    }
    for (; i <= degree; ++i)
    {
        result1 += a[i] * xpwr1;
        xpwr1 *= x;
    }
    
    result1 += result2;
    result1 += result3;
    result1 += result4;
    result1 += result5;
    result1 += result6;
    result1 += result7;
    result1 += result8;
    result1 += result9;
    result1 += result10;
    return result1;
}

如果是整型,编码器能够自行将程序有化成这样的代码
5.19(我暂时还没搞明白)

void faster_psum1a(float a[], float p[], long n)
{
    long i;
    float val = 0;
    for (i = 0; (i+2) < n; i += 3)
    {
        float tmp1 = a[i];
        float tmp2 = tmp1 + a[i+1];
        float tmp3 = tmp2 + a[i+2];
        
        p[i] = var + tmp1;
        p[i+1] = var + tmp2;
        p[i+2] = var = var + tmp3;
    }
    for (; i < n; ++i)
    {
        var += a[i];
        p[i] = var;
    }

代码都引用自https://www.cnblogs.com/liqiuhao/p/7989634.html

你可能感兴趣的:(#,C++与理论基础)