完整讲解视频:野生Gprof会梦见存储器山嘛?_哔哩哔哩_bilibili
别忘了一键三连哦
提供的代码:
mm.c
/* matrix multiply permutations */
#include
#include
#include "mm.h"
#include "fcycmm.h"
#include "clock.h"
/* whether or not fcyc should clear the cache */
#define CLEARCACHE 1
/* global arrays */
array ga, gb, gc;
/* check the result array for correctness */
void checkresult(array c, int n)
{
int i, j;
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
if (c[i][j] != (double)n) {
printf("Error: bad number (%f) in result matrix (%d,%d)\n",
c[i][j], i, j);
fflush(stdout);
exit(0);
}
}
/* Run f and return clocks per inner loop iteration */
double run(test_funct f, int n)
{
double cpi;
cpi = fcyc(f, n, CLEARCACHE) / (n*n*n);
checkresult(gc, n);
return(cpi);
}
/* reset result array to zero */
void reset(array c, int n)
{
int i,j;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i][j] = 0.0;
}
}
}
/* initialize input arrays to 1 */
void init(array a, array b, int n)
{
int i,j;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
a[i][j] = 1.0;
b[i][j] = 1.0;
}
}
}
/* print an array (debug) */
void printarray(array a, int n)
{
int i, j;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
printf("%5.1f ", a[i][j]);
}
printf("\n");
}
}
/***********************************************
* Six different versions of matrix multiply
***********************************************/
void ijk(array A, array B, array C, int n)
{
int i, j, k;
double sum;
/* $begin mm-ijk */
for (i = 0; i < n; i++)
for (j = 0; j < n; j++) {
sum = 0.0;
for (k = 0; k < n; k++)
sum += A[i][k]*B[k][j];
C[i][j] += sum;
}
/* $end mm-ijk */
}
void jik(array A, array B, array C, int n)
{
int i, j, k;
double sum;
/* $begin mm-jik */
for (j = 0; j < n; j++)
for (i = 0; i < n; i++) {
sum = 0.0;
for (k = 0; k < n; k++)
sum += A[i][k]*B[k][j];
C[i][j] += sum;
}
/* $end mm-jik */
}
void ikj(array A, array B, array C, int n)
{
int i, j, k;
double r;
/* $begin mm-ikj */
for (i = 0; i < n; i++)
for (k = 0; k < n; k++) {
r = A[i][k];
for (j = 0; j < n; j++)
C[i][j] += r*B[k][j];
}
/* $end mm-ikj */
}
void kij(array A, array B, array C, int n)
{
int i, j, k;
double r;
/* $begin mm-kij */
for (k = 0; k < n; k++)
for (i = 0; i < n; i++) {
r = A[i][k];
for (j = 0; j < n; j++)
C[i][j] += r*B[k][j];
}
/* $end mm-kij */
}
void kji(array A, array B, array C, int n)
{
int i, j, k;
double r;
/* $begin mm-kji */
for (k = 0; k < n; k++)
for (j = 0; j < n; j++) {
r = B[k][j];
for (i = 0; i < n; i++)
C[i][j] += A[i][k]*r;
}
/* $end mm-kji */
}
void jki(array A, array B, array C, int n)
{
int i, j, k;
double r;
/* $begin mm-jki */
for (j = 0; j < n; j++)
for (k = 0; k < n; k++) {
r = B[k][j];
for (i = 0; i < n; i++)
C[i][j] += A[i][k]*r;
}
/* $end mm-jki */
}
/*
* Run the six versions of matrix multiply and display performance
* as clock cycles per inner loop iteration.
*/
int main()
{
int n;
init(ga, gb, MAXN);
printf("matmult cycles/loop iteration\n");
printf("%3s%6s%6s%6s%6s%6s%6s\n", "n",
"jki", "kji", "ijk", "jik", "kij", "ikj");
fflush(stdout);
for (n = MINN; n <= MAXN; n += INCN) {
printf("%3d ", n);
printf("%5.2f ", run(jki, n));
printf("%5.2f ", run(kji, n));
printf("%5.2f ", run(ijk, n));
printf("%5.2f ", run(jik, n));
printf("%5.2f ", run(kij, n));
printf("%5.2f ", run(ikj, n));
printf("\n");
fflush(stdout);
}
exit(0);
}
mountain.c
/* mountain.c - Generate the memory mountain. */
/* $begin mountainmain */
#include
#include
#include "fcyc2.h" /* measurement routines */
#include "clock.h" /* routines to access the cycle counter */
#define MINBYTES (1 << 14) /* First working set size */
#define MAXBYTES (1 << 27) /* Last working set size */
#define MAXSTRIDE 15 /* Stride x8 bytes */
#define MAXELEMS MAXBYTES/sizeof(long)
/* $begin mountainfuns */
long data[MAXELEMS]; /* The global array we'll be traversing */
/* $end mountainfuns */
/* $end mountainmain */
void init_data(long *data, int n);
int test(int elems, int stride);
double run(int size, int stride, double Mhz);
/* $begin mountainmain */
int main()
{
int size; /* Working set size (in bytes) */
int stride; /* Stride (in array elements) */
double Mhz; /* Clock frequency */
init_data(data, MAXELEMS); /* Initialize each element in data */
Mhz = mhz(0); /* Estimate the clock frequency */
/* $end mountainmain */
/* Not shown in the text */
printf("Clock frequency is approx. %.1f MHz\n", Mhz);
printf("Memory mountain (MB/sec)\n");
printf("\t");
for (stride = 1; stride <= MAXSTRIDE; stride++)
printf("s%d\t", stride);
printf("\n");
/* $begin mountainmain */
for (size = MAXBYTES; size >= MINBYTES; size >>= 1) {
/* $end mountainmain */
/* Not shown in the text */
if (size > (1 << 20))
printf("%dm\t", size / (1 << 20));
else
printf("%dk\t", size / 1024);
/* $begin mountainmain */
for (stride = 1; stride <= MAXSTRIDE; stride++) {
printf("%.0f\t", run(size, stride, Mhz));
}
printf("\n");
}
exit(0);
}
/* $end mountainmain */
/* init_data - initializes the array */
void init_data(long *data, int n)
{
int i;
for (i = 0; i < n; i++)
data[i] = i;
}
/* $begin mountainfuns */
/* test - Iterate over first "elems" elements of array "data" with
* stride of "stride", using 4x4 loop unrolling.
*/
int test(int elems, int stride)
{
long i, sx2 = stride*2, sx3 = stride*3, sx4 = stride*4;
long acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;
long length = elems;
long limit = length - sx4;
/* Combine 4 elements at a time */
for (i = 0; i < limit; i += sx4) {
acc0 = acc0 + data[i];
acc1 = acc1 + data[i+stride];
acc2 = acc2 + data[i+sx2];
acc3 = acc3 + data[i+sx3];
}
/* Finish any remaining elements */
for (; i < length; i += stride) {
acc0 = acc0 + data[i];
}
return ((acc0 + acc1) + (acc2 + acc3));
}
/* run - Run test(elems, stride) and return read throughput (MB/s).
* "size" is in bytes, "stride" is in array elements, and Mhz is
* CPU clock frequency in Mhz.
*/
double run(int size, int stride, double Mhz)
{
double cycles;
int elems = size / sizeof(double);
test(elems, stride); /* Warm up the cache */ //line:mem:warmup
cycles = fcyc2(test, elems, stride, 0); /* Call test(elems,stride) */ //line:mem:fcyc
return (size / stride) / (cycles / Mhz); /* Convert cycles to MB/s */ //line:mem:bwcompute
}
/* $end mountainfuns */