The latency of data access becomes greater with each cache level. Latency of memory access is best measured in CPU clock cycles. One cycle occupies from 4 to 6 nanoseconds, depending on the CPU clock speed. The latencies to the different levels of the memory hierarchy are as follows:
CPU Register: 0 cycles.
L1 cache hit: 2 or 3 cycles.
L1 cache miss satisfied by L2 cache hit: 8 to 10 cycles.
L2 cache miss satisfied from main memory, no TLB miss: 75 to 250 cycles; that is, 300 to 1100 nanoseconds, depending on the node where the memory resides (see Table 1-3).
TLB miss requiring only reload of the TLB to refer to a virtual page already in memory: approximately 2000 cycles.
TLB miss requiring virtual page to load from backing store: hundreds of millions of cycles; that is, tens to hundreds of milliseconds.
A miss at each level of the memory hierarchy multiplies the latency by an order of magnitude or more. Clearly a program can sustain high performance only by achieving a very high ratio of cache hits at every level. Fortunately, hit ratios of 95% and higher are commonly achieved.
test_std_faster
只用1秒多
test_std_mul
用7秒多
#include "../../profile.h"
#include
#include
#include
void test(int *arr, const int N, const int K)
{
for(int i=0; i
template
void test_std_faster(T** a, T** b, T** c, int n)
{
int i,j,k;
LIGHT_PROFILE_FUNCTION_SCOPE();
for (k = 0; k < n; k++)
{
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
c[i][j] = c[i][j] + a[i][k]*b[k][j];
}
}
}
}
template
void test_std_mul(T** a, T** b, T** c, int n)
{
int i,j,k;
LIGHT_PROFILE_FUNCTION_SCOPE();
for (k = 0; k < n; k++)
{
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
c[k][i] = c[k][i] + a[k][j]*b[j][i];
}
}
}
}
int main()
{
using namespace light;
typedef float real;
typedef int large;
large m=1000, n=1000;
real** A=new_array2d(m,n);
real** B=new_array2d(m,n);
real** C=new_array2d(m,n);
test_std_mul(A,B,C,n);
test_std_faster(A,B,C,n);
delete_array2d(A);
delete_array2d(B);
delete_array2d(C);
const int dimX=64, dimY=1024, dimZ=1024;
const int len=dimX*dimY*dimZ;
int *arr=new int[dimX*dimY*dimZ];
{
int K=1;
PROFILE_SCOPE("test1");
test(arr, len, K);
}
for(int k=1; k<128; k+=2){
char str[20];
sprintf(str, "K=%d", k);
{
PROFILE_SCOPE(str);
test(arr, len, k);
}
}
delete []arr;
return 0;
}
具体结果:
In [test_std_mul]:7.69271374 seconds
In [test_std_faster]:1.06887197 seconds
In [test1]:0.126983577 seconds
In [K=1]:0.071336323 seconds
In [K=3]:0.070501934 seconds
In [K=5]:0.070173198 seconds
In [K=7]:0.069914513 seconds
In [K=9]:0.070830328 seconds
In [K=11]:0.069679724 seconds
In [K=13]:0.070218294 seconds
In [K=15]:0.070343338 seconds
In [K=17]:0.06793199 seconds
In [K=19]:0.065443525 seconds
In [K=21]:0.063027865 seconds
In [K=23]:0.059502513 seconds
In [K=25]:0.055603195 seconds
In [K=27]:0.05297714 seconds
In [K=29]:0.050074194 seconds
In [K=31]:0.048622758 seconds
In [K=33]:0.042584798 seconds
In [K=35]:0.0476709 seconds
In [K=37]:0.034328566 seconds
In [K=39]:0.046996806 seconds
In [K=41]:0.039327118 seconds
In [K=43]:0.028445512 seconds
In [K=45]:0.03679236 seconds
In [K=47]:0.039664869 seconds
In [K=49]:0.040579371 seconds
In [K=51]:0.03415459 seconds
In [K=53]:0.034461676 seconds
In [K=55]:0.022190094 seconds
In [K=57]:0.031438485 seconds
In [K=59]:0.031826945 seconds
In [K=61]:0.031029584 seconds
In [K=63]:0.030291529 seconds
In [K=65]:0.029730277 seconds
In [K=67]:0.029219664 seconds
In [K=69]:0.027173476 seconds
In [K=71]:0.026998234 seconds
In [K=73]:0.019982384 seconds
In [K=75]:0.027318676 seconds
In [K=77]:0.026787412 seconds
In [K=79]:0.026784303 seconds
In [K=81]:0.024683844 seconds
In [K=83]:0.023558035 seconds
In [K=85]:0.015945637 seconds
In [K=87]:0.023418765 seconds
In [K=89]:0.018921708 seconds
In [K=91]:0.016292988 seconds
In [K=93]:0.022331199 seconds
In [K=95]:0.012433372 seconds
In [K=97]:0.021208601 seconds
In [K=99]:0.01638945 seconds
In [K=101]:0.01571856 seconds
In [K=103]:0.019857388 seconds
In [K=105]:0.010748321 seconds
In [K=107]:0.01605354 seconds
In [K=109]:0.013885589 seconds
In [K=111]:0.015515276 seconds
In [K=113]:0.014337689 seconds
In [K=115]:0.012188802 seconds
In [K=117]:0.015207584 seconds
In [K=119]:0.009342447 seconds
In [K=121]:0.018276332 seconds
In [K=123]:0.008979882 seconds
In [K=125]:0.015199975 seconds
In [K=127]:0.010937606 seconds
Reduce demands on memory bandwidth by pre-loading into local variables
while( … ) {
*res++ = filter[0]*signal[0]
+ filter[1]*signal[1]
+ filter[2]*signal[2];
signal++;
}
float f0 = filter[0];
float f1 = filter[1];
float f2 = filter[2];
while( … ) {
*res++ = f0*signal[0]
+ f1*signal[1]
+ f2*signal[2];
signal++;
}
Expose instruction-level parallelism
float f0 = filter[0], f1 = filter[1], f2 = filter[2];
float s0 = signal[0], s1 = signal[1], s2 = signal[2];
*res++ = f0*s0 + f1*s1 + f2*s2;
do {
signal += 3;
s0 = signal[0];
res[0] = f0*s1 + f1*s2 + f2*s0;
s1 = signal[1];
res[1] = f0*s2 + f1*s0 + f2*s1;
s2 = signal[2];
res[2] = f0*s0 + f1*s1 + f2*s2;
res += 3;
} while( … );