Outline Cache mountain Matrix multiplication Suggested Reading: 6.6, 6.7
2 Outline • Cache mountain • Matrix multiplication • Suggested Reading: 6.6, 6.7
6.6 Putting it Together: The Impact of Caches on Program Performance 6.6.1 The Memory Mountain
3 6.6 Putting it Together: The Impact of Caches on Program Performance 6.6.1 The Memory Mountain
The memory Mountain P512 Read throughput (read bandwidth) The rate that a program reads data from the memory system Memory mountain A two-dimensional function of read bandwidth versus temporal and spatial locality Characterizes the capabilities of the memory system for each computer
4 The Memory Mountain P512 • Read throughput (read bandwidth) – The rate that a program reads data from the memory system • Memory mountain – A two-dimensional function of read bandwidth versus temporal and spatial locality – Characterizes the capabilities of the memory system for each computer
Memory mountain main routine Figure 6.41 P513 / mountain c- Generate the memory mountain. #define minbytes(1 <<10)/working set size ranges from 1 KB*/ #define maxbYtes (1 < 23)/...up to 8 MB*/ #define maxstride 16 / strides range from 1 to 16*/ #define maXelems maXbytes/sizeof(int) int data MAXELEMSI: The array well be traversing
5 Memory mountain main routine Figure 6.41 P513 /* mountain.c - Generate the memory mountain. */ #define MINBYTES (1 << 10) /* Working set size ranges from 1 KB */ #define MAXBYTES (1 << 23) /* ... up to 8 MB */ #define MAXSTRIDE 16 /* Strides range from 1 to 16 */ #define MAXELEMS MAXBYTES/sizeof(int) int data[MAXELEMS]; /* The array we'll be traversing */
Memory mountain main routine int maino int size, / Working set size(in bytes)*/ int stride: / Stride (in array elements)*/ double mhz; / Clock frequency * init data(data, MAXELEMS); /Initialize each element in data to 1 */ Mhz=mhz) / Estimate the clock frequency x
6 Memory mountain main routine int main() { int size; /* Working set size (in bytes) */ int stride; /* Stride (in array elements) */ double Mhz; /* Clock frequency */ init_data(data, MAXELEMS); /* Initialize each element in data to 1 */ Mhz = mhz(0); /* Estimate the clock frequency */
Memory mountain main routine for(size= MAXBYTES; size > MINBYTES; size >>=1)t for(stride= 1; stride<= MAXSTRIDE; stride++) printf( o.Ift",run(size, stride, Mhz)) printf("n; exit(0);
7 Memory mountain main routine for (size = MAXBYTES; size >= MINBYTES; size >>= 1) { for (stride = 1; stride <= MAXSTRIDE; stride++) printf("%.1f\t", run(size, stride, Mhz)); printf("\n"); } exit(0); }
Memory mountain test function Figure 6.40 P512 / The test function * void test (int elems, int stride)i inti result=0 volatile int sink: for(i=0; i< elems; i+= stride) result += datai; sink= result: /*So compiler doesn't optimize away the loop
8 Memory mountain test function Figure 6.40 P512 /* The test function */ void test (int elems, int stride) { int i, result = 0; volatile int sink; for (i = 0; i < elems; i += stride) result += data[i]; sink = result; /* So compiler doesn't optimize away the loop */ }
Memory mountain test function /*Run test(elems, stride) and return read throughput MB/s)*/ double run (int size, int stride, double mhz) double cycles; int elems= size/ sizeof(int) test(elems, stride); / warm up the cache * cycles= fcyc2(test, elems, stride, 0); /*call test(elems, stride)*/ return(size/stride)/(cycles /Mhz); /*convert cycles to MB/s*/
9 Memory mountain test function /* Run test (elems, stride) and return read throughput (MB/s) */ double run (int size, int stride, double Mhz) { double cycles; int elems = size / sizeof(int); test (elems, stride); /* warm up the cache */ cycles = fcyc2(test, elems, stride, 0); /* call test (elems,stride) */ return (size / stride) / (cycles / Mhz); /* convert cycles to MB/s */ }