197 lines
5.1 KiB
Plaintext
197 lines
5.1 KiB
Plaintext
# SYNOPSIS
|
|
#
|
|
# AX_CACHE_SIZE_TUNE
|
|
#
|
|
# DESCRIPTION
|
|
#
|
|
# Find L1, L2, L3 caches size by running some timing experiments.
|
|
# The results are available in the defines __M4RI_CPU_L1_CACHE,
|
|
# __M4RI_CPU_L2_CACHE and __M4RI_CPU_L3_CACHE.
|
|
#
|
|
# This macro depends on AC_PROG_SED, AC_PROG_CC.
|
|
#
|
|
# LAST MODIFICATION
|
|
#
|
|
# 2011-04-11
|
|
#
|
|
# COPYLEFT
|
|
#
|
|
# Copyright (c) 2009,2010 Martin Albrecht <martinralbrecht@googlemail.com>
|
|
#
|
|
# Copying and distribution of this file, with or without modification, are
|
|
# permitted in any medium without royalty provided the copyright notice
|
|
# and this notice are preserved.
|
|
|
|
AC_DEFUN([AX_CACHE_SIZE_TUNE],
|
|
[ AC_REQUIRE([AC_PROG_CC])
|
|
AC_REQUIRE([AC_PROG_SED])
|
|
|
|
AC_LANG_PUSH([C])
|
|
AC_CACHE_CHECK(for cache sizes, ax_cv_cache_sizes,
|
|
[AC_RUN_IFELSE([AC_LANG_PROGRAM([[
|
|
#include <time.h>
|
|
#include <sys/time.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
double walltime(double t0) {
|
|
double mic, time;
|
|
double mega = 0.000001;
|
|
struct timeval tp;
|
|
static long base_sec = 0;
|
|
static long base_usec = 0;
|
|
|
|
(void) gettimeofday(&tp,NULL);
|
|
if (base_sec == 0) {
|
|
base_sec = tp.tv_sec;
|
|
base_usec = tp.tv_usec;
|
|
}
|
|
|
|
time = (double) (tp.tv_sec - base_sec);
|
|
mic = (double) (tp.tv_usec - base_usec);
|
|
time = (time + mic * mega) - t0;
|
|
return(time);
|
|
}
|
|
|
|
double run_experiment(size_t size, size_t trials) {
|
|
size_t i,j;
|
|
unsigned long *a = (unsigned long*)malloc(size/4);
|
|
unsigned long *b = (unsigned long*)malloc(size/4);
|
|
unsigned long *c = (unsigned long*)malloc(size/4);
|
|
unsigned long *d = (unsigned long*)malloc(size/4);
|
|
|
|
size_t n = size/4/(sizeof(unsigned long));
|
|
|
|
/* we setup a lookup table with a random-ish pattern */
|
|
a[0] = 1337;
|
|
b[0] = 5345345;
|
|
for(j=1; j<n ; j++) {
|
|
a[j] = a[j-1] * 1073741827;
|
|
a[j-1] %= n;
|
|
b[j] = b[j-1] * 1073741827;
|
|
b[j-1] %= n;
|
|
}
|
|
a[n-1] %= n;
|
|
b[n-1] %= n;
|
|
|
|
double wt = walltime(0.0);
|
|
clock_t ct = clock();
|
|
for(i=0; i<trials; i++) {
|
|
for(j=0; j<n; j+=8) {
|
|
d[b[j+0]] = c[a[j+0]];
|
|
d[b[j+1]] = c[a[j+1]];
|
|
d[b[j+2]] = c[a[j+2]];
|
|
d[b[j+3]] = c[a[j+3]];
|
|
d[b[j+4]] = c[a[j+4]];
|
|
d[b[j+5]] = c[a[j+5]];
|
|
d[b[j+6]] = c[a[j+6]];
|
|
d[b[j+7]] = c[a[j+7]];
|
|
}
|
|
}
|
|
ct = clock() - ct;
|
|
wt = walltime(wt);
|
|
free(a);
|
|
free(b);
|
|
free(c);
|
|
free(d);
|
|
return (double)wt;
|
|
}
|
|
|
|
|
|
#define NUMBER_OF_EXPERIMENTS 8
|
|
|
|
size_t cache_size(const size_t *candidates, const size_t n, size_t trials) {
|
|
double times[NUMBER_OF_EXPERIMENTS][n];
|
|
double dtimes[NUMBER_OF_EXPERIMENTS][n];
|
|
size_t i,j;
|
|
double wt, result;
|
|
|
|
|
|
for(j=0; j<NUMBER_OF_EXPERIMENTS; j++) {
|
|
size_t mult = 1;
|
|
size_t _trials = trials;
|
|
run_experiment(candidates[0]*1024,_trials);
|
|
wt = walltime(0.0);
|
|
times[j][0] = run_experiment(candidates[0]*1024,_trials);
|
|
wt = walltime(wt);
|
|
dtimes[j][0] = 1.0;
|
|
printf("s: %5zu, rx: %6.2f, x: %6.2f, wt: %6.2f, dx: NaN\n",candidates[0],times[j][0],times[j][0],wt);
|
|
fflush(NULL);
|
|
|
|
for(i=1;i<n;i++) {
|
|
run_experiment(candidates[i]*1024,_trials);
|
|
wt = walltime(0.0);
|
|
result = run_experiment(candidates[i]*1024,_trials);
|
|
wt = walltime(wt);
|
|
times[j][i] = mult*result;
|
|
dtimes[j][i] = candidates[i-1]*times[j][i]/times[j][i-1]/candidates[i];
|
|
|
|
printf("s: %5zu, rx: %6.2f, x: %6.2f, wt: %6.2f, dx: %6.2f\n",candidates[i],result,times[j][i],wt,dtimes[j][i]);
|
|
fflush(NULL);
|
|
|
|
while(wt > 0.25) {
|
|
_trials = _trials/2;
|
|
mult = 2*mult;
|
|
wt /= 2.0;
|
|
result /= 2.0;
|
|
}
|
|
}
|
|
printf("\n");
|
|
}
|
|
for(i=0;i<n;i++) {
|
|
double tmp = 0.0;
|
|
for(j=0; j<NUMBER_OF_EXPERIMENTS; j++) {
|
|
tmp += dtimes[j][i];
|
|
}
|
|
dtimes[0][i] = tmp/NUMBER_OF_EXPERIMENTS;
|
|
}
|
|
|
|
size_t max = 1;
|
|
for(i=1;i<n;i++){
|
|
if (dtimes[0][i] > dtimes[0][max] ) {
|
|
max = i;
|
|
}
|
|
}
|
|
return candidates[max-1];
|
|
}
|
|
]],
|
|
[[
|
|
const size_t c1[] = { 4, 8, 16, 32, 64, 128};
|
|
const size_t c2[] = { 128, 256, 512};
|
|
const size_t c3[] = {1024,1536,2048,3072,4096,6144,8192,16384,32768};
|
|
|
|
FILE *f;
|
|
printf("\n");
|
|
size_t _l1 = cache_size(c1, 6, 1ULL<<15);
|
|
size_t _l2 = cache_size(c2, 3, 1ULL<<12);
|
|
size_t _l3 = cache_size(c3, 9, 1ULL<< 9);
|
|
|
|
f = fopen("conftest_cache_sizes", "w"); if (!f) return 1;
|
|
fprintf(f,"%lu:%lu:%lu\n",(unsigned long)(_l1*1024),(unsigned long)(_l2*1024),(unsigned long)(_l3*1024));
|
|
fclose(f);
|
|
return 0;
|
|
]])],
|
|
[ax_cv_cache_sizes=`cat conftest_cache_sizes`; rm -f conftest_cache_sizes],
|
|
[ax_cv_cache_sizes=unknown; rm -f conftest_cache_sizes],
|
|
[ax_cv_cache_sizes=unknown])])
|
|
AC_LANG_POP([C])
|
|
AC_MSG_CHECKING(the L1 cache size)
|
|
ax_l1_size=`echo $ax_cv_cache_sizes | cut -d ':' -f 1`
|
|
AC_MSG_RESULT( $ax_l1_size Bytes)
|
|
|
|
AC_MSG_CHECKING(the L2 cache size)
|
|
ax_l2_size=`echo $ax_cv_cache_sizes | cut -d ':' -f 2`
|
|
AC_MSG_RESULT( $ax_l2_size Bytes)
|
|
|
|
AC_MSG_CHECKING(the L3 cache size)
|
|
ax_l3_size=`echo $ax_cv_cache_sizes | cut -d ':' -f 3`
|
|
AC_MSG_RESULT( $ax_l3_size Bytes)
|
|
|
|
M4RI_CPU_L1_CACHE=${ax_l1_size}
|
|
M4RI_CPU_L2_CACHE=${ax_l2_size}
|
|
M4RI_CPU_L3_CACHE=${ax_l3_size}
|
|
AC_SUBST(M4RI_CPU_L1_CACHE)
|
|
AC_SUBST(M4RI_CPU_L2_CACHE)
|
|
AC_SUBST(M4RI_CPU_L3_CACHE)
|
|
])
|