# SYNOPSIS
#
#   AX_CACHE_SIZE_TUNE
#
# DESCRIPTION
#
#   Find L1, L2, L3 caches size by running some timing experiments.
#   The results are available in the defines __M4RI_CPU_L1_CACHE,
#   __M4RI_CPU_L2_CACHE and __M4RI_CPU_L3_CACHE.
#
#   This macro depends on AC_PROG_SED, AC_PROG_CC.
#
# LAST MODIFICATION
#
#   2011-04-11
#
# COPYLEFT
#
#   Copyright (c) 2009,2010 Martin Albrecht <martinralbrecht@googlemail.com>
#
#   Copying and distribution of this file, with or without modification, are
#   permitted in any medium without royalty provided the copyright notice
#   and this notice are preserved.

AC_DEFUN([AX_CACHE_SIZE_TUNE],
[ AC_REQUIRE([AC_PROG_CC])
  AC_REQUIRE([AC_PROG_SED])

  AC_LANG_PUSH([C])
  AC_CACHE_CHECK(for cache sizes, ax_cv_cache_sizes,
  [AC_RUN_IFELSE([AC_LANG_PROGRAM([[
#include <time.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>

double walltime(double t0) {
  double mic, time;
  double mega = 0.000001;
  struct timeval tp;
  static long base_sec = 0;
  static long base_usec = 0;

  (void) gettimeofday(&tp,NULL);
  if (base_sec == 0) {
    base_sec = tp.tv_sec;
    base_usec = tp.tv_usec;
  }

  time = (double) (tp.tv_sec - base_sec);
  mic = (double) (tp.tv_usec - base_usec);
  time = (time + mic * mega) - t0;
  return(time);
}

double run_experiment(size_t size, size_t trials) {
  size_t i,j;
  unsigned long *a = (unsigned long*)malloc(size/4);
  unsigned long *b = (unsigned long*)malloc(size/4);
  unsigned long *c = (unsigned long*)malloc(size/4);
  unsigned long *d = (unsigned long*)malloc(size/4);

  size_t n = size/4/(sizeof(unsigned long));

  /* we setup a lookup table with a random-ish pattern */
  a[0] = 1337;
  b[0] = 5345345;
  for(j=1; j<n ; j++) {
    a[j] = a[j-1] * 1073741827;
    a[j-1] %= n;
    b[j] = b[j-1] * 1073741827;
    b[j-1] %= n;
  }
  a[n-1] %= n;
  b[n-1] %= n;

  double wt = walltime(0.0);
  clock_t ct = clock();
  for(i=0; i<trials; i++) {
    for(j=0; j<n; j+=8) {
      d[b[j+0]] = c[a[j+0]];
      d[b[j+1]] = c[a[j+1]];
      d[b[j+2]] = c[a[j+2]];
      d[b[j+3]] = c[a[j+3]];
      d[b[j+4]] = c[a[j+4]];
      d[b[j+5]] = c[a[j+5]];
      d[b[j+6]] = c[a[j+6]];
      d[b[j+7]] = c[a[j+7]];
    }
  }
  ct = clock() - ct;
  wt = walltime(wt);
  free(a);
  free(b);
  free(c);
  free(d);
  return (double)wt;
}


#define NUMBER_OF_EXPERIMENTS 8

size_t cache_size(const size_t *candidates, const size_t n, size_t trials) {
  double times[NUMBER_OF_EXPERIMENTS][n];
  double dtimes[NUMBER_OF_EXPERIMENTS][n];
  size_t i,j;
  double wt, result;


  for(j=0; j<NUMBER_OF_EXPERIMENTS; j++) {
    size_t mult  = 1;
    size_t _trials = trials;
    run_experiment(candidates[0]*1024,_trials);
    wt = walltime(0.0);
    times[j][0] = run_experiment(candidates[0]*1024,_trials);
    wt = walltime(wt);
    dtimes[j][0] = 1.0;
    printf("s: %5zu, rx: %6.2f, x: %6.2f, wt: %6.2f, dx:    NaN\n",candidates[0],times[j][0],times[j][0],wt);
    fflush(NULL);

    for(i=1;i<n;i++) {
      run_experiment(candidates[i]*1024,_trials);
      wt = walltime(0.0);
      result = run_experiment(candidates[i]*1024,_trials);
      wt = walltime(wt);
      times[j][i] = mult*result;
      dtimes[j][i] = candidates[i-1]*times[j][i]/times[j][i-1]/candidates[i];

      printf("s: %5zu, rx: %6.2f, x: %6.2f, wt: %6.2f, dx: %6.2f\n",candidates[i],result,times[j][i],wt,dtimes[j][i]);
      fflush(NULL);

      while(wt > 0.25) {
        _trials = _trials/2;
        mult = 2*mult;
        wt /= 2.0;
        result /= 2.0;
      }
    }
    printf("\n");
  }
  for(i=0;i<n;i++) {
    double tmp = 0.0;
    for(j=0; j<NUMBER_OF_EXPERIMENTS; j++) {
      tmp += dtimes[j][i];
    }
    dtimes[0][i] = tmp/NUMBER_OF_EXPERIMENTS;
  }

  size_t max = 1;
  for(i=1;i<n;i++){
    if (dtimes[0][i] > dtimes[0][max] ) {
      max = i;
    }
  }
  return candidates[max-1];
}
      ]],
      [[
  const size_t c1[] = {   4,   8,  16,  32,  64, 128};
  const size_t c2[] = { 128, 256, 512};
  const size_t c3[] = {1024,1536,2048,3072,4096,6144,8192,16384,32768};

  FILE *f;
  printf("\n");
  size_t _l1 = cache_size(c1,  6, 1ULL<<15);
  size_t _l2 = cache_size(c2,  3, 1ULL<<12);
  size_t _l3 = cache_size(c3,  9, 1ULL<< 9);

  f = fopen("conftest_cache_sizes", "w"); if (!f) return 1;
  fprintf(f,"%lu:%lu:%lu\n",(unsigned long)(_l1*1024),(unsigned long)(_l2*1024),(unsigned long)(_l3*1024));
  fclose(f);
  return 0;
   ]])],
    [ax_cv_cache_sizes=`cat conftest_cache_sizes`; rm -f conftest_cache_sizes],
    [ax_cv_cache_sizes=unknown; rm -f conftest_cache_sizes],
    [ax_cv_cache_sizes=unknown])])
  AC_LANG_POP([C])
  AC_MSG_CHECKING(the L1 cache size)
  ax_l1_size=`echo $ax_cv_cache_sizes | cut -d ':' -f 1`
  AC_MSG_RESULT( $ax_l1_size Bytes)

  AC_MSG_CHECKING(the L2 cache size)
  ax_l2_size=`echo $ax_cv_cache_sizes | cut -d ':' -f 2`
  AC_MSG_RESULT( $ax_l2_size Bytes)

  AC_MSG_CHECKING(the L3 cache size)
  ax_l3_size=`echo $ax_cv_cache_sizes | cut -d ':' -f 3`
  AC_MSG_RESULT( $ax_l3_size Bytes)

  M4RI_CPU_L1_CACHE=${ax_l1_size}
  M4RI_CPU_L2_CACHE=${ax_l2_size}
  M4RI_CPU_L3_CACHE=${ax_l3_size}
  AC_SUBST(M4RI_CPU_L1_CACHE)
  AC_SUBST(M4RI_CPU_L2_CACHE)
  AC_SUBST(M4RI_CPU_L3_CACHE)
])