[Commits] [svn:einsteintoolkit] incoming/MemSpeed/ (Rev. 88)

Fri Jun 21 20:07:31 CDT 2013

User: eschnett
Date: 2013/06/21 08:07 PM

Added:
 /MemSpeed/
  README, configuration.ccl, interface.ccl, param.ccl, schedule.ccl
 /MemSpeed/doc/
  documentation.tex
 /MemSpeed/par/
 /MemSpeed/src/
  make.code.defn, memspeed.cc
 /MemSpeed/test/

Log:
 New thorn MemSpeed

File Changes:

Directory: /MemSpeed/
=====================

File [added]: README
Delta lines: +9 -0
===================================================================

--- MemSpeed/README	                        (rev 0)
+++ MemSpeed/README	2013-06-22 01:07:31 UTC (rev 88)
@@ -0,0 +1,9 @@
+Cactus Code Thorn MemSpeed
+Author(s)    : Erik Schnetter <schnetter at gmail.com>
+Maintainer(s): Erik Schnetter <schnetter at gmail.com>
+Licence      : n/a
+--------------------------------------------------------------------------
+
+1. Purpose
+
+Determine the latencies and bandwidths of caches and main memory.

File [added]: configuration.ccl
Delta lines: +3 -0
===================================================================
--- MemSpeed/configuration.ccl	                        (rev 0)
+++ MemSpeed/configuration.ccl	2013-06-22 01:07:31 UTC (rev 88)
@@ -0,0 +1,3 @@
+# Configuration definitions for thorn MemSpeed
+
+REQUIRES Vectors

File [added]: interface.ccl
Delta lines: +17 -0
===================================================================
--- MemSpeed/interface.ccl	                        (rev 0)
+++ MemSpeed/interface.ccl	2013-06-22 01:07:31 UTC (rev 88)
@@ -0,0 +1,17 @@
+# Interface definition for thorn MemSpeed
+
+IMPLEMENTS: MemSpeed
+
+USES INCLUDE HEADER: vectors.h
+
+
+
+CCTK_INT FUNCTION GetCacheInfo1                            \
+    (CCTK_POINTER_TO_CONST ARRAY OUT names,                \
+     CCTK_INT              ARRAY OUT types,                \
+     CCTK_POINTER_TO_CONST ARRAY OUT sizes,                \
+     CCTK_INT              ARRAY OUT linesizes,            \
+     CCTK_INT              ARRAY OUT strides,              \
+     CCTK_INT              ARRAY OUT num_puss,             \
+     CCTK_INT                    IN  max_num_cache_levels)
+REQUIRES FUNCTION GetCacheInfo1

File [added]: param.ccl
Delta lines: +5 -0
===================================================================
--- MemSpeed/param.ccl	                        (rev 0)
+++ MemSpeed/param.ccl	2013-06-22 01:07:31 UTC (rev 88)
@@ -0,0 +1,5 @@
+# Parameter definitions for thorn MemSpeed
+
+BOOLEAN verbose "Verbose output" STEERABLE=always
+{
+} "no"

File [added]: schedule.ccl
Delta lines: +7 -0
===================================================================
--- MemSpeed/schedule.ccl	                        (rev 0)
+++ MemSpeed/schedule.ccl	2013-06-22 01:07:31 UTC (rev 88)
@@ -0,0 +1,7 @@
+# Schedule definitions for thorn MemSpeed
+
+SCHEDULE MemSpeed_MeasureSpeed AT wragh
+{
+  LANG: C
+  OPTIONS: meta
+} "Measure memory and cache speeds"

Directory: /MemSpeed/doc/
=========================

File [added]: documentation.tex
Delta lines: +144 -0
===================================================================
--- MemSpeed/doc/documentation.tex	                        (rev 0)
+++ MemSpeed/doc/documentation.tex	2013-06-22 01:07:31 UTC (rev 88)
@@ -0,0 +1,144 @@
+% *======================================================================*
+%  Cactus Thorn template for ThornGuide documentation
+%  Author: Ian Kelley
+%  Date: Sun Jun 02, 2002
+%  $Header$
+%
+%  Thorn documentation in the latex file doc/documentation.tex
+%  will be included in ThornGuides built with the Cactus make system.
+%  The scripts employed by the make system automatically include
+%  pages about variables, parameters and scheduling parsed from the
+%  relevant thorn CCL files.
+%
+%  This template contains guidelines which help to assure that your
+%  documentation will be correctly added to ThornGuides. More
+%  information is available in the Cactus UsersGuide.
+%
+%  Guidelines:
+%   - Do not change anything before the line
+%       % START CACTUS THORNGUIDE",
+%     except for filling in the title, author, date, etc. fields.
+%        - Each of these fields should only be on ONE line.
+%        - Author names should be separated with a \\ or a comma.
+%   - You can define your own macros, but they must appear after
+%     the START CACTUS THORNGUIDE line, and must not redefine standard
+%     latex commands.
+%   - To avoid name clashes with other thorns, 'labels', 'citations',
+%     'references', and 'image' names should conform to the following
+%     convention:
+%       ARRANGEMENT_THORN_LABEL
+%     For example, an image wave.eps in the arrangement CactusWave and
+%     thorn WaveToyC should be renamed to CactusWave_WaveToyC_wave.eps
+%   - Graphics should only be included using the graphicx package.
+%     More specifically, with the "\includegraphics" command.  Do
+%     not specify any graphic file extensions in your .tex file. This
+%     will allow us to create a PDF version of the ThornGuide
+%     via pdflatex.
+%   - References should be included with the latex "\bibitem" command.
+%   - Use \begin{abstract}...\end{abstract} instead of \abstract{...}
+%   - Do not use \appendix, instead include any appendices you need as
+%     standard sections.
+%   - For the benefit of our Perl scripts, and for future extensions,
+%     please use simple latex.
+%
+% *======================================================================*
+%
+% Example of including a graphic image:
+%    \begin{figure}[ht]
+% 	\begin{center}
+%    	   \includegraphics[width=6cm]{MyArrangement_MyThorn_MyFigure}
+% 	\end{center}
+% 	\caption{Illustration of this and that}
+% 	\label{MyArrangement_MyThorn_MyLabel}
+%    \end{figure}
+%
+% Example of using a label:
+%   \label{MyArrangement_MyThorn_MyLabel}
+%
+% Example of a citation:
+%    \cite{MyArrangement_MyThorn_Author99}
+%
+% Example of including a reference
+%   \bibitem{MyArrangement_MyThorn_Author99}
+%   {J. Author, {\em The Title of the Book, Journal, or periodical}, 1 (1999),
+%   1--16. {\tt http://www.nowhere.com/}}
+%
+% *======================================================================*
+
+% If you are using CVS use this line to give version information
+% $Header$
+
+\documentclass{article}
+
+% Use the Cactus ThornGuide style file
+% (Automatically used from Cactus distribution, if you have a
+%  thorn without the Cactus Flesh download this from the Cactus
+%  homepage at www.cactuscode.org)
+\usepackage{../../../../doc/latex/cactus}
+
+\begin{document}
+
+% The author of the documentation
+\author{Erik Schnetter \textless schnetter at gmail.com\textgreater}
+
+% The title of the document (not necessarily the name of the Thorn)
+\title{MemSpeed}
+
+% the date your document was last changed, if your document is in CVS,
+% please use:
+%    \date{$ $Date: 2004-01-07 14:12:39 -0600 (Wed, 07 Jan 2004) $ $}
+\date{June 17 2013}
+
+\maketitle
+
+% Do not delete next line
+% START CACTUS THORNGUIDE
+
+% Add all definitions used in this documentation here
+%   \def\mydef etc
+
+% Add an abstract for this thorn's documentation
+\begin{abstract}
+
+\end{abstract}
+
+% The following sections are suggestive only.
+% Remove them or add your own.
+
+\section{Introduction}
+
+\section{Physical System}
+
+\section{Numerical Implementation}
+
+\section{Using This Thorn}
+
+\subsection{Obtaining This Thorn}
+
+\subsection{Basic Usage}
+
+\subsection{Special Behaviour}
+
+\subsection{Interaction With Other Thorns}
+
+\subsection{Examples}
+
+\subsection{Support and Feedback}
+
+\section{History}
+
+\subsection{Thorn Source Code}
+
+\subsection{Thorn Documentation}
+
+\subsection{Acknowledgements}
+
+
+\begin{thebibliography}{9}
+
+\end{thebibliography}
+
+% Do not delete next line
+% END CACTUS THORNGUIDE
+
+\end{document}

Directory: /MemSpeed/src/
=========================

File [added]: make.code.defn
Delta lines: +7 -0
===================================================================
--- MemSpeed/src/make.code.defn	                        (rev 0)
+++ MemSpeed/src/make.code.defn	2013-06-22 01:07:31 UTC (rev 88)
@@ -0,0 +1,7 @@
+# Main make.code.defn file for thorn MemSpeed
+
+# Source files in this directory
+SRCS = memspeed.cc
+
+# Subdirectories containing source files
+SUBDIRS = 

File [added]: memspeed.cc
Delta lines: +483 -0
===================================================================
--- MemSpeed/src/memspeed.cc	                        (rev 0)
+++ MemSpeed/src/memspeed.cc	2013-06-22 01:07:31 UTC (rev 88)
@@ -0,0 +1,483 @@
+#include <cctk.h>
+#include <cctk_Arguments.h>
+#include <cctk_Parameters.h>
+
+#include <vectors.h>
+
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+using namespace std;
+
+
+
+#ifdef _OPENMP
+#  include <omp.h>
+#else
+#  include <sys/time.h>
+namespace {
+  double omp_get_wtime()
+  {
+    timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec + 1.0e-6 * tv.tv_usec;
+  }
+}
+#endif
+
+
+
+namespace {
+  
+  struct cpu_info_t {
+    double flop_speed;
+    double iop_speed;
+  };
+  cpu_info_t cpu_info;
+  
+  struct cache_info_t {
+    string    name;
+    int       type;
+    ptrdiff_t size;
+    int       linesize;
+    int       stride;
+    int       num_pus;
+    
+    double read_latency;
+    double read_bandwidth;
+    double write_latency;
+    double write_bandwidth;
+  };
+  vector<cache_info_t> cache_info;
+  
+  
+  
+  void load_cache_info()
+  {
+    const int num_cache_levels = GetCacheInfo1(0, 0, 0, 0, 0, 0, 0);
+    vector<CCTK_POINTER_TO_CONST> names_(num_cache_levels);
+    vector<CCTK_INT>              types_(num_cache_levels);
+    vector<CCTK_POINTER_TO_CONST> sizes_(num_cache_levels);
+    vector<CCTK_INT>              linesizes_(num_cache_levels);
+    vector<CCTK_INT>              strides_(num_cache_levels);
+    vector<CCTK_INT>              num_puss_(num_cache_levels);
+    GetCacheInfo1(&names_[0], &types_[0],
+                  &sizes_[0], &linesizes_[0], &strides_[0], &num_puss_[0],
+                  num_cache_levels);
+    cache_info.resize(num_cache_levels);
+    for (int n=0; n<num_cache_levels; ++n) {
+      cache_info[n].name     = (const char*)(names_[n]);
+      cache_info[n].type     = types_[n];
+      cache_info[n].size     = ptrdiff_t(sizes_[n]);
+      cache_info[n].linesize = linesizes_[n];
+      cache_info[n].stride   = strides_[n];
+      cache_info[n].num_pus  = num_puss_[n];
+    }
+  }
+  
+  
+  
+  void measure_cpu_flop_speed()
+  {
+    DECLARE_CCTK_PARAMETERS;
+    
+    printf("  CPU floating point performance:");
+    if (verbose) {
+      printf("\n");
+    }
+    double min_elapsed = 1.0;
+    ptrdiff_t max_count = 1000000;
+    double elapsed = 0.0;
+    for (;;) {
+      if (verbose) {
+        printf("    iterations=%td...", max_count);
+        fflush(stdout);
+      }
+      const double t0 = omp_get_wtime();
+      CCTK_REAL_VEC s0, s1, s2, s3, s4, s5, s6, s7;
+      s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7 = vec_set1(1.0);
+      for (ptrdiff_t count=0; count<max_count; ++count) {
+        s0 = kmadd(vec_set1(1.1), s0, vec_set1(-0.1));
+        s1 = kmadd(vec_set1(1.1), s1, vec_set1(-0.1));
+        s2 = kmadd(vec_set1(1.1), s2, vec_set1(-0.1));
+        s3 = kmadd(vec_set1(1.1), s3, vec_set1(-0.1));
+        s4 = kmadd(vec_set1(1.1), s4, vec_set1(-0.1));
+        s5 = kmadd(vec_set1(1.1), s5, vec_set1(-0.1));
+        s6 = kmadd(vec_set1(1.1), s6, vec_set1(-0.1));
+        s7 = kmadd(vec_set1(1.1), s7, vec_set1(-0.1));
+      }
+      volatile CCTK_REAL_VEC use_s CCTK_ATTRIBUTE_UNUSED =
+        kadd(kadd(kadd(s0, s1), kadd(s2, s3)),
+             kadd(kadd(s4, s5), kadd(s6, s7)));
+      const double t1 = omp_get_wtime();
+      elapsed = t1 - t0;
+      if (verbose) {
+        printf(" time=%g sec\n", elapsed);
+      }
+      if (elapsed >= min_elapsed) break;
+      max_count *= llrint(max(2.0, min(10.0, 1.1 * min_elapsed / elapsed)));
+    }
+    cpu_info.flop_speed = max_count * 8 * CCTK_REAL_VEC_SIZE * 2 / elapsed;
+    if (verbose) {
+      printf("    result:");
+    }
+    printf(" %g Gflop/sec for each PU\n", cpu_info.flop_speed / 1.0e+9);
+  }
+  
+  
+  
+  void measure_cpu_iop_speed()
+  {
+    DECLARE_CCTK_PARAMETERS;
+    
+    printf("  CPU integer performance:");
+    if (verbose) {
+      printf("\n");
+    }
+    double min_elapsed = 1.0;
+    ptrdiff_t max_count = 1000000;
+    double elapsed = 0.0;
+    for (;;) {
+      if (verbose) {
+        printf("    iterations=%td...", max_count);
+        fflush(stdout);
+      }
+      const double t0 = omp_get_wtime();
+      vector<CCTK_REAL> base(1000);
+      ptrdiff_t s0, s1, s2, s3, s4, s5, s6, s7;
+      s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7 = 0; 
+      for (ptrdiff_t count=0; count<max_count; ++count) {
+        s0 = ptrdiff_t(&base[  s0]);
+        s1 = ptrdiff_t(&base[2*s1]);
+        s2 = ptrdiff_t(&base[3*s2]);
+        s3 = ptrdiff_t(&base[4*s3]);
+        s4 = ptrdiff_t(&base[5*s4]);
+        s5 = ptrdiff_t(&base[6*s5]);
+        s6 = ptrdiff_t(&base[7*s6]);
+        s7 = ptrdiff_t(&base[8*s7]);
+      }
+      volatile ptrdiff_t use_s CCTK_ATTRIBUTE_UNUSED =
+        s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7;
+      const double t1 = omp_get_wtime();
+      elapsed = t1 - t0;
+      if (verbose) {
+        printf(" time=%g sec\n", elapsed);
+      }
+      if (elapsed >= min_elapsed) break;
+      max_count *= llrint(max(2.0, min(10.0, 1.1 * min_elapsed / elapsed)));
+    }
+    cpu_info.iop_speed = max_count * 8 * 2 / elapsed;
+    if (verbose) {
+      printf("    result:");
+    }
+    printf(" %g Giop/sec for each PU\n", cpu_info.iop_speed / 1.0e+9);
+  }
+  
+  
+  
+  void calc_sizes(int cache, ptrdiff_t& skipsize, ptrdiff_t& size)
+  {
+    if (cache_info[cache].type==1) {
+      if (cache>0 && cache_info[cache-1].type==1) {
+        // Global memory, and there is also local memory
+        skipsize = cache_info[cache-1].size;
+        size = (cache_info[cache].size - skipsize) / 4;
+        assert(size >= skipsize/4);
+      } else {
+        // Local memory or only memory
+        skipsize = 0;
+        size = cache_info[cache].size / 2;
+      }
+    } else {
+      // Cache
+      skipsize = 0;
+      size = cache_info[cache].size * 3 / 4;
+    }
+  }
+  
+  
+  
+  void measure_read_latency()
+  {
+    DECLARE_CCTK_PARAMETERS;
+    
+    printf("  Read latency:\n");
+    for (int cache=0; cache<int(cache_info.size()); ++cache) {
+      ptrdiff_t skipsize, size;
+      calc_sizes(cache, skipsize, size);
+      assert(size>0);
+      const ptrdiff_t step = cache_info[cache].linesize;
+      assert(step>0);
+      if (verbose) {
+        printf("    %s read latency (using %td bytes):\n",
+               cache_info[cache].name.c_str(), size);
+        fflush(stdout);
+      } else {
+        printf("    %s read latency:", cache_info[cache].name.c_str());
+      }
+      vector<char> skiparray(skipsize, 1);
+      const ptrdiff_t offset = 0xa1d2d5ff; // a random number
+      const ptrdiff_t nmax = size / sizeof(void*);
+      vector<void*> array(nmax);
+      {
+        ptrdiff_t i = 0;
+        for (ptrdiff_t n=0; n<nmax; ++n) {
+          ptrdiff_t next_i = (i+offset) % nmax;
+          if (array[i] && n != nmax-1) ++next_i;
+          assert(!array[i]);
+          array[i] = &array[next_i];
+          i = next_i;
+        }
+        assert(i == 0);
+      }
+      double min_elapsed = 1.0;
+      ptrdiff_t max_count = 1000;
+      double elapsed = 0.0;
+      for (;;) {
+        if (verbose) {
+          printf("      iterations=%td...", max_count);
+          fflush(stdout);
+        }
+        const double t0 = omp_get_wtime();
+        void* ptr = &array[0];
+        for (ptrdiff_t count=0; count<max_count; ++count) {
+#define REPEAT10(x) x x x x x x x x x x
+          REPEAT10(REPEAT10(ptr = *(void**)ptr;));
+#undef REPEAT10
+        }
+        volatile bool use_ptr CCTK_ATTRIBUTE_UNUSED = ptr;
+        const double t1 = omp_get_wtime();
+        elapsed = t1 - t0;
+        if (verbose) {
+          printf(" time=%g sec\n", elapsed);
+        }
+        if (elapsed >= min_elapsed) break;
+        max_count *= llrint(max(2.0, min(10.0, 1.1 * min_elapsed / elapsed)));
+      }
+      cache_info[cache].read_latency = elapsed / (max_count * 100);
+      if (verbose) {
+        printf("      result:");
+      }
+      printf(" %g nsec\n", cache_info[cache].read_latency * 1.0e+9);
+    }
+  }
+  
+  
+  
+  void measure_read_bandwidth()
+  {
+    DECLARE_CCTK_PARAMETERS;
+    
+    printf("  Read bandwidth:\n");
+    for (int cache=0; cache<int(cache_info.size()); ++cache) {
+      ptrdiff_t skipsize, size;
+      calc_sizes(cache, skipsize, size);
+      assert(size>0);
+      if (verbose) {
+        printf("    %s read bandwidth (using %td bytes):\n",
+               cache_info[cache].name.c_str(), size);
+        fflush(stdout);
+      } else {
+        printf("    %s read bandwidth:", cache_info[cache].name.c_str());
+      }
+      vector<char> skiparray(skipsize, 1);
+      const ptrdiff_t nmax = size / sizeof(CCTK_REAL);
+      vector<CCTK_REAL> raw_array(nmax + CCTK_REAL_VEC_SIZE-1, 1.0);
+      CCTK_REAL* restrict array = &raw_array[CCTK_REAL_VEC_SIZE-1];
+      array = (CCTK_REAL*)(ptrdiff_t(array) & -sizeof(CCTK_REAL_VEC));
+      double min_elapsed = 1.0;
+      ptrdiff_t max_count = 1;
+      double elapsed = 0.0;
+      for (;;) {
+        if (verbose) {
+          printf("      iterations=%td...", max_count);
+          fflush(stdout);
+        }
+        const double t0 = omp_get_wtime();
+        for (ptrdiff_t count=0; count<max_count; ++count) {
+          CCTK_REAL_VEC s0, s1, s2, s3, s4, s5, s6, s7;
+          s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7 = vec_set1(0.0);
+          const ptrdiff_t dn = CCTK_REAL_VEC_SIZE;
+          for (ptrdiff_t n=0; n<nmax;) {
+            s0 = kmadd(vec_load(array[n]), s0, vec_load(array[n+dn]));
+            n += 2*dn;
+            s1 = kmadd(vec_load(array[n]), s1, vec_load(array[n+dn]));
+            n += 2*dn;
+            s2 = kmadd(vec_load(array[n]), s2, vec_load(array[n+dn]));
+            n += 2*dn;
+            s3 = kmadd(vec_load(array[n]), s3, vec_load(array[n+dn]));
+            n += 2*dn;
+            s4 = kmadd(vec_load(array[n]), s4, vec_load(array[n+dn]));
+            n += 2*dn;
+            s5 = kmadd(vec_load(array[n]), s5, vec_load(array[n+dn]));
+            n += 2*dn;
+            s6 = kmadd(vec_load(array[n]), s6, vec_load(array[n+dn]));
+            n += 2*dn;
+            s7 = kmadd(vec_load(array[n]), s7, vec_load(array[n+dn]));
+            n += 2*dn;
+          }
+          volatile CCTK_REAL_VEC use_s CCTK_ATTRIBUTE_UNUSED =
+            kadd(kadd(kadd(s0, s1), kadd(s2, s3)),
+                 kadd(kadd(s4, s5), kadd(s6, s7)));
+        }
+        const double t1 = omp_get_wtime();
+        elapsed = t1 - t0;
+        if (verbose) {
+          printf(" time=%g sec\n", elapsed);
+        }
+        if (elapsed >= min_elapsed) break;
+        max_count *= llrint(max(2.0, min(10.0, 1.1 * min_elapsed / elapsed)));
+      }
+      cache_info[cache].read_bandwidth = max_count * size / elapsed;
+      if (verbose) {
+        printf("      result:");
+      }
+      printf(" %g GByte/sec for %d PUs\n",
+             cache_info[cache].read_bandwidth / 1.0e+9,
+             cache_info[cache].num_pus);
+    }
+  }
+  
+  
+  
+  void measure_write_latency()
+  {
+    DECLARE_CCTK_PARAMETERS;
+    
+    printf("  Write latency:\n");
+    for (int cache=0; cache<int(cache_info.size()); ++cache) {
+      ptrdiff_t skipsize, size;
+      calc_sizes(cache, skipsize, size);
+      assert(size>0);
+      size = ptrdiff_t(1) << ilogb(double(size));
+      const ptrdiff_t size_mask = size - 1;
+      const ptrdiff_t offset = 0xa1d2d5ff; // a random number
+      assert(size>0);
+      if (verbose) {
+        printf("    %s write latency (using %td bytes):\n",
+               cache_info[cache].name.c_str(), size);
+        fflush(stdout);
+      } else {
+        printf("    %s write latency:", cache_info[cache].name.c_str());
+      }
+      vector<char> skiparray(skipsize, 1);
+      vector<char> array_(size, 1);
+      char* restrict array = &array_[0];
+      double min_elapsed = 1.0;
+      ptrdiff_t max_count = 1000;
+      double elapsed = 0.0;
+      while (elapsed < min_elapsed) {
+        if (verbose) {
+          printf("      iterations=%td...", max_count);
+          fflush(stdout);
+        }
+        const double t0 = omp_get_wtime();
+        ptrdiff_t n = 0;
+        for (ptrdiff_t count=0; count<max_count; ++count) {
+          array[n & size_mask] = 2;
+          n += offset;
+          array[n & size_mask] = 2;
+          n += offset;
+          array[n & size_mask] = 2;
+          n += offset;
+          array[n & size_mask] = 2;
+          n += offset;
+          array[n & size_mask] = 2;
+          n += offset;
+          array[n & size_mask] = 2;
+          n += offset;
+          array[n & size_mask] = 2;
+          n += offset;
+          array[n & size_mask] = 2;
+          n += offset;
+        }
+        volatile char use_array CCTK_ATTRIBUTE_UNUSED = array[0];
+        const double t1 = omp_get_wtime();
+        elapsed = t1 - t0;
+        if (verbose) {
+          printf(" time=%g sec\n", elapsed);
+        }
+        max_count *= llrint(max(2.0, min(10.0, 1.1 * min_elapsed / elapsed)));
+      }
+      cache_info[cache].write_latency = elapsed / (max_count * 8);
+      if (verbose) {
+        printf("      result:");
+      }
+      printf(" %g nsec\n", cache_info[cache].write_latency * 1.0e+9);
+    }
+  }
+  
+  
+  
+  void measure_write_bandwidth()
+  {
+    DECLARE_CCTK_PARAMETERS;
+    
+    printf("  Write bandwidth:\n");
+    for (int cache=0; cache<int(cache_info.size()); ++cache) {
+      ptrdiff_t skipsize, size;
+      calc_sizes(cache, skipsize, size);
+      assert(size>0);
+      if (verbose) {
+        printf("    %s write bandwidth (using %td bytes):\n",
+               cache_info[cache].name.c_str(), size);
+        fflush(stdout);
+      } else {
+        printf("    %s write bandwidth:", cache_info[cache].name.c_str());
+      }
+      vector<char> skiparray(skipsize, 1);
+      vector<char> array(size, 1);
+      double min_elapsed = 1.0;
+      ptrdiff_t max_count = 1;
+      double elapsed = 0.0;
+      for (;;) {
+        if (verbose) {
+          printf("      iterations=%td...", max_count);
+          fflush(stdout);
+        }
+        const double t0 = omp_get_wtime();
+        for (ptrdiff_t count=0; count<max_count; ++count) {
+          memset(&array[0], count % 256, size);
+          volatile char use_array CCTK_ATTRIBUTE_UNUSED = array[count % size];
+        }
+        const double t1 = omp_get_wtime();
+        elapsed = t1 - t0;
+        if (verbose) {
+          printf(" time=%g sec\n", elapsed);
+        }
+        if (elapsed >= min_elapsed) break;
+        max_count *= llrint(max(2.0, min(10.0, 1.1 * min_elapsed / elapsed)));
+      }
+      cache_info[cache].write_bandwidth = max_count * size / elapsed;
+      if (verbose) {
+        printf("      result:");
+      }
+      printf(" %g GByte/sec for %d PUs\n",
+             cache_info[cache].write_bandwidth / 1.0e+9,
+             cache_info[cache].num_pus);
+    }
+  }
+  
+}
+
+
+
+extern "C"
+void MemSpeed_MeasureSpeed(CCTK_ARGUMENTS)
+{
+  DECLARE_CCTK_ARGUMENTS;
+  
+  if (CCTK_MyProc(cctkGH) != 0) return;
+  
+  CCTK_INFO("Measuring CPU, cache, and memory speeds:");
+  load_cache_info();
+  measure_cpu_flop_speed();
+  measure_cpu_iop_speed();
+  measure_read_latency();
+  measure_read_bandwidth();
+  measure_write_latency();
+  measure_write_bandwidth();
+}