1 files changed, 427 insertions, 0 deletions
diff --git a/benchmarks/stream/stream_d.f b/benchmarks/stream/stream_d.f
new file mode 100644
index 0000000..9924be1
--- /dev/null
+++ b/benchmarks/stream/stream_d.f
@@ -0,0 +1,427 @@
+* Program: STREAM
+* Programmer: John D. McCalpin
+* Revision: 5.0, July 30, 2000
+*
+* This program measures memory transfer rates in MB/s for simple
+* computational kernels coded in Fortran.  
+* The intent is to demonstrate the extent to which ordinary user
+* code can exploit the main memory bandwidth of the system under
+* test.
+*=========================================================================
+* The STREAM web page is at:
+*          http://www.streambench.org
+*
+* Most of the content is currently hosted at:
+*          http://www.cs.virginia.edu/stream/
+*
+* BRIEF INSTRUCTIONS: 
+*       0) See http://www.cs.virginia.edu/stream/ref.html for details
+*       1) STREAM requires a timing function called second().
+*          Several examples are provided in this directory.
+*          "CPU" timers are only allowed for uniprocessor runs.
+*          "Wall-clock" timers are required for all multiprocessor runs.
+*       2) The STREAM array sizes must be set to size the test.
+*          The value "N" must be chosen so that each of the three
+*          arrays is at least 4x larger than the sum of all the last-
+*          level caches used in the run, or 1 million elements, which-
+*          ever is larger.
+*          ------------------------------------------------------------
+*          Note that you are free to use any array length and offset
+*          that makes each array 4x larger than the last-level cache.
+*          The intent is to determine the *best* sustainable bandwidth
+*          available with this simple coding.  Of course, lower values
+*          are usually fairly easy to obtain on cached machines, but 
+*          by keeping the test to the *best* results, the answers are
+*          easier to interpret.
+*          You may put the arrays in common or not, at your discretion.
+*          There is a commented-out COMMON statement below.
+*          Fortran90 "allocatable" arrays are fine, too.
+*          ------------------------------------------------------------
+*       3) Compile the code with full optimization.  Many compilers
+*          generate unreasonably bad code before the optimizer tightens
+*          things up.  If the results are unreasonably good, on the
+*          other hand, the optimizer might be too smart for me
+*          Please let me know if this happens.
+*       4) Mail the results to mccalpin@cs.virginia.edu
+*          Be sure to include:
+*               a) computer hardware model number and software revision
+*               b) the compiler flags
+*               c) all of the output from the test case.
+*          Please let me know if you do not want your name posted along
+*          with the submitted results.
+*       5) See the web page for more comments about the run rules and
+*          about interpretation of the results.
+*
+* Thanks,
+*   Dr. Bandwidth
+*=========================================================================
+*
+      PROGRAM stream
+*     IMPLICIT NONE
+C     .. Parameters ..
+C     Set thread_nbr equal to the number of threads in this run
+      INTEGER n,offset,ndim,ntimes,thread_nbr
+      PARAMETER (n=2000000,offset=0,ndim=n+offset,ntimes=10,
+     $           thread_nbr=1)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION dummy,scalar,t
+      INTEGER j,k,nbpw,quantum
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4),
+     $                 times(4,ntimes)
+      INTEGER bytes(4)
+      CHARACTER label(4)*11
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION second
+      INTEGER checktick,realsize
+      EXTERNAL second,checktick,realsize
+C     ..
+C     .. Intrinsic Functions ..
+C
+      INTRINSIC dble,max,min,nint,sqrt
+C     ..
+C     .. Arrays in Common ..
+      DOUBLE PRECISION a(ndim),b(ndim),c(ndim)
+C     ..
+C     .. Common blocks ..
+*     COMMON a,b,c
+C     ..
+C     .. Data statements ..
+      DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
+      DATA label/'Copy:      ','Scale:     ','Add:       ',
+     $     'Triad:     '/
+      DATA bytes/2,2,3,3/,dummy/0.0d0/
+C     ..
+
+*       --- SETUP --- determine precision and check timing ---
+
+      nbpw = realsize()
+
+      WRITE (*,FMT=9010) 'Array size = ',n
+      WRITE (*,FMT=9010) 'Offset     = ',offset
+      WRITE (*,FMT=9020) 'The total memory requirement is ',
+     $  3*nbpw*n/ (1024*1024),' MB'
+      WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times'
+      WRITE (*,FMT=9030) '--'
+      WRITE (*,FMT=9030) 'The *best* time for each test is used'
+      WRITE (*,FMT=9030) '*EXCLUDING* the first and last iterations'
+
+
+      IF ( thread_nbr.GT.1 ) THEN
+C     Uncomment the call below when running multi-thread OpenMP tests
+C        CALL omp_set_num_threads(thread_nbr);
+C     Comment out the five statements below if running OpenMP tests
+         PRINT *,'--------------------------------------------------'
+         PRINT *,'When running with OpenMP, you must uncomment'
+         PRINT *,'the call to omp_set_num_threads and comment out'
+         PRINT *,'these statements.  Check stream_d.f.' 
+         GO TO 9060
+      END IF
+
+!$OMP PARALLEL DO
+      DO 10 j = 1,n
+          a(j) = 2.0d0
+          b(j) = 0.5D0
+          c(j) = 0.0D0
+   10 CONTINUE
+      t = second(dummy)
+!$OMP PARALLEL DO
+      DO 20 j = 1,n
+          a(j) = 0.5d0*a(j)
+   20 CONTINUE
+      t = second(dummy) - t
+      PRINT *,'----------------------------------------------------'
+      quantum = checktick()
+      WRITE (*,FMT=9000)
+     $  'Your clock granularity/precision appears to be ',quantum,
+     $  ' microseconds'
+      PRINT *,'----------------------------------------------------'
+
+*       --- MAIN LOOP --- repeat test cases NTIMES times ---
+      scalar = 0.5d0*a(1)
+      DO 70 k = 1,ntimes
+
+          t = second(dummy)
+          a(1) = a(1) + t
+!$OMP PARALLEL DO
+          DO 30 j = 1,n
+              c(j) = a(j)
+   30     CONTINUE
+          t = second(dummy) - t
+          c(n) = c(n) + t
+          times(1,k) = t
+
+          t = second(dummy)
+          c(1) = c(1) + t
+!$OMP PARALLEL DO
+          DO 40 j = 1,n
+              b(j) = scalar*c(j)
+   40     CONTINUE
+          t = second(dummy) - t
+          b(n) = b(n) + t
+          times(2,k) = t
+
+          t = second(dummy)
+          a(1) = a(1) + t
+!$OMP PARALLEL DO
+          DO 50 j = 1,n
+              c(j) = a(j) + b(j)
+   50     CONTINUE
+          t = second(dummy) - t
+          c(n) = c(n) + t
+          times(3,k) = t
+
+          t = second(dummy)
+          b(1) = b(1) + t
+!$OMP PARALLEL DO
+          DO 60 j = 1,n
+              a(j) = b(j) + scalar*c(j)
+   60     CONTINUE
+          t = second(dummy) - t
+          a(n) = a(n) + t
+          times(4,k) = t
+   70 CONTINUE
+
+*       --- SUMMARY ---
+      DO 90 k = 2,ntimes-1
+          DO 80 j = 1,4
+              avgtime(j) = avgtime(j) + times(j,k)
+              mintime(j) = min(mintime(j),times(j,k))
+              maxtime(j) = max(maxtime(j),times(j,k))
+   80     CONTINUE
+   90 CONTINUE
+      WRITE (*,FMT=9040)
+      DO 100 j = 1,4
+          avgtime(j) = avgtime(j)/dble(ntimes-2)
+          WRITE (*,FMT=9050) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
+     $      avgtime(j),mintime(j),maxtime(j)
+  100 CONTINUE
+      PRINT *,'----------------------------------------------------'
+      CALL checksums (a,b,c,n,ntimes)
+      PRINT *,'----------------------------------------------------'
+
+ 9000 FORMAT (1x,a,i6,a)
+ 9010 FORMAT (1x,a,i10)
+ 9020 FORMAT (1x,a,i4,a)
+ 9030 FORMAT (1x,a,i3,a,a)
+ 9040 FORMAT ('Function',5x,'Rate (MB/s)  Avg time   Min time  Max time'
+     $       )
+ 9050 FORMAT (a,4 (f10.4,2x))
+ 9060 END
+
+*-------------------------------------
+* INTEGER FUNCTION dblesize()
+*
+* A semi-portable way to determine the precision of DOUBLE PRECISION
+* in Fortran.
+* Here used to guess how many bytes of storage a DOUBLE PRECISION
+* number occupies.
+*
+      INTEGER FUNCTION realsize()
+*     IMPLICIT NONE
+
+C     .. Local Scalars ..
+      DOUBLE PRECISION result,test
+      INTEGER j,ndigits
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION ref(30)
+C     ..
+C     .. External Subroutines ..
+      EXTERNAL confuse
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC abs,acos,log10,sqrt
+C     ..
+
+C       Test #1 - compare single(1.0d0+delta) to 1.0d0
+
+   10 DO 20 j = 1,30
+          ref(j) = 1.0d0 + 10.0d0** (-j)
+   20 CONTINUE
+
+      DO 30 j = 1,30
+          test = ref(j)
+          ndigits = j
+          CALL confuse(test,result)
+          IF (test.EQ.1.0D0) THEN
+              GO TO 40
+          END IF
+   30 CONTINUE
+      GO TO 50
+
+   40 WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
+     $  ndigits,' digits of accuracy'
+      IF (ndigits.LE.8) THEN
+          realsize = 4
+      ELSE
+          realsize = 8
+      END IF
+      WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
+     $  ' bytes per DOUBLE PRECISION word'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      RETURN
+
+   50 PRINT *,'Hmmmm.  I am unable to determine the size.'
+      PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION',
+     $  ' number : '
+      READ (*,FMT=*) realsize
+      IF (realsize.NE.4 .AND. realsize.NE.8) THEN
+          PRINT *,'Your answer ',realsize,' does not make sense.'
+          PRINT *,'Try again.'
+          PRINT *,'Please enter the number of Bytes per ',
+     $      'DOUBLE PRECISION number : '
+          READ (*,FMT=*) realsize
+      END IF
+      PRINT *,'You have manually entered a size of ',realsize,
+     $  ' bytes per DOUBLE PRECISION number'
+      WRITE (*,FMT='(a)')
+     $  '----------------------------------------------'
+      END
+
+      SUBROUTINE confuse(q,r)
+*     IMPLICIT NONE
+C     .. Scalar Arguments ..
+      DOUBLE PRECISION q,r
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC cos
+C     ..
+      r = cos(q)
+      RETURN
+      END
+
+* A semi-portable way to determine the clock granularity
+* Adapted from a code by John Henning of Digital Equipment Corporation
+*
+      INTEGER FUNCTION checktick()
+*     IMPLICIT NONE
+
+C     .. Parameters ..
+      INTEGER n
+      PARAMETER (n=20)
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION dummy,t1,t2
+      INTEGER i,j,jmin
+C     ..
+C     .. Local Arrays ..
+      DOUBLE PRECISION timesfound(n)
+C     ..
+C     .. External Functions ..
+      DOUBLE PRECISION second
+      EXTERNAL second
+C     ..
+C     .. Intrinsic Functions ..
+      INTRINSIC max,min,nint
+C     ..
+      i = 0
+      dummy = 0.0d0
+      t1 = second(dummy)
+
+   10 t2 = second(dummy)
+      IF (t2.EQ.t1) GO TO 10
+
+      t1 = t2
+      i = i + 1
+      timesfound(i) = t1
+      IF (i.LT.n) GO TO 10
+
+      jmin = 1000000
+      DO 20 i = 2,n
+          j = nint((timesfound(i)-timesfound(i-1))*1d6)
+          jmin = min(jmin,max(j,0))
+   20 CONTINUE
+
+      IF (jmin.GT.0) THEN
+          checktick = jmin
+      ELSE
+          PRINT *,'Your clock granularity appears to be less ',
+     $      'than one microsecond'
+          checktick = 1
+      END IF
+      RETURN
+
+*      PRINT 14, timesfound(1)*1d6
+*      DO 20 i=2,n
+*         PRINT 14, timesfound(i)*1d6,
+*     &       nint((timesfound(i)-timesfound(i-1))*1d6)
+*   14    FORMAT (1X, F18.4, 1X, i8)
+*   20 CONTINUE
+
+      END
+
+
+
+
+      SUBROUTINE checksums(a,b,c,n,ntimes)
+*     IMPLICIT NONE
+C     ..
+C     .. Arguments ..
+      DOUBLE PRECISION a(*),b(*),c(*)
+      INTEGER n,ntimes
+C     ..
+C     .. Local Scalars ..
+      DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon
+      INTEGER k
+C     ..
+
+C     Repeat the main loop, but with scalars only.
+C     This is done to check the sum & make sure all
+C     iterations have been executed correctly.
+
+      aa = 2.0D0
+      bb = 0.5D0
+      cc = 0.0D0
+      aa = 0.5D0*aa
+      scalar = 0.5d0*aa
+      DO k = 1,ntimes
+          cc = aa
+          bb = scalar*cc
+          cc = aa + bb
+          aa = bb + scalar*cc
+      END DO
+      aa = aa*DBLE(n-2)
+      bb = bb*DBLE(n-2)
+      cc = cc*DBLE(n-2)
+
+C     Now sum up the arrays, excluding the first and last
+C     elements, which are modified using the timing results
+C     to confuse aggressive optimizers.
+
+      suma = 0.0d0
+      sumb = 0.0d0
+      sumc = 0.0d0
+!$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc)
+      DO 110 j = 2,n-1
+          suma = suma + a(j)
+          sumb = sumb + b(j)
+          sumc = sumc + c(j)
+  110 CONTINUE
+
+      epsilon = 1.D-6
+
+      IF (ABS(suma-aa)/suma .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array a()'
+          PRINT *,'Target   Sum of a is = ',aa
+          PRINT *,'Computed Sum of a is = ',suma
+      ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array b()'
+          PRINT *,'Target   Sum of b is = ',bb
+          PRINT *,'Computed Sum of b is = ',sumb
+      ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN
+          PRINT *,'Failed Validation on array c()'
+          PRINT *,'Target   Sum of c is = ',cc
+          PRINT *,'Computed Sum of c is = ',sumc
+      ELSE
+          PRINT *,'Solution Validates!'
+      ENDIF
+
+      END
+