aboutsummaryrefslogtreecommitdiff
path: root/benchmarks/stream/stream_d.f
diff options
context:
space:
mode:
Diffstat (limited to 'benchmarks/stream/stream_d.f')
-rw-r--r--benchmarks/stream/stream_d.f427
1 files changed, 427 insertions, 0 deletions
diff --git a/benchmarks/stream/stream_d.f b/benchmarks/stream/stream_d.f
new file mode 100644
index 0000000..9924be1
--- /dev/null
+++ b/benchmarks/stream/stream_d.f
@@ -0,0 +1,427 @@
+* Program: STREAM
+* Programmer: John D. McCalpin
+* Revision: 5.0, July 30, 2000
+*
+* This program measures memory transfer rates in MB/s for simple
+* computational kernels coded in Fortran.
+* The intent is to demonstrate the extent to which ordinary user
+* code can exploit the main memory bandwidth of the system under
+* test.
+*=========================================================================
+* The STREAM web page is at:
+* http://www.streambench.org
+*
+* Most of the content is currently hosted at:
+* http://www.cs.virginia.edu/stream/
+*
+* BRIEF INSTRUCTIONS:
+* 0) See http://www.cs.virginia.edu/stream/ref.html for details
+* 1) STREAM requires a timing function called second().
+* Several examples are provided in this directory.
+* "CPU" timers are only allowed for uniprocessor runs.
+* "Wall-clock" timers are required for all multiprocessor runs.
+* 2) The STREAM array sizes must be set to size the test.
+* The value "N" must be chosen so that each of the three
+* arrays is at least 4x larger than the sum of all the last-
+* level caches used in the run, or 1 million elements, which-
+* ever is larger.
+* ------------------------------------------------------------
+* Note that you are free to use any array length and offset
+* that makes each array 4x larger than the last-level cache.
+* The intent is to determine the *best* sustainable bandwidth
+* available with this simple coding. Of course, lower values
+* are usually fairly easy to obtain on cached machines, but
+* by keeping the test to the *best* results, the answers are
+* easier to interpret.
+* You may put the arrays in common or not, at your discretion.
+* There is a commented-out COMMON statement below.
+* Fortran90 "allocatable" arrays are fine, too.
+* ------------------------------------------------------------
+* 3) Compile the code with full optimization. Many compilers
+* generate unreasonably bad code before the optimizer tightens
+* things up. If the results are unreasonably good, on the
+* other hand, the optimizer might be too smart for me
+* Please let me know if this happens.
+* 4) Mail the results to mccalpin@cs.virginia.edu
+* Be sure to include:
+* a) computer hardware model number and software revision
+* b) the compiler flags
+* c) all of the output from the test case.
+* Please let me know if you do not want your name posted along
+* with the submitted results.
+* 5) See the web page for more comments about the run rules and
+* about interpretation of the results.
+*
+* Thanks,
+* Dr. Bandwidth
+*=========================================================================
+*
+ PROGRAM stream
+* IMPLICIT NONE
+C .. Parameters ..
+C Set thread_nbr equal to the number of threads in this run
+ INTEGER n,offset,ndim,ntimes,thread_nbr
+ PARAMETER (n=2000000,offset=0,ndim=n+offset,ntimes=10,
+ $ thread_nbr=1)
+C ..
+C .. Local Scalars ..
+ DOUBLE PRECISION dummy,scalar,t
+ INTEGER j,k,nbpw,quantum
+C ..
+C .. Local Arrays ..
+ DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4),
+ $ times(4,ntimes)
+ INTEGER bytes(4)
+ CHARACTER label(4)*11
+C ..
+C .. External Functions ..
+ DOUBLE PRECISION second
+ INTEGER checktick,realsize
+ EXTERNAL second,checktick,realsize
+C ..
+C .. Intrinsic Functions ..
+C
+ INTRINSIC dble,max,min,nint,sqrt
+C ..
+C .. Arrays in Common ..
+ DOUBLE PRECISION a(ndim),b(ndim),c(ndim)
+C ..
+C .. Common blocks ..
+* COMMON a,b,c
+C ..
+C .. Data statements ..
+ DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
+ DATA label/'Copy: ','Scale: ','Add: ',
+ $ 'Triad: '/
+ DATA bytes/2,2,3,3/,dummy/0.0d0/
+C ..
+
+* --- SETUP --- determine precision and check timing ---
+
+ nbpw = realsize()
+
+ WRITE (*,FMT=9010) 'Array size = ',n
+ WRITE (*,FMT=9010) 'Offset = ',offset
+ WRITE (*,FMT=9020) 'The total memory requirement is ',
+ $ 3*nbpw*n/ (1024*1024),' MB'
+ WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times'
+ WRITE (*,FMT=9030) '--'
+ WRITE (*,FMT=9030) 'The *best* time for each test is used'
+ WRITE (*,FMT=9030) '*EXCLUDING* the first and last iterations'
+
+
+ IF ( thread_nbr.GT.1 ) THEN
+C Uncomment the call below when running multi-thread OpenMP tests
+C CALL omp_set_num_threads(thread_nbr);
+C Comment out the five statements below if running OpenMP tests
+ PRINT *,'--------------------------------------------------'
+ PRINT *,'When running with OpenMP, you must uncomment'
+ PRINT *,'the call to omp_set_num_threads and comment out'
+ PRINT *,'these statements. Check stream_d.f.'
+ GO TO 9060
+ END IF
+
+!$OMP PARALLEL DO
+ DO 10 j = 1,n
+ a(j) = 2.0d0
+ b(j) = 0.5D0
+ c(j) = 0.0D0
+ 10 CONTINUE
+ t = second(dummy)
+!$OMP PARALLEL DO
+ DO 20 j = 1,n
+ a(j) = 0.5d0*a(j)
+ 20 CONTINUE
+ t = second(dummy) - t
+ PRINT *,'----------------------------------------------------'
+ quantum = checktick()
+ WRITE (*,FMT=9000)
+ $ 'Your clock granularity/precision appears to be ',quantum,
+ $ ' microseconds'
+ PRINT *,'----------------------------------------------------'
+
+* --- MAIN LOOP --- repeat test cases NTIMES times ---
+ scalar = 0.5d0*a(1)
+ DO 70 k = 1,ntimes
+
+ t = second(dummy)
+ a(1) = a(1) + t
+!$OMP PARALLEL DO
+ DO 30 j = 1,n
+ c(j) = a(j)
+ 30 CONTINUE
+ t = second(dummy) - t
+ c(n) = c(n) + t
+ times(1,k) = t
+
+ t = second(dummy)
+ c(1) = c(1) + t
+!$OMP PARALLEL DO
+ DO 40 j = 1,n
+ b(j) = scalar*c(j)
+ 40 CONTINUE
+ t = second(dummy) - t
+ b(n) = b(n) + t
+ times(2,k) = t
+
+ t = second(dummy)
+ a(1) = a(1) + t
+!$OMP PARALLEL DO
+ DO 50 j = 1,n
+ c(j) = a(j) + b(j)
+ 50 CONTINUE
+ t = second(dummy) - t
+ c(n) = c(n) + t
+ times(3,k) = t
+
+ t = second(dummy)
+ b(1) = b(1) + t
+!$OMP PARALLEL DO
+ DO 60 j = 1,n
+ a(j) = b(j) + scalar*c(j)
+ 60 CONTINUE
+ t = second(dummy) - t
+ a(n) = a(n) + t
+ times(4,k) = t
+ 70 CONTINUE
+
+* --- SUMMARY ---
+ DO 90 k = 2,ntimes-1
+ DO 80 j = 1,4
+ avgtime(j) = avgtime(j) + times(j,k)
+ mintime(j) = min(mintime(j),times(j,k))
+ maxtime(j) = max(maxtime(j),times(j,k))
+ 80 CONTINUE
+ 90 CONTINUE
+ WRITE (*,FMT=9040)
+ DO 100 j = 1,4
+ avgtime(j) = avgtime(j)/dble(ntimes-2)
+ WRITE (*,FMT=9050) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
+ $ avgtime(j),mintime(j),maxtime(j)
+ 100 CONTINUE
+ PRINT *,'----------------------------------------------------'
+ CALL checksums (a,b,c,n,ntimes)
+ PRINT *,'----------------------------------------------------'
+
+ 9000 FORMAT (1x,a,i6,a)
+ 9010 FORMAT (1x,a,i10)
+ 9020 FORMAT (1x,a,i4,a)
+ 9030 FORMAT (1x,a,i3,a,a)
+ 9040 FORMAT ('Function',5x,'Rate (MB/s) Avg time Min time Max time'
+ $ )
+ 9050 FORMAT (a,4 (f10.4,2x))
+ 9060 END
+
+*-------------------------------------
+* INTEGER FUNCTION dblesize()
+*
+* A semi-portable way to determine the precision of DOUBLE PRECISION
+* in Fortran.
+* Here used to guess how many bytes of storage a DOUBLE PRECISION
+* number occupies.
+*
+ INTEGER FUNCTION realsize()
+* IMPLICIT NONE
+
+C .. Local Scalars ..
+ DOUBLE PRECISION result,test
+ INTEGER j,ndigits
+C ..
+C .. Local Arrays ..
+ DOUBLE PRECISION ref(30)
+C ..
+C .. External Subroutines ..
+ EXTERNAL confuse
+C ..
+C .. Intrinsic Functions ..
+ INTRINSIC abs,acos,log10,sqrt
+C ..
+
+C Test #1 - compare single(1.0d0+delta) to 1.0d0
+
+ 10 DO 20 j = 1,30
+ ref(j) = 1.0d0 + 10.0d0** (-j)
+ 20 CONTINUE
+
+ DO 30 j = 1,30
+ test = ref(j)
+ ndigits = j
+ CALL confuse(test,result)
+ IF (test.EQ.1.0D0) THEN
+ GO TO 40
+ END IF
+ 30 CONTINUE
+ GO TO 50
+
+ 40 WRITE (*,FMT='(a)')
+ $ '----------------------------------------------'
+ WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
+ $ ndigits,' digits of accuracy'
+ IF (ndigits.LE.8) THEN
+ realsize = 4
+ ELSE
+ realsize = 8
+ END IF
+ WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
+ $ ' bytes per DOUBLE PRECISION word'
+ WRITE (*,FMT='(a)')
+ $ '----------------------------------------------'
+ RETURN
+
+ 50 PRINT *,'Hmmmm. I am unable to determine the size.'
+ PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION',
+ $ ' number : '
+ READ (*,FMT=*) realsize
+ IF (realsize.NE.4 .AND. realsize.NE.8) THEN
+ PRINT *,'Your answer ',realsize,' does not make sense.'
+ PRINT *,'Try again.'
+ PRINT *,'Please enter the number of Bytes per ',
+ $ 'DOUBLE PRECISION number : '
+ READ (*,FMT=*) realsize
+ END IF
+ PRINT *,'You have manually entered a size of ',realsize,
+ $ ' bytes per DOUBLE PRECISION number'
+ WRITE (*,FMT='(a)')
+ $ '----------------------------------------------'
+ END
+
+ SUBROUTINE confuse(q,r)
+* IMPLICIT NONE
+C .. Scalar Arguments ..
+ DOUBLE PRECISION q,r
+C ..
+C .. Intrinsic Functions ..
+ INTRINSIC cos
+C ..
+ r = cos(q)
+ RETURN
+ END
+
+* A semi-portable way to determine the clock granularity
+* Adapted from a code by John Henning of Digital Equipment Corporation
+*
+ INTEGER FUNCTION checktick()
+* IMPLICIT NONE
+
+C .. Parameters ..
+ INTEGER n
+ PARAMETER (n=20)
+C ..
+C .. Local Scalars ..
+ DOUBLE PRECISION dummy,t1,t2
+ INTEGER i,j,jmin
+C ..
+C .. Local Arrays ..
+ DOUBLE PRECISION timesfound(n)
+C ..
+C .. External Functions ..
+ DOUBLE PRECISION second
+ EXTERNAL second
+C ..
+C .. Intrinsic Functions ..
+ INTRINSIC max,min,nint
+C ..
+ i = 0
+ dummy = 0.0d0
+ t1 = second(dummy)
+
+ 10 t2 = second(dummy)
+ IF (t2.EQ.t1) GO TO 10
+
+ t1 = t2
+ i = i + 1
+ timesfound(i) = t1
+ IF (i.LT.n) GO TO 10
+
+ jmin = 1000000
+ DO 20 i = 2,n
+ j = nint((timesfound(i)-timesfound(i-1))*1d6)
+ jmin = min(jmin,max(j,0))
+ 20 CONTINUE
+
+ IF (jmin.GT.0) THEN
+ checktick = jmin
+ ELSE
+ PRINT *,'Your clock granularity appears to be less ',
+ $ 'than one microsecond'
+ checktick = 1
+ END IF
+ RETURN
+
+* PRINT 14, timesfound(1)*1d6
+* DO 20 i=2,n
+* PRINT 14, timesfound(i)*1d6,
+* & nint((timesfound(i)-timesfound(i-1))*1d6)
+* 14 FORMAT (1X, F18.4, 1X, i8)
+* 20 CONTINUE
+
+ END
+
+
+
+
+ SUBROUTINE checksums(a,b,c,n,ntimes)
+* IMPLICIT NONE
+C ..
+C .. Arguments ..
+ DOUBLE PRECISION a(*),b(*),c(*)
+ INTEGER n,ntimes
+C ..
+C .. Local Scalars ..
+ DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon
+ INTEGER k
+C ..
+
+C Repeat the main loop, but with scalars only.
+C This is done to check the sum & make sure all
+C iterations have been executed correctly.
+
+ aa = 2.0D0
+ bb = 0.5D0
+ cc = 0.0D0
+ aa = 0.5D0*aa
+ scalar = 0.5d0*aa
+ DO k = 1,ntimes
+ cc = aa
+ bb = scalar*cc
+ cc = aa + bb
+ aa = bb + scalar*cc
+ END DO
+ aa = aa*DBLE(n-2)
+ bb = bb*DBLE(n-2)
+ cc = cc*DBLE(n-2)
+
+C Now sum up the arrays, excluding the first and last
+C elements, which are modified using the timing results
+C to confuse aggressive optimizers.
+
+ suma = 0.0d0
+ sumb = 0.0d0
+ sumc = 0.0d0
+!$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc)
+ DO 110 j = 2,n-1
+ suma = suma + a(j)
+ sumb = sumb + b(j)
+ sumc = sumc + c(j)
+ 110 CONTINUE
+
+ epsilon = 1.D-6
+
+ IF (ABS(suma-aa)/suma .GT. epsilon) THEN
+ PRINT *,'Failed Validation on array a()'
+ PRINT *,'Target Sum of a is = ',aa
+ PRINT *,'Computed Sum of a is = ',suma
+ ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN
+ PRINT *,'Failed Validation on array b()'
+ PRINT *,'Target Sum of b is = ',bb
+ PRINT *,'Computed Sum of b is = ',sumb
+ ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN
+ PRINT *,'Failed Validation on array c()'
+ PRINT *,'Target Sum of c is = ',cc
+ PRINT *,'Computed Sum of c is = ',sumc
+ ELSE
+ PRINT *,'Solution Validates!'
+ ENDIF
+
+ END
+