! Test of matrix multiplication for different aspect ratios ! ! l 2,500 4,000 1,600 550 12,000 ! m 10 100 1,000 10,000 100 ! n 12,000 3,500 1,400 250 1,500 ! storage (mb) 459.06 222.40 69.27 105.09 294.11 ! op count (mFlops) 300 1,400 2,240 1,375 1,800 ! call Perform_test ( 1600, 1000, 1400, 'Uniform Array Size') call Perform_Test ( 4500, 100, 3500, 'Test with small m') call Perform_Test ( 12000, 100, 1500, 'Test with large l') call Perform_Test ( 550, 10000, 250, 'Test with large m') call Perform_Test ( 2500, 10, 12000, 'Test with large n') ! end MODULE AUDIT ! timing using RDTSC ( cpu_clock@ ) integer*8 :: tick_a, tick_b, tick_c, tick_d, tick1 real*8 :: tick_rate = -1 contains subroutine report_ticks (num_tick, names) integer*4 num_tick character names(num_tick)*12 ! if (tick_rate < 0) then tick1 = get_RDTSC_Rate () tick_rate = tick1 end if ! if (num_tick >= 1) write (*,1) names(1), dble(tick_a)/tick_rate if (num_tick >= 2) write (*,1) names(2), dble(tick_b)/tick_rate if (num_tick >= 3) write (*,1) names(3), dble(tick_c)/tick_rate if (num_tick >= 4) write (*,1) names(4), dble(tick_d)/tick_rate 1 format (2x,a,' =',f10.4,' sec') end subroutine report_ticks integer*8 function Get_Query_Perform_tick () ! Win API Routine ! STDCALL QUERYPERFORMANCECOUNTER 'QueryPerformanceCounter' (REF):LOGICAL*4 ! integer*8 :: tick integer*8 :: first_tick = -1 logical*4 :: ll ! tick = 1 ll = QueryPerformanceCounter (tick) ! cycle count; assume ll is ok if (first_tick < 0) first_tick = tick Get_Query_Perform_tick = tick - first_tick ! END function Get_Query_Perform_tick integer*8 function Get_Query_Perform_rate () ! Win API Routine ! STDCALL QUERYPERFORMANCEFREQUENCY 'QueryPerformanceFrequency' (REF):LOGICAL*4 ! integer*8 :: tick logical*4 :: ll ! tick = 1 ll = QueryPerformanceFrequency (tick) ! cycle rate Get_Query_Perform_rate = tick ! END function Get_Query_Perform_rate integer*8 function get_RDTSC_tick () ! api RDTSC interface to processor clock rate ! integer*8 :: tick integer*8 :: first_tick = -1 real*10 cpu_clock@ intrinsic cpu_clock@ ! tick = cpu_clock@ () if (first_tick < 0) then first_tick = tick write (*,*) ' RDTSC first tick =', first_tick end if get_RDTSC_tick = tick - first_tick ! end function get_RDTSC_tick integer*8 function get_RDTSC_rate () ! ! initialises rdtsc pointer and estimates rdtsc tick rate based on timing with ticks of QueryPerformanceCounter ! ! calibrate using num_cycle of QueryPerform ! integer*8 rd_list(2), qp_list(2), tick_rd, last_qp, tick_qp, Query_ticks, RDTSC_ticks, call_rate, query_rate real*8 Test_Seconds integer*4 i, i_list(2), k, kk, Test_Cycles ! ! RDTSC timing integer*8, parameter :: million = 1000000 integer*8, parameter :: thousand = 1000 integer*8 :: processor_rate = 3000 * million ! estimate of processor rate approximate only integer*8 :: cpu_cycles_per_call = 200 ! estimate of cpu cycles per call (approximate) integer*8 :: rdtsc_rate = -1 ! or 2666701126 ticks per second ~ processor clock rate integer*4 :: num_cycles ! number of call cycles for test ! if ( rdtsc_rate <= 0) then write (*,10) ' ' write (*,11) 'cpu_clock@ Initialisation ', rdtsc_rate ! tick_rd = get_RDTSC_tick () ! API RDTSC interface last_qp = Get_Query_Perform_tick () ! API QueryPerform interface query_rate = Get_Query_Perform_rate () ! num_cycles = ( processor_rate / cpu_cycles_per_call ) / thousand ! calls expected in .001 seconds ! k = -1 kk = 1 last_qp = -1 do i = 0, num_cycles tick_rd = get_RDTSC_tick () ! API RDTSC interface tick_qp = Get_Query_Perform_tick () ! API QueryPerform interface if (tick_qp == last_qp) cycle ! wait for Get_Query_Perform_tick to tick over ! ! only record when QueryPerform changes value last_qp = tick_qp k = k+1 ! number of changes of QueryPerform if (k==2) kk = 2 ! shift to second storage i_list(kk) = i ! number of cycles qp_list(kk) = tick_qp ! ticks of QueryPerform rd_list(kk) = tick_rd ! ticks of RDTSC end do ! ! Get statistics from recording interval Test_Cycles = i_list(2) - i_list(1) ! number of calls Query_ticks = qp_list(2)-qp_list(1) ! number of Query_Perform ticks RDTSC_ticks = rd_list(2)-rd_list(1) ! number of RDTSC ticks Test_Seconds = dble (Query_ticks) / dble (Query_rate) ! duration of test ! rdtsc_rate = dble (RDTSC_ticks) / Test_Seconds ! RDTSC rate (ticks per second) ! call_rate = dble (Test_Cycles) / Test_Seconds cpu_cycles_per_call = RDTSC_ticks / Test_Cycles write (*,12) 'initialise duration =', Test_Seconds, ' seconds' write (*,11) 'cpu_clock@ duration =', RDTSC_ticks, ' ticks' write (*,11) 'cpu_clock@ cycles =', Test_Cycles, ' calls' write (*,11) 'cpu_clock@ call rate =', call_rate, ' calls per second' write (*,11) 'ticks per test cycle =', cpu_cycles_per_call, ' ticks' ! write (*,11) 'cpu_clock@ rate =', rdtsc_rate, ' ticks per second' write (*,10) 'cpu_clock@ initialised' write (*,10) ' ' ! end if ! get_RDTSC_rate = rdtsc_rate ! 10 format (3x,a) 11 format (3x,a,i15,a) 12 format (3x,a,f15.6,a) ! 13 format (3x,a,f15.1,a) ! end function get_RDTSC_rate end MODULE AUDIT ! subroutine Perform_test (l,m,n, description) ! program to run the test ! use audit ! character description *(*) integer*4 l,m,n ! integer*4 i,j real*8, allocatable, dimension (:,:) :: a,b,c,d, at real*8 mb ! tick1 = get_rdtsc_rate() ! write (*,2000) l,m,n, description 2000 format (/80('=')/' Test for l,m,n = ',i0,1x,i0,1x,i0,' : ',a) ! mb = dble (l*m*2 + m*n + l*n*2 ) / (2.**17) write (*,*) ' ' write (*,*) ' Initialise', mb,' Mb' ! ! Allocate arrays tick1 = get_rdtsc_tick() allocate ( A(l,m) ) allocate ( B(m,n) ) allocate ( C(l,n) ) allocate ( D(l,n) ) allocate ( At(m,l) ) tick_a = get_rdtsc_tick() - tick1 ! ! Initialise A and B do j = 1,m do i = 1,l call random_number (A(i,j)) At(j,i) = A(i,j) end do end do tick_b = get_rdtsc_tick() - tick1 ! tick1 = get_rdtsc_tick() do j = 1,n do i = 1,m call random_number (B(i,j)) end do end do tick_c = get_rdtsc_tick() - tick1 call report_ticks (3, (/ 'Allocate ', 'Initial A ', 'Initial B ' /) ) ! write (*,*) ' ' write (*,*) 'MATRIX_transpose MULTIPLICATION TEST' call Matrix_Mult_tran ( At, B, D, l, m, n ) ! write (*,*) ' ' write (*,*) 'MATRIX MULTIPLICATION TEST' call Matrix_Mult ( A, B, C, l, m, n ) ! call test_accuracy ( C, D, l, n ) write (*,*) ' ' write (*,*) 'MATMUL Intrinsic TEST' tick1 = get_rdtsc_tick() c = matmul (a,b) tick_a = get_rdtsc_tick() - tick1 call report_ticks (1, (/ 'MATMUL int ' /) ) ! call test_accuracy ( C, D, l, n ) ! deallocate ( A ) deallocate ( B ) deallocate ( C ) deallocate ( D ) deallocate ( At ) end Subroutine Matrix_Mult ( A, B, C, l, m, n ) ! ! calculate Matrix Multiply C(l,n) = A(l,m) x B(m,n) ! USE AUDIT ! integer*4, intent (IN) :: l, m, n real*8, intent (IN) :: A(l,m) real*8, intent (IN) :: B(m,n) real*8, intent (OUT) :: C(l,n) ! integer*4 :: i,j,k real*8 :: s ! ! Normal calculation order tick1 = get_rdtsc_tick() do i = 1,l do j = 1,n s = 0 do k = 1,m ! C(i,j) = C(i,j) + A(i,k) * B(k,j) s = s + A(i,k) * B(k,j) end do C(i,j) = s end do end do tick_a = get_rdtsc_tick() - tick1 ! ! Re-order to use Vector addition ! C = 0 ! do j = 1,n ! do k = 1,m ! do i = 1,l ! C(i,j) = C(i,j) + A(i,k) * B(k,j) ! end do ! end do ! end do ! ! this becomes tick1 = get_rdtsc_tick() C = 0 do j = 1,n do k = 1,m C(:,j) = C(:,j) + A(:,k) * B(k,j) end do end do tick_b = get_rdtsc_tick() - tick1 ! ! or ! do j = 1,n ! C(:,j) = 0 ! do k = 1,m ! C(:,j) = C(:,j) + A(:,k) * B(k,j) ! end do ! end do ! ! or tick1 = get_rdtsc_tick() do j = 1,n C(:,j) = 0 do k = 1,m call Vec_Add ( C(1,j), A(1,k), B(k,j), l) end do end do tick_c = get_rdtsc_tick() - tick1 ! ! or tick1 = get_rdtsc_tick() do j = 1,n C(:,j) = 0 do k = 1,m call Vec_Add_SSE ( C(1,j), A(1,k), B(k,j), l) end do end do tick_d = get_rdtsc_tick() - tick1 ! call report_ticks (4, (/ 'Basic Loops ', 'Array syntax', 'Vec_Add ', 'Vec_Add_SSE ' /) ) end Subroutine Matrix_Mult Subroutine Matrix_Mult_Tran ( A, B, C, l, m, n ) USE AUDIT ! ! calculate Matrix Multiply C(l,n) = A(m,l)' x B(m,n) ! integer*4, intent (IN) :: l, m, n real*8, intent (IN) :: A(m,l) real*8, intent (IN) :: B(m,n) real*8, intent (OUT) :: C(l,n) real*8 vec_sum, Vec_Sum_SSE external Vec_Sum, Vec_Sum_SSE ! integer*4 :: i,j,k ! ! Normal calculation order tick1 = get_rdtsc_tick() C = 0 do i = 1,l do j = 1,n do k = 1,m C(i,j) = C(i,j) + A(k,i) * B(k,j) end do end do end do tick_a = get_rdtsc_tick() - tick1 ! ! Utilise Dot_Product, this becomes tick1 = get_rdtsc_tick() do i = 1,l do j = 1,n C(i,j) = Dot_Product ( A(:,i), B(:,j) ) end do end do tick_b = get_rdtsc_tick() - tick1 ! ! Utilise Dot_Product, this becomes tick1 = get_rdtsc_tick() do i = 1,l do j = 1,n C(i,j) = Vec_Sum ( A(1,i), B(1,j), m) end do end do tick_c = get_rdtsc_tick() - tick1 ! tick1 = get_rdtsc_tick() do i = 1,l do j = 1,n C(i,j) = Vec_Sum_SSE ( A(1,i), B(1,j), m) end do end do tick_d = get_rdtsc_tick() - tick1 ! call report_ticks (4, (/ 'Basic Loops ', 'Dot_Product ', 'Vec_Sum ', 'Vec_Sum_SSE ' /) ) ! end Subroutine Matrix_Mult_Tran subroutine test_accuracy ( C, D, l, n ) ! ! check the difference between matrix calculation C(l,n) and D(l,n) ! USE AUDIT ! integer*4 l, n, i,j real*8 C(l,n), D(l,n), emax, ei write (*,*) ' ' write (*,*) 'CHECK Accuracy' tick1 = get_rdtsc_tick() emax = 0 do j = 1,n do i = 1,l ei = abs (c(i,j)-d(i,j)) if (ei > emax) emax = ei end do end do tick_a = get_rdtsc_tick() - tick1 call report_ticks (1, (/ 'Check C D ' /) ) write (*,*) 'emax=',emax ! end subroutine test_accuracy !===Vector=Array=Library==== SUBROUTINE Vec_Add (A, B, FAC, N) ! ! Performs the vector opperation [A] = [A] + [B] * FAC ! integer*4, intent (in) :: n real*8, dimension(n), intent (inout) :: A real*8, dimension(n), intent (in) :: B real*8, intent (in) :: fac ! A = A + B * fac return ! END SUBROUTINE Vec_Mult (C, A, B, N) ! ! Performs the vector opperation [C] = [A] * [B] ! integer*4, intent (in) :: n real*8, dimension(n), intent (out) :: C real*8, dimension(n), intent (in) :: A real*8, dimension(n), intent (in) :: B ! C = A * B RETURN ! END REAL*8 FUNCTION Vec_Sum (A, B, N) ! ! Performs a vector dot product VECSUM = [A] . [B] ! account is NOT taken of the zero terms in the vectors ! integer*4, intent (in) :: n real*8, dimension(n), intent (in) :: A real*8, dimension(n), intent (in) :: B ! real*8 c integer*4 i ! c = 0.0 do i = 1,n c = c + A(i)*B(i) end do Vec_Sum = c RETURN ! END subroutine Vec_Add_SSE (y, x, a, n) integer n real*8 x(*), y(*), a integer nn, m real*8 v(2) ! ! Does SAXPY update Y = Y + A*X ! ! On exit X is unchanged, Y is overwritten with Y + A*X ! ! Last edited by davidb on Sun Jun 03, 2012 2:18 am; edited 6 times in total ! if (n==1) then y(1) = y(1) + a*x(1) ! else if (n > 1) then ! nn = n/4 ! Number of sets of 4 m = n - 4*nn ! Number of residuals v = a ! Vector containing (/a, a/) ! ! Assembly code is between code, edoc lines code movupd xmm7%, v ; move v array to xmm7 mov eax%, =x ; address of x mov ecx%, =y ; address of y mov edx%, nn ; initialise loop counter for vectorised loop mov edi%, m ; initialise loop counter for scalar loop cmp edx%, 0 jz $60 ; no terms in vectorized loop ! Check for alignment of x(1) and y(1) on 16 byte boundaries and eax%, 15 ; low nibble of x address and ecx%, 15 ; low nibble of y address cmp eax%, 0 jnz $10 ; x(1) is unaligned, check for alignment of x(2) cmp ecx%, 0 jnz $10 ; y(1) is unaligned, check for alignment of y(2) ! x(1) and y(1) are aligned on 16 byte boundaries mov eax%, =x ; address of x mov ecx%, =y ; address of y jmp $40 ; branch to aligned vectorized loop ! Check for alignment of x(2) and y(2) on 16 byte boundaries 10 xor eax%, 8 jnz $20 ; branch to unaligned vectorized loop xor ecx%, 8 jnz $20 ; branch to unaligned vectorized loop ! x(2) and y(2) are aligned on 16 byte boundaries mov eax%, =x ; address of x argument mov ecx%, =y ; address of y argument movsd xmm0%, [eax%] movsd xmm2%, [ecx%] mulsd xmm0%, xmm7% addsd xmm0%, xmm2% movsd [ecx%], xmm0% ; form y(1) = y(1) + a*x(1) lea eax%, [eax%+8] ; address of x(2) lea ecx%, [ecx%+8] ; address of y(2) dec edi% ; decrease loop counter for scalar loop jge $40 ; branch to aligned vectorized loop mov edi%, 3 ; adjust loop counter for scalar loop dec edx% ; adjust loop counter for vectorised loop jz $60 ; no terms in vectorized loop jmp $40 ; branch to aigned vectorized loop ! ! Initialise unaligned vectorised loop 20 mov eax%, =x ; address of x argument mov ecx%, =y ; address of y argument ! Unaligned vectorised loop to calculate y = y + a*x for element sets 1 to nn 30 movupd xmm0%, [eax%] ; move next 2 doubles in x into xmm0 movupd xmm1%, [eax%+16] ; move next 2 doubles in x into xmm1 movupd xmm2%, [ecx%] ; move next 2 doubles in y into xmm2 movupd xmm3%, [ecx%+16] ; move next 2 doubles in y into xmm3 mulpd xmm0%, xmm7% ; multiply x vector by scalar mulpd xmm1%, xmm7% ; multiply x vector by scalar addpd xmm0%, xmm2% ; form y + a*x vector addpd xmm1%, xmm3% ; form y + a*x vector movupd [ecx%], xmm0% ; move xmm0 into next 2 doubles in y movupd [ecx%+16], xmm1% ; move xmm1 into next 2 doubles in y lea eax%, [eax%+32] lea ecx%, [ecx%+32] dec edx% ; decrement counter jnz $30 ; conditional branch back jmp $60 ! Aligned vectorised loop to calculate y = y + a*x for element sets 1 to nn 40 movapd xmm0%, [eax%] ; move next 2 doubles in x into xmm0 movapd xmm1%, [eax%+16] ; move next 2 doubles in x into xmm1 movapd xmm2%, [ecx%] ; move next 2 doubles in y into xmm2 movapd xmm3%, [ecx%+16] ; move next 2 doubles in y into xmm3 mulpd xmm0%, xmm7% ; multiply x vector by scalar mulpd xmm1%, xmm7% ; multiply x vector by scalar addpd xmm0%, xmm2% ; form y + a*x vector addpd xmm1%, xmm3% ; form y + a*x vector movapd [ecx%], xmm0% ; move xmm0 into next 2 doubles in y movapd [ecx%+16], xmm1% ; move xmm1 into next 2 doubles in y lea eax%, [eax%+32] lea ecx%, [ecx%+32] dec edx% ; decrement counter jnz $40 ; conditional branch back ! Check for entries in the scalar loop 60 cmp edi%, 0 jz $80 ; conditional skip loop, no terms in scalar loop ! Scalar loop to calculate y = y + a*x for the last nn+1 to n elements 70 movsd xmm0%, [eax%] movsd xmm2%, [ecx%] mulsd xmm0%, xmm7% addsd xmm0%, xmm2% movsd [ecx%], xmm0% lea eax%, [eax%+8] lea ecx%, [ecx%+8] dec edi% jnz $70 ! End of assembly 80 nop edoc end if end subroutine Vec_Add_SSE !** DOUBLE PRECISION VERSION ! now DO loop version real*8 function Vec_Sum_SSE (v1,v2,n) integer n real*8 v1(*), v2(*) ! integer nn, m real*8 :: v(2) ! if (n < 1) then Vec_Sum_SSE = 0 else if (n==1) then Vec_Sum_SSE = v1(1)*v2(1) else nn = n/4 ! Number of sets of 4 m = n - 4*nn ! Number of residuals ! Assembly code is between code, edoc lines code xorps xmm0%, xmm0% ; set xmm0 to zero xorps xmm1%, xmm1% ; set xmm1 to zero mov eax%, =v1 ; address of v1 mov ecx%, =v2 ; address of v2 mov edx%, nn ; initialise loop counter for vectorised loop mov edi%, m ; initialise loop counter for scalar loop cmp edx%, 0 jz $60 ; no terms in vectorized loop ! Check for alignment of v1(1) and v2(1) on 16 byte boundaries and eax%, 15 ; low nibble of v1 address and ecx%, 15 ; low nibble of v2 address cmp eax%, 0 jnz $10 ; v1(1) is unaligned, check for alignment of v1(2) cmp ecx%, 0 jnz $10 ; v2(1) is unaligned, check for alignment of v2(2) ! v1(1) and v2(1) are aligned on 16 byte boundaries mov eax%, =v1 ; address of v1 mov ecx%, =v2 ; address of v2 jmp $40 ; branch to aligned vectorized loop ! Check for alignment of v1(2) and v2(2) on 16 byte boundaries 10 xor eax%, 8 jnz $20 ; branch to unaligned vectorized loop xor ecx%, 8 jnz $20 ; branch to unaligned vectorized loop ! v1(2) and v2(2) are aligned on 16 byte boundaries mov eax%, =v1 ; address of v1 argument mov ecx%, =v2 ; address of v2 argument movsd xmm0%, [eax%] mulsd xmm0%, [ecx%] ; initialise with product v1(1)*v2(1) lea eax%, [eax%+8] ; address of v1(2) lea ecx%, [ecx%+8] ; address of v2(2) dec edi% ; decrease loop counter for scalar loop jge $40 ; branch to aligned vectorized loop mov edi%, 3 ; adjust loop counter for scalar loop dec edx% ; adjust loop counter for vectorised loop jz $60 ; no terms in vectorized loop jmp $40 ; branch to aligned vectorized loop ! Initialise unaligned vectorised loop 20 mov eax%, =v1 ; address of v1 argument mov ecx%, =v2 ; address of v2 argument ! Unaligned vectorised loop to accumulate dot product of elements 1 to nn 30 movupd xmm2%, [eax%] ; move next 2 doubles in v1 into xmm2 movupd xmm4%, [eax%+16] ; move next 2 doubles in v1 into xmm4 movupd xmm3%, [ecx%] ; move next 2 doubles in v2 into xmm3 movupd xmm5%, [ecx%+16] ; move next 2 doubles in v2 into xmm5 mulpd xmm2%, xmm3% ; multiply values and store in xmm2 mulpd xmm4%, xmm5% ; multiply values and store in xmm4 addpd xmm0%, xmm2% ; accumulate sum in xmm0 addpd xmm1%, xmm4% ; accumulate sum in xmm1 lea eax%, [eax%+32] lea ecx%, [ecx%+32] dec edx% ; decrement counter jnz $30 ; conditional branch back jmp $50 ! Aligned vectorised loop to accumulate dot product of elements 1 to nn 40 movapd xmm2%, [eax%] ; move next 2 doubles in v1 into xmm2 movapd xmm4%, [eax%+16] ; move next 2 doubles in v1 into xmm4 mulpd xmm2%, [ecx%] ; multiply values and store in xmm2 mulpd xmm4%, [ecx%+16] ; multiply values and store in xmm4 addpd xmm0%, xmm2% ; accumulate sum in xmm0 addpd xmm1%, xmm4% ; accumulate sum in xmm1 lea eax%, [eax%+32] lea ecx%, [ecx%+32] dec edx% ; decrement counter jnz $40 ; conditional branch back 50 addpd xmm0%, xmm1% ; add xmm0 and xmm1 ! Check for entries in the scalar loop 60 cmp edi%, 0 jz $80 ; conditional skip loop, no terms in scalar loop ! Scalar loop to accumulate dot product of last nn+1 to n elements in low 64 bits of xmm0 70 movsd xmm1%, [eax%] mulsd xmm1%, [ecx%] addsd xmm0%, xmm1% lea eax%, [eax%+8] lea ecx%, [ecx%+8] dec edi% jnz $70 ! Can't get final reduction to work, so will do this in Fortran for now 80 movupd v, xmm0% ; move xmm0 to v array edoc ! Final reduction, the result is the sum of the two values in v Vec_Sum_SSE = sum(v) end if end function Vec_Sum_SSE