!  Test of matrix multiplication for different aspect ratios
!
!	l                  2,500   4,000   1,600     550  12,000 
!	m                     10     100   1,000  10,000     100 
!	n                 12,000   3,500   1,400     250   1,500 
!	storage (mb)      459.06  222.40   69.27  105.09  294.11 
!	op count (mFlops)    300   1,400   2,240   1,375   1,800 
!
      call Perform_test (  1600,  1000,  1400, 'Uniform Array Size')
      call Perform_Test (  4500,   100,  3500, 'Test with small m')
      call Perform_Test ( 12000,   100,  1500, 'Test with large l')
      call Perform_Test (   550, 10000,   250, 'Test with large m')
      call Perform_Test (  2500,    10, 12000, 'Test with large n')
!
      end

MODULE AUDIT   !  timing using RDTSC  ( cpu_clock@ )

      integer*8 :: tick_a, tick_b, tick_c, tick_d, tick1
      real*8    :: tick_rate = -1

contains

       subroutine report_ticks (num_tick, names)
       integer*4 num_tick
       character names(num_tick)*12
!
       if (tick_rate < 0) then
          tick1     = get_RDTSC_Rate ()
          tick_rate = tick1
       end if
!       
       if (num_tick >= 1) write (*,1) names(1), dble(tick_a)/tick_rate
       if (num_tick >= 2) write (*,1) names(2), dble(tick_b)/tick_rate
       if (num_tick >= 3) write (*,1) names(3), dble(tick_c)/tick_rate
       if (num_tick >= 4) write (*,1) names(4), dble(tick_d)/tick_rate
1      format (2x,a,' =',f10.4,' sec')         
       end subroutine report_ticks

       integer*8 function Get_Query_Perform_tick ()     !     Win API Routine
!
       STDCALL QUERYPERFORMANCECOUNTER   'QueryPerformanceCounter' (REF):LOGICAL*4
!
       integer*8 :: tick
       integer*8 :: first_tick = -1
       logical*4 :: ll
!
       tick = 1
       ll   = QueryPerformanceCounter (tick)          ! cycle count; assume ll is ok
       if (first_tick < 0) first_tick = tick
       Get_Query_Perform_tick   = tick - first_tick
!
       END function Get_Query_Perform_tick
 
       integer*8 function Get_Query_Perform_rate ()      !     Win API Routine
!
       STDCALL QUERYPERFORMANCEFREQUENCY 'QueryPerformanceFrequency' (REF):LOGICAL*4
!
       integer*8 :: tick
       logical*4 :: ll
!
       tick = 1
       ll   = QueryPerformanceFrequency (tick)     ! cycle rate
       Get_Query_Perform_rate = tick
!
       END function Get_Query_Perform_rate
          
       integer*8 function get_RDTSC_tick ()        !  api RDTSC interface to processor clock rate
!
       integer*8 :: tick
       integer*8 :: first_tick = -1
       real*10   cpu_clock@
       intrinsic cpu_clock@
!
       tick = cpu_clock@ ()
       if (first_tick < 0) then
         first_tick = tick
         write (*,*) '  RDTSC first tick     =', first_tick
       end if
       get_RDTSC_tick = tick - first_tick
!
       end function get_RDTSC_tick
 
      integer*8 function get_RDTSC_rate ()
!
! initialises rdtsc pointer and estimates rdtsc tick rate based on timing with ticks of QueryPerformanceCounter
!
!  calibrate using num_cycle of QueryPerform
!
       integer*8  rd_list(2), qp_list(2), tick_rd, last_qp, tick_qp, Query_ticks, RDTSC_ticks, call_rate, query_rate
       real*8     Test_Seconds
       integer*4  i, i_list(2), k, kk,  Test_Cycles
!
!    RDTSC timing
       integer*8, parameter :: million              = 1000000
       integer*8, parameter :: thousand             = 1000     
       integer*8            :: processor_rate       = 3000 * million   ! estimate of processor rate approximate only
       integer*8            :: cpu_cycles_per_call  = 200              ! estimate of cpu cycles per call (approximate)
       integer*8            :: rdtsc_rate           = -1               ! or 2666701126  ticks per second ~ processor clock rate
       integer*4            :: num_cycles                              ! number of call cycles for test
!
       if ( rdtsc_rate <= 0) then
         write (*,10) ' '
         write (*,11) 'cpu_clock@ Initialisation ', rdtsc_rate
!
         tick_rd    = get_RDTSC_tick ()           ! API RDTSC interface
         last_qp    = Get_Query_Perform_tick ()   ! API QueryPerform interface
         query_rate = Get_Query_Perform_rate ()
!
         num_cycles = ( processor_rate / cpu_cycles_per_call ) / thousand  ! calls expected in .001 seconds
!
         k   = -1
         kk  =  1
         last_qp = -1
         do i = 0, num_cycles                  
            tick_rd = get_RDTSC_tick ()            ! API RDTSC interface
            tick_qp = Get_Query_Perform_tick ()    ! API QueryPerform interface
            if (tick_qp == last_qp) cycle          ! wait for Get_Query_Perform_tick to tick over
!
!         only record when QueryPerform changes value
            last_qp      = tick_qp
            k            = k+1       ! number of changes of QueryPerform
            if (k==2) kk = 2         ! shift to second storage
            i_list(kk)   = i         ! number of cycles
            qp_list(kk)  = tick_qp   ! ticks of QueryPerform
            rd_list(kk)  = tick_rd   ! ticks of RDTSC
         end do
!
!      Get statistics from recording interval
         Test_Cycles         = i_list(2) - i_list(1)                   ! number of calls
         Query_ticks         = qp_list(2)-qp_list(1)                   ! number of Query_Perform ticks
         RDTSC_ticks         = rd_list(2)-rd_list(1)                   ! number of RDTSC ticks
         Test_Seconds        = dble (Query_ticks) / dble (Query_rate)  ! duration of test
!
         rdtsc_rate          = dble (RDTSC_ticks) / Test_Seconds       ! RDTSC rate (ticks per second)
!
         call_rate           = dble (Test_Cycles) / Test_Seconds
         cpu_cycles_per_call = RDTSC_ticks / Test_Cycles
         write (*,12) 'initialise duration  =', Test_Seconds, ' seconds'
         write (*,11) 'cpu_clock@ duration  =', RDTSC_ticks,  ' ticks'
         write (*,11) 'cpu_clock@ cycles    =', Test_Cycles,  ' calls'
         write (*,11) 'cpu_clock@ call rate =', call_rate,    ' calls per second'
         write (*,11) 'ticks per test cycle =', cpu_cycles_per_call, ' ticks'
!
         write (*,11) 'cpu_clock@ rate      =', rdtsc_rate,   ' ticks per second'
         write (*,10) 'cpu_clock@ initialised'
         write (*,10) ' '
!
       end if
!
       get_RDTSC_rate = rdtsc_rate
!
    10 format (3x,a)
    11 format (3x,a,i15,a)
    12 format (3x,a,f15.6,a)
!    13 format (3x,a,f15.1,a)
!
       end function get_RDTSC_rate

end MODULE AUDIT
!

subroutine Perform_test (l,m,n, description)
!   program to run the test
!
use audit
!
      character description *(*)
      integer*4 l,m,n
!
      integer*4 i,j
      real*8, allocatable, dimension (:,:) :: a,b,c,d, at
      real*8  mb
!
      tick1 = get_rdtsc_rate()
!
      write (*,2000) l,m,n, description
2000  format (/80('=')/' Test for l,m,n = ',i0,1x,i0,1x,i0,' : ',a)
!      
      mb = dble (l*m*2 + m*n + l*n*2 ) / (2.**17)
      write (*,*) ' '
      write (*,*) ' Initialise', mb,' Mb'
!
!   Allocate arrays
      tick1 = get_rdtsc_tick()
      allocate ( A(l,m) )
      allocate ( B(m,n) )
      allocate ( C(l,n) )
      allocate ( D(l,n) )
      allocate ( At(m,l) )
      tick_a = get_rdtsc_tick() - tick1
!
!   Initialise A and B
      do j = 1,m
        do i = 1,l
          call random_number (A(i,j))
          At(j,i) = A(i,j)
        end do
      end do
      tick_b = get_rdtsc_tick() - tick1
!
      tick1 = get_rdtsc_tick()
      do j = 1,n
        do i = 1,m
          call random_number (B(i,j))
        end do
      end do
      tick_c = get_rdtsc_tick() - tick1
      call report_ticks (3, (/ 'Allocate    ', 'Initial A   ', 'Initial B   ' /) )
!
      write (*,*) ' '
      write (*,*) 'MATRIX_transpose MULTIPLICATION TEST'
      call Matrix_Mult_tran ( At, B, D, l, m, n )
!
      write (*,*) ' '
      write (*,*) 'MATRIX MULTIPLICATION TEST'
      call Matrix_Mult ( A, B, C, l, m, n )
!
      call test_accuracy ( C, D, l, n )

      write (*,*) ' '
      write (*,*) 'MATMUL Intrinsic TEST'
      tick1 = get_rdtsc_tick()
      c = matmul (a,b)
      tick_a = get_rdtsc_tick() - tick1
      call report_ticks (1, (/ 'MATMUL int  ' /) )
!
      call test_accuracy ( C, D, l, n )
!
      deallocate ( A )
      deallocate ( B )
      deallocate ( C )
      deallocate ( D )
      deallocate ( At )

      end

      Subroutine Matrix_Mult ( A, B, C, l, m, n )
!
!   calculate Matrix Multiply C(l,n) = A(l,m) x B(m,n)      
!
USE AUDIT
!
      integer*4, intent (IN)  :: l, m, n
      real*8,    intent (IN)  :: A(l,m)
      real*8,    intent (IN)  :: B(m,n)
      real*8,    intent (OUT) :: C(l,n)
!
      integer*4 :: i,j,k
      real*8    :: s
!
!   Normal calculation order
      tick1 = get_rdtsc_tick()
      do i = 1,l
        do j = 1,n
          s = 0
          do k = 1,m
!            C(i,j) = C(i,j) + A(i,k) * B(k,j)
            s = s + A(i,k) * B(k,j)
          end do
          C(i,j) = s
        end do
      end do  
      tick_a = get_rdtsc_tick() - tick1
!
!   Re-order to use Vector addition
!      C = 0
!        do j = 1,n
!          do k = 1,m
!      do i = 1,l
!            C(i,j) = C(i,j) + A(i,k) * B(k,j)
!          end do
!        end do
!      end do  
!
!    this becomes
      tick1 = get_rdtsc_tick()
      C = 0
        do j = 1,n
          do k = 1,m
            C(:,j) = C(:,j) + A(:,k) * B(k,j)
          end do
        end do  
      tick_b = get_rdtsc_tick() - tick1
!
!     or      
!        do j = 1,n
!          C(:,j) = 0
!          do k = 1,m
!            C(:,j) = C(:,j) + A(:,k) * B(k,j)
!          end do
!        end do  
!
!     or      
      tick1 = get_rdtsc_tick()
        do j = 1,n
          C(:,j) = 0
          do k = 1,m
            call Vec_Add ( C(1,j), A(1,k), B(k,j), l)
          end do
        end do  
      tick_c = get_rdtsc_tick() - tick1
!
!     or      
      tick1 = get_rdtsc_tick()
        do j = 1,n
          C(:,j) = 0
          do k = 1,m
            call Vec_Add_SSE ( C(1,j), A(1,k), B(k,j), l)
          end do
        end do  
      tick_d = get_rdtsc_tick() - tick1
!
      call report_ticks (4, (/ 'Basic Loops ', 'Array syntax', 'Vec_Add     ', 'Vec_Add_SSE ' /) )
      end Subroutine Matrix_Mult

      Subroutine Matrix_Mult_Tran ( A, B, C, l, m, n )
USE AUDIT
!
!   calculate Matrix Multiply C(l,n) = A(m,l)' x B(m,n)      
!
      integer*4, intent (IN)  :: l, m, n
      real*8,    intent (IN)  :: A(m,l)
      real*8,    intent (IN)  :: B(m,n)
      real*8,    intent (OUT) :: C(l,n)
      real*8   vec_sum, Vec_Sum_SSE
      external Vec_Sum, Vec_Sum_SSE
!
      integer*4 :: i,j,k
!
!   Normal calculation order
      tick1 = get_rdtsc_tick()
      C = 0
      do i = 1,l
        do j = 1,n
          do k = 1,m
            C(i,j) = C(i,j) + A(k,i) * B(k,j)
          end do
        end do
      end do  
      tick_a = get_rdtsc_tick() - tick1
!
!   Utilise Dot_Product, this becomes
      tick1 = get_rdtsc_tick()
      do i = 1,l
        do j = 1,n
          C(i,j) = Dot_Product ( A(:,i), B(:,j) )
        end do
      end do  
      tick_b = get_rdtsc_tick() - tick1
!
!   Utilise Dot_Product, this becomes
      tick1 = get_rdtsc_tick()
      do i = 1,l
        do j = 1,n
          C(i,j) = Vec_Sum ( A(1,i), B(1,j), m)
        end do
      end do  
      tick_c = get_rdtsc_tick() - tick1
!
      tick1 = get_rdtsc_tick()
      do i = 1,l
        do j = 1,n
          C(i,j) = Vec_Sum_SSE ( A(1,i), B(1,j), m)
        end do
      end do  
      tick_d = get_rdtsc_tick() - tick1
!
      call report_ticks (4, (/ 'Basic Loops ', 'Dot_Product ', 'Vec_Sum     ', 'Vec_Sum_SSE ' /) )
!
      end Subroutine Matrix_Mult_Tran

      subroutine test_accuracy ( C, D, l, n )
!
!   check the difference between matrix calculation C(l,n) and D(l,n)
!
USE AUDIT
!
      integer*4 l, n,  i,j
      real*8    C(l,n), D(l,n),  emax, ei
      
      write (*,*) ' '
      write (*,*) 'CHECK Accuracy'      
      tick1 = get_rdtsc_tick()
      emax = 0
      do j = 1,n
        do i = 1,l
          ei = abs (c(i,j)-d(i,j))
          if (ei > emax) emax = ei
        end do
      end do
      tick_a = get_rdtsc_tick() - tick1
      call report_ticks (1, (/ 'Check C D   ' /) )
      write (*,*) 'emax=',emax
!
     end subroutine test_accuracy

!===Vector=Array=Library====

      SUBROUTINE Vec_Add (A, B, FAC, N)
!
!   Performs the vector opperation  [A] = [A] + [B] * FAC
!
      integer*4,                 intent (in)    :: n
      real*8,    dimension(n),   intent (inout) :: A
      real*8,    dimension(n),   intent (in)    :: B
      real*8,                    intent (in)    :: fac
!
      A = A + B * fac
      return
!
      END

      SUBROUTINE Vec_Mult (C, A, B, N)
!
!   Performs the vector opperation  [C] = [A] * [B]
!
      integer*4,                 intent (in)    :: n
      real*8,    dimension(n),   intent (out)   :: C
      real*8,    dimension(n),   intent (in)    :: A
      real*8,    dimension(n),   intent (in)    :: B
!
      C = A * B
      RETURN
!
      END

      REAL*8 FUNCTION Vec_Sum (A, B, N)
!
!   Performs a vector dot product  VECSUM = [A] . [B]
!     account is NOT taken of the zero terms in the vectors
!
      integer*4,                 intent (in)    :: n
      real*8,    dimension(n),   intent (in)    :: A
      real*8,    dimension(n),   intent (in)    :: B
!
      real*8    c
      integer*4 i
!
      c = 0.0
      do i = 1,n
         c = c + A(i)*B(i)
      end do
      Vec_Sum = c
      RETURN
!
      END

subroutine Vec_Add_SSE (y, x, a, n)
   integer n
   real*8 x(*), y(*), a
 
   integer nn, m
   real*8 v(2)
 
!
! Does SAXPY update Y = Y + A*X
!
! On exit X is unchanged, Y is overwritten with Y + A*X
!
! Last edited by davidb on Sun Jun 03, 2012 2:18 am; edited 6 times in total
!
   if (n==1) then
     y(1) = y(1) + a*x(1)
!
   else if (n > 1) then
!
    nn = n/4            ! Number of sets of 4
    m  = n - 4*nn       ! Number of residuals
    v  = a              ! Vector containing (/a, a/)
!
   ! Assembly code is between code, edoc lines
    code
 
      movupd xmm7%, v            ; move v array to xmm7
 
      mov eax%, =x               ; address of x
      mov ecx%, =y               ; address of y
 
      mov edx%, nn               ; initialise loop counter for vectorised loop
      mov edi%, m                ; initialise loop counter for scalar loop
      cmp edx%, 0
      jz $60                     ; no terms in vectorized loop
 
      ! Check for alignment of x(1) and y(1) on 16 byte boundaries
      and eax%, 15               ; low nibble of x address
      and ecx%, 15               ; low nibble of y address
      cmp eax%, 0
      jnz $10                    ; x(1) is unaligned, check for alignment of x(2)
      cmp ecx%, 0
      jnz $10                    ; y(1) is unaligned, check for alignment of y(2)
 
      ! x(1) and y(1) are aligned on 16 byte boundaries
      mov eax%, =x               ; address of x
      mov ecx%, =y               ; address of y
      jmp $40                    ; branch to aligned vectorized loop
 
      ! Check for alignment of x(2) and y(2) on 16 byte boundaries
  10  xor eax%, 8
      jnz $20                    ; branch to unaligned vectorized loop
      xor ecx%, 8
      jnz $20                    ; branch to unaligned vectorized loop
 
      ! x(2) and y(2) are aligned on 16 byte boundaries
      mov eax%, =x               ; address of x argument
      mov ecx%, =y               ; address of y argument
      movsd xmm0%, [eax%]
      movsd xmm2%, [ecx%]
      mulsd xmm0%, xmm7%
      addsd xmm0%, xmm2%
      movsd [ecx%], xmm0%        ; form y(1) = y(1) + a*x(1)
      lea eax%, [eax%+8]         ; address of x(2)
      lea ecx%, [ecx%+8]         ; address of y(2)
      dec edi%                   ; decrease loop counter for scalar loop
      jge $40                    ; branch to aligned vectorized loop
      mov edi%, 3                ; adjust loop counter for scalar loop
      dec edx%                   ; adjust loop counter for vectorised loop
      jz $60                     ; no terms in vectorized loop
      jmp $40                    ; branch to aigned vectorized loop
!
 
      ! Initialise unaligned vectorised loop
  20  mov eax%, =x               ; address of x argument
      mov ecx%, =y               ; address of y argument
 
      ! Unaligned vectorised loop to calculate y = y + a*x for element sets 1 to nn
  30  movupd xmm0%, [eax%]       ; move next 2 doubles in x into xmm0
      movupd xmm1%, [eax%+16]    ; move next 2 doubles in x into xmm1
      movupd xmm2%, [ecx%]       ; move next 2 doubles in y into xmm2
      movupd xmm3%, [ecx%+16]    ; move next 2 doubles in y into xmm3
      mulpd xmm0%, xmm7%         ; multiply x vector by scalar
      mulpd xmm1%, xmm7%         ; multiply x vector by scalar
      addpd xmm0%, xmm2%         ; form y + a*x vector
      addpd xmm1%, xmm3%         ; form y + a*x vector
      movupd [ecx%], xmm0%       ; move xmm0 into next 2 doubles in y
      movupd [ecx%+16], xmm1%    ; move xmm1 into next 2 doubles in y
      lea eax%, [eax%+32]
      lea ecx%, [ecx%+32]
      dec edx%                   ; decrement counter
      jnz $30                    ; conditional branch back
      jmp $60
 
      ! Aligned vectorised loop to calculate y = y + a*x for element sets 1 to nn
  40  movapd xmm0%, [eax%]       ; move next 2 doubles in x into xmm0
      movapd xmm1%, [eax%+16]    ; move next 2 doubles in x into xmm1
      movapd xmm2%, [ecx%]       ; move next 2 doubles in y into xmm2
      movapd xmm3%, [ecx%+16]    ; move next 2 doubles in y into xmm3
      mulpd xmm0%, xmm7%         ; multiply x vector by scalar
      mulpd xmm1%, xmm7%         ; multiply x vector by scalar
      addpd xmm0%, xmm2%         ; form y + a*x vector
      addpd xmm1%, xmm3%         ; form y + a*x vector
      movapd [ecx%], xmm0%       ; move xmm0 into next 2 doubles in y
      movapd [ecx%+16], xmm1%    ; move xmm1 into next 2 doubles in y
      lea eax%, [eax%+32]
      lea ecx%, [ecx%+32]
      dec edx%                   ; decrement counter
      jnz $40                    ; conditional branch back
 
      ! Check for entries in the scalar loop
  60  cmp edi%, 0
      jz $80                     ; conditional skip loop, no terms in scalar loop
 
      ! Scalar loop to calculate y = y + a*x for the last nn+1 to n elements
  70  movsd xmm0%, [eax%]
      movsd xmm2%, [ecx%]
      mulsd xmm0%, xmm7%
      addsd xmm0%, xmm2%
      movsd [ecx%], xmm0%
      lea eax%, [eax%+8]
      lea ecx%, [ecx%+8]
      dec edi%
      jnz $70
 
      ! End of assembly
  80  nop
 
     edoc
   end if
 
end subroutine Vec_Add_SSE

!** DOUBLE PRECISION VERSION   ! now DO loop version
real*8 function Vec_Sum_SSE (v1,v2,n)
   integer n
   real*8 v1(*), v2(*)
!
   integer nn, m
   real*8 :: v(2)
!
   if (n < 1) then
     Vec_Sum_SSE = 0
   else if (n==1) then
     Vec_Sum_SSE = v1(1)*v2(1)
   else
 
    nn = n/4        !  Number of sets of 4
    m  = n - 4*nn   !  Number of residuals
 
!  Assembly code is between code, edoc lines
    code
      xorps xmm0%, xmm0%         ; set xmm0 to zero
      xorps xmm1%, xmm1%         ; set xmm1 to zero
 
      mov eax%, =v1              ; address of v1
      mov ecx%, =v2              ; address of v2
 
      mov edx%, nn               ; initialise loop counter for vectorised loop
      mov edi%, m                ; initialise loop counter for scalar loop
      cmp edx%, 0
      jz $60                     ; no terms in vectorized loop
 
!    Check for alignment of v1(1) and v2(1) on 16 byte boundaries
      and eax%, 15               ; low nibble of v1 address
      and ecx%, 15               ; low nibble of v2 address
      cmp eax%, 0
      jnz $10                    ; v1(1) is unaligned, check for alignment of v1(2)
      cmp ecx%, 0
      jnz $10                    ; v2(1) is unaligned, check for alignment of v2(2)
 
!    v1(1) and v2(1) are aligned on 16 byte boundaries
      mov eax%, =v1              ; address of v1
      mov ecx%, =v2              ; address of v2
      jmp $40                    ; branch to aligned vectorized loop
 
!    Check for alignment of v1(2) and v2(2) on 16 byte boundaries
  10  xor eax%, 8
      jnz $20                    ; branch to unaligned vectorized loop
      xor ecx%, 8
      jnz $20                    ; branch to unaligned vectorized loop
 
!    v1(2) and v2(2) are aligned on 16 byte boundaries
      mov eax%, =v1              ; address of v1 argument
      mov ecx%, =v2              ; address of v2 argument
      movsd xmm0%, [eax%]
      mulsd xmm0%, [ecx%]        ; initialise with product v1(1)*v2(1)
      lea eax%, [eax%+8]         ; address of v1(2)
      lea ecx%, [ecx%+8]         ; address of v2(2)
      dec edi%                   ; decrease loop counter for scalar loop
      jge $40                    ; branch to aligned vectorized loop
      mov edi%, 3                ; adjust loop counter for scalar loop
      dec edx%                   ; adjust loop counter for vectorised loop
      jz $60                     ; no terms in vectorized loop
      jmp $40                    ; branch to aligned vectorized loop
 
!    Initialise unaligned vectorised loop
  20  mov eax%, =v1              ; address of v1 argument
      mov ecx%, =v2              ; address of v2 argument
 
!    Unaligned vectorised loop to accumulate dot product of elements 1 to nn
  30  movupd xmm2%, [eax%]       ; move next 2 doubles in v1 into xmm2
      movupd xmm4%, [eax%+16]    ; move next 2 doubles in v1 into xmm4
      movupd xmm3%, [ecx%]       ; move next 2 doubles in v2 into xmm3
      movupd xmm5%, [ecx%+16]    ; move next 2 doubles in v2 into xmm5
      mulpd xmm2%, xmm3%         ; multiply values and store in xmm2
      mulpd xmm4%, xmm5%         ; multiply values and store in xmm4
      addpd xmm0%, xmm2%         ; accumulate sum in xmm0
      addpd xmm1%, xmm4%         ; accumulate sum in xmm1
      lea eax%, [eax%+32]
      lea ecx%, [ecx%+32]
      dec edx%                   ; decrement counter
      jnz $30                    ; conditional branch back
      jmp $50
 
!    Aligned vectorised loop to accumulate dot product of elements 1 to nn
  40  movapd xmm2%, [eax%]       ; move next 2 doubles in v1 into xmm2
      movapd xmm4%, [eax%+16]    ; move next 2 doubles in v1 into xmm4
      mulpd xmm2%, [ecx%]        ; multiply values and store in xmm2
      mulpd xmm4%, [ecx%+16]     ; multiply values and store in xmm4
      addpd xmm0%, xmm2%         ; accumulate sum in xmm0
      addpd xmm1%, xmm4%         ; accumulate sum in xmm1
      lea eax%, [eax%+32]
      lea ecx%, [ecx%+32]
      dec edx%                   ; decrement counter
      jnz $40                    ; conditional branch back
 
  50  addpd xmm0%, xmm1%         ; add xmm0 and xmm1
 
!    Check for entries in the scalar loop
  60  cmp edi%, 0
      jz $80                     ; conditional skip loop, no terms in scalar loop
 
!    Scalar loop to accumulate dot product of last nn+1 to n elements in low 64 bits of xmm0
  70  movsd xmm1%, [eax%]
      mulsd xmm1%, [ecx%]
      addsd xmm0%, xmm1%
      lea eax%, [eax%+8]
      lea ecx%, [ecx%+8]
      dec edi%
      jnz $70
 
!    Can't get final reduction to work, so will do this in Fortran for now
  80  movupd v, xmm0%            ; move xmm0 to v array
    edoc
 
! Final reduction, the result is the sum of the two values in v
    Vec_Sum_SSE = sum(v)
   end if
 
end function Vec_Sum_SSE