sslug-teknik team mailing list archive
-
sslug-teknik team
-
Mailing list archive
-
Message #02487
Re: Hastighed: Linux vs. Windows !! [BIG]
Falko Jens Wagner wrote:
....
> Tilgengæld kan man forsøge sig med selv at omstrukturere løkkerne, for at få
> klemt rækkerne i matricen ned i cachen, og så kan man faktisk spore en
> forbedring af performance ved de matrice-størrelser, hvor der kan ligge et
> helt multiplum af rækker i matricen, dvs. når de arrangerer sig fint
> ("symmetrisk").
...
Med cacher er det ikke altid smart at tingene er for symmetriske.
Som eksempel er vedlagt to FORTRAN programmer, f1.f er kildeteksten,
f2.f er
pseudoFORTRAN som SGI compileren har om flyttet "lidt" om på.
Da jeg kiggede på det i sin tid lærte jeg flg:
1. Tingene er ikke som man tror
2. FORTRAN compilere er som regel meget bedre til dette her end C
compilere,
da den nemmere kan "overskue" arrays etc.
3. Compileren var bedre til det end jeg :-)
- dette er mine 0.02 Euro
Mogens
--
Mogens Kjaer, Carlsberg Laboratory, Dept. of Chemistry
Gamle Carlsberg Vej 10, DK-2500 Valby, Denmark
Phone: +45 33 27 53 25, Fax: +45 33 27 47 08
Email: mk@xxxxxx Homepage: http://www.crc.dk
subroutine matmult(n,a,b,c)
integer n
double precision a(n,n),b(n,n),c(n,n)
double precision sum
integer i,j,k
do i=1,n
do j=1,n
sum=0.0d0
do k=1,n
sum=sum+a(i,k)*b(k,j)
enddo
c(i,j)=sum
enddo
enddo
return
end
C ***********************************************************
C Fortran file translated from WHIRL Wed Feb 17 14:27:31 1999
C ***********************************************************
SUBROUTINE matmult(n, a, b, c)
IMPLICIT NONE
INTEGER*4 n
REAL*8 a(n, n)
REAL*8 b(n, n)
REAL*8 c(n, n)
C
C **** Variables and functions ****
C
INTEGER*4 i
INTEGER*4 j
INTEGER*4 k
REAL*8 deref_se1_F8(*)
POINTER(se1_F8, deref_se1_F8)
C
C **** Temporary variables ****
C
INTEGER*4 se1__$stk
INTEGER*4 seonly0i
INTEGER*4 setile2j
INTEGER*4 tile2k
INTEGER*4 tile2i
INTEGER*4 j0
REAL*8 mi0
REAL*8 mi1
REAL*8 mi2
REAL*8 mi3
REAL*8 mi4
REAL*8 mi5
REAL*8 mi6
REAL*8 mi7
INTEGER*4 i0
INTEGER*4 wd_k0
REAL*8 mi8
REAL*8 mi9
INTEGER*4 i1
INTEGER*4 wd_j
INTEGER*4 k0
REAL*8 mi10
REAL*8 mi11
REAL*8 mi12
REAL*8 mi13
INTEGER*4 i2
INTEGER*4 wd_k
REAL*8 mi14
INTEGER*4 i3
INTEGER*4 j1
INTEGER*4 i4
INTEGER*4 tmp0
C
C **** statements ****
C
se1__$stk = INTRN_U4READSTACKPOINTER()
se1_F8 = ALLOCA(((MIN(n, 60) * MIN(n, 300)) * 8))
DO seonly0i = 1, n, 300
DO setile2j = 1, n, 60
DO j = setile2j, MIN((setile2j + 59), n), 1
IF(IAND(j, 31) .EQ. 16) THEN
DO i = seonly0i, MIN((seonly0i + 299), n), 1
C PREFETCH(269093936) deref_se1_F8(((i - seonly0i) + 1) * MIN(n, 60) + (j - setile2j) + 1) OFFS=1600
C write strid1=0 strid2=1 conf=2
C PREFETCH(269093320) deref_se1_F8(((i - seonly0i) + 1) * MIN(n, 60) + (j - setile2j) + 1) OFFS=1600
C write strid1=1 strid2=0 conf=2
C prefetch (ptr, lrnum): 1st <269093320, 1>2nd <269093936, 1>
deref_se1_F8(((i - seonly0i) + 1) * MIN(n, 60) + (j - setile2j) + 1) = 0.0D00
END DO
ELSE
DO i = seonly0i, MIN((seonly0i + 299), n), 1
C PREFETCH(269093544) deref_se1_F8(((i - seonly0i) + 1) * MIN(n, 60) + (j - setile2j) + 1) OFFS=1600
C write strid1=1 strid2=0 conf=2
C prefetch (ptr, lrnum): 1st <269093544, 1>
deref_se1_F8(((i - seonly0i) + 1) * MIN(n, 60) + (j - setile2j) + 1) = 0.0D00
END DO
ENDIF
END DO
DO tile2k = 1, n, 60
DO tile2i = seonly0i, MIN((seonly0i + 299), n), 156
DO j0 = setile2j, MIN((n + -1), (setile2j + 58)), 2
DO k = tile2k, MIN((n + -3), (tile2k + 56)), 4
mi0 = b(k, j0)
mi1 = b(k + 3, j0 + 1)
mi2 = b(k + 3, j0)
mi3 = b(k, j0 + 1)
mi4 = b(k + 2, j0 + 1)
mi5 = b(k + 2, j0)
mi6 = b(k + 1, j0)
mi7 = b(k + 1, j0 + 1)
DO i0 = tile2i, MIN((tile2i + 155), MIN((seonly0i + 299), n)), 1
C PREFETCH(270058544) a(i0, k + 3) OFFS=64
C read strid1=4 strid2=0 conf=2
C PREFETCH(270058248) a(i0, k + 2) OFFS=64
C read strid1=4 strid2=0 conf=2
C PREFETCH(270057952) a(i0, k + 1) OFFS=64
C read strid1=4 strid2=0 conf=2
C PREFETCH(270057704) a(i0, k) OFFS=64
C read strid1=4 strid2=0 conf=2
deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 1) = (deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 1) +(a(i0, k)
1 ! prefetch (ptr, lrnum): 1st <270057704, 1>
1 * mi0))
deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 2) = (deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 2) +(a(i0, k)
1 ! prefetch (ptr, lrnum): 1st <270057704, 1>
1 * mi3))
deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 1) = (deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 1) +(a(i0, k + 1)
1 ! prefetch (ptr, lrnum): 1st <270057952, 1>
1 * mi6))
deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 2) = (deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 2) +(a(i0, k + 1)
1 ! prefetch (ptr, lrnum): 1st <270057952, 1>
1 * mi7))
deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 1) = (deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 1) +(a(i0, k + 2)
1 ! prefetch (ptr, lrnum): 1st <270058248, 1>
1 * mi5))
deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 2) = (deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 2) +(a(i0, k + 2)
1 ! prefetch (ptr, lrnum): 1st <270058248, 1>
1 * mi4))
deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 1) = (deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 1) +(a(i0, k + 3)
1 ! prefetch (ptr, lrnum): 1st <270058544, 1>
1 * mi2))
deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 2) = (deref_se1_F8(((i0 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 2) +(a(i0, k + 3)
1 ! prefetch (ptr, lrnum): 1st <270058544, 1>
1 * mi1))
END DO
END DO
DO wd_k0 = k, MIN((tile2k + 59), n), 1
mi8 = b(wd_k0, j0)
mi9 = b(wd_k0, j0 + 1)
DO i1 = tile2i, MIN((tile2i + 155), MIN((seonly0i + 299), n)), 1
C PREFETCH(270058840) a(i1, wd_k0) OFFS=64
C read strid1=4 strid2=0 conf=2
deref_se1_F8(((i1 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 1) = (deref_se1_F8(((i1 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 1) +(a(i1, wd_k0)
1 ! prefetch (ptr, lrnum): 1st <270058840, 1>
1 * mi8))
deref_se1_F8(((i1 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 2) = (deref_se1_F8(((i1 - seonly0i) + 1) * MIN(n, 60) + (j0 - setile2j) + 2) +(a(i1, wd_k0)
1 ! prefetch (ptr, lrnum): 1st <270058840, 1>
1 * mi9))
END DO
END DO
END DO
DO wd_j = j0, MIN((setile2j + 59), n), 1
DO k0 = tile2k, MIN((n + -3), (tile2k + 56)), 4
mi10 = b(k0, wd_j)
mi11 = b(k0 + 3, wd_j)
mi12 = b(k0 + 1, wd_j)
mi13 = b(k0 + 2, wd_j)
DO i2 = tile2i, MIN((tile2i + 155), MIN((seonly0i + 299), n)), 1
C PREFETCH(270059976) a(i2, k0 + 3) OFFS=64
C read strid1=4 strid2=0 conf=2
C PREFETCH(270059680) a(i2, k0 + 2) OFFS=64
C read strid1=4 strid2=0 conf=2
C PREFETCH(270059384) a(i2, k0 + 1) OFFS=64
C read strid1=4 strid2=0 conf=2
C PREFETCH(270059136) a(i2, k0) OFFS=64
C read strid1=4 strid2=0 conf=2
deref_se1_F8(((i2 - seonly0i) + 1) * MIN(n, 60) + (wd_j - setile2j) + 1) = (deref_se1_F8(((i2 - seonly0i) + 1) * MIN(n, 60) + (wd_j - setile2j) + 1) +(a(i2, k0)
1 ! prefetch (ptr, lrnum): 1st <270059136, 1>
1 * mi10))
deref_se1_F8(((i2 - seonly0i) + 1) * MIN(n, 60) + (wd_j - setile2j) + 1) = (deref_se1_F8(((i2 - seonly0i) + 1) * MIN(n, 60) + (wd_j - setile2j) + 1) +(a(i2, k0 + 1)
1 ! prefetch (ptr, lrnum): 1st <270059384, 1>
1 * mi12))
deref_se1_F8(((i2 - seonly0i) + 1) * MIN(n, 60) + (wd_j - setile2j) + 1) = (deref_se1_F8(((i2 - seonly0i) + 1) * MIN(n, 60) + (wd_j - setile2j) + 1) +(a(i2, k0 + 2)
1 ! prefetch (ptr, lrnum): 1st <270059680, 1>
1 * mi13))
deref_se1_F8(((i2 - seonly0i) + 1) * MIN(n, 60) + (wd_j - setile2j) + 1) = (deref_se1_F8(((i2 - seonly0i) + 1) * MIN(n, 60) + (wd_j - setile2j) + 1) +(a(i2, k0 + 3)
1 ! prefetch (ptr, lrnum): 1st <270059976, 1>
1 * mi11))
END DO
END DO
DO wd_k = k0, MIN((tile2k + 59), n), 1
mi14 = b(wd_k, wd_j)
DO i3 = tile2i, MIN((tile2i + 155), MIN((seonly0i + 299), n)), 1
C PREFETCH(270060272) a(i3, wd_k) OFFS=64
C read strid1=4 strid2=0 conf=2
deref_se1_F8(((i3 - seonly0i) + 1) * MIN(n, 60) + (wd_j - setile2j) + 1) = (deref_se1_F8(((i3 - seonly0i) + 1) * MIN(n, 60) + (wd_j - setile2j) + 1) +(a(i3, wd_k)
1 ! prefetch (ptr, lrnum): 1st <270060272, 1>
1 * mi14))
END DO
END DO
END DO
END DO
END DO
DO j1 = setile2j, MIN((setile2j + 59), n), 1
IF(IAND(j1, 31) .EQ. 16) THEN
DO i4 = seonly0i, MIN((seonly0i + 299), n), 1
C PREFETCH(270062040) deref_se1_F8(((i4 - seonly0i) + 1) * MIN(n, 60) + (j1 - setile2j) + 1) OFFS=1600
C read strid1=0 strid2=1 conf=2
C PREFETCH(270061256) deref_se1_F8(((i4 - seonly0i) + 1) * MIN(n, 60) + (j1 - setile2j) + 1) OFFS=1600
C read strid1=1 strid2=0 conf=2
C PREFETCH(270060864) c(i4, j1) OFFS=256
C write strid1=0 strid2=16 conf=2
C PREFETCH(270060568) c(i4, j1) OFFS=64
C write strid1=4 strid2=0 conf=2
C prefetch (ptr, lrnum): 1st <270060568, 1>2nd <270060864, 1>
c(i4, j1) = deref_se1_F8(((i4 - seonly0i) + 1) * MIN(n, 60) + (j1 - setile2j) + 1)
1 ! prefetch (ptr, lrnum): 1st <270061256, 1>2nd <270062040, 1>
1
END DO
ELSE
DO i4 = seonly0i, MIN((seonly0i + 299), n), 1
C PREFETCH(270061648) deref_se1_F8(((i4 - seonly0i) + 1) * MIN(n, 60) + (j1 - setile2j) + 1) OFFS=1600
C read strid1=1 strid2=0 conf=2
c(i4, j1) = deref_se1_F8(((i4 - seonly0i) + 1) * MIN(n, 60) + (j1 - setile2j) + 1)
1 ! prefetch (ptr, lrnum): 1st <270061648, 1>
1
END DO
ENDIF
END DO
END DO
END DO
tmp0 = INTRN_U4I4SETSTACKPOINTER(se1__$stk)
RETURN
END ! matmult
References