PROGRAM matmul
INTEGER :: N, i, j, k
REAL(kind=8) :: a(10,10), b(10,10), c(10,10), ct(10,10), mysum
DO i = 1, 10, 1
DO j = 1, 10, 1
a(i,j) = i + j
b(i,j) = i - j
c(i,j) = 0.0
ct(i,j) = 0.0
END DO
b(i,i) = 1.0
END DO
DO j = 1, 10, 1
DO k = 1, 10, 1
DO i = 1, 10, 1
c(i,j) = c(i,j) + a(i,k) * b(k,j)
END DO
END DO
END DO
CALL gemm(10,a,b,ct)
mysum = 0.0
DO i = 1, 10, 1
DO j = 1, 10, 1
mysum = c(i,j) - ct(i,j)
END DO
END DO
IF (abs(mysum) >= 0.00001) THEN
WRITE (*, FMT=*) "Something wrong"
ELSE
WRITE (*, FMT=*) "Output matches"
END IF
END PROGRAM matmul

SUBROUTINE gemm(N,A,B,C)
INTEGER :: t12
INTEGER :: t10
INTEGER :: t8
INTEGER :: t6
INTEGER :: t4
INTEGER :: t2
INTEGER :: chill_t64
INTEGER :: chill_t63
INTEGER :: chill_t62
INTEGER :: chill_t61
INTEGER :: chill_t60
INTEGER :: chill_t59
INTEGER :: chill_t58
INTEGER :: chill_t57
INTEGER :: chill_t56
INTEGER :: chill_t55
INTEGER :: chill_t54
INTEGER :: chill_t53
INTEGER :: chill_t52
INTEGER :: chill_t51
INTEGER :: chill_t50
INTEGER :: chill_t49
INTEGER :: chill_t48
INTEGER :: chill_t47
INTEGER :: over2
INTEGER :: chill_t46
INTEGER :: chill_t45
INTEGER :: chill_t44
INTEGER :: chill_t43
INTEGER :: chill_t42
INTEGER :: chill_t41
INTEGER :: chill_t40
INTEGER :: chill_t39
INTEGER :: chill_t38
INTEGER :: chill_t37
INTEGER :: chill_t36
INTEGER :: chill_t35
INTEGER :: chill_t34
INTEGER :: chill_t33
INTEGER :: chill_t32
INTEGER :: chill_t31
INTEGER :: chill_t30
INTEGER :: chill_t29
INTEGER :: chill_t28
INTEGER :: chill_t27
INTEGER :: chill_t26
INTEGER :: chill_t25
INTEGER :: chill_t24
INTEGER :: chill_t23
INTEGER :: over1
INTEGER :: chill_t22
INTEGER :: chill_t21
INTEGER :: chill_t20
INTEGER :: chill_t19
INTEGER :: chill_t18
INTEGER :: chill_t17
INTEGER :: chill_t16
INTEGER :: chill_t15
REAL(kind=8), DIMENSION(8,512) :: f_P2
INTEGER :: chill_t14
INTEGER :: chill_t13
INTEGER :: chill_t12
INTEGER :: chill_t11
INTEGER :: chill_t10
INTEGER :: chill_t9
INTEGER :: chill_t8
INTEGER :: chill_t7
REAL(kind=8), DIMENSION(512,128) :: f_P1
INTEGER :: chill_t1
INTEGER :: chill_t2
INTEGER :: chill_t4
INTEGER :: chill_t6
INTEGER :: chill_t5
INTEGER :: N
REAL(kind=8) :: A(N,N), B(N,N), C(N,N)
INTEGER :: I, J, K
over1 = 0
over2 = 0
DO t2 = 1, N, 512
DO t4 = 1, N, 128
DO t6 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
DO t8 = t4, merge(t4 + 127,N,t4 + 127 <= N), 1
f_P1(t8 - t4 + 1,t6 - t2 + 1) = A(t8,t6)
END DO
END DO
DO t6 = 1, N, 8
DO t8 = t6, merge(N,t6 + 7,N <= t6 + 7), 1
DO t10 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
f_P2(t10 - t2 + 1,t8 - t6 + 1) = B(t10,t8)
END DO
END DO
over1 = MOD(N,2)
DO t8 = t4, merge(-over1 + N,t4 + 126,-over1 + N <= t4 + 126), 2
over2 = MOD(N,2)
DO t10 = t6, merge(t6 + 6,N - over2,t6 + 6 <= N - over2), 2
DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1
C(t8,t10) = C(t8,t10) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
C(t8 + 1,t10) = C(t8 + 1,t10) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
C(t8,t10 + 1) = C(t8,t10 + 1) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1)
C(t8 + 1,t10 + 1) = C(t8 + 1,t10 + 1) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1)
END DO
END DO
IF (N - 7 <= t6 .AND. 1 <= over2) THEN
DO t12 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
C(t8,N) = C(t8,N) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1)
C(t8 + 1,N) = C(t8 + 1,N) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1)
END DO
END IF
END DO
IF (N - 127 <= t4 .AND. 1 <= over1) THEN
DO t10 = t6, merge(t6 + 7,N,t6 + 7 <= N), 1
DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1
C(N,t10) = C(N,t10) + f_P1(N - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
END DO
END DO
END IF
END DO
END DO
END DO
END SUBROUTINE