#matrix multiply large array size for intel machine source: gemm.f90 procedure: gemm format: rose loop: 0 TI = 128 #TI = 4 TJ = 8 #TK = 3 TK = 512 UI = 2 UJ = 2 permute([3,1,2]) tile(0,2,TJ) #print space tile(0,2,TI) #print space tile(0,5,TK) #print space datacopy(0,3,A,false,-1) #print space datacopy(0,4,B) unroll(0,4,UI) unroll(0,5,UJ)