1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
>>> # Perform imperfect loop interchange of LU decomposition
>>> # to get jki form and then block the k and i loops
>>> #
>>> #for k = 1 to n do
>>> # for i = k+1 to n do
>>> # a(i,k) = a(i,k) / a(k,k)
>>> # for j = k+1 to n do
>>> # a(i,j) = a(i,j) - a(k,j)*a(i,k)
>>> # endfor
>>> # endfor
>>> #endfor
>>> #
>>> #
>>>
>>> symbolic n;
>>>
>>> IS10 := {[k,i] : 1 <= k <= n && k+1 <= i <= n};
>>> IS20 := {[k,i,j] : 1 <= k <= n && k+1 <= i <= n && k+1 <= j <= n};
>>>
>>> T10 := {[k,i] -> [t1,t2,k,k,i]:
>>> exists (alpha,beta: t1 = 64beta+1 && k-1 = alpha + 64 beta &&
>>> alpha >= 0 && alpha <= 63)
>>> && exists (gamma,delta: t2 = 64delta && i = gamma +64delta &&
>>> gamma >= 0 && gamma <= 63)};
>>>
>>> T20 := {[k,i,j] -> [t1,t2,j,k,i]:
>>> exists (alpha,beta: t1 = 64beta+1 && k-1 = alpha + 64 beta &&
>>> alpha >= 0 && alpha <= 63)
>>> && exists (gamma,delta: t2 = 64delta && i = gamma +64delta &&
>>> gamma >= 0 && gamma <= 63)};
>>>
>>> T10;
{[k,i] -> [t1,t2,k,k,i] : exists ( alpha,beta : t1 = 1+64alpha && t2 = 64beta && k-63 <= t1 <= k && i-63 <= t2 <= i)}
>>> T20;
{[k,i,j] -> [t1,t2,j,k,i] : exists ( alpha,beta : t1 = 1+64alpha && t2 = 64beta && k-63 <= t1 <= k && i-63 <= t2 <= i)}
>>>
>>> # Generate code with different ammounts of overhead remove
>>> # The more overhead we remove, the more code duplication may occur
>>> codegen 0 T10:IS10,T20:IS20;
for(t1 = 1; t1 <= n-1; t1 += 64) {
for(t2 = t1-1; t2 <= n; t2 += 64) {
for(t3 = t1; t3 <= n; t3++) {
for(t4 = t1; t4 <= min(t3-1,t2+62,t1+63); t4++) {
for(t5 = max(t2,t4+1); t5 <= min(n,t2+63); t5++) {
s1(t4,t5,t3);
}
}
if (t1 >= t3-63) {
for(t5 = max(t3+1,t2); t5 <= min(n,t2+63); t5++) {
s0(t3,t5);
}
}
}
}
}
>>> codegen 1 T10:IS10,T20:IS20;
for(t1 = 1; t1 <= n-1; t1 += 64) {
for(t2 = t1-1; t2 <= n; t2 += 64) {
for(t3 = t1; t3 <= n; t3++) {
for(t4 = t1; t4 <= min(t3-1,t2+62,t1+63); t4++) {
for(t5 = max(t2,t4+1); t5 <= min(n,t2+63); t5++) {
s1(t4,t5,t3);
}
}
if (t1 >= t3-63) {
for(t5 = max(t3+1,t2); t5 <= min(n,t2+63); t5++) {
s0(t3,t5);
}
}
}
}
}
>>> codegen 2 T10:IS10,T20:IS20;
for(t1 = 1; t1 <= n-1; t1 += 64) {
for(t2 = t1-1; t2 <= n; t2 += 64) {
for(t3 = t1; t3 <= n; t3++) {
for(t4 = t1; t4 <= min(t3-1,t2+62,t1+63); t4++) {
for(t5 = max(t2,t4+1); t5 <= min(n,t2+63); t5++) {
s1(t4,t5,t3);
}
}
if (t1 >= t3-63) {
for(t5 = max(t3+1,t2); t5 <= min(n,t2+63); t5++) {
s0(t3,t5);
}
}
}
}
}
>>> codegen 3 T10:IS10,T20:IS20;
for(t1 = 1; t1 <= n-1; t1 += 64) {
for(t2 = t1-1; t2 <= n; t2 += 64) {
for(t3 = t1; t3 <= min(t1+63,n); t3++) {
for(t4 = t1; t4 <= min(t2+62,t3-1); t4++) {
for(t5 = max(t2,t4+1); t5 <= min(t2+63,n); t5++) {
s1(t4,t5,t3);
}
}
for(t5 = max(t2,t3+1); t5 <= min(n,t2+63); t5++) {
s0(t3,t5);
}
}
for(t3 = t1+64; t3 <= n; t3++) {
for(t4 = t1; t4 <= min(t2+62,t1+63); t4++) {
for(t5 = max(t2,t4+1); t5 <= min(t2+63,n); t5++) {
s1(t4,t5,t3);
}
}
}
}
}
>>>
>>>
|