1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
# Perform imperfect loop interchange of LU decomposition
# to get jki form and then block the k and i loops
#
#for k = 1 to n do
# for i = k+1 to n do
# a(i,k) = a(i,k) / a(k,k)
# for j = k+1 to n do
# a(i,j) = a(i,j) - a(k,j)*a(i,k)
# endfor
# endfor
#endfor
#
#
symbolic n;
IS10 := {[k,i] : 1 <= k <= n && k+1 <= i <= n};
IS20 := {[k,i,j] : 1 <= k <= n && k+1 <= i <= n && k+1 <= j <= n};
T10 := {[k,i] -> [t1,t2,k,k,i]:
exists (alpha,beta: t1 = 64beta+1 && k-1 = alpha + 64 beta &&
alpha >= 0 && alpha <= 63)
&& exists (gamma,delta: t2 = 64delta && i = gamma +64delta &&
gamma >= 0 && gamma <= 63)};
T20 := {[k,i,j] -> [t1,t2,j,k,i]:
exists (alpha,beta: t1 = 64beta+1 && k-1 = alpha + 64 beta &&
alpha >= 0 && alpha <= 63)
&& exists (gamma,delta: t2 = 64delta && i = gamma +64delta &&
gamma >= 0 && gamma <= 63)};
T10;
T20;
# Generate code with different ammounts of overhead remove
# The more overhead we remove, the more code duplication may occur
codegen 0 T10:IS10,T20:IS20;
codegen 1 T10:IS10,T20:IS20;
codegen 2 T10:IS10,T20:IS20;
codegen 3 T10:IS10,T20:IS20;
|