summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorDerick Huth <derickhuth@gmail.com>2015-09-24 11:26:53 -0600
committerDerick Huth <derickhuth@gmail.com>2015-09-24 11:26:53 -0600
commitc285135eb903c31cd221f90f03e288a6b67770cd (patch)
tree1f6ea3120a09feef7236dac579d5a2d5b774aaa7 /examples
parentf5c39e4c6ff55520948c2ef331c968cd84b817d9 (diff)
downloadchill-c285135eb903c31cd221f90f03e288a6b67770cd.tar.gz
chill-c285135eb903c31cd221f90f03e288a6b67770cd.tar.bz2
chill-c285135eb903c31cd221f90f03e288a6b67770cd.zip
pre-v0.2.1
Diffstat (limited to 'examples')
-rw-r--r--examples/chill/gemm.c18
-rw-r--r--examples/chill/gemm.script31
-rw-r--r--examples/chill/gemv.c15
-rw-r--r--examples/chill/gemv.script9
-rw-r--r--examples/chill/jacobi1.c13
-rw-r--r--examples/chill/jacobi1.script18
-rw-r--r--examples/chill/jacobi2.c15
-rw-r--r--examples/chill/jacobi2.script21
-rw-r--r--examples/chill/unroll.c33
-rw-r--r--examples/chill/unroll.script35
-rw-r--r--examples/cuda-chill/cp.c29
-rw-r--r--examples/cuda-chill/cp.lua46
-rw-r--r--examples/cuda-chill/cudaize.lua1004
-rwxr-xr-xexamples/cuda-chill/cudaize.py1047
-rw-r--r--examples/cuda-chill/mm.c10
-rw-r--r--examples/cuda-chill/mm.lua38
-rwxr-xr-xexamples/cuda-chill/mpeg4.c23
-rw-r--r--examples/cuda-chill/mpeg4.lua45
-rwxr-xr-xexamples/cuda-chill/mriq-fh.c38
-rwxr-xr-xexamples/cuda-chill/mriq-fh.lua73
-rw-r--r--examples/cuda-chill/mriq.c33
-rw-r--r--examples/cuda-chill/mriq.lua55
-rw-r--r--examples/cuda-chill/mv-shadow.c9
-rw-r--r--examples/cuda-chill/mv-shadow.lua65
-rw-r--r--examples/cuda-chill/mv.c9
-rw-r--r--examples/cuda-chill/mv.lua65
-rw-r--r--examples/cuda-chill/mv_try.c9
-rw-r--r--examples/cuda-chill/mv_try.lua14
-rw-r--r--examples/cuda-chill/nbody.c66
-rw-r--r--examples/cuda-chill/nbody.lua53
-rw-r--r--examples/cuda-chill/tmv-shadow.c9
-rw-r--r--examples/cuda-chill/tmv-shadow.lua50
-rw-r--r--examples/cuda-chill/tmv.c9
-rw-r--r--examples/cuda-chill/tmv.lua50
-rw-r--r--examples/fortran/README10
-rw-r--r--examples/fortran/ccd.f32
-rw-r--r--examples/fortran/ccd.script18
-rw-r--r--examples/fortran/gemm.f9058
-rw-r--r--examples/fortran/gemm.script30
-rw-r--r--examples/fortran/rose_gemm.f90155
40 files changed, 0 insertions, 3360 deletions
diff --git a/examples/chill/gemm.c b/examples/chill/gemm.c
deleted file mode 100644
index a565511..0000000
--- a/examples/chill/gemm.c
+++ /dev/null
@@ -1,18 +0,0 @@
-
-#define N 512
-
-int main() {
-
- float a[N][N], b[N][N], c[N][N];
-
- int i, j, k;
-
- for (j = 0; j < N; j++)
- for (k = 0; k < N; k++)
- for (i = 0; i < N; i++) {
- c[i][j] = c[i][j] + a[i][k] * b[k][j];
- }
-
- return 0;
-}
-
diff --git a/examples/chill/gemm.script b/examples/chill/gemm.script
deleted file mode 100644
index ed91567..0000000
--- a/examples/chill/gemm.script
+++ /dev/null
@@ -1,31 +0,0 @@
-#matrix multiply large array size for intel machine
-source: gemm.c
-procedure: main
-format: rose
-loop: 0
-
-TI = 128
-TJ = 8
-TK = 512
-UI = 2
-UJ = 2
-
-permute([3,1,2])
-tile(0,2,TJ)
-#print space
-tile(0,2,TI)
-#print space
-tile(0,5,TK)
-#print space
-
-datacopy(0,3,a,false,1)
-#print space
-
-datacopy(0,4,b)
-print
-unroll(0,4,UI)#print space
-print
-unroll(0,5,UJ)
-#print space
-print
-
diff --git a/examples/chill/gemv.c b/examples/chill/gemv.c
deleted file mode 100644
index 610d4cb..0000000
--- a/examples/chill/gemv.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#define N 10
-
-int main() {
- // int n;
- float a[N];
- float b[N];
- float c[N][N];
-
- int i, j;
-
- for (i = 1; i < N; i++)
- for (j = 1; j < N; j++)
- a[i] = a[i] + c[i][j] * b[j];
-
-}
diff --git a/examples/chill/gemv.script b/examples/chill/gemv.script
deleted file mode 100644
index f1d5f89..0000000
--- a/examples/chill/gemv.script
+++ /dev/null
@@ -1,9 +0,0 @@
-source: gemv.c # matrix-vector multiply
-procedure: main
-format : rose
-loop: 0
-
-
-
-original()
-print
diff --git a/examples/chill/jacobi1.c b/examples/chill/jacobi1.c
deleted file mode 100644
index 0fcaee4..0000000
--- a/examples/chill/jacobi1.c
+++ /dev/null
@@ -1,13 +0,0 @@
-#define N 512
-
-int main() {
- int i, t;
-
- float a[N][N];
-
- for (t = 2; t <= 100; t++)
- for (i = 2; i <= N - 1; i++)
- a[t][i] = a[t - 1][i - 1] + a[t - 1][i] + a[t - 1][i + 1];
-
- return 0;
-}
diff --git a/examples/chill/jacobi1.script b/examples/chill/jacobi1.script
deleted file mode 100644
index c0dec8d..0000000
--- a/examples/chill/jacobi1.script
+++ /dev/null
@@ -1,18 +0,0 @@
-#
-# tiling perfect jacobi loop nest with time step, use
-# unimodular transformation first (only applicable to the
-# perfect loop nest) to make tiling legal.
-#
-
-source: jacobi1.c
-procedure: main
-format : rose
-loop: 0
-
-print dep
-
-nonsingular([[1,0],[1,1]]) # unimodular matrix, determinant is one
-tile(0,2,64)
-
-print dep
-print
diff --git a/examples/chill/jacobi2.c b/examples/chill/jacobi2.c
deleted file mode 100644
index b8d8d7b..0000000
--- a/examples/chill/jacobi2.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#define N 512
-
-int main() {
- double a[N];
- double b[N];
- int t, i;
- for (t = 1; t <= 100; t++) {
- for (i = 2; i <= N - 1; i++)
- b[i] = (double) 0.25 * (a[i - 1] + a[i + 1]) + (double) 0.5 * a[i];
-
- for (i = 2; i <= N - 1; i++)
- a[i] = b[i];
- }
- return 0;
-}
diff --git a/examples/chill/jacobi2.script b/examples/chill/jacobi2.script
deleted file mode 100644
index afe14c6..0000000
--- a/examples/chill/jacobi2.script
+++ /dev/null
@@ -1,21 +0,0 @@
-#
-# tiling imperfect jacobi loop nest, more details in the paper
-# "Automatic Tiling of Iterative Stencil Loops" by Zhiyuan Li and
-# Yonghong Song, TOPLAS, 2004.
-#
-
-source: jacobi2.c
-procedure: main
-format: rose
-loop: 0
-
-print dep
-
-original()
-shift([1], 2, 1)
-fuse([0,1], 2) # optional
-skew([0,1], 2, [2,1])
-tile(0, 2, 32, 1)
-
-print dep
-print
diff --git a/examples/chill/unroll.c b/examples/chill/unroll.c
deleted file mode 100644
index e74dea3..0000000
--- a/examples/chill/unroll.c
+++ /dev/null
@@ -1,33 +0,0 @@
-
-#define N 14
-#define DT 0.314
-
-void foo(int n, float* x, float* y, float* z, float* f3, float* f1, float* w) {
-
- int i, j;
-
- for (i = 1; i <= 14; i++)
- x[i] = 1.0;
-
- for (i = 1; i <= 14; i += 3)
- y[i] = 1.0;
-
- for (i = N + 1; i <= N + 20; i += 3)
- z[i] = 1.0;
-
- for (i = 0; i <= N; i++) {
- for (j = i; j <= i + N; j++)
- f3[i] = f3[i] + f1[j] * w[j - i];
- f3[i] = f3[i] * DT;
- }
-
- return 0;
-}
-
-int main() {
- float x[N], y[N], z[N], f3[N], f1[N], w[N];
-
- foo(N, x, y, z, f3, f1, w);
- return 0;
-}
-
diff --git a/examples/chill/unroll.script b/examples/chill/unroll.script
deleted file mode 100644
index e64acb6..0000000
--- a/examples/chill/unroll.script
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Test unroll-and-jam. The last loop adapted from the simple
-# convolution example from p463 of "Optimizing Compilers for
-# Modern Architectures", by Randy Allen and Ken Kennedy.
-#
-
-source: unroll.c
-procedure: foo
-format: rose
-# fully unroll a loop with known iteration count
-loop: 0
-original()
-unroll(0,1,3)
-print
-print space
-
-
-# a strided loop
-loop: 1
-original()
-unroll(0,1,2)
-print
-print space
-
-# lower and upper bounds are not constant
-loop: 2
-original()
-unroll(0,1,20)
-print
-
-# parallelogram iteration space
-loop: 3
-original()
-unroll(0,1,2)
-print
diff --git a/examples/cuda-chill/cp.c b/examples/cuda-chill/cp.c
deleted file mode 100644
index 837d7a6..0000000
--- a/examples/cuda-chill/cp.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#define N 1
-
-#define VOLSIZEY 512
-#define VOLSIZEX 512
-#define VOLSIZEZ 1
-#define ATOMCOUNT 4000
-#define GRIDSPACING 0.1
-#define zDim 0
-
-extern float sqrtf(float);
-
-void cenergy_cpu(float atoms[ATOMCOUNT*4],float *energy,float z)
-{
-int i,j,n;float dx,dy,dz;
-
- for (j=0; j<VOLSIZEY; j++) {
- for (i=0; i<VOLSIZEX; i++) {
- for (n=0;n<ATOMCOUNT;n+=4) {
- dx = (GRIDSPACING * i) - atoms[n];
- dy = (GRIDSPACING * j) - atoms[n+1];
- dz = z - atoms[n+2];
- energy[(j*VOLSIZEX + i)+VOLSIZEX*VOLSIZEY*zDim] += atoms[n+3]/sqrtf( (dx*dx) + (dy*dy)+ (dz*dz) ) ;
- }
-
-
- }
- }
-}
-
diff --git a/examples/cuda-chill/cp.lua b/examples/cuda-chill/cp.lua
deleted file mode 100644
index 1ef2264..0000000
--- a/examples/cuda-chill/cp.lua
+++ /dev/null
@@ -1,46 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("cp.c", "cenergy_cpu", 0)
-
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-V=512
-N=4000
-N=1
-
-Tj=32
-Ti=16
-Tii=16
-Tjj=16
-
---normalize_index("j")
---normalize_index("i")
-print_code()
-normalize_index("n")
--- TILE COMMANDS ZEROOOOOOOOOOO:3
---permute(0,{"i","j","n"})
---tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","n"})--CU=-1
-tile_by_index({"j","i"},{Tj,Ti},{l1_control="jj",l2_control="ii"},{"jj","ii","j","i","n"})--CU=-1
---tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1
-
---tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3
---tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","i","iii","j","jjj","n"})--CU=3
---tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"})
---tile_by_index({"i"}, {Tii}, {l1_control="iii",l1_tile="i"}, {"ii", "jj", "iii","i","j","n"})
-print_code()
-cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"jj","ii"}, thread={"j","i"}})--CU=3
---cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"ii","jj"}, thread={"i","j"}})--CU=3
-print_code()
-copy_to_shared("tx","atoms",-16)
-copy_to_registers("tx","energy")
---copy_to_texture("atoms")
---unroll_to_depth(1)
---unroll(0,9,0)
---unroll(0,5,0)
-
---unroll(0,8,256)
-print_code()
diff --git a/examples/cuda-chill/cudaize.lua b/examples/cuda-chill/cudaize.lua
deleted file mode 100644
index 7359cca..0000000
--- a/examples/cuda-chill/cudaize.lua
+++ /dev/null
@@ -1,1004 +0,0 @@
-
--- THIS IS CUDAIZE.LUA
-
-function table.contains_key(table, key)
- for k in pairs(table) do
- if k == key then
- return true
- end
- end
- return false
-end
-
-function valid_indices(stmt, indices)
- --print( "valid_indices() lua calling C cur_indices")
- --io.flush()
- cur = cur_indices(stmt)
- --print("Cur indices "..list_to_string(cur))
- for idx in pairs(indices) do
- if not table.contains_key(cur,idx) then
- return false
- end
- end
- return true
-end
-
-function next_clean_level(cur_idxs,level)
- --print("next_clean_level( ..., "..level.." )")
- --print(string.format("indices_at_each_level %s ",list_to_string(cur_idxs) ))
-
- --print("loop to "..#cur_idxs)
- for i=level+1,#cur_idxs do
- --print("Checking level "..i.." = '"..cur_idxs[i].."'")
- if (# cur_idxs[i] > 0) then
- --print("Good enough"..(# cur_idxs[i]))
- --print("returning "..i)
- return i
- end
- end
- return -1 --sentinal that there were no non-dummy indices left
-end
-
-function build_order(final_order, tile_idx_names, ctrl_idx_names, tile_idx_map, cur_level)
- order = {}
- --print("\nbuild_order()")
- --print("build_order(): final_order = ( "..list_to_string(final_order).." )")
- --print("build_order(): ctrl_idx_names = ("..list_to_string(ctrl_idx_names).." )")
- --print("cur_level "..cur_level.."")
- --io.flush()
-
- for i,k in ipairs(final_order) do
- skip = false
- cur = final_order[i]
- --print("\ncur "..cur.." = final_order["..i.."] = "..final_order[i].." ")
- --control loops below our current level should not be in the current order
- for j=cur_level+2,# ctrl_idx_names do
- --print("j "..j.." final_order["..i.."] = "..final_order[i].." ")
- if ctrl_idx_names[j] == final_order[i] then
- skip = true
- --print("SKIP "..final_order[i].." ")
- --io.flush()
- end
- end
- --possibly substitute tile indices ifn necessar
- if table.contains_key(tile_idx_map,final_order[i]) then
- approved_sub = false
- sub_string = tile_idx_map[final_order[i]]
- for j=cur_level+2,# tile_idx_names do
- if tile_idx_names[j] == sub_string then
- approved_sub = true
- end
- end
- if approved_sub then
- cur = sub_string
- end
- end
- if not skip then
- table.insert(order,cur)
- end
- end
- return order
-end
-
-function list_to_string(str_list)
- --Helpful debug output
- l = ""
- for i,str in ipairs(str_list) do
- if i > 1 then
- l = l .. ", " .. str
- else
- l = str
- end
- end
- return l
-end
-
-
-function find_cur_level(stmt,idx)
- --Search cur_indices for a idx at stmt
- cur = cur_indices(stmt)
- --print(string.format("find_cur_level(stmt %d, idx %s) Cur indices %s", stmt, idx, list_to_string(cur)))
- for i,cidx in ipairs(cur) do
- if cidx == idx then
- --print(string.format("found it at index %d", i))
- return i
- end
- end
- error("Unable to find "..idx.." in current list of indices")
-end
-
-
-function chk_cur_level(stmt,idx)
- --Search cur_indices for a idx at stmt
- cur = cur_indices(stmt)
- for i,cidx in ipairs(cur) do
- if cidx == idx then
- return i
- end
- end
- return -1
-end
-
-
-function find_offset(cur_order, tile, control)
- --print("Looking for tile '"..tile.."' and control '"..control.."' in ( "..list_to_string(cur_order)..", )")
- idx1 = -1
- idx2 = -1
- for i,cur in ipairs(cur_order) do
- if(cur == tile) then
- idx1 = i
- end
- if(cur == control) then
- idx2 = i
- end
- end
- if(idx1 < 0) then
- error("Unable to find tile " .. tile .. " in current list of indices")
- end
- if(idx2 < 0) then
- error("Unable to find control " .. control .. " in current list of indices")
- end
- --print("found at level " .. idx2 .. " and " .. idx1)
- if(idx2 < idx1) then
- return idx2-idx1+1
- else
- return idx2-idx1
- end
-end
-
-function tile_by_index(tile_indices, sizes, index_names, final_order, tile_method)
- --print "STARTING TILE BY INDEX"
- --io.flush()
- stmt = 0 --assume stmt 0
- cur = cur_indices(stmt)
- --print("Cur indices "..list_to_string(cur))
- if not valid_indices(stmt,tile_indices) then
- error('One of the indices in the first parameter were not '..
- 'found in the current set of indices.')
- end
- if not tile_method then tile_method = counted end
- tile_idx_names = {}
- for i,s in ipairs(tile_indices) do tile_idx_names[i]=s end --shallow copy
- --print("tile_index_names: ['"..list_to_string(tile_indices).."']")
-
- --print("index_names: ")
- --for k,v in pairs(index_names) do print(k,v) end
-
- --io.flush()
-
- ctrl_idx_names = {}
- tile_idx_map = {}
- for k,v in pairs(index_names) do
- valid = false
- if(string.sub(k,1,1) == "l") then
- if string.sub(k,-8) == "_control" then
- i = tonumber(string.sub(k,2,-9))
- if i and i >= 1 and i <= (# tile_indices) then
- ctrl_idx_names[i] = v
- --print(string.format("Handling control %s for loop level %d",v,i))
- --print("control "..k.." name "..v.." ")
- valid = true
- end
- elseif string.sub(k,-5) == "_tile" then
- i = tonumber(string.sub(k,2,-6))
- if i and i >= 1 and i <= (# tile_indices) then
- --print(string.format("tile %s -> %s",tile_indices[i], v))
- tile_idx_names[i] = v
- tile_idx_map[v] = tile_indices[i]
- --print(string.format("tile %s -> %s",tile_indices[i], v))
- valid = true
- end
- end
- end
- if not valid then error(string.format("%s is not a proper key for specifying "..
- "tile or control loop indices\n", k)) end
- end
-
- --filter out control indices (and do name substitution of unprocessed tile indices) for a given level
- cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, -1)
- permute(stmt, cur_order)
-
- for i,cur_idx in ipairs(tile_indices) do
- --print(string.format("i %d cur_idx %s calling build order ********", i-1, cur_idx))
- cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)
- --Find a offset between tile loop and control loop
- -- 0 = control loop one level above tile loop
- -- -1 = control loop two levels above tile loop
- -- > 0 = tile loop above control loop
- -- In the last case, we do two extra tile commands to get the control
- -- above the tile and then rely on the final permute to handle the
- -- rest
- level = find_cur_level(stmt,cur_idx)
- offset = find_offset(cur_order, tile_idx_names[i], ctrl_idx_names[i])
- --print(string.format("offset %d", offset))
-
- if (offset <= 0) then
- --print(string.format("[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %s)",stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method))
- tile(stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method)
- else
- --print(string.format("2tile(%d, %d, %d, %d, %s, %s, %s)", stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method))
- tile(stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method);--regular level
- --flip tile and control loop
- --print(string.format("3tile(%d, %d, %d)",stmt, level+1, level+1))
- tile(stmt, level+1, level+1);
- --print(string.format("4tile(%d, %d, %d)",stmt, level+1, level))
- tile(stmt, level+1, level);
- --print(string.format("\n[offset>0]tile(%d, %d, %d, %d,%s,%s,%s)",stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method))
- --print_code()
-
- end
-
- --Do permutation based on cur_order
- --print "permute based on build order calling build_order()"
- --print "cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)"
- cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)
- --print "permute(stmt, cur_order);"
- permute(stmt, cur_order);
- --print "\nafter permute(), code is:"
- --print_code()
- end
- --print "ENDING TILE BY INDEX"
- --print_code()
-end
-
-function normalize_index(index)
- stmt = 0 --assume stmt 0cur = cur_indices(stmt)
- --print("Cur indices "..list_to_string(cur))
- l = find_cur_level(stmt, index)
- tile(stmt, l, l)
- --print(string.format("\n[Normalize]tile(%d, %d, %d)",stmt, l,l))
-end
-
-function is_in_indices(stmt, idx)
- cur = cur_indices(stmt)
- for i=0,#cur,1 do
- if(cur[i]==idx) then
- return true
- end
- end
- return false
-
-end
-
-
-function copy_to_registers(start_loop, array_name)
-
- --print("\n\n****** starting copy to registers")
- io.flush()
-
- stmt = 0 --assume stmt 0
-
- -- [Malik] first we make sure that tx and ty are consecutive loops in the 2D thread setup, otherwise all levels for subsequent operations are messed up. Start logic.
- cur = cur_indices(stmt)
- table_Size = table.getn(cur)
-
- --print(string.format("Cur indices %s,",list_to_string(cur)))
- --print(string.format("The table size is %d", table_Size))
- --table.foreach(cur, print)
- --print_code()
-
- level_tx = -1
- level_ty = -1
- if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end
- if is_in_indices(stmt,"ty") then level_ty = find_cur_level(stmt,"ty") end
- --print(string.format("level_tx %d level_ty %d", level_tx, level_ty))
-
- ty_lookup_idx = ""
- org_level_ty = level_ty
-
- --if(cur[level_tx+1]~=nil and cur[level_tx+1]~="") then ty_lookup = ty_lookup+1 end
- if(cur[level_ty+1]~=nil and cur[level_ty+1]~="") then
- --print(string.format("IF cur[%d] = %s", level_ty+1, cur[level_ty+1]))
- ty_lookup_idx = cur[level_ty+1]
- else
- --if cur[level_ty] ~= nil then print(string.format("ELSE ty_lookup_idx = cur[%d] = %s", level_ty, cur[level_ty])) -- TODO
- --else print "ELSE (dangerous)" end
- ty_lookup_idx = cur[level_ty] -- may assign nil !?
- end
- --if ty_lookup_idx ~= nil then print(string.format("ty_lookup_idx '%s'", ty_lookup_idx)) -- TODO
- --else print "ty_lookup_idx is NIL"
- --end
-
- if level_ty > 0 then
- --print(string.format("\ntile3(%d,%d,%d)",stmt,level_ty,level_tx+1))
- tile(stmt,level_ty,level_tx+1)
- end
- --print_code()
-
- --print("\ntylookup is %d",ty_lookup)
- --exit(0)
- --
- cur = cur_indices(stmt)
- table_Size = table.getn(cur)
- --print(string.format("Cur indices %s,",list_to_string(cur)))
- --print("The table size is "..table.getn(cur))
- --table.foreach(cur, print)
-
- if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end
- if ty_lookup_idx then
- if is_in_indices(stmt,ty_lookup_idx) then level_ty = find_cur_level(stmt,ty_lookup_idx) end
- end
-
- ty_lookup = 1
- idx_flag = -1
- -- find the level of the next valid index after ty+1
- --print(string.format("\nlevel_ty %d", level_ty))
- if level_ty > 0 then
- --print(string.format("table_Size %d", table_Size))
- for num= level_ty+ty_lookup,table_Size do
- --print(string.format("num=%d cur[num] = '%s'",num, cur[num]))
- if(cur[num] ~= "") then
- idx_flag = find_cur_level(stmt,cur[num])
- --print (string.format("idx_flag = %d", idx_flag))
- break
- end
- end
- end
-
- --print(string.format("\n(first) I am checking all indexes after ty+1 %s",idx_flag))
- --print_code()
- --print ""
-
- how_many_levels = 1
- startat = idx_flag + 1
- if startat == 0 then startat = 1 end -- avoid attempt to examine an illegal array offset
- --print(string.format("idx_flag = %d I will check levels starting with %d", idx_flag, idx_flag+1))
-
- for ch_lev = startat,table_Size,1 do -- was for ch_lev = idx_flag+1,table_Size,1 do
- --print(string.format("ch_lev %d", ch_lev))
- if(cur[ch_lev] ~= nil and cur[ch_lev] ~= "") then
- --print(string.format("cur[%d] = '%s'", ch_lev, cur[ch_lev]))
- how_many_levels = how_many_levels+1
- end
- end
- --print("\nHow Many Levels",how_many_levels)
-
- -- change this all to reflect the real logic which is to normalize all loops inside the thread loops.
- if(how_many_levels <2) then
- while( idx_flag >= 0) do
- for num = level_ty+ty_lookup,(table_Size) do
- --print(string.format("at top of loop, num is %d", num))
- --print(string.format("num %d", num))
- --print(string.format("cur[num] = '%s'", cur[num]))
- if(cur[num] ~= "") then
- idx=cur[num]
- --print(string.format("idx '%s'", idx))
-
- curlev = find_cur_level(stmt,idx)
- --print(string.format("curlev %d", curlev))
-
- --print_code()
- --print(string.format("\n[COPYTOREG]tile(%d,%d,%d)",stmt,find_cur_level(stmt,idx),level_tx))
- tile(stmt,find_cur_level(stmt,idx),find_cur_level(stmt,idx))
- curlev = find_cur_level(stmt,idx)
- --print(string.format("curlev %d", curlev))
- tile(stmt,find_cur_level(stmt,idx),level_tx)
- --print(string.format("hehe '%s'",cur[num]))
-
- cur = cur_indices(stmt)
- --print("Cur indices INSIDE"..list_to_string(cur))
- table_Size = table.getn(cur)
- --print(string.format("Table Size is: %d",table_Size))
- level_tx = find_cur_level(stmt,"tx")
- --print(string.format("\n level TX is: %d",level_tx))
- level_ty = find_cur_level(stmt,ty_lookup_idx)
- --print(string.format("\n level TY is: %d",level_ty))
- idx_flag = -1
- --print "idx_flag = -1"
-
- -- find the level of the next valid index after ty+1
-
- -- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?)
- for num= level_ty+ty_lookup,table_Size do
- --print(string.format("num mucking num = %d", num))
- if(cur[num] ~= nil and cur[num] ~= "") then
- idx_flag = find_cur_level(stmt,cur[num])
- --print("\n(second) I am checking all indexes after ty+1 %s",cur[num])
- break
- end
- end
- --print(string.format("num mucked to %d idx_flag = %d", num, idx_flag))
-
- end
- --print(string.format("at bottom of loop, num is %d", num))
- end
- end
- end
- --print "done with levels"
-
-
-
-
- --print "ARE WE SYNCED HERE?"
- --print_code()
- --print("\ntile(%d,%d,%d)",stmt,level_k,level_k)
- --tile(stmt,level_k,level_k)
-
- -- [Malik] end logic
- --print_code()
- start_level = find_cur_level(stmt, start_loop)
- --We should hold contant any block or tile loop
- block_idxs = block_indices()
- thread_idxs = thread_indices()
- --print("\nblock indices are")
- --table.foreach(block_idxs, print)
- --print("\nthread indices are")
- --table.foreach(thread_idxs, print)
- --print(string.format("\nStart Level: %d",start_level))
-
- hold_constant = {}
- --print("\n Now in Blocks")
- for i,idx in ipairs(block_idxs) do
- --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx)))
- if find_cur_level(stmt,idx) >= start_level then
- table.insert(hold_constant, idx)
- --print(string.format("\nJust inserted block %s in hold_constant",idx))
- end
- end
-
-
- --print("\n Now in Threads")
- for i,idx in ipairs(thread_idxs) do
- --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx)))
- if find_cur_level(stmt,idx) >= start_level then
- table.insert(hold_constant, idx)
- --print(string.format("\nJust inserted thread %s in hold_constant",idx))
- end
- end
-
- --print "\nhold constant table is: "
- --table.foreach(hold_constant, print)
-
- --print("\nbefore datacopy pvt")
- old_num_stmts = num_statements()
- --print_code()
- --print(string.format("\n[DataCopy]datacopy_privatized(%d, %s, %s, vector having privatized levels)",stmt, start_loop, array_name))
- --table.foreach(hold_constant, print)
- datacopy_privatized(stmt, start_loop, array_name, hold_constant)
-
- --print(hold_constant)
- new_num_stmts = num_statements()
- --print("\nthe num of statements:%d\n",new_num_stmt)
- --print_code()
- --exit(0)
- -- [Malik] normalize the copy loops created.
- cur = cur_indices(old_num_stmts)
- --print("Cur indices "..list_to_string(cur))
- for cidx,i in ipairs(cur) do
- if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then
- --tile(old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i))
- --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i))
- end
- end
- --print_code()
- --print("\nthe num of statements OLD+1 :",(old_num_stmts+1))
-
-
---[[
- is this commented out? why yes, yes it is block comment
- if( (old_num_stmts+1) <= new_num_stmts) then
- cur = cur_indices(old_num_stmts+1)
- --print("Cur indices+1 "..list_to_string(cur))
- for cidx,i in ipairs(cur) do
- if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then
- tile(old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i))
- --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i))
- end
- end
- end
---]]
-
-
- --Unroll to the last thread level
- --for stmt=old_num_stmts,new_num_stmts-1 do
- -- level = find_cur_level(stmt,thread_idxs[#thread_idxs])--get last thread level
- --if level < #cur_indices(stmt) then
- -- unroll(stmt,level+1,0)
- --print(string.format("\n[Unroll]unroll(%d, %d, 0)",stmt, level+1))
- ----print_code()
- --end
- --end
- io.flush()
- --print("****** ending copy to registers\n\n")
- --io.flush()
-end
-
-function copy_to_shared(start_loop, array_name, alignment)
- --print(string.format("\nstarting copy to shared(%s, %s, %d )",start_loop,array_name,alignment))
- stmt = 0 --assume stmt 0
- cur = cur_indices(stmt)
- --print("Cur indices "..list_to_string(cur))
-
- start_level = find_cur_level(stmt, start_loop)
- --print(string.format("start_level %d", start_level))
-
- old_num_stmts = num_statements()
- --print(string.format("old_num_statements %d", old_num_stmts))
-
- --Now, we give it indices for up to two dimentions for copy loop
- copy_loop_idxs = {"tmp1","tmp2"}
- --print(string.format("\n[DataCopy]datacopy(%d, %d, %s, {\"tmp1\",\"tmp2\"},false,0,1,%d,true)",stmt, start_level, array_name, alignment))
- datacopy(stmt, start_level, array_name, copy_loop_idxs, false, 0, 1, alignment,true)
-
- add_sync(stmt,start_loop)
- new_num_stmts = num_statements()
-
- --This is fairly CUBLAS2 specific, not sure how well it generalizes,
- --but for a 2D copy, what we want to do is "normalize" the first loop
- --"tmp1" then get its hard upper bound. We then want to tile it to
- --make the control loop of that tile "ty". We then tile "tmp2" with a
- --size of 1 and make it "tx".
- --print(string.format("fairly CUBLAS2 specific, OLD %d NEW %d", old_num_stmts, new_num_stmts ))
-
- for stmt=old_num_stmts,new_num_stmts-1 do
- --print(string.format("for stmt = %d", stmt))
- was_no_error, level = pcall(find_cur_level, stmt, "tmp2")
-
- if was_no_error then
- --print_code()
- --print("\nCopy to shared: [If was no error]\n")
- find_cur_level(stmt,"tmp2")
- tile(stmt, level, level)
-
- lower,upper = hard_loop_bounds(stmt, level)
- upper = upper + 1
- --print(string.format("lower %d upper %d", lower, upper))
-
- tx,ty = thread_dims()
- --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx)
-
- level = find_cur_level(stmt,"tmp1")
- --print(string.format("level %d", level))
-
- if tx == upper and ty == 1 then
- --print(string.format("tx = %d upper = %d ty = %d", tx, upper, ty))
- --print "Don't need"
-
- --Don't need an extra tile level, just move this loop up
- second_level = find_cur_level(stmt,"tmp2")
- --print(string.format("\n[Tile0]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx"))
- tile(stmt, second_level, 1, level, "tx", "tx", counted)
- else
- --print "DO need?"
- --print_code()
- if(ty == 1) then new_ctrl = "tmp3" else new_ctrl = "ty" end
-
-
---[[ Commenting out a block of Gabe's code in this control flow
- -- level = find_cur_level(stmt,"tmp1")
- tile(stmt, level, level)
-
- lower,upper = hard_loop_bounds(stmt, level)
- upper = upper + 1
- --print_code()
- --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx..", level: "..level)
- if(math.ceil(upper/ty) > 1)then
- tile(stmt, level, math.ceil(upper/ty), level, "tmp", new_ctrl, counted)
- --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level, math.ceil(upper/ty),upper,ty, level, "tmp", new_ctrl))
- else
- tile(stmt, level, math.ceil(upper/ty), level, "ty", new_ctrl, counted)
- --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level, math.ceil(upper/ty),upper,ty, level, "tx", new_ctrl))
- end
-
- --print_code()
- -- [Malik] If here we have the loop upper bound > tx, then we should tile once more after the next tile, to carve out the correct tx.
- lower1,upper1 = hard_loop_bounds(stmt,level)
- level1 = level
- stmt1 = stmt
- -- [Malik] Do the tile after the second level tile with if condition. Just to keep the original order, the tile is being pushed to the end.
-
- --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1)
-
- --print_code()
- --level = find_cur_level(stmt,"tmp")
- --tile(stmt,level,level)
- --print_code()
-
- --[Malik] if you are moving the loop above the level1, you need to update level1 with new position which would be level1+2 or second_level
- if(level <= level1) then level1 = level1+2 end
- --print(string.format("\n[Tile2]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx"))
- --print("\n----------------------------------")
- --print_code()
- --print("\n**********************************")
- --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1)
- -- [Malik] If the upper bound > tx, we do another tile to carve out the correct tx from a bigger loop. Else just normalize the bounds.
- if( upper1 > ty) then
- third_level = find_cur_level(stmt1,"tmp")
- --print("\n\n\n\t\t\t\tthirdlevel:"..third_level)
- tile(stmt1, third_level, ty, third_level, "ty", "tmp", counted)
- --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt1, third_level, ty,third_level, "ty", "tmp"))
- tile(stmt1,third_level+1,third_level+1)
- --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level+1))
- tile(stmt1,third_level+1,third_level)
- --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level))
- else
- tile(stmt1,level1,level1)
- --print(string.format("\n[Tile3ELSE]tile(%d, %d, %d)",stmt1,level1,level1))
- end
-
- --print("\nStarting tmp2\n");--print_code();
- second_level = find_cur_level(stmt,"tmp2")
- lower,upper = hard_loop_bounds(stmt,second_level)
- level = second_level
- --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..", level:"..level)
-
- if(math.ceil(upper/tx) > 1)then
- tile(stmt, second_level,math.ceil(upper/tx), level, "tmp", "tx", counted)
- --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tmp", "tx"))
- else
- tile(stmt, second_level,math.ceil(upper/tx), level, "tx", "tx", counted)
- --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tx", "tx"))
- end
- --print_code()
- lower2,upper2 = hard_loop_bounds(stmt,level)
- level2 = level
- stmt2 = stmt
- --print("[Malik]-loop cleanup@tmp2: lower2, upper2: "..lower2..", "..upper2..", tx: "..tx..", level:"..level2)
- -- now for the second level.
- if( upper2 > tx) then
- forth_level = find_cur_level(stmt2,"tmp")
- --print("\n\n\n\t\t\t\tforthlevel:"..forth_level)
- --print_code()
- tile(stmt2, forth_level, 1, forth_level, "tx", "tmp", counted)
- --print(string.format("\n[Tile3B]tile(%d, %d, %d,%d,%s,%s,counted)",stmt2, forth_level, tx,forth_level, "ty", "tmp"))
- --print_code()
- --tile(stmt2,forth_level+1,forth_level+1)
- --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level+1))
- --tile(stmt2,forth_level+1,forth_level)
- --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level))
- else
- new_level = find_cur_level(stmt2,"ty")
- tile(stmt2,level2,1,new_level,"tx","tx",counted)
- --print(string.format("\n[Tile3BELSE]tile(%d, %d, %d)",stmt2,level2,level2))
- tmp_level = find_cur_level(stmt2,"tmp")
- tile(stmt2,tmp_level,tmp_level)
- end
-
- --print_code()
- --print("\n----------------------------------")
---]]
-
- --print_code()
- --print("\nStarting tmp2\n");--print_code();
- first_level = find_cur_level(stmt,"tmp1")
- second_level = find_cur_level(stmt,"tmp2")
- lower,upper = hard_loop_bounds(stmt,second_level)
-
- --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..",first level:"..first_level..",second_level:"..second_level)
-
- -- Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx.
- --print(string.format("\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,1,first_level, "tx", "tx"))
- tile(stmt,second_level,1,first_level,"tx","tx",counted)
- --print_code()
-
- first_level = find_cur_level(stmt,"tmp1")
- lower_1,upper_1 = hard_loop_bounds(stmt,first_level)
- tx_level = find_cur_level(stmt,"tx")
- lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level)
- --print(string.format("UL_1 %d %d UL_tx %d %d", lower_1, upper_1, lower_tx, upper_tx))
-
- if(math.ceil(upper_tx/tx) > 1)then
- --print "ceil I say"
- --print(string.format("\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,tx,tx_level, "tx", "tmp1"))
- tile(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted)
- --print_code()
-
- peat = find_cur_level(stmt,"tx")
- --print(string.format("\n[Tile1]tile(%d, %d, %d)",stmt, peat, peat))
- tile(stmt, peat, peat ) --find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx"))
- --print_code()
-
- if (find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx")) then
- --print(string.format("\nagain [Tile1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")))
- tile(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
- --print_code()
- end
- --else
- --tile(stmt, tx_level,1, tx_level, "tx", "tx", counted)
- --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,1,tx_level, "tx", "tx"))
- end
- --print_code()
- --]] -- this apparently is NOT the end of a block comment
-
- --print("\nStarting tmp1\n")
- -- Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty".
- tile(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))
- --print_code()
-
- ty_level = find_cur_level(stmt,"tmp1")
- lower_ty,upper_ty = hard_loop_bounds(stmt,ty_level)
-
- tx_level = find_cur_level(stmt,"tx")
- lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level)
- --print("[Malik]-loop cleanup@tmp1: lowerty, upperty: "..lower_ty..", "..upper_ty..", ty: "..ty..",ty level:"..ty_level..",tx_level:"..tx_level..", stmt: "..stmt)
-
- --print "before ceil"
- if(math.ceil(upper_ty/ty) > 1)then
- --print "CEIL IF"
- --print("\n Inside upper_ty/ty > 1\n");
-
- --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,ty,ty_level, "ty", "tmp_ty"))
- tile(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted)
- --print_code()
-
- --print(string.format("\n[Tile2-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt ,"ty"),find_cur_level(stmt,"ty")))
- tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty"))
- --print_code()
-
- -----------------------------------------------------------------------
- ----------------------------------------------------------------------
- cur_idxs = cur_indices(stmt)
- --print("\n cur indexes are "..list_to_string(cur_idxs))
-
- -- Putting ty before any tmp_tx
- idx_flag = -1
- for num= 0,table.getn(cur_idxs) do
- if(cur[num] == "tmp_tx") then
- idx_flag = find_cur_level(stmt,cur[num])
- break
- end
- end
- --print(string.format("\n (1) so i have found out the value of idx flag as %d",idx_flag) )
-
- if(idx_flag >=0 ) then
- if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then
- --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
- tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- --print_code()
- end
- end
-
- -- Now Putting ty before any tmp_ty
- idx_flag = -1
- for num= 0,table.getn(cur_idxs) do
- if(cur[num] == "tmp_ty") then
- idx_flag = find_cur_level(stmt,cur[num])
- break
- end
- end
- --print(string.format("\n IF so i have found out the value of idx flag as %d",idx_flag) )
- if(idx_flag >=0 ) then
- --print "one more test"
- if ((find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"))) then
- --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
- tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- --print_code()
- end
- end
- else
- --print "CEIL ELSE"
- --cur_idxs = cur_indices(stmt)
- --print("\n Inside upper_ty/ty <= 1\n");
-
- --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,1,ty_level, "ty", "ty"))
- tile(stmt, ty_level,1, ty_level, "ty", "ty", counted)
- --print_code()
-
- --print(string.format("\n[Tile3-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1))
- tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
- --print_code()
-
- idx_flag = -1
- if(cur_idxs) then
- --print "CAN NEVER GET HERE? cur_idxs"
- for num= 0,table.getn(cur_idxs) do
- if(cur[num] == "tmp_ty") then
- idx_flag = find_cur_level(stmt,cur[num])
- break
- end
- end
- end
- --print(string.format("\n ELSE so i have found out the value of idx flag as %d",idx_flag) )
- if(idx_flag >=0 ) then
- if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then
- --print(string.format("tile( stmt %d, level ty %d, level ty %d",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
- tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- --print(string.format("\n[Tile3-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
- end
- end
- end
-
- --print_code()
- end
-
-
- --print "\n\n *** at bottom of if in copy to shared, "
- --print_code()
- --print "end of if"
-
- else
- --copy to shared only created one level, not two, so we use a different approach (MV & TMV)
- --print("\nCopy to shared: [If was error]\n")
- level = find_cur_level(stmt,"tmp1")
- tile(stmt, level, level)
-
- --print(string.format("\n[Tile]tile(%d, %d, %d)",stmt, level, level))
- tx,ty = thread_dims()
- lower,upper = hard_loop_bounds(stmt, level)
- upper = upper+1 --upper bound given as <=, compare to dimensions tx which is <
- --print("upper "..upper.." tx "..tx)
- if upper == tx then
- rename_index(stmt, "tmp1", "tx")
- else
- --print("upper is not tx")
- --TODO: Don't know, maybe do some tileing etc
- --print_code()
- --print("upper "..upper.." tx "..tx.." stmt: "..stmt.." level: "..level)
- tile(stmt, level,tx,level, "tx", "tmp_tx", counted)
- --print_code()
-
- --print("stmt:"..stmt.." level+1: "..level+1)
- --print("TILE 7")
- tile(stmt, level+1,1,level+1,"tx", "tx",counted)
- --print("TILE 3")
- tile(stmt,level+1,level)
- --print_code()
-
- if(ty > 1) then
- --print_code()
- --print("GOING IN")
- lower,upper = hard_loop_bounds(stmt, level+1)
- --print(string.format("ty %d lower %d upper %d", ty, lower, upper))
- --upper=125
- --print("NOW FOR Y: upper "..upper.." ty "..ty.." stmt: "..stmt.." level: "..(level+1).." bound:"..math.ceil(upper/ty))
- tile(stmt, level+1,math.ceil(upper/ty),level+1, "tmp_ty", "ty", counted)
- --tile(stmt, level+2,math.ceil(upper/ty),level+2, "tmp_ty", "ty", counted)
- end
- --print_code()
- --rename_index(stmt, "tmp1", "tx")
- --print("Warning: Need to implement some logic here to tile the single level shared copy loop to match thread dimensions")
- end
- end
- --Always add sync
- add_sync(stmt,start_loop)
-
- end
- --print("ending copy to shared\n")
- --print_code()
-end
-
-function unroll_to_depth(max_depth)
- --print(string.format("\n\nunroll_to_depth(%d)", max_depth ))
- --print "SYNC UP"
-
- cur = cur_indices(0)
- thread_idxs = thread_indices()
- guard_idx = thread_idxs[#thread_idxs]
-
- --print(string.format("cur indices %s",list_to_string(cur)))
- --print(string.format("thread indices %s",list_to_string(thread_idxs)))
- --print(string.format("#thread_idxs = %d", #thread_idxs))
- --print(string.format("guard_idx = %s", guard_idx))
-
- ---- HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS
- common_loops = {}
- comm_loops_cnt = 0
- num_stmts = num_statements()
- --print(string.format("num statements %d", num_stmts))
-
- for stmt=0,num_stmts-1 do
- cur_idxs = cur_indices(stmt)
-
- --print(string.format("\nSTMT %d Current Indices: %s",stmt,list_to_string(cur_idxs)))
-
- if(chk_cur_level(stmt,"tx")>0) then
- for ii=1,find_cur_level(stmt,"tx")-1 do -- started at 0
- --print(string.format("ii = %d", ii)) -- index starts at 1, what does index 0 do?
- --if cur_idxs[ii] == nil then print "cur_idxs[i]] is NIL"
- --else print(string.format("cur_idxs[%d] = '%s'", ii, cur_idxs[ii])) -- index starts at 1, what does index 0 do?
- --end
-
- if(cur_idxs[ii] ~= "bx" and cur_idxs[ii] ~= "by" and cur_idxs[ii] ~= nil and cur_idxs[ii] ~= "tx" and cur_idxs[ii] ~= "ty" and cur_idxs[ii] ~= "") then
-
- --print(string.format("id %s is not in the list", cur_idxs[ii] ))
-
- for stmt1=stmt+1,num_stmts-1 do
- --print(string.format("\nii %d stmt1 is %d", ii, stmt1))
- cur_idxs1 = cur_indices(stmt1)
- --print("\nstmt1 cur_idxs1 is "..list_to_string(cur_idxs1))
-
- --print(string.format("cur level(%d, %s) = %d", stmt, "tx", find_cur_level(stmt,"tx")))
-
- endrange = find_cur_level(stmt,"tx")-1
- --print(string.format("for iii=1, %d do", endrange))
-
- for iii=1,find_cur_level(stmt,"tx")-1 do -- started at 0
- --print(string.format("stmt %d ii %d iii %d ", stmt, ii, iii))
- --if(cur_idxs1[iii] ~= nil) then
- -- print(string.format("stmt %d ii %d iii %d cur_idxs1[%d] = '%s'", stmt, ii, iii, iii, cur_idxs1[iii]))
- --else
- -- print(string.format("stmt %d ii %d iii %d cur_idxs1[%d] = NIL", stmt, ii, iii, iii))
- --end
-
- if(cur_idxs1[iii] ~= "bx" and cur_idxs1[iii] ~= "by" and cur_idxs1[iii] ~= nil and cur_idxs1[iii] ~= "tx" and cur_idxs1[iii] ~= "ty" and cur_idxs1[iii] ~= "") then
- if(cur_idxs[ii] == cur_idxs1[iii]) then
- --print("\nfound idx:"..cur_idxs[ii])
- --if(comm_loops_cnt == 0) then print "\n\n*** WARNING *** assigning to array index ZERO in Lua" end
- common_loops[comm_loops_cnt] = cur_idxs[ii]
- --print(string.format("cl[%d] = '%s'", comm_loops_cnt, common_loops[comm_loops_cnt]))
- comm_loops_cnt = comm_loops_cnt + 1
- end
- end
- end
- end
- end
- end
- end
- end
- ----
- --if(comm_loops_cnt>0) then
- -- print("\n COMM LOOPS :TOTAL "..comm_loops_cnt..", and are "..list_to_string(common_loops).." this loop :"..common_loops[0])
- --else
- -- print "UNROLL can't unroll any loops?"
- --end
-
-
-
-
- repeat
- old_num_stmts = num_statements()
- --print(string.format("old_num_statements %d", old_num_stmts))
-
- for stmt=0,old_num_stmts-1 do
- cur_idxs = cur_indices(stmt)
- --print(string.format("stmt %d cur_idxs = %s", stmt, list_to_string(cur_idxs)))
- if(#cur_idxs > 0) then
- gaurd_level = -1
- if(chk_cur_level(stmt,guard_idx)>0) then
- gaurd_level = find_cur_level(stmt,guard_idx)
- end
- --print(string.format("guard_level(sp) = %d", gaurd_level))
-
- if(gaurd_level>-1) then
- level = next_clean_level(cur_idxs,gaurd_level)
- --print(string.format("next clean level %d", level))
-
- --need to handle max_depth
- num_unrolled = 0
- level_unroll_comm = level
- level_arr = {}
- while level >= 0 do
- --print(string.format("while: level = %d", level))
-
- if num_unrolled == max_depth then break end
- --print("Unrolling "..stmt.." at level "..(level).." index ".. cur_idxs[gaurd_level+1])
-
- level_arr[num_unrolled] = level
- num_unrolled = num_unrolled + 1
-
- guard_level = find_cur_level(stmt,guard_idx)
- level = next_clean_level(cur_idxs,level+1)
- end
- --dies print("How many levels for unroll commands"..table.getn(level_arr).." which is "..level_arr[0].." and "..level_arr[#level_arr])
- --if(table.getn(level_arr) ~= nil) then
-
- --print "OK, NOW WE UNROLL"
-
- if(level_unroll_comm >= 0)then
- for i = table.getn(level_arr),0,-1 do
- --print(string.format("\ni=%d", i))
- --print(string.format("[Unroll]unroll(%d, %d, 0)",stmt, level_arr[i]))
-
- unroll(stmt,level_arr[i],0)
- --print("finished unroll]]\n")
- --print_code()
- end
- end
-------
- end
---[[
-
-THERE WAS A BIG BLOCK OF COMMENTED OUT CODE HERE
-
-
---]]
-------
- end
- end
- new_num_stmts = num_statements()
-
- until old_num_stmts == new_num_stmts
-
-end
-
-
diff --git a/examples/cuda-chill/cudaize.py b/examples/cuda-chill/cudaize.py
deleted file mode 100755
index ffef009..0000000
--- a/examples/cuda-chill/cudaize.py
+++ /dev/null
@@ -1,1047 +0,0 @@
-#! /usr/bin/python
-
-# THIS IS CUDAIZE.PY
-
-import chill
-import sys
-import math
-
-strided = 0
-counted = 1
-
-def print_code():
- chill.print_code()
- print ""
- sys.stdout.flush()
-
-
-def table_contains_key( table, key ): # use a dict for the 'table'?
- return table.has_key(key) # (key in table)?
-
-def print_array( arr ): # a useful function to mimic lua output
- for a in arr[:-1]:
- print "%s," % a,
- print "%s" % arr[-1]
- sys.stdout.flush()
-
-def valid_indices( statement, indices ):
- #print "valid_indices() python calling C cur_indices"
- #print statement
- cur = chill.cur_indices(statement) # calls C
- #print "python valid_indices(), cur = ",
- #print cur
- #print "indices = ",
- #print indices
-
- for index in indices:
- if not index in cur:
- return False
- return True
-
-def next_clean_level( indices_at_each_level, level):
- #print "next_clean_level( ..., %d )" % level
- #print "indices_at_each_level ",
- print_array( indices_at_each_level )
-
- numlevels = len(indices_at_each_level)
- #print "loop to %d" % numlevels
- for i in range(level+1, numlevels+1):
- pythoni = i-1 # LUA index starts at 1
- #print "Checking level %d = '%s'" % (i, indices_at_each_level[pythoni])
- sys.stdout.flush()
- if len(indices_at_each_level[pythoni]) > 0: # LUA INDEX STARTS AT 1
- #print "returning %d" % i
- return i # MATCH lua return value, LUA index starts at one
- return -1 # no non-dummy indices
-
-
-
-
-def build_order( final_order, tile_index_names, control_index_names, tile_index_map, current_level):
- order = []
- #print "\nbuild_order()"
- #print "build_order(): final_order = (",
- count = 0
- for f in final_order:
- #if count+1 == len(final_order):
- # print "%s )" % f
- #else:
- # print "%s," % f ,
- count += 1
-
- keys = control_index_names.keys()
- keys.sort()
- #if (2 == len(keys)):
- # print "build_order(): ctrl_idx_names = (%s, %s)" % (control_index_names[0], control_index_names[1])
- #else:
- # print "build_order(): ctrl_idx_names = (%s" % control_index_names[0],
- # for k in keys[1:]:
- # print ", %s" % control_index_names[k],
- # print ")"
-
- #print control_index_names
- #print "cur_level %d" % current_level
-
- #print "tile index map: ",
- #print tile_index_map
-
-
- for i in range(len(final_order)):
- k = final_order[i] # not used?
- skip = False
- cur = final_order[i]
- # control loops below our current level should not be in the current order
-
- # skip = cur in control_index_names[current_level+2:]
- #print "\n%d control_index_names, " % len(control_index_names)
- #print control_index_names
-
- for j in range(current_level+1, len(control_index_names)):
- #print "comparing cur %s with cin[%d] %s" % ( cur, j, control_index_names[j])
- if control_index_names[j] == cur:
- skip = True
- #print "SKIP %s " % cur
-
- # possibly substitute tile indices if necessary
- if tile_index_map.has_key(cur):
- approved_sub = False
- sub_string = tile_index_map[cur]
- #print "sub_string = ",
- #print sub_string
-
- # approved_sub = sub_string in tile_index_names[current_level+2:]
- for j in range(current_level+1, len(tile_index_names)):
- if tile_index_names[j] == sub_string:
- approved_sub = True
- if approved_sub:
- cur = sub_string
-
- if not skip:
- order.append( cur)
- #print "build_order() returning order (",
- #print order
- #for o in order:
- # print "%s," % o,
- #print ")"
- return order
-
-def find_cur_level( stmt, idx ):
- #print "find_cur_level(stmt %d, idx %s) Cur indices" % ( stmt, idx ),
-
- cur = chill.cur_indices(stmt)
- #for c in cur[:-1]:
- # print "%s," % c,
- #print "%s" % cur[ -1 ]
-
- index = 1 # lua starts indices at 1 !!
- for c in cur:
- if c == idx:
- #print "found it at index %d" % index
- #sys.stdout.flush()
- #print "in find_cur_level, returning ",
- #print index
- return index
- index += 1
- #print "find_cur_level(), Unable to find index %s in" % idx,
- #print cur
- #print "in find_cur_level, returning -1"
- return -1 # special meaning "it's not there"
-
-def chk_cur_level( stmt, idx ):
- # search cur_indices for a ind at stmt
- cur = chill.cur_indices(stmt)
- if idx in cur:
- return 1 + cur.index(idx) # lua index starts at 1 !
- return -1
-
-def find_offset( cur_order, tile, control):
- #print "Looking for tile '%s' and control '%s' in (" % (tile, control),
- #print cur_order
- #for o in cur_order:
- # print "%s," % o,
- #print ")"
-
- idx1 = -1
- idx2 = -1
- if tile in cur_order:
- idx1 = 1 + cur_order.index(tile) # lua indexes from 1!
- else:
- print "find_offset(), unable to find tile %s in current list of indices" % tile
- sys.exit(-1)
-
- if control in cur_order:
- idx2 = 1 + cur_order.index(control) # lua indexes from 1!
- else:
- print "find_offset(), unable to find control %s in current list of indices" % control
- sys.exit(-1)
-
- #print "found at level %d and %d" % ( idx2, idx1 )
- # this appears horrible
- if idx2 < idx1:
- return idx2-idx1+1 # bad ordering
- else:
- return idx2-idx1
-
-
-
-def tile_by_index( tile_indices, sizes, index_names, final_order, tile_method):
- #print "STARTING TILE BY INDEX"
- #print "tile_by_index() tile_method ",
- #print tile_method
- #print "index_names: ",
- #print index_names
-
- stmt = 0 # assume statement 0
- if not valid_indices( stmt, tile_indices):
- print "python tile_by_index() one or more of ",
- print tile_indices,
- print " is not valid"
- sys.exit(-1)
-
- if tile_method == None:
- #print "CREATING tile_method = 1"
- tile_method = 1 # "counted"
-
- tile_index_names = []
- for ti in tile_indices:
- tile_index_names.append( ti ) # make a copy?
- #print "tile_index_names:",
- #print tile_index_names
-
- control_index_names = {} # a dictionary?
- tile_index_map = {}
-
- #print "index_names: "
- #print index_names
-
- for pair in index_names:
- valid = False
- control = pair[0]
- name = pair[1]
- #print "control %s name %s" % ( control, name )
-
- if control[0] == "l" and control[1].isdigit():
- if control.endswith("_control"):
- index = int(control[1: -8])
- control_index_names[index-1] = name
- valid = True
-
- elif control.endswith("_tile"):
- index = int(control[1: -5])
- #print "index %d" % index
- tile_index_names[index-1] = name # ??
- tile_index_map[name] = tile_indices[index-1]
- valid = True
- if not valid:
- print "%s is not a proper key for specifying tile or control loop indices\n" % control
-
- #print "control_index_names = ",
- #print control_index_names
-
- #print "tile_index_names = ",
- #print tile_index_names
-
- #print "before call to build_order(), tile_index_map = ",
- #print tile_index_map
-
-
- # filter out control indices (and do name substitution of unprocessed tile indices) for a given level
- cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, -1)
-
- #print "returned from build_order python\n\n"
-
- # print("permute("..stmt..", {"..list_to_string(cur_order).."})")
- #print "permute(%d, {" % stmt,
- #print "cur_order = ",
- #print cur_order,
- #print "})"
-
- cur_order.insert(0, stmt)
- #print cur_order
- chill.permute( tuple( cur_order))
- #print "in cudaize.py, returned from C code chill.permute()\n"
-
- for i in range(len(tile_indices)):
- cur_idx = tile_indices[i]
- #print "i %d cur_idx %s calling build order ********" % (i, cur_idx)
- cur_order = build_order( final_order, tile_indices, control_index_names, tile_index_map, i)
- #print "cur_idx %s return from build order" % cur_idx
-
- # Find an offset between tile loop and control loop
- # 0 = control loop one level above tile loop
- # -1 = control loop two levels above tile loop
- # > 0 = tile loop above control loop
- # In the last case, we do two extra tile commands to get the control
- # above the tile and then rely on the final permute to handle the
- # rest
- level = find_cur_level(stmt,cur_idx)
- #print "level %d\n" % level
-
- offset = find_offset(cur_order, tile_index_names[i], control_index_names[i])
- #print "offset %d" % offset
-
- if offset <= 0:
- #print "[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method )
- chill.tile7( stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method )
- #print "in cudaize.py, returned from C code chill.tile7\n"
-
- else:
- #print "2tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method )
- chill.tile7( stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method ) # regular level
-
- # flip and tile control loop
- #print "3tile(%d, %d, %d)" % ( stmt, level+1, level+1)
- chill.tile3( stmt, level+1, level+1)
-
- #print "4tile(%d, %d, %d)" % ( stmt, level+1, level)
- chill.tile3( stmt, level+1, level)
-
- #print_code()
-
- # Do permutation based on cur_order
- #print("permute based on build order calling build_order()")
- cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, i)
-
- #print("permute based on build order return from build_order()")
-
- # print("permute("..stmt..", {"..list_to_string(cur_order).."})")
- topermute = cur_order
- topermute.insert(0, stmt)
- chill.permute( tuple(topermute) )
- #print "\nafter permute(), code is:"
- #print_code()
-
-def normalize_index( index ):
- #print "in cudaize.py, normalize_index( %s )" % index
- stmt = 0 # assume stmt 0
- l = find_cur_level( stmt, index )
- chill.tile3( stmt, l, l )
-
-def is_in_indices( stmt, idx):
- cur = chill.cur_indices(stmt)
- return idx in cur
-
-def copy_to_registers( start_loop, array_name ):
- #print "\n\n****** starting copy to registers"
- #sys.stdout.flush()
-
- stmt = 0 # assume stmt 0
- cur = chill.cur_indices(stmt) # calls C
- table_Size = len(cur)
-
- #print "Cur indices",
- #print_array(cur)
- #print "\nThe table size is %d" % table_Size
- #count=1
- #for c in cur:
- # print "%d\t%s" % (count,c)
- # count += 1
-
- #print_code()
-
- # would be much cleaner if not translating this code from lua!
- level_tx = -1
- level_ty = -1
- if is_in_indices(stmt,"tx"):
- level_tx = find_cur_level(stmt,"tx")
- if is_in_indices(stmt,"ty"):
- level_ty = find_cur_level(stmt,"ty")
- #print "level_tx %d level_ty %d" % ( level_tx, level_ty )
- #sys.stdout.flush()
-
- ty_lookup_idx = ""
- org_level_ty = level_ty
-
- # UGLY logic. Lua index starts at 1, so all tests etc here are off by 1 from the lua code
- # level_ty initializes to -1 , which is not a valid index, and so there is added code to
- # make it not try to acccess offset -1. -1 IS a valid python array index
- # to top it off, the else below can assign a NIL to ty_lookup_idx!
- if level_ty != -1 and cur[level_ty] != "":
- #print "IF cur[%d] = %s" % ( level_ty, cur[level_ty] )
- ty_lookup_idx = cur[level_ty]
- else:
- #print "ELSE ty_lookup_idx = cur[%d] = %s" % ( level_ty, cur[level_ty-1])
- ty_lookup_idx = cur[level_ty-1]
- #print "ty_lookup_idx '%s'" % ty_lookup_idx
-
- if level_ty > -1:
- #print "\ntile3(%d,%d,%d)" % (stmt,level_ty,level_tx+1)
- chill.tile3(stmt,level_ty,level_tx+1)
- #print_code()
-
- cur = chill.cur_indices(stmt) # calls C
- table_Size = len(cur)
- #print "Cur indices ",
- #for c in cur:
- # print "%s," % c,
- #print "\nThe table size is %d" % len(cur)
- #count=1
- #for c in cur:
- # print "%d\t%s" % (count,c)
- # count += 1
- #sys.stdout.flush()
-
- if is_in_indices(stmt,"tx"):
- level_tx = find_cur_level(stmt,"tx")
- if ty_lookup_idx != "": # perhaps incorrect test
- if is_in_indices(stmt,ty_lookup_idx):
- level_ty = find_cur_level(stmt,ty_lookup_idx)
-
- ty_lookup = 1
- idx_flag = -1
- # find the level of the next valid index after ty+1
- #print "\nlevel_ty %d" % level_ty
- if level_ty > -1:
- #print "table_Size %d" % table_Size
- for num in range(-1 + level_ty+ty_lookup,table_Size): # ?? off by one?
- #print "num=%d cur[num] = '%s'" % (num+1, cur[num]) # num+1 is lua index ????
- sys.stdout.flush()
- if cur[num] != "":
- idx_flag = find_cur_level(stmt,cur[num])
- #print "idx_flag = %d" % idx_flag
- break
-
- #print "\n(first) I am checking all indexes after ty+1 %s" % idx_flag
- #print_code()
- #print ""
-
- how_many_levels = 1
-
- #print "idx_flag = %d I will check levels starting with %d" % (idx_flag, idx_flag+1)
- # lua arrays start at index 1. the next loop in lua starts at offset 0, since idx_flag can be -1
- # thus the check for "not equal nil" in lua (bad idea)
- # python arrays start at 0, so will check for things that lua doesn't (?)
- startat = idx_flag + 1
- if idx_flag == -1:
- startat = 1 # pretend we're lua for now. TODO: fix the logic
-
- for ch_lev in range(startat,table_Size+1): # logic may be wrong (off by one)
- #print "ch_lev %d" % ch_lev
- if ch_lev <= table_Size and cur[ch_lev-1] != "":
- #print "cur[%d] = '%s'" % ( ch_lev, cur[ch_lev-1] )
- how_many_levels += 1
-
- #print "\nHow Many Levels %d" % how_many_levels
- sys.stdout.flush()
- sys.stdout.flush()
-
- if how_many_levels< 2:
- while( idx_flag >= 0):
- for num in range(level_ty+ty_lookup,table_Size+1):
- #print "at top of loop, num is %d" % num
- #print "cur[num] = '%s'" % cur[num-1]
- if cur[num-1] != "":
- idx = cur[num-1]
- #print "idx '%s'" % idx
- sys.stdout.flush()
- curlev = find_cur_level(stmt,idx)
- #print "curlev %d" % curlev
-
- #print "\n[COPYTOREG]tile(%d,%d,%d)"%(stmt,curlev,level_tx)
-
- chill.tile3(stmt, curlev, curlev)
- curlev = find_cur_level(stmt,idx)
- #print "curlev %d" % curlev
- chill.tile3(stmt,curlev,level_tx)
- #print "hehe '%s'" % cur[num-1]
-
- cur = chill.cur_indices(stmt)
- #print "Cur indices INSIDE",
- #for c in cur:
- # print "%s," % c,
- table_Size = len(cur)
- #print "\nTable Size is: %d" % len(cur)
-
- level_tx = find_cur_level(stmt,"tx")
- #print "\n level TX is: %d" % level_tx
- level_ty = find_cur_level(stmt,ty_lookup_idx)
- #print "\n level TY is: %d" %level_ty
- idx_flag = -1
- #print "idx_flag = -1"
-
-
- #- find the level of the next valid index after ty+1
- #- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?)
- for num2 in range( -1 + level_ty+ty_lookup ,table_Size): # lua starts index at one
- #print "num mucking num = %d" % num2
- if(cur[num2] != ""):
- #print "cur[%d] = '%s'" % ( num2, cur[num2] )
- idx_flag = find_cur_level(stmt,cur[num2])
- #print("\n(second) I am checking all indexes after ty+1 %s",cur[num2])
- break
-
- #print "num mucked to %d idx_flag = %d" % (num, idx_flag)
-
- #print "at bottom of loop, num is %d" % num
-
- #print "done with levels"
-
- # this was a block comment ???
-
-# for num in range(level_ty+1, table_Size+1):
-# print "num %d" % num
-# if cur[num-1] != "":
-# idx_flag = find_cur_level(stmt,cur[num-1]) ## ugly
-# print "idx_flag = %d" % idx_flag
-
- # change this all to reflect the real logic which is to normalize all loops inside the thread loops.
-# print "change this all ...\n"
-# print "level_ty+1 %d table_Size-1 %d idx_flag %d" %( level_ty+1, table_Size-1, idx_flag)
-# sys.stdout.flush()
-# sys.stdout.flush()
-
-# while level_ty+1 < (table_Size-1) and idx_flag >= 0:
-# print "*** level_ty %d" % level_ty
-# for num in range(level_ty+2,table_Size+1): # lua for includes second value
-# print "num %d cur[num] %s" % (num, cur[num])
-# if cur[num] != "":
-# idx = cur[num]
-# print "idx='%s'" % idx
-# #print_code()
-
-
-
-
- #print "ARE WE SYNCED HERE?"
- #print_code()
-
- # [Malik] end logic
- start_level = find_cur_level(stmt, start_loop) # start_loop was passed parameter!
-
- # We should hold constant any block or tile loop
- block_idxs = chill.block_indices()
- thread_idxs = chill.thread_indices()
- #print"\nblock indices are"
- #for index, val in enumerate(block_idxs):
- # print "%d\t%s" % ( int(index)+1 , val )
- #print"\nthread indices are"
- #for index, val in enumerate(thread_idxs):
- # print "%d\t%s" % ( int(index)+1 , val )
- #print "\nStart Level: %d" % start_level
-
- hold_constant = []
- #print("\n Now in Blocks")
- for idx in block_idxs:
- blocklevel = find_cur_level(stmt,idx)
- if blocklevel >= start_level:
- hold_constant.append(idx)
- #print "\nJust inserted block %s in hold_constant" %idx
-
- #print("\n Now in Threads")
- for idx in thread_idxs:
- blocklevel = find_cur_level(stmt,idx)
- if blocklevel >= start_level:
- hold_constant.append(idx)
- #print "\nJust inserted thread %s in hold_constant" %idx
- #print "\nhold constant table is: "
- #for index, val in enumerate(hold_constant):
- # print "%d\t%s" % ( int(index)+1 , val )
-
- #print("\nbefore datacopy pvt")
- old_num_stmts = chill.num_statements()
- #sys.stdout.flush()
-
- #print "\n[DataCopy]datacopy_privatized(%d, %s, %s, " % (stmt, start_loop, array_name),
- #print hold_constant,
- #print ")"
- passtoC = [stmt, start_loop, array_name ] # a list
- passtoC.append( len(hold_constant ) )
- for h in hold_constant:
- passtoC.append( h )
- chill.datacopy_privatized( tuple( passtoC ))
- sys.stdout.flush()
- sys.stdout.flush()
-
- new_num_statements = chill.num_statements()
- #print "new num statements %d" % new_num_statements
-
- # Unroll to the last thread level
-# for stmt in range(old_num_statements, new_num_statements):
-# print "unrolling statement %d" % stmt
-# level = find_cur_level(stmt,thread_idxs[-1]) #get last thread level
-# print "level is %d" % level
-# idxs = chill.cur_indices(stmt)
-# if level < len(idxs):
-# chill.unroll(stmt,level+1,0)
-
-
-
-def copy_to_shared( start_loop, array_name, alignment ):
- #print "\nstarting copy to shared( %s, %s, %d)" % (start_loop, array_name, alignment )
- #print "copy_to_shared( %s, %s, %d) in cudaize.py" % ( start_loop, array_name, alignment )
- stmt = 0 # assume statement 0
-
- cur = chill.cur_indices(stmt)
- #print "Cur indices ",
- #print_array( cur )
-
- start_level = find_cur_level( stmt, start_loop )
- #print "start_level %d" % start_level
-
- old_num_statements = chill.num_statements()
- #print "old_num_statements %d" % old_num_statements
-
-
- # Now, we give it indices for up to two dimensions for copy loop
- copy_loop_idxs = ["tmp1","tmp2"]
- #chill.datacopy_9arg(stmt, start_level, array_name, copy_loop_idxs, False, 0, 1, alignment,True)
- passtoC = [stmt, start_level, array_name] # a list
- passtoC.append( len(copy_loop_idxs))
- for i in copy_loop_idxs:
- passtoC.append(i)
- passtoC.append( 0 ) # False
- passtoC.append( 0 )
- passtoC.append( 1 )
- passtoC.append( alignment )
- passtoC.append( 1 ) # True
- #print "\n[DataCopy]datacopy( ",
- #print passtoC,
- #print ")"
-
- #if array_name == "b":
- # chill.cheat(1)
- #if array_name == "c":
- # chill.cheat(2)
-
- chill.datacopy_9arg( tuple( passtoC ))
-
- #print "back from datacopy_9arg\n\n\n"
- #sys.stdout.flush()
-
-
- #print "calling add_sync( %d, %s )" % ( stmt, start_loop )
- chill.add_sync( stmt, start_loop )
- #print "back from add_sync()\n\n"
-
- new_num_statements = chill.num_statements()
-
- # This is fairly CUBLAS2 specific, not sure how well it generalizes,
- # but for a 2D copy, what we want to do is "normalize" the first loop
- # "tmp1" then get its hard upper bound. We then want to tile it to
- # make the control loop of that tile "ty". We then tile "tmp2" with a
- # size of 1 and make it "tx".
-
- #print "fairly CUBLAS2 specific, OLD %d NEW %d" % ( old_num_statements, new_num_statements)
- sys.stdout.flush()
- sys.stdout.flush()
-
- for stmt in range(old_num_statements, new_num_statements):
- #print "for stmt = %d" % stmt
- level = find_cur_level( stmt, "tmp2")
- #print "FOUND CUR LEVEL? level '",
- #print level,
- #print "'"
-
- #print "in loop, stmt %d level %d" % ( stmt, level )
- if level != -1:
- #print "\nCopy to shared: [If was no error]\n"
- find_cur_level(stmt,"tmp2")
- chill.tile3( stmt, level, level )
-
- #print "hard_loop_bounds( %d, %d )" % (stmt, level)
- bounds = chill.hard_loop_bounds(stmt, level)
- lower = bounds[0]
- upper = 1+ bounds[1]
- #print "lower %d upper %d" % ( lower, upper )
-
- dims = chill.thread_dims()
- #print "in cudaize.py copy_to_shared, dims =",
- #print dims
- tx = dims[0]
- ty = dims[1]
- #print "2-loop cleanup: lower, upper: %d, %d, tx: %d" % ( lower, upper, tx)
-
- level = find_cur_level(stmt,"tmp1")
- #print "level %d" % level
- if tx == upper and ty == 1:
- #print "tx = %d upper = %d ty = %d"% (tx, upper, ty)
- #print "Don't need"
-
- # Don't need an extra tile level, just move this loop up
- second_level = find_cur_level(stmt,"tmp2")
- chill.tile7(stmt, second_level, 1, level, "tx", "tx", counted)
-
- else:
- #print "DO need?"
- if ty == 1:
- new_ctrl = "tmp3"
- else:
- new_ctrl = "ty"
-
- # LOTS of commented out code here in cudaize.lua
-
- #print_code()
- #print "\nStarting tmp2\n"
- first_level = find_cur_level(stmt,"tmp1")
- second_level = find_cur_level(stmt,"tmp2")
- bounds = chill.hard_loop_bounds(stmt, second_level)
- lower = bounds[0]
- upper = 1 + bounds[1] # BROKEN?
-
- #print "[Malik]-loop cleanup@tmp2: lower, upper: %d, %d, tx: %d,first level:%d,second_level:%d" % ( lower, upper-1, tx, first_level, second_level)
-
- # Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx.
- #print "\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, second_level,1,first_level, "tx", "tx")
- chill.tile7(stmt, second_level,1,first_level,"tx","tx",counted)
- #print_code()
-
- first_level = find_cur_level(stmt,"tmp1")
- bounds = chill.hard_loop_bounds(stmt, first_level)
- lower_1 = bounds[0]
- upper_1 = 1 + bounds[1]
- tx_level = find_cur_level(stmt,"tx")
- bounds = chill.hard_loop_bounds(stmt,tx_level)
- lower_tx = bounds[0]
- upper_tx = 1+bounds[1]
- #print "UL_1 %d %d UL_tx %d %d" % ( lower_1, upper_1-1, lower_tx, upper_tx-1)
-
- if int(math.ceil( float(upper_tx)/float(tx))) > 1:
- #print "ceil I say"
- #print "\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, tx_level,tx,tx_level, "tx", "tmp1")
- chill.tile7(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted)
- #print_code()
-
- repeat = find_cur_level(stmt,"tx")
- #print "\n[Tile1]tile(%d, %d, %d)" % (stmt, repeat, repeat)
- chill.tile3(stmt, repeat, repeat) #find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx"))
- #print_code()
-
- if find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx"):
- #print "\nagain [Tile1]tile(%d, %d, %d)" % (stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
- chill.tile3(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
- #print_code()
-
- #print_code()
-
- #print "\nStarting tmp1\n"
- # Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty".
- chill.tile3(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))
- #print_code()
-
- ty_level = find_cur_level(stmt,"tmp1")
- bounds = chill.hard_loop_bounds(stmt,ty_level)
- lower_ty = bounds[0]
- upper_ty = 1 + bounds[1]
-
- tx_level = find_cur_level(stmt,"tx")
- bounds = chill.hard_loop_bounds(stmt,tx_level)
- lower_tx = bounds[0]
- upper_tx = 1 + bounds[1]
-
- #print "[Malik]-loop cleanup@tmp1: lowerty, upperty: %d, %d, ty: %d,ty level:%d,tx_level:%d, stmt: %d" % ( lower_ty, upper_ty-1, ty, ty_level, tx_level, stmt)
-
- #print "before ceil"
- #sys.stdout.flush()
-
- if(math.ceil(float(upper_ty)/float(ty)) > 1):
- #print "CEIL IF"
- #print "\n Inside upper_ty/ty > 1\n"
-
- #print "\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, ty_level,ty,ty_level, "ty", "tmp_ty")
- chill.tile7(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted)
- #print_code()
-
- #print "\n[Tile2-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt ,"ty"),find_cur_level(stmt,"ty"))
- chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty"))
- #print_code()
-
- cur_idxs = chill.cur_indices(stmt)
- #print "\n cur indexes are ",
- #print_array( cur_idxs)
- #sys.stdout.flush()
-
- # Putting ty before any tmp_tx
- idx_flag = -1
- if "tmp_tx" in cur_idxs:
- idx_flag = 1 + cur_idxs.index("tmp_tx") # lua index starts at 1
- #print "\n (1) so i have found out the value of idx flag as %d" % idx_flag
- #sys.stdout.flush()
-
- if idx_flag >= 0:
- if find_cur_level(stmt,"ty") > find_cur_level(stmt,"tmp_ty"):
- #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- #print_code()
-
-
- # Now Putting ty before any tmp_ty
- sys.stdout.flush()
- idx_flag = -1
- if "tmp_ty" in cur_idxs:
- idx_flag = 1 + cur_idxs.index("tmp_ty") # lua index starts at 1
- #print "\n IF so i have found out the value of idx flag as %d" % idx_flag
- #sys.stdout.flush()
-
- if idx_flag >= 0:
- #print "one more test"
- sys.stdout.flush()
- if find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"):
- #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- #sys.stdout.flush()
- chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- #print_code()
-
-
-
- else:
- #print "CEIL ELSE"
- #print "\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, ty_level,1,ty_level, "ty", "ty")
- #sys.stdout.flush()
- chill.tile7( stmt, ty_level, 1, ty_level, "ty", "ty", counted )
- #print_code()
-
- #print "\n[Tile3-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
- sys.stdout.flush()
-
- chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
- #print_code()
-
-
- idx_flag = -1
- # LUA code checks to see if cur_idxs exists? it is unused except in the other clause of this is
- #if(cur_idxs) then
- #print "CAN NEVER GET HERE? cur_idxs"
- #for num= 0,table.getn(cur_idxs) do
- #if(cur[num] == "tmp_ty") then
- #idx_flag = find_cur_level(stmt,cur[num])
- #break
- #end
- #end
- print "\n ELSE so i have found out the value of idx flag as %d" % idx_flag
- if idx_flag >= 0: # can't happen
- print "tile( stmt %d, level ty %d, level ty %d" % ( stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- #chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-
-
-
-
-
- #print "\n\n *** at bottom of if in copy to shared, "
- #print_code()
- #print "end of if"
-
- else:
- # copy to shared only created one level, not two, so we use a different approach (MV & TMV)
- #print "\nCopy to shared: [If was error]\n"
- level = find_cur_level(stmt,"tmp1")
- chill.tile3(stmt, level, level)
-
- dims = chill.thread_dims()
- #print dims
- tx = dims[0]
- ty = dims[1]
-
- bounds = chill.hard_loop_bounds(stmt, level)
- lower = bounds[0]
- upper = bounds[1]
-
- #print "bounds lower %d upper %d" % (lower, upper)
- upper = upper+1 # upper bound given as <=, compare to dimensions tx which is <
- if upper == tx:
- #print "upper == tx"
- chill.rename_index( stmt, "tmp1", "tx")
- else:
- #print "upper is not tx"
- #print "upper %d tx %d stmt: %d level: %d" % ( upper, tx, stmt, level)
- chill.tile7( stmt, level, tx, level, "tx", "tmp_tx", counted)
- #print_code()
-
- #print "stmt:%d level+1: %d" % ( stmt, level+1)
- #print("TILE 7")
- chill.tile7( stmt, level+1,1,level+1,"tx", "tx",counted)
- #print("TILE 3")
- chill.tile3( stmt, level+1, level)
- #print_code()
-
-
- if ty > 1:
- #print "GOING IN"
- bounds = chill.hard_loop_bounds(stmt, level+1)
- lower = bounds[0]
- upper = bounds[1]
- #print "ty %d lower %d upper %d" % ( ty, lower, upper )
- floatdiv = float(upper)/float(ty)
- bound = int(math.ceil(float(upper)/float(ty)))
- #print "NOW FOR Y: upper %d ty %d stmt: %d level: %d bound: %d" % ( upper, ty, stmt, level+1, bound)
- chill.tile7(stmt, level+1, bound, level+1, "tmp_ty", "ty", counted)
-
- # Always add sync
- chill.add_sync( stmt, start_loop )
- #print "ending copy to shared\n"
- #sys.stdout.flush()
- #print_code()
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-def unroll_to_depth( max_depth ):
- print "\n\nunroll_to_depth(%d)" % max_depth
- print "SYNC UP"
- sys.stdout.flush()
-
- cur = chill.cur_indices(0)
- thread_idxs = chill.thread_indices()
- guard_idx = thread_idxs[-1] # last one
-
- print "cur indices",
- print_array(cur)
- print "thread indices",
- print_array(thread_idxs)
- print "guard_idx = %s" % guard_idx
-
- #print "thread_idxs = ",
- #print thread_idxs
- guard_idx = thread_idxs[-1]
- #print "guard_idx = %s" % guard_idx
-
- # HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS
- common_loops = []
- comm_loops_cnt = 0
- num_stmts = chill.num_statements()
- print "num statements %d" % num_stmts
-
- for stmt in range(num_stmts):
- sys.stdout.flush()
- print "\nSTMT %d" % stmt,
- cur_idxs = chill.cur_indices(stmt)
- print "Current Indices:",
- for c in cur_idxs[:-1]:
- print "%s," % c,
- print "%s" % cur_idxs[-1] # last one
- sys.stdout.flush()
- #print_code()
-
- if chk_cur_level(stmt, "tx") > 0:
-
- for ii in range(find_cur_level(stmt,"tx")-1):
- print "ii = %d\ncur_idxs[%d] = '%s'" % (ii+1, ii+1, cur_idxs[ii]) # print to match lua
- id = cur_idxs[ii]
- if id not in ["bx", "by", "", "tx", "ty"]:
-
- print "id %s is not in the list" % id
-
- for stmt1 in range(stmt+1, num_stmts):
- print "\nii %d stmt1 is %d" % (ii+1, stmt1) # print to match lua
- cur_idxs1 = chill.cur_indices(stmt1)
- print "\nstmt1 cur_idxs1 is ",
- for ind in cur_idxs1[:-1]:
- print "%s," % ind,
- print "%s" % cur_idxs1[-1]
-
- print "cur level(%d, %s) = %d" % (stmt, "tx", find_cur_level(stmt,"tx") )
- sys.stdout.flush()
-
- endrange = find_cur_level(stmt,"tx")-1
- print "for iii=1, %d do" % endrange
- sys.stdout.flush()
- for iii in range(endrange): # off by one? TODO
- print "stmt %d ii %d iii %d\n" % (stmt, ii+1, iii+1),
- sys.stdout.flush()
-
- if iii >= len(cur_idxs1):
- print "stmt %d ii %d iii %d cur_idxs1[%d] = NIL" % (stmt, ii+1, iii+1, iii+1, ) # print to match lua
- else:
- print "stmt %d ii %d iii %d cur_idxs1[%d] = '%s'" % (stmt, ii+1, iii+1, iii+1, cur_idxs1[iii]) # print to match lua
- sys.stdout.flush()
-
- # this will still probably die
- if iii < len(cur_idxs1) and [iii] not in ["bx", "by", "tx", "ty", ""]:
- if cur_idxs[ii] == cur_idxs1[iii]:
- print "\nfound idx:%s" % cur_idxs[ii]
- common_loops.append(cur_idxs[ii])
- print "cl[%d] = '%s'" % ( comm_loops_cnt, cur_idxs[ii] )
- comm_loops_cnt = len(common_loops)
-
- if len(common_loops) > 0:
- print "\n COMM LOOPS :TOTAL %d, and are " % comm_loops_cnt,
- print common_loops,
- print " this loop : %s" % common_loops[0]
- else:
- print "UNROLL can't unroll any loops?"
-
-
- while True: # break at bottom of loop (repeat in lua)
- old_num_statements = chill.num_statements()
- print "old_num_statements %d" % old_num_statements
-
- for stmt in range(old_num_statements):
- cur_idxs = chill.cur_indices(stmt)
- print "stmt %d cur_idxs =" % stmt,
- index = 0
- for i in cur_idxs:
- index +=1
- if index == len(cur_idxs):
- print "%s" %i
- else:
- print "%s," % i,
-
- if len(cur_idxs) > 0:
- guard_level = -1
- if chk_cur_level(stmt, guard_idx) > 0:
- guard_level = find_cur_level(stmt,guard_idx)
- print "guard_level(sp) = %d" % guard_level
- if guard_level > -1:
- level = next_clean_level(cur_idxs,guard_level)
- print "next clean level %d" % level
-
-
- #print "looking at %d" % stmt
- #print "comparing %d and %d in" % (guard_level, level),
- #index = 0
- #for i in cur_idxs:
- #index +=1
- #if index == len(cur_idxs):
- # print "%s" %i
- #else:
- # print "%s," % i,
-
- # need to handle max_depth
- num_unrolled = 0
- level_unroll_comm = level
- level_arr = []
-
- #print "before while, level = %d" % level
- while level >= 0:
- print "while: level = %d" % level
- if num_unrolled == max_depth:
- break
-
- print "Unrolling %d at level %d index %s" % ( stmt, level, cur_idxs[guard_level]) # ???
- level_arr.append(level)
-
- guard_level = find_cur_level(stmt,guard_idx)
- level = next_clean_level(cur_idxs,level+1)
-
- print "OK, NOW WE UNROLL"
- if level_unroll_comm >= 0:
- level_arr.reverse()
- for i,lev in enumerate(level_arr):
- print "\ni=%d" % i
- print "[Unroll]unroll(%d, %d, 0)" % (stmt, lev)
- chill.unroll(stmt, lev, 0)
-
-
- new_num_statements = chill.num_statements()
- if old_num_statements == new_num_statements:
- break # exit infinite loop
-
-
-# all other calls to C have a routine in this file (?)
-def unroll( statement, level, unroll_amount ):
- chill.unroll( statement, level, unroll_amount )
-
diff --git a/examples/cuda-chill/mm.c b/examples/cuda-chill/mm.c
deleted file mode 100644
index 0efbeeb..0000000
--- a/examples/cuda-chill/mm.c
+++ /dev/null
@@ -1,10 +0,0 @@
-#define N 1024
-
-void normalMM(float c[N][N], float a[N][N], float b[N][N]) {
- int i, j, k;
-
- for (i = 0; i < N; i++)
- for (j = 0; j < N; j++)
- for (k = 0; k < N; k++)
- c[j][i] = c[j][i] + a[k][i] * b[j][k];
-}
diff --git a/examples/cuda-chill/mm.lua b/examples/cuda-chill/mm.lua
deleted file mode 100644
index 5bde1b0..0000000
--- a/examples/cuda-chill/mm.lua
+++ /dev/null
@@ -1,38 +0,0 @@
-init("mm.c", "normalMM", 0)
-dofile("cudaize.lua")
-N=1024
-Ti=128
-Tj=64
-Tk=16
-Tii=16
-Tjj=16
-
-
-
-
-N=1024
-
-
-
-
-
-
-
-
-
-
-
-
-
-tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k"})CU=1
-
-tile_by_index({"k"},{Tk},{l1_control="kk"},{"ii","jj","kk","i","j","k"})CU=3
-
-tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","kk","i","iii","j","jjj","k"},1)CU=2
-
-cudaize("mm_GPU",{a=1048576,b=1048576,c=1048576},{block={"ii","jj"}, thread={"i","j"}})CU=2
-copy_to_shared("tx","a",-16)
-copy_to_shared("tx","b",-16)
-copy_to_registers("kk","c")
---print_code()
-unroll_to_depth(2)
diff --git a/examples/cuda-chill/mpeg4.c b/examples/cuda-chill/mpeg4.c
deleted file mode 100755
index 7f83bf7..0000000
--- a/examples/cuda-chill/mpeg4.c
+++ /dev/null
@@ -1,23 +0,0 @@
-#define N1 4096
-#define N2 4096
-#define WINDOW_SIZE 16
-
-void mpeg4_cpu(float result[N1][N2], float prev[N2+WINDOW_SIZE][N2+WINDOW_SIZE], float curr[WINDOW_SIZE*WINDOW_SIZE])
-{
- unsigned int i;
- unsigned int j;
- unsigned int k;
- unsigned int l;
-
- for ( i = 0; i < N1; ++i)
- for ( j = 0; j < N2; ++j)
- for ( k = 0; k < WINDOW_SIZE; ++k)
- for ( l = 0; l < WINDOW_SIZE; ++l)
- result[i][j] += prev[i+k][j+l] * curr[k*WINDOW_SIZE+l];
-
-
-
-
-
-}
-
diff --git a/examples/cuda-chill/mpeg4.lua b/examples/cuda-chill/mpeg4.lua
deleted file mode 100644
index f025dc0..0000000
--- a/examples/cuda-chill/mpeg4.lua
+++ /dev/null
@@ -1,45 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("mpeg4.c", "mpeg4_cpu", 0)
-
---dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods
-
-N=4096
-M=4096
-W=16
-
---TI 4ust be <= M
---TJ must be <=TI
-Ti=32
-Tj=32
-Tii=16
-Tjj=16
-Tk=4
---permute(0,{"j","i","k","l"})
-tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k","l"})
---tile_by_index({"k","l"},{Tk*2,Tk*2},{l1_control="kk",l2_control="ll"},{"ii","jj","kk","ll","i","j","k","l"})
---print_code()
---tile_by_index({"k","l"},{Tk,Tk},{l1_control="kk",l2_control="ll"},{"ii","jj","i","j","kk","k","ll","l"})
-tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","iii","i","jjj","j","k","l"})
---print_code()
---normalize_index("j")
---normalize_index("i")
---print_code()
-cudaize("kernel_GPU",{curr=W*W,prev=(N+W)*(M+W),result=N*M},{block={"ii","jj"}, thread={"i","j"}})
---print_code()
-copy_to_shared("iii","prev",16)
-
-copy_to_registers("jjj","result")
-
---print_code()
---copy_to_constant_no_tile("curr")
-unroll_to_depth(2)
-print_code()
-print_space()
-
-
diff --git a/examples/cuda-chill/mriq-fh.c b/examples/cuda-chill/mriq-fh.c
deleted file mode 100755
index 1e924b7..0000000
--- a/examples/cuda-chill/mriq-fh.c
+++ /dev/null
@@ -1,38 +0,0 @@
-#define X 32768
-#define K 256
-struct kValues {
- float Kx;
- float Ky;
- float Kz;
- float PhiMag;
-};
-extern float sin(float);
-extern float cos(float);
-
-void mriFH_cpu(float *rPhi,float *rRho,float *iRho, float *iPhi, float *rD, float *iD, float *kx, float *ky, float *kz, float *dx, float *dy, float *dz, float *rFHref, float *iFHref)
-{
-
- float rfh;
- float ifh;
- float exp;
- float cArg;
- float sArg;
- //float rRho[K];
- //float iRho[K];
- unsigned int k;
- unsigned int x;
-
-
- for (x = 0; x < X; ++x) {
- for (k = 0; k < K; ++k) {
-
- exp = 2 * 3.14159 * (kx[k]* dx[x] + ky[k]* dy[x] + kz[k]* dz[x]);
- cArg = cos(exp);
- sArg = sin(exp);
- rFHref[x] += rRho[k]* cArg - iRho[k]* sArg;
- iFHref[x] += iRho[k]*cArg + rRho[k]*sArg;
- }
-
- }
-}
-
diff --git a/examples/cuda-chill/mriq-fh.lua b/examples/cuda-chill/mriq-fh.lua
deleted file mode 100755
index 3277bac..0000000
--- a/examples/cuda-chill/mriq-fh.lua
+++ /dev/null
@@ -1,73 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("mriq-fh.c", "mriFH_cpu", 0)
-
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-N=32768
-M=256
-Tx=256
-
-
-print_code()
---permute(0,{"j","i"})
---tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"})
-tile_by_index({"x"},{Tx},{l1_control="xx"},{"xx","x","k"})
---tile_by_index({"x"},{16},{l1_control="xx1"},{"xx","x","xx1","k"})
---tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
-print_code()
-
-normalize_index("x")
---normalize_index("i")
-print_code()
---tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"})
---print_code()
---cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}})
-cudaize("kernel_GPU",{dx=N,dy=N,dz=N,iRho=M,kx=M,ky=M,kz=M,rFHref=N,iFHref=N,rRho=M},{block={"xx"}, thread={"x"}})
---copy_to_shared("tx","iRho",-16)
---copy_to_shared("tx","dz",1)
---copy_to_shared("tx","rRho",-16)
---copy_to_registers("tx","rFHref")
---copy_to_registers("tx","rRho")
---copy_to_registers("tx","iRho")
---copy_to_registers("tx","kx")
---copy_to_registers("tx","dx")
---copy_to_registers("tx","ky")
---copy_to_registers("tx","dy")
---copy_to_registers("tx","kz")
---copy_to_registers("tx","dz")
---copy_to_registers("tx","iFHref")
---copy_to_texture("rRho")
---copy_to_texture("kx")
---copy_to_texture("dx")
---copy_to_texture("ky")
---copy_to_texture("dy")
---copy_to_texture("kz")
---copy_to_texture("dz")
---copy_to_texture("iRho")
---print_code()--]]
---unroll(0,4,0)
---copy_to_constant_no_tile("kx")
---copy_to_constant_no_tile("ky")
---copy_to_constant_no_tile("kz")
---copy_to_constant_no_tile("rRho")
---copy_to_constant_no_tile("iRho")
-
---unroll_to_depth(1)
-print_code()
---[[
-copy_to_Texture("rRho")
-copy_to_Texture("kx")
-copy_to_Texture("dx")
-copy_to_Texture("ky")
-copy_to_Texture("dy")
-copy_to_Texture("kz")
-copy_to_Texture("dz")
-copy_to_Texture("iRho")
---unroll_to_depth(2)
---]]
diff --git a/examples/cuda-chill/mriq.c b/examples/cuda-chill/mriq.c
deleted file mode 100644
index ba4b87c..0000000
--- a/examples/cuda-chill/mriq.c
+++ /dev/null
@@ -1,33 +0,0 @@
-#define N 32768
-#define M 3072
-struct kValues {
- float Kx;
- float Ky;
- float Kz;
- float PhiMag;
-};
-extern float sinf(float);
-extern float cosf(float);
-
-void
-ComputeQCPU(int numK, int numX,struct kValues kVals[M],float x[N], float y[N], float z[N],float Qr[N], float Qi[N]) {
- float expArg;
- float cosArg;
- float sinArg;
- float phi;
- int i;
- int j;
- numK = M;
- numX = N;
- for ( i = 0; i < M; i++) {
- for ( j = 0; j < N; j++) {
- expArg = 6.2831853071795864769252867665590058f * (kVals[i].Kx * x[j] +kVals[i].Ky * y[j] +kVals[i].Kz * z[j]);
- cosArg = cosf(expArg);
- sinArg = sinf(expArg);
- phi = kVals[i].PhiMag;
- Qr[j] += phi * cosArg;
- Qi[j] += phi * sinArg;
- }
- }
-}
-
diff --git a/examples/cuda-chill/mriq.lua b/examples/cuda-chill/mriq.lua
deleted file mode 100644
index 1170111..0000000
--- a/examples/cuda-chill/mriq.lua
+++ /dev/null
@@ -1,55 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("mriq.c", "ComputeQCPU", 0)
-
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-N=32768
-M=3072
-TI=128
-TJ=128
-
-permute(0,{"j","i"})
---tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"})
-tile_by_index({"i"}, {TJ}, {l1_control="ii",l1_tile="i"}, {"ii", "j","i"})
-tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
-
-normalize_index("j")
-normalize_index("i")
---print_code()
---tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"})
---print_code()
-cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}})
-
-copy_to_shared("tx","kVals",1)
---copy_to_shared("tx","x",1)
---copy_to_shared("tx","y",1)
---copy_to_shared("tx","z",1)
-
---copy_to_texture("kVals")
---datacopy(0, 3, "kVals", {"tt","t"},false,0,1,-16,true)
---print_code()
---datacopy_privatized(0,"tx","kVals",{"tx"})
---copy_to_registers("tx","kVals")
-copy_to_registers("ii","x")
-copy_to_registers("ii","y")
-copy_to_registers("ii","z")
-copy_to_registers("ii","Qi")
-copy_to_registers("ii","Qr")
---[[datacopy_privatized(0,"tx","x",{"tx"})
-datacopy_privatized(0,"tx","y",{"tx"})
-datacopy_privatized(0,"tx","z",{"tx"})
-datacopy_privatized(0,"tx","Qi",{"tx"})
-datacopy_privatized(0,"tx","Qr",{"tx"})
-
-
-]]--
---unroll(0,5,64)
-print_code()
---unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
diff --git a/examples/cuda-chill/mv-shadow.c b/examples/cuda-chill/mv-shadow.c
deleted file mode 100644
index 582b187..0000000
--- a/examples/cuda-chill/mv-shadow.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 1024
-
-void normalMV(float c[N][N], float a[N], float b[N]) {
- int i, j;
-
- for (i = 0; i < N; i++)
- for (j = 0; j < N; j++)
- a[i] = a[i] + c[j][i] * b[j];
-}
diff --git a/examples/cuda-chill/mv-shadow.lua b/examples/cuda-chill/mv-shadow.lua
deleted file mode 100644
index 43e8491..0000000
--- a/examples/cuda-chill/mv-shadow.lua
+++ /dev/null
@@ -1,65 +0,0 @@
-init("mv-shadow.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-
-N=129
-TI=32
-TJ=64
-
-N=1024
-TI=16
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
---Tile the i and j loop, introducing "ii" as the control loop for the "i"
---tile, "k" for the control loop fo the "j" tile, with the final order
---of {"ii", "k", "i", "j"}
-tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
---tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
---Normalize indx will do a tile size of one over the loop level specified
---by the input index. This is useful to get a zero lower bound and hard
---upper bound on a loop instead of it being relative to previous loop
---levels.
---normalize_index("ii")
-normalize_index("i")
-print_code()
-
---Cudaize now determines the grid dimentions from the loops themselves
---(the upper bounds of the block and thread loops). It also renames the
---given block and thread loops's indexes to the approviate values from
---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
---size of the arrays to be copied in the CUDA scaffolding.
-cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}})
---print_code()
-
---Does a datacopy, tile, and add_sync to get a shared memory copy
-
---copy_to_shared("tx", "b", 1)
---copy_to_shared("tx", "c", -16)
---print_code()
---copy_to_texture("b")
---copy_to_texture("c")
-copy_to_registers("k", "a")
---print_code()
-
-unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
---copy_to_texture("b")
---print_code()
---unroll(0,5,0)
---print_code()
diff --git a/examples/cuda-chill/mv.c b/examples/cuda-chill/mv.c
deleted file mode 100644
index 582b187..0000000
--- a/examples/cuda-chill/mv.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 1024
-
-void normalMV(float c[N][N], float a[N], float b[N]) {
- int i, j;
-
- for (i = 0; i < N; i++)
- for (j = 0; j < N; j++)
- a[i] = a[i] + c[j][i] * b[j];
-}
diff --git a/examples/cuda-chill/mv.lua b/examples/cuda-chill/mv.lua
deleted file mode 100644
index ca54501..0000000
--- a/examples/cuda-chill/mv.lua
+++ /dev/null
@@ -1,65 +0,0 @@
-init("mv.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-
-N=129
-TI=32
-TJ=64
-
-N=1024
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
---Tile the i and j loop, introducing "ii" as the control loop for the "i"
---tile, "k" for the control loop fo the "j" tile, with the final order
---of {"ii", "k", "i", "j"}
-tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
---tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
---Normalize indx will do a tile size of one over the loop level specified
---by the input index. This is useful to get a zero lower bound and hard
---upper bound on a loop instead of it being relative to previous loop
---levels.
---normalize_index("ii")
-normalize_index("i")
-print_code()
-
---Cudaize now determines the grid dimentions from the loops themselves
---(the upper bounds of the block and thread loops). It also renames the
---given block and thread loops's indexes to the approviate values from
---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
---size of the arrays to be copied in the CUDA scaffolding.
-cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}})
-
---print_code()
-
---Does a datacopy, tile, and add_sync to get a shared memory copy
-
---copy_to_shared("tx", "b", 1)
---copy_to_shared("tx", "c", -16)
---print_code()
---copy_to_texture("b")
---copy_to_texture("c")
-copy_to_registers("k", "a")
---print_code()
-
-unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
---copy_to_texture("b")
---print_code()
---unroll(0,5,0)
---print_code()
diff --git a/examples/cuda-chill/mv_try.c b/examples/cuda-chill/mv_try.c
deleted file mode 100644
index 7781f3b..0000000
--- a/examples/cuda-chill/mv_try.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 4096
-
-void normalMV(int n, float c[N][N], float a[N], float b[N]) {
- int i, j;
-
- for (i = 0; i < n; i++)
- for (j = 0; j < n; j++)
- a[i] = a[i] + c[i][j] * b[j];
-}
diff --git a/examples/cuda-chill/mv_try.lua b/examples/cuda-chill/mv_try.lua
deleted file mode 100644
index db4d9ad..0000000
--- a/examples/cuda-chill/mv_try.lua
+++ /dev/null
@@ -1,14 +0,0 @@
-init("mv_try.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-
-TI=96
-
-N=4096
-
-
-tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
-cudaize("mv_GPU", {a=N, b=N, c=N*N},
- {block={"ii"}, thread={"i"}})
-
-print_code()
diff --git a/examples/cuda-chill/nbody.c b/examples/cuda-chill/nbody.c
deleted file mode 100644
index 57899b6..0000000
--- a/examples/cuda-chill/nbody.c
+++ /dev/null
@@ -1,66 +0,0 @@
-#define NBODIES 16384
-#define SOFTENINGSQUARED 0.01f
-#define DELTATIME 0.001f
-#define DAMPING 1.0f
-
-#define NBLOCKSY 1
-#define NBLOCKSX (NBODIES/NTHREADSX)
-#define NTHREADSY 1
-#define NTHREADSX 64
-
-#define BLOCKSIZE 128
-
-#define SHARED 1
-#define TIMER 1
-#define VERIFY 1
-
-extern float sqrtf(float);
-
-void nbody_cpu(float* oldpos,float* oldpos1, float *newpos, float *oldvel, float *newvel, float *force)
-{
- float r0,r1,r2;
- float invDist, invDistCube, mass, invMass;
- unsigned int i,j;
- for(i = 0; i < NBODIES; ++i) {
- //force[i*4 ] = 0;
- //force[i*4+1] = 0;
- //force[i*4+2] = 0;
- //force[i*4+3] = 0;
- for(j = 0; j < NBODIES; ++j) {
- r0 = oldpos[j*4]-oldpos1[i*4];
- r1 = oldpos[j*4+1]-oldpos1[i*4+1];
- r2 = oldpos[j*4+2]-oldpos1[i*4+2];
-
- invDist = 1.0/sqrtf(r0 * r0 + r1 * r1 + r2 * r2 + SOFTENINGSQUARED);
- invDistCube = invDist * invDist * invDist;
- mass = oldpos1[i*4+3];
-
- force[i*4] = force[i*4] + r0 * mass * invDistCube;
- force[i*4+1] = force[i*4+1] + r1 * mass * invDistCube;
- force[i*4+2] = force[i*4+2] + r2 * mass * invDistCube;
-
- }
- }
-
-/* for (i = 0; i < NBODIES; ++i) {
- invMass = oldvel[4*i+3];
-
- oldvel[4*i] += (force[4*i] * invMass) * DELTATIME * DAMPING;
- oldvel[4*i+1] += (force[4*i+1] * invMass) * DELTATIME * DAMPING;
- oldvel[4*i+2] += (force[4*i+2] * invMass) * DELTATIME * DAMPING;
-
- oldpos[4*i] += oldvel[4*i] * DELTATIME;
- oldpos[4*i+1] += oldvel[4*i+1] * DELTATIME;
- oldpos[4*i+2] += oldvel[4*i+2] * DELTATIME;
-
- newpos[4*i+0] = oldpos[4*i];
- newpos[4*i+1] = oldpos[4*i+1];
- newpos[4*i+2] = oldpos[4*i+2];
- newpos[4*i+3] = oldpos[4*i+3];
-
- newvel[4*i+0] = oldvel[4*i];
- newvel[4*i+1] = oldvel[4*i+1];
- newvel[4*i+2] = oldvel[4*i+2];
- newvel[4*i+3] = oldvel[4*i+3];
- }*/
-}
diff --git a/examples/cuda-chill/nbody.lua b/examples/cuda-chill/nbody.lua
deleted file mode 100644
index 08f88a9..0000000
--- a/examples/cuda-chill/nbody.lua
+++ /dev/null
@@ -1,53 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("nbody.c", "nbody_cpu" , 0)
-
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-NBODIES=16384
-
-
---Tj=128 CHANGE FOR BEST..... BEST IS 64BLOCKS 128THREADS
---Ti=256
-Tj=64
-Ti=32
-Tjjj=1
-Tiii=1
-Tn=0.1
---normalize_index("j")
---
---print_code()
---normalize_index("n")
--- TILE COMMANDS ZEROOOOOOOOOOO:3
---tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j"})--CU=-1
-tile_by_index({"i"},{Ti},{l1_control="ii"},{"ii","i","j"})--CU=-1
---normalize_index("i")
---tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1
-
---tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3
---tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"})
---tile_by_index({"i"}, {Ti/2}, {l1_control="iii"}, {"ii","iii", "jj","i","j"})
---print_code()
-cudaize("kernel_GPU",{oldpos=4*NBODIES,oldpos1=4*NBODIES,oldvel=4*NBODIES,force=4*NBODIES,newpos=4*NBODIES,newvel=4*NBODIES},{block={"ii"}, thread={"i"}})--CU=3
-print_code()
---tile(0,6,6)
---copy_to_shared("tx","oldpos",-16)
---copy_to_registers("j","oldpos")
---copy_to_registers("j","oldpos1")
---copy_to_registers("j","force")
-
---copy_to_texture("oldpos")
---tile(1,3,3)
---tile(2,3,3)
-
-print_code()
---unroll_to_depth(1)
---
---tile(2,3,3)
---unroll(2,3,0)
---unroll(0,5,0)
---print_code()
diff --git a/examples/cuda-chill/tmv-shadow.c b/examples/cuda-chill/tmv-shadow.c
deleted file mode 100644
index cb9ea8d..0000000
--- a/examples/cuda-chill/tmv-shadow.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 1024
-
-void normalMV(float c[N][N], float a[N], float b[N]) {
- int i, j;
-
- for (i = 0; i < N; i++)
- for (j = 0; j < N; j++)
- a[i] = a[i] + c[i][j] * b[j];
-}
diff --git a/examples/cuda-chill/tmv-shadow.lua b/examples/cuda-chill/tmv-shadow.lua
deleted file mode 100644
index 196b939..0000000
--- a/examples/cuda-chill/tmv-shadow.lua
+++ /dev/null
@@ -1,50 +0,0 @@
-init("tmv-shadow.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-
-N=1024
---N= 8209
---N=129
-TI=64
-N=1024
-TI=32
---tile, "k" for the control loop for the "j" tile, with the final order
---of {"ii", "k", "i", "j"}
-tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
---tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
-
---print_code()
---Normalize indx will do a tile size of one over the loop level specified
---by the input index. This is useful to get a zero lower bound and hard
---upper bound on a loop instead of it being relative to previous loop
---levels.
---normalize_index("i")
---print_code()
-
---Cudaize now determines the grid dimentions from the loops themselves
---(the upper bounds of the block and thread loops). It also renames the
---given block and thread loops's indexes to the approviate values from
---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
---size of the arrays to be copied in the CUDA scaffolding.
-cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}})
-
---print_code()
-
---Does a datacopy, tile, and add_sync to get a shared memory copy
-copy_to_shared("tx", "b", 1)
---copy_to_texture("b")
---print_code()
-
-copy_to_shared("tx", "c", -16)
---copy_to_texture("c")
---print_code()
-
-copy_to_registers("k", "a")
-print_code()
---unroll(0,5,0)
---unroll(0,4,0)
---unroll(2,4,16)
-unroll_to_depth(1)
---print_code()
diff --git a/examples/cuda-chill/tmv.c b/examples/cuda-chill/tmv.c
deleted file mode 100644
index cb9ea8d..0000000
--- a/examples/cuda-chill/tmv.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 1024
-
-void normalMV(float c[N][N], float a[N], float b[N]) {
- int i, j;
-
- for (i = 0; i < N; i++)
- for (j = 0; j < N; j++)
- a[i] = a[i] + c[i][j] * b[j];
-}
diff --git a/examples/cuda-chill/tmv.lua b/examples/cuda-chill/tmv.lua
deleted file mode 100644
index 5071108..0000000
--- a/examples/cuda-chill/tmv.lua
+++ /dev/null
@@ -1,50 +0,0 @@
-init("tmv.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-
-N=1024
---N= 8209
---N=129
-TI=64
-N=1024
-TI=32
---tile, "k" for the control loop for the "j" tile, with the final order
---of {"ii", "k", "i", "j"}
-tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
---tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
-
---print_code()
---Normalize indx will do a tile size of one over the loop level specified
---by the input index. This is useful to get a zero lower bound and hard
---upper bound on a loop instead of it being relative to previous loop
---levels.
---normalize_index("i")
---print_code()
-
---Cudaize now determines the grid dimentions from the loops themselves
---(the upper bounds of the block and thread loops). It also renames the
---given block and thread loops's indexes to the approviate values from
---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
---size of the arrays to be copied in the CUDA scaffolding.
-cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}})
-
---print_code()
-
---Does a datacopy, tile, and add_sync to get a shared memory copy
-copy_to_shared("tx", "b", 1)
---copy_to_texture("b")
---print_code()
-
-copy_to_shared("tx", "c", -16)
---copy_to_texture("c")
---print_code()
-
-copy_to_registers("k", "a")
-print_code()
---unroll(0,5,0)
---unroll(0,4,0)
---unroll(2,4,16)
-unroll_to_depth(1)
---print_code()
diff --git a/examples/fortran/README b/examples/fortran/README
deleted file mode 100644
index 4f23bee..0000000
--- a/examples/fortran/README
+++ /dev/null
@@ -1,10 +0,0 @@
-// Manu
-
-1) Fortran support added to permute, tile, unroll and datacopy. Tested these w.r.t gemm.c using gemm.script.
- There might be other issues (like fusion due to unroll, ...) that have not been tested.
-
-2) To incorporate Fortran support I had to modify certain values in omega (include/omega/omega_core/oc.h).
- To solve for large number of unknowns, these values have to be reverted back.
-
-3) Tested the existing chill scripts using Derick's python script.
- At least the existing chill scripts are not affected by the fortran related changes.
diff --git a/examples/fortran/ccd.f b/examples/fortran/ccd.f
deleted file mode 100644
index 12d834d..0000000
--- a/examples/fortran/ccd.f
+++ /dev/null
@@ -1,32 +0,0 @@
-c
-c These have been separated out from ccsd_t_singles_l.F and ccsd_t_doubles_l.F
-c
- subroutine clean_sd_t_s1_1(h3d,h2d,h1d,p6d,p5d,p4d,
- 2 triplesx,t1sub,v2sub)
- IMPLICIT NONE
- integer h3d,h2d,h1d,p6d,p5d,p4d
- integer h3,h2,h1,p6,p5,p4
- integer N
- double precision triplesx(16,16,16,16,16,16)
- double precision t1sub(16,16)
- double precision v2sub(16,16,16,16)
-
- N = 16
-
- do p4=1,10
- do p5=1,10
- do p6=1,10
- do h1=1,10
- do h2=1,10
- do h3=1,10
- triplesx(h3,h2,h1,p6,p5,p4)=triplesx(h3,h2,h1,p6,p5,p4)
- 1 + t1sub(p4,h1)*v2sub(h3,h2,p6,p5)
- enddo
- enddo
- enddo
- enddo
- enddo
- enddo
- return
- end
-
diff --git a/examples/fortran/ccd.script b/examples/fortran/ccd.script
deleted file mode 100644
index c2af500..0000000
--- a/examples/fortran/ccd.script
+++ /dev/null
@@ -1,18 +0,0 @@
-source: ccd.f
-procedure: clean_sd_t_s1_1
-format : rose
-loop: 0
-
-
-
-original()
-
-UN=4
-
-unroll(0,5,4)
-unroll(0,4,4)
-unroll(0,3,4)
-unroll(0,2,4)
-unroll(0,1,4)
-
-print
diff --git a/examples/fortran/gemm.f90 b/examples/fortran/gemm.f90
deleted file mode 100644
index b65bb58..0000000
--- a/examples/fortran/gemm.f90
+++ /dev/null
@@ -1,58 +0,0 @@
-program matmul
-
- integer N,i,j,k
- real*8 a(10,10), b(10,10), c(10,10), ct(10,10),mysum
-
- do i=1,10,1
- do j=1,10,1
- a(i,j) = i+j
- b(i,j) = i-j
- c(i,j) = 0.0
- ct(i,j) = 0.0
- end do
- b(i,i) = 1.0;
- end do
-
-
- DO j=1,10,1
- DO k=1,10,1
- DO i=1,10,1
- c(i,j) = c(i,j)+a(i,k)*b(k,j)
- end do
- end do
- end do
-
-
-
- call gemm(10,a,b,ct)
-
- mysum = 0.0
- do i=1,10,1
- do j=1,10,1
- mysum = c(i,j) - ct(i,j)
- end do
- end do
-
- if (abs(mysum) >= 0.00001) then
- write (*,*) "Something wrong"
- else
- write (*,*) "Output matches"
- end if
-
-end program matmul
-
- SUBROUTINE gemm(N,A,B,C)
- INTEGER N
- REAL*8 A(N,N), B(N,N), C(N,N)
-
- INTEGER I,J,K
-
- DO J=1,N,1
- DO K=1,N,1
- DO I=1,N,1
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
- end do
- end do
- end do
-
- END subroutine
diff --git a/examples/fortran/gemm.script b/examples/fortran/gemm.script
deleted file mode 100644
index 01eb859..0000000
--- a/examples/fortran/gemm.script
+++ /dev/null
@@ -1,30 +0,0 @@
-#matrix multiply large array size for intel machine
-source: gemm.f90
-procedure: gemm
-format: rose
-loop: 0
-
-TI = 128
-#TI = 4
-TJ = 8
-#TK = 3
-TK = 512
-UI = 2
-UJ = 2
-
-permute([3,1,2])
-tile(0,2,TJ)
-#print space
-tile(0,2,TI)
-#print space
-tile(0,5,TK)
-#print space
-
-
-datacopy(0,3,A,false,-1)
-#print space
-
-datacopy(0,4,B)
-unroll(0,4,UI)
-unroll(0,5,UJ)
-
diff --git a/examples/fortran/rose_gemm.f90 b/examples/fortran/rose_gemm.f90
deleted file mode 100644
index d150922..0000000
--- a/examples/fortran/rose_gemm.f90
+++ /dev/null
@@ -1,155 +0,0 @@
-PROGRAM matmul
-INTEGER :: N, i, j, k
-REAL(kind=8) :: a(10,10), b(10,10), c(10,10), ct(10,10), mysum
-DO i = 1, 10, 1
-DO j = 1, 10, 1
-a(i,j) = i + j
-b(i,j) = i - j
-c(i,j) = 0.0
-ct(i,j) = 0.0
-END DO
-b(i,i) = 1.0
-END DO
-DO j = 1, 10, 1
-DO k = 1, 10, 1
-DO i = 1, 10, 1
-c(i,j) = c(i,j) + a(i,k) * b(k,j)
-END DO
-END DO
-END DO
-CALL gemm(10,a,b,ct)
-mysum = 0.0
-DO i = 1, 10, 1
-DO j = 1, 10, 1
-mysum = c(i,j) - ct(i,j)
-END DO
-END DO
-IF (abs(mysum) >= 0.00001) THEN
-WRITE (*, FMT=*) "Something wrong"
-ELSE
-WRITE (*, FMT=*) "Output matches"
-END IF
-END PROGRAM matmul
-
-SUBROUTINE gemm(N,A,B,C)
-INTEGER :: t12
-INTEGER :: t10
-INTEGER :: t8
-INTEGER :: t6
-INTEGER :: t4
-INTEGER :: t2
-INTEGER :: chill_t64
-INTEGER :: chill_t63
-INTEGER :: chill_t62
-INTEGER :: chill_t61
-INTEGER :: chill_t60
-INTEGER :: chill_t59
-INTEGER :: chill_t58
-INTEGER :: chill_t57
-INTEGER :: chill_t56
-INTEGER :: chill_t55
-INTEGER :: chill_t54
-INTEGER :: chill_t53
-INTEGER :: chill_t52
-INTEGER :: chill_t51
-INTEGER :: chill_t50
-INTEGER :: chill_t49
-INTEGER :: chill_t48
-INTEGER :: chill_t47
-INTEGER :: over2
-INTEGER :: chill_t46
-INTEGER :: chill_t45
-INTEGER :: chill_t44
-INTEGER :: chill_t43
-INTEGER :: chill_t42
-INTEGER :: chill_t41
-INTEGER :: chill_t40
-INTEGER :: chill_t39
-INTEGER :: chill_t38
-INTEGER :: chill_t37
-INTEGER :: chill_t36
-INTEGER :: chill_t35
-INTEGER :: chill_t34
-INTEGER :: chill_t33
-INTEGER :: chill_t32
-INTEGER :: chill_t31
-INTEGER :: chill_t30
-INTEGER :: chill_t29
-INTEGER :: chill_t28
-INTEGER :: chill_t27
-INTEGER :: chill_t26
-INTEGER :: chill_t25
-INTEGER :: chill_t24
-INTEGER :: chill_t23
-INTEGER :: over1
-INTEGER :: chill_t22
-INTEGER :: chill_t21
-INTEGER :: chill_t20
-INTEGER :: chill_t19
-INTEGER :: chill_t18
-INTEGER :: chill_t17
-INTEGER :: chill_t16
-INTEGER :: chill_t15
-REAL(kind=8), DIMENSION(8,512) :: f_P2
-INTEGER :: chill_t14
-INTEGER :: chill_t13
-INTEGER :: chill_t12
-INTEGER :: chill_t11
-INTEGER :: chill_t10
-INTEGER :: chill_t9
-INTEGER :: chill_t8
-INTEGER :: chill_t7
-REAL(kind=8), DIMENSION(512,128) :: f_P1
-INTEGER :: chill_t1
-INTEGER :: chill_t2
-INTEGER :: chill_t4
-INTEGER :: chill_t6
-INTEGER :: chill_t5
-INTEGER :: N
-REAL(kind=8) :: A(N,N), B(N,N), C(N,N)
-INTEGER :: I, J, K
-over1 = 0
-over2 = 0
-DO t2 = 1, N, 512
-DO t4 = 1, N, 128
-DO t6 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
-DO t8 = t4, merge(t4 + 127,N,t4 + 127 <= N), 1
-f_P1(t8 - t4 + 1,t6 - t2 + 1) = A(t8,t6)
-END DO
-END DO
-DO t6 = 1, N, 8
-DO t8 = t6, merge(N,t6 + 7,N <= t6 + 7), 1
-DO t10 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
-f_P2(t10 - t2 + 1,t8 - t6 + 1) = B(t10,t8)
-END DO
-END DO
-over1 = MOD(N,2)
-DO t8 = t4, merge(-over1 + N,t4 + 126,-over1 + N <= t4 + 126), 2
-over2 = MOD(N,2)
-DO t10 = t6, merge(t6 + 6,N - over2,t6 + 6 <= N - over2), 2
-DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1
-C(t8,t10) = C(t8,t10) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
-C(t8 + 1,t10) = C(t8 + 1,t10) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
-C(t8,t10 + 1) = C(t8,t10 + 1) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1)
-C(t8 + 1,t10 + 1) = C(t8 + 1,t10 + 1) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1)
-END DO
-END DO
-IF (N - 7 <= t6 .AND. 1 <= over2) THEN
-DO t12 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
-C(t8,N) = C(t8,N) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1)
-C(t8 + 1,N) = C(t8 + 1,N) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1)
-END DO
-END IF
-END DO
-IF (N - 127 <= t4 .AND. 1 <= over1) THEN
-DO t10 = t6, merge(t6 + 7,N,t6 + 7 <= N), 1
-DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1
-C(N,t10) = C(N,t10) + f_P1(N - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
-END DO
-END DO
-END IF
-END DO
-END DO
-END DO
-END SUBROUTINE
-