diff options
author | dhuth <derickhuth@gmail.com> | 2014-08-27 09:52:06 -0600 |
---|---|---|
committer | dhuth <derickhuth@gmail.com> | 2014-08-27 09:52:06 -0600 |
commit | bff810cc371a38f493d688c54f71013f5a7d53bf (patch) | |
tree | fbe86954bb3c01deb21da9e41ebff5baa2889a45 /examples | |
download | chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.gz chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.bz2 chill-bff810cc371a38f493d688c54f71013f5a7d53bf.zip |
Initial commit
Diffstat (limited to 'examples')
39 files changed, 2308 insertions, 0 deletions
diff --git a/examples/chill/gemm.c b/examples/chill/gemm.c new file mode 100644 index 0000000..355bafe --- /dev/null +++ b/examples/chill/gemm.c @@ -0,0 +1,15 @@ +int main() { + + float a[512][512], b[512][512], c[512][512]; + + int i, j, k; + int n; + for (j = 0; j < n; j++) + for (k = 0; k < n; k++) + for (i = 0; i < n; i++) { + c[i][j] = c[i][j] + a[i][k] * b[k][j]; + } + + return 0; +} + diff --git a/examples/chill/gemm.script b/examples/chill/gemm.script new file mode 100644 index 0000000..ed91567 --- /dev/null +++ b/examples/chill/gemm.script @@ -0,0 +1,31 @@ +#matrix multiply large array size for intel machine +source: gemm.c +procedure: main +format: rose +loop: 0 + +TI = 128 +TJ = 8 +TK = 512 +UI = 2 +UJ = 2 + +permute([3,1,2]) +tile(0,2,TJ) +#print space +tile(0,2,TI) +#print space +tile(0,5,TK) +#print space + +datacopy(0,3,a,false,1) +#print space + +datacopy(0,4,b) +print +unroll(0,4,UI)#print space +print +unroll(0,5,UJ) +#print space +print + diff --git a/examples/chill/gemv.c b/examples/chill/gemv.c new file mode 100644 index 0000000..610d4cb --- /dev/null +++ b/examples/chill/gemv.c @@ -0,0 +1,15 @@ +#define N 10 + +int main() { + // int n; + float a[N]; + float b[N]; + float c[N][N]; + + int i, j; + + for (i = 1; i < N; i++) + for (j = 1; j < N; j++) + a[i] = a[i] + c[i][j] * b[j]; + +} diff --git a/examples/chill/gemv.script b/examples/chill/gemv.script new file mode 100644 index 0000000..f1d5f89 --- /dev/null +++ b/examples/chill/gemv.script @@ -0,0 +1,9 @@ +source: gemv.c # matrix-vector multiply +procedure: main +format : rose +loop: 0 + + + +original() +print diff --git a/examples/chill/jacobi1.c b/examples/chill/jacobi1.c new file mode 100644 index 0000000..0fcaee4 --- /dev/null +++ b/examples/chill/jacobi1.c @@ -0,0 +1,13 @@ +#define N 512 + +int main() { + int i, t; + + float a[N][N]; + + for (t = 2; t <= 100; t++) + for (i = 2; i <= N - 1; i++) + a[t][i] = a[t - 1][i - 1] + a[t - 1][i] + a[t - 1][i + 1]; + + return 0; +} diff --git a/examples/chill/jacobi1.script b/examples/chill/jacobi1.script new file mode 100644 index 0000000..c0dec8d --- /dev/null +++ b/examples/chill/jacobi1.script @@ -0,0 +1,18 @@ +# +# tiling perfect jacobi loop nest with time step, use +# unimodular transformation first (only applicable to the +# perfect loop nest) to make tiling legal. +# + +source: jacobi1.c +procedure: main +format : rose +loop: 0 + +print dep + +nonsingular([[1,0],[1,1]]) # unimodular matrix, determinant is one +tile(0,2,64) + +print dep +print diff --git a/examples/chill/jacobi2.c b/examples/chill/jacobi2.c new file mode 100644 index 0000000..b8d8d7b --- /dev/null +++ b/examples/chill/jacobi2.c @@ -0,0 +1,15 @@ +#define N 512 + +int main() { + double a[N]; + double b[N]; + int t, i; + for (t = 1; t <= 100; t++) { + for (i = 2; i <= N - 1; i++) + b[i] = (double) 0.25 * (a[i - 1] + a[i + 1]) + (double) 0.5 * a[i]; + + for (i = 2; i <= N - 1; i++) + a[i] = b[i]; + } + return 0; +} diff --git a/examples/chill/jacobi2.script b/examples/chill/jacobi2.script new file mode 100644 index 0000000..afe14c6 --- /dev/null +++ b/examples/chill/jacobi2.script @@ -0,0 +1,21 @@ +# +# tiling imperfect jacobi loop nest, more details in the paper +# "Automatic Tiling of Iterative Stencil Loops" by Zhiyuan Li and +# Yonghong Song, TOPLAS, 2004. +# + +source: jacobi2.c +procedure: main +format: rose +loop: 0 + +print dep + +original() +shift([1], 2, 1) +fuse([0,1], 2) # optional +skew([0,1], 2, [2,1]) +tile(0, 2, 32, 1) + +print dep +print diff --git a/examples/chill/unroll.c b/examples/chill/unroll.c new file mode 100644 index 0000000..68f4633 --- /dev/null +++ b/examples/chill/unroll.c @@ -0,0 +1,31 @@ +#define N 14 +void foo(int n, float* x, float* y, float* z, float* f3, float* f1, float* w) { + int dt; + + int i, j; + + for (i = 1; i <= 14; i++) + x[i] = 1.0; + + for (i = 1; i <= 14; i += 3) + y[i] = 1.0; + + for (i = N + 1; i <= N + 20; i += 3) + z[i] = 1.0; + + for (i = 0; i <= N; i++) { + for (j = i; j <= i + N; j++) + f3[i] = f3[i] + f1[j] * w[j - i]; + f3[i] = f3[i] * dt; + } + + return 0; +} + +int main() { + float x[N], y[N], z[N], f3[N], f1[N], w[N]; + + foo(N, x, y, z, f3, f1, w); + return 0; +} + diff --git a/examples/chill/unroll.script b/examples/chill/unroll.script new file mode 100644 index 0000000..e64acb6 --- /dev/null +++ b/examples/chill/unroll.script @@ -0,0 +1,35 @@ +# +# Test unroll-and-jam. The last loop adapted from the simple +# convolution example from p463 of "Optimizing Compilers for +# Modern Architectures", by Randy Allen and Ken Kennedy. +# + +source: unroll.c +procedure: foo +format: rose +# fully unroll a loop with known iteration count +loop: 0 +original() +unroll(0,1,3) +print +print space + + +# a strided loop +loop: 1 +original() +unroll(0,1,2) +print +print space + +# lower and upper bounds are not constant +loop: 2 +original() +unroll(0,1,20) +print + +# parallelogram iteration space +loop: 3 +original() +unroll(0,1,2) +print diff --git a/examples/cuda-chill/cp.c b/examples/cuda-chill/cp.c new file mode 100644 index 0000000..837d7a6 --- /dev/null +++ b/examples/cuda-chill/cp.c @@ -0,0 +1,29 @@ +#define N 1 + +#define VOLSIZEY 512 +#define VOLSIZEX 512 +#define VOLSIZEZ 1 +#define ATOMCOUNT 4000 +#define GRIDSPACING 0.1 +#define zDim 0 + +extern float sqrtf(float); + +void cenergy_cpu(float atoms[ATOMCOUNT*4],float *energy,float z) +{ +int i,j,n;float dx,dy,dz; + + for (j=0; j<VOLSIZEY; j++) { + for (i=0; i<VOLSIZEX; i++) { + for (n=0;n<ATOMCOUNT;n+=4) { + dx = (GRIDSPACING * i) - atoms[n]; + dy = (GRIDSPACING * j) - atoms[n+1]; + dz = z - atoms[n+2]; + energy[(j*VOLSIZEX + i)+VOLSIZEX*VOLSIZEY*zDim] += atoms[n+3]/sqrtf( (dx*dx) + (dy*dy)+ (dz*dz) ) ; + } + + + } + } +} + diff --git a/examples/cuda-chill/cp.lua b/examples/cuda-chill/cp.lua new file mode 100644 index 0000000..1ef2264 --- /dev/null +++ b/examples/cuda-chill/cp.lua @@ -0,0 +1,46 @@ +--CUBLAS 2 MM Multiply + +--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you +--call init() and use global variables to specify procedure and loop + +--Second parameter is procedure # and third is loop # +init("cp.c", "cenergy_cpu", 0) + +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, + --copy_to_shared methods +V=512 +N=4000 +N=1 + +Tj=32 +Ti=16 +Tii=16 +Tjj=16 + +--normalize_index("j") +--normalize_index("i") +print_code() +normalize_index("n") +-- TILE COMMANDS ZEROOOOOOOOOOO:3 +--permute(0,{"i","j","n"}) +--tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","n"})--CU=-1 +tile_by_index({"j","i"},{Tj,Ti},{l1_control="jj",l2_control="ii"},{"jj","ii","j","i","n"})--CU=-1 +--tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1 + +--tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3 +--tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","i","iii","j","jjj","n"})--CU=3 +--tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"}) +--tile_by_index({"i"}, {Tii}, {l1_control="iii",l1_tile="i"}, {"ii", "jj", "iii","i","j","n"}) +print_code() +cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"jj","ii"}, thread={"j","i"}})--CU=3 +--cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"ii","jj"}, thread={"i","j"}})--CU=3 +print_code() +copy_to_shared("tx","atoms",-16) +copy_to_registers("tx","energy") +--copy_to_texture("atoms") +--unroll_to_depth(1) +--unroll(0,9,0) +--unroll(0,5,0) + +--unroll(0,8,256) +print_code() diff --git a/examples/cuda-chill/cudaize.lua b/examples/cuda-chill/cudaize.lua new file mode 100644 index 0000000..7359cca --- /dev/null +++ b/examples/cuda-chill/cudaize.lua @@ -0,0 +1,1004 @@ + +-- THIS IS CUDAIZE.LUA + +function table.contains_key(table, key) + for k in pairs(table) do + if k == key then + return true + end + end + return false +end + +function valid_indices(stmt, indices) + --print( "valid_indices() lua calling C cur_indices") + --io.flush() + cur = cur_indices(stmt) + --print("Cur indices "..list_to_string(cur)) + for idx in pairs(indices) do + if not table.contains_key(cur,idx) then + return false + end + end + return true +end + +function next_clean_level(cur_idxs,level) + --print("next_clean_level( ..., "..level.." )") + --print(string.format("indices_at_each_level %s ",list_to_string(cur_idxs) )) + + --print("loop to "..#cur_idxs) + for i=level+1,#cur_idxs do + --print("Checking level "..i.." = '"..cur_idxs[i].."'") + if (# cur_idxs[i] > 0) then + --print("Good enough"..(# cur_idxs[i])) + --print("returning "..i) + return i + end + end + return -1 --sentinal that there were no non-dummy indices left +end + +function build_order(final_order, tile_idx_names, ctrl_idx_names, tile_idx_map, cur_level) + order = {} + --print("\nbuild_order()") + --print("build_order(): final_order = ( "..list_to_string(final_order).." )") + --print("build_order(): ctrl_idx_names = ("..list_to_string(ctrl_idx_names).." )") + --print("cur_level "..cur_level.."") + --io.flush() + + for i,k in ipairs(final_order) do + skip = false + cur = final_order[i] + --print("\ncur "..cur.." = final_order["..i.."] = "..final_order[i].." ") + --control loops below our current level should not be in the current order + for j=cur_level+2,# ctrl_idx_names do + --print("j "..j.." final_order["..i.."] = "..final_order[i].." ") + if ctrl_idx_names[j] == final_order[i] then + skip = true + --print("SKIP "..final_order[i].." ") + --io.flush() + end + end + --possibly substitute tile indices ifn necessar + if table.contains_key(tile_idx_map,final_order[i]) then + approved_sub = false + sub_string = tile_idx_map[final_order[i]] + for j=cur_level+2,# tile_idx_names do + if tile_idx_names[j] == sub_string then + approved_sub = true + end + end + if approved_sub then + cur = sub_string + end + end + if not skip then + table.insert(order,cur) + end + end + return order +end + +function list_to_string(str_list) + --Helpful debug output + l = "" + for i,str in ipairs(str_list) do + if i > 1 then + l = l .. ", " .. str + else + l = str + end + end + return l +end + + +function find_cur_level(stmt,idx) + --Search cur_indices for a idx at stmt + cur = cur_indices(stmt) + --print(string.format("find_cur_level(stmt %d, idx %s) Cur indices %s", stmt, idx, list_to_string(cur))) + for i,cidx in ipairs(cur) do + if cidx == idx then + --print(string.format("found it at index %d", i)) + return i + end + end + error("Unable to find "..idx.." in current list of indices") +end + + +function chk_cur_level(stmt,idx) + --Search cur_indices for a idx at stmt + cur = cur_indices(stmt) + for i,cidx in ipairs(cur) do + if cidx == idx then + return i + end + end + return -1 +end + + +function find_offset(cur_order, tile, control) + --print("Looking for tile '"..tile.."' and control '"..control.."' in ( "..list_to_string(cur_order)..", )") + idx1 = -1 + idx2 = -1 + for i,cur in ipairs(cur_order) do + if(cur == tile) then + idx1 = i + end + if(cur == control) then + idx2 = i + end + end + if(idx1 < 0) then + error("Unable to find tile " .. tile .. " in current list of indices") + end + if(idx2 < 0) then + error("Unable to find control " .. control .. " in current list of indices") + end + --print("found at level " .. idx2 .. " and " .. idx1) + if(idx2 < idx1) then + return idx2-idx1+1 + else + return idx2-idx1 + end +end + +function tile_by_index(tile_indices, sizes, index_names, final_order, tile_method) + --print "STARTING TILE BY INDEX" + --io.flush() + stmt = 0 --assume stmt 0 + cur = cur_indices(stmt) + --print("Cur indices "..list_to_string(cur)) + if not valid_indices(stmt,tile_indices) then + error('One of the indices in the first parameter were not '.. + 'found in the current set of indices.') + end + if not tile_method then tile_method = counted end + tile_idx_names = {} + for i,s in ipairs(tile_indices) do tile_idx_names[i]=s end --shallow copy + --print("tile_index_names: ['"..list_to_string(tile_indices).."']") + + --print("index_names: ") + --for k,v in pairs(index_names) do print(k,v) end + + --io.flush() + + ctrl_idx_names = {} + tile_idx_map = {} + for k,v in pairs(index_names) do + valid = false + if(string.sub(k,1,1) == "l") then + if string.sub(k,-8) == "_control" then + i = tonumber(string.sub(k,2,-9)) + if i and i >= 1 and i <= (# tile_indices) then + ctrl_idx_names[i] = v + --print(string.format("Handling control %s for loop level %d",v,i)) + --print("control "..k.." name "..v.." ") + valid = true + end + elseif string.sub(k,-5) == "_tile" then + i = tonumber(string.sub(k,2,-6)) + if i and i >= 1 and i <= (# tile_indices) then + --print(string.format("tile %s -> %s",tile_indices[i], v)) + tile_idx_names[i] = v + tile_idx_map[v] = tile_indices[i] + --print(string.format("tile %s -> %s",tile_indices[i], v)) + valid = true + end + end + end + if not valid then error(string.format("%s is not a proper key for specifying ".. + "tile or control loop indices\n", k)) end + end + + --filter out control indices (and do name substitution of unprocessed tile indices) for a given level + cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, -1) + permute(stmt, cur_order) + + for i,cur_idx in ipairs(tile_indices) do + --print(string.format("i %d cur_idx %s calling build order ********", i-1, cur_idx)) + cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1) + --Find a offset between tile loop and control loop + -- 0 = control loop one level above tile loop + -- -1 = control loop two levels above tile loop + -- > 0 = tile loop above control loop + -- In the last case, we do two extra tile commands to get the control + -- above the tile and then rely on the final permute to handle the + -- rest + level = find_cur_level(stmt,cur_idx) + offset = find_offset(cur_order, tile_idx_names[i], ctrl_idx_names[i]) + --print(string.format("offset %d", offset)) + + if (offset <= 0) then + --print(string.format("[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %s)",stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method)) + tile(stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method) + else + --print(string.format("2tile(%d, %d, %d, %d, %s, %s, %s)", stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method)) + tile(stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method);--regular level + --flip tile and control loop + --print(string.format("3tile(%d, %d, %d)",stmt, level+1, level+1)) + tile(stmt, level+1, level+1); + --print(string.format("4tile(%d, %d, %d)",stmt, level+1, level)) + tile(stmt, level+1, level); + --print(string.format("\n[offset>0]tile(%d, %d, %d, %d,%s,%s,%s)",stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method)) + --print_code() + + end + + --Do permutation based on cur_order + --print "permute based on build order calling build_order()" + --print "cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)" + cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1) + --print "permute(stmt, cur_order);" + permute(stmt, cur_order); + --print "\nafter permute(), code is:" + --print_code() + end + --print "ENDING TILE BY INDEX" + --print_code() +end + +function normalize_index(index) + stmt = 0 --assume stmt 0cur = cur_indices(stmt) + --print("Cur indices "..list_to_string(cur)) + l = find_cur_level(stmt, index) + tile(stmt, l, l) + --print(string.format("\n[Normalize]tile(%d, %d, %d)",stmt, l,l)) +end + +function is_in_indices(stmt, idx) + cur = cur_indices(stmt) + for i=0,#cur,1 do + if(cur[i]==idx) then + return true + end + end + return false + +end + + +function copy_to_registers(start_loop, array_name) + + --print("\n\n****** starting copy to registers") + io.flush() + + stmt = 0 --assume stmt 0 + + -- [Malik] first we make sure that tx and ty are consecutive loops in the 2D thread setup, otherwise all levels for subsequent operations are messed up. Start logic. + cur = cur_indices(stmt) + table_Size = table.getn(cur) + + --print(string.format("Cur indices %s,",list_to_string(cur))) + --print(string.format("The table size is %d", table_Size)) + --table.foreach(cur, print) + --print_code() + + level_tx = -1 + level_ty = -1 + if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end + if is_in_indices(stmt,"ty") then level_ty = find_cur_level(stmt,"ty") end + --print(string.format("level_tx %d level_ty %d", level_tx, level_ty)) + + ty_lookup_idx = "" + org_level_ty = level_ty + + --if(cur[level_tx+1]~=nil and cur[level_tx+1]~="") then ty_lookup = ty_lookup+1 end + if(cur[level_ty+1]~=nil and cur[level_ty+1]~="") then + --print(string.format("IF cur[%d] = %s", level_ty+1, cur[level_ty+1])) + ty_lookup_idx = cur[level_ty+1] + else + --if cur[level_ty] ~= nil then print(string.format("ELSE ty_lookup_idx = cur[%d] = %s", level_ty, cur[level_ty])) -- TODO + --else print "ELSE (dangerous)" end + ty_lookup_idx = cur[level_ty] -- may assign nil !? + end + --if ty_lookup_idx ~= nil then print(string.format("ty_lookup_idx '%s'", ty_lookup_idx)) -- TODO + --else print "ty_lookup_idx is NIL" + --end + + if level_ty > 0 then + --print(string.format("\ntile3(%d,%d,%d)",stmt,level_ty,level_tx+1)) + tile(stmt,level_ty,level_tx+1) + end + --print_code() + + --print("\ntylookup is %d",ty_lookup) + --exit(0) + -- + cur = cur_indices(stmt) + table_Size = table.getn(cur) + --print(string.format("Cur indices %s,",list_to_string(cur))) + --print("The table size is "..table.getn(cur)) + --table.foreach(cur, print) + + if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end + if ty_lookup_idx then + if is_in_indices(stmt,ty_lookup_idx) then level_ty = find_cur_level(stmt,ty_lookup_idx) end + end + + ty_lookup = 1 + idx_flag = -1 + -- find the level of the next valid index after ty+1 + --print(string.format("\nlevel_ty %d", level_ty)) + if level_ty > 0 then + --print(string.format("table_Size %d", table_Size)) + for num= level_ty+ty_lookup,table_Size do + --print(string.format("num=%d cur[num] = '%s'",num, cur[num])) + if(cur[num] ~= "") then + idx_flag = find_cur_level(stmt,cur[num]) + --print (string.format("idx_flag = %d", idx_flag)) + break + end + end + end + + --print(string.format("\n(first) I am checking all indexes after ty+1 %s",idx_flag)) + --print_code() + --print "" + + how_many_levels = 1 + startat = idx_flag + 1 + if startat == 0 then startat = 1 end -- avoid attempt to examine an illegal array offset + --print(string.format("idx_flag = %d I will check levels starting with %d", idx_flag, idx_flag+1)) + + for ch_lev = startat,table_Size,1 do -- was for ch_lev = idx_flag+1,table_Size,1 do + --print(string.format("ch_lev %d", ch_lev)) + if(cur[ch_lev] ~= nil and cur[ch_lev] ~= "") then + --print(string.format("cur[%d] = '%s'", ch_lev, cur[ch_lev])) + how_many_levels = how_many_levels+1 + end + end + --print("\nHow Many Levels",how_many_levels) + + -- change this all to reflect the real logic which is to normalize all loops inside the thread loops. + if(how_many_levels <2) then + while( idx_flag >= 0) do + for num = level_ty+ty_lookup,(table_Size) do + --print(string.format("at top of loop, num is %d", num)) + --print(string.format("num %d", num)) + --print(string.format("cur[num] = '%s'", cur[num])) + if(cur[num] ~= "") then + idx=cur[num] + --print(string.format("idx '%s'", idx)) + + curlev = find_cur_level(stmt,idx) + --print(string.format("curlev %d", curlev)) + + --print_code() + --print(string.format("\n[COPYTOREG]tile(%d,%d,%d)",stmt,find_cur_level(stmt,idx),level_tx)) + tile(stmt,find_cur_level(stmt,idx),find_cur_level(stmt,idx)) + curlev = find_cur_level(stmt,idx) + --print(string.format("curlev %d", curlev)) + tile(stmt,find_cur_level(stmt,idx),level_tx) + --print(string.format("hehe '%s'",cur[num])) + + cur = cur_indices(stmt) + --print("Cur indices INSIDE"..list_to_string(cur)) + table_Size = table.getn(cur) + --print(string.format("Table Size is: %d",table_Size)) + level_tx = find_cur_level(stmt,"tx") + --print(string.format("\n level TX is: %d",level_tx)) + level_ty = find_cur_level(stmt,ty_lookup_idx) + --print(string.format("\n level TY is: %d",level_ty)) + idx_flag = -1 + --print "idx_flag = -1" + + -- find the level of the next valid index after ty+1 + + -- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?) + for num= level_ty+ty_lookup,table_Size do + --print(string.format("num mucking num = %d", num)) + if(cur[num] ~= nil and cur[num] ~= "") then + idx_flag = find_cur_level(stmt,cur[num]) + --print("\n(second) I am checking all indexes after ty+1 %s",cur[num]) + break + end + end + --print(string.format("num mucked to %d idx_flag = %d", num, idx_flag)) + + end + --print(string.format("at bottom of loop, num is %d", num)) + end + end + end + --print "done with levels" + + + + + --print "ARE WE SYNCED HERE?" + --print_code() + --print("\ntile(%d,%d,%d)",stmt,level_k,level_k) + --tile(stmt,level_k,level_k) + + -- [Malik] end logic + --print_code() + start_level = find_cur_level(stmt, start_loop) + --We should hold contant any block or tile loop + block_idxs = block_indices() + thread_idxs = thread_indices() + --print("\nblock indices are") + --table.foreach(block_idxs, print) + --print("\nthread indices are") + --table.foreach(thread_idxs, print) + --print(string.format("\nStart Level: %d",start_level)) + + hold_constant = {} + --print("\n Now in Blocks") + for i,idx in ipairs(block_idxs) do + --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx))) + if find_cur_level(stmt,idx) >= start_level then + table.insert(hold_constant, idx) + --print(string.format("\nJust inserted block %s in hold_constant",idx)) + end + end + + + --print("\n Now in Threads") + for i,idx in ipairs(thread_idxs) do + --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx))) + if find_cur_level(stmt,idx) >= start_level then + table.insert(hold_constant, idx) + --print(string.format("\nJust inserted thread %s in hold_constant",idx)) + end + end + + --print "\nhold constant table is: " + --table.foreach(hold_constant, print) + + --print("\nbefore datacopy pvt") + old_num_stmts = num_statements() + --print_code() + --print(string.format("\n[DataCopy]datacopy_privatized(%d, %s, %s, vector having privatized levels)",stmt, start_loop, array_name)) + --table.foreach(hold_constant, print) + datacopy_privatized(stmt, start_loop, array_name, hold_constant) + + --print(hold_constant) + new_num_stmts = num_statements() + --print("\nthe num of statements:%d\n",new_num_stmt) + --print_code() + --exit(0) + -- [Malik] normalize the copy loops created. + cur = cur_indices(old_num_stmts) + --print("Cur indices "..list_to_string(cur)) + for cidx,i in ipairs(cur) do + if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then + --tile(old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i)) + --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i)) + end + end + --print_code() + --print("\nthe num of statements OLD+1 :",(old_num_stmts+1)) + + +--[[ + is this commented out? why yes, yes it is block comment + if( (old_num_stmts+1) <= new_num_stmts) then + cur = cur_indices(old_num_stmts+1) + --print("Cur indices+1 "..list_to_string(cur)) + for cidx,i in ipairs(cur) do + if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then + tile(old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i)) + --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i)) + end + end + end +--]] + + + --Unroll to the last thread level + --for stmt=old_num_stmts,new_num_stmts-1 do + -- level = find_cur_level(stmt,thread_idxs[#thread_idxs])--get last thread level + --if level < #cur_indices(stmt) then + -- unroll(stmt,level+1,0) + --print(string.format("\n[Unroll]unroll(%d, %d, 0)",stmt, level+1)) + ----print_code() + --end + --end + io.flush() + --print("****** ending copy to registers\n\n") + --io.flush() +end + +function copy_to_shared(start_loop, array_name, alignment) + --print(string.format("\nstarting copy to shared(%s, %s, %d )",start_loop,array_name,alignment)) + stmt = 0 --assume stmt 0 + cur = cur_indices(stmt) + --print("Cur indices "..list_to_string(cur)) + + start_level = find_cur_level(stmt, start_loop) + --print(string.format("start_level %d", start_level)) + + old_num_stmts = num_statements() + --print(string.format("old_num_statements %d", old_num_stmts)) + + --Now, we give it indices for up to two dimentions for copy loop + copy_loop_idxs = {"tmp1","tmp2"} + --print(string.format("\n[DataCopy]datacopy(%d, %d, %s, {\"tmp1\",\"tmp2\"},false,0,1,%d,true)",stmt, start_level, array_name, alignment)) + datacopy(stmt, start_level, array_name, copy_loop_idxs, false, 0, 1, alignment,true) + + add_sync(stmt,start_loop) + new_num_stmts = num_statements() + + --This is fairly CUBLAS2 specific, not sure how well it generalizes, + --but for a 2D copy, what we want to do is "normalize" the first loop + --"tmp1" then get its hard upper bound. We then want to tile it to + --make the control loop of that tile "ty". We then tile "tmp2" with a + --size of 1 and make it "tx". + --print(string.format("fairly CUBLAS2 specific, OLD %d NEW %d", old_num_stmts, new_num_stmts )) + + for stmt=old_num_stmts,new_num_stmts-1 do + --print(string.format("for stmt = %d", stmt)) + was_no_error, level = pcall(find_cur_level, stmt, "tmp2") + + if was_no_error then + --print_code() + --print("\nCopy to shared: [If was no error]\n") + find_cur_level(stmt,"tmp2") + tile(stmt, level, level) + + lower,upper = hard_loop_bounds(stmt, level) + upper = upper + 1 + --print(string.format("lower %d upper %d", lower, upper)) + + tx,ty = thread_dims() + --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx) + + level = find_cur_level(stmt,"tmp1") + --print(string.format("level %d", level)) + + if tx == upper and ty == 1 then + --print(string.format("tx = %d upper = %d ty = %d", tx, upper, ty)) + --print "Don't need" + + --Don't need an extra tile level, just move this loop up + second_level = find_cur_level(stmt,"tmp2") + --print(string.format("\n[Tile0]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx")) + tile(stmt, second_level, 1, level, "tx", "tx", counted) + else + --print "DO need?" + --print_code() + if(ty == 1) then new_ctrl = "tmp3" else new_ctrl = "ty" end + + +--[[ Commenting out a block of Gabe's code in this control flow + -- level = find_cur_level(stmt,"tmp1") + tile(stmt, level, level) + + lower,upper = hard_loop_bounds(stmt, level) + upper = upper + 1 + --print_code() + --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx..", level: "..level) + if(math.ceil(upper/ty) > 1)then + tile(stmt, level, math.ceil(upper/ty), level, "tmp", new_ctrl, counted) + --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level, math.ceil(upper/ty),upper,ty, level, "tmp", new_ctrl)) + else + tile(stmt, level, math.ceil(upper/ty), level, "ty", new_ctrl, counted) + --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level, math.ceil(upper/ty),upper,ty, level, "tx", new_ctrl)) + end + + --print_code() + -- [Malik] If here we have the loop upper bound > tx, then we should tile once more after the next tile, to carve out the correct tx. + lower1,upper1 = hard_loop_bounds(stmt,level) + level1 = level + stmt1 = stmt + -- [Malik] Do the tile after the second level tile with if condition. Just to keep the original order, the tile is being pushed to the end. + + --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1) + + --print_code() + --level = find_cur_level(stmt,"tmp") + --tile(stmt,level,level) + --print_code() + + --[Malik] if you are moving the loop above the level1, you need to update level1 with new position which would be level1+2 or second_level + if(level <= level1) then level1 = level1+2 end + --print(string.format("\n[Tile2]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx")) + --print("\n----------------------------------") + --print_code() + --print("\n**********************************") + --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1) + -- [Malik] If the upper bound > tx, we do another tile to carve out the correct tx from a bigger loop. Else just normalize the bounds. + if( upper1 > ty) then + third_level = find_cur_level(stmt1,"tmp") + --print("\n\n\n\t\t\t\tthirdlevel:"..third_level) + tile(stmt1, third_level, ty, third_level, "ty", "tmp", counted) + --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt1, third_level, ty,third_level, "ty", "tmp")) + tile(stmt1,third_level+1,third_level+1) + --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level+1)) + tile(stmt1,third_level+1,third_level) + --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level)) + else + tile(stmt1,level1,level1) + --print(string.format("\n[Tile3ELSE]tile(%d, %d, %d)",stmt1,level1,level1)) + end + + --print("\nStarting tmp2\n");--print_code(); + second_level = find_cur_level(stmt,"tmp2") + lower,upper = hard_loop_bounds(stmt,second_level) + level = second_level + --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..", level:"..level) + + if(math.ceil(upper/tx) > 1)then + tile(stmt, second_level,math.ceil(upper/tx), level, "tmp", "tx", counted) + --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tmp", "tx")) + else + tile(stmt, second_level,math.ceil(upper/tx), level, "tx", "tx", counted) + --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tx", "tx")) + end + --print_code() + lower2,upper2 = hard_loop_bounds(stmt,level) + level2 = level + stmt2 = stmt + --print("[Malik]-loop cleanup@tmp2: lower2, upper2: "..lower2..", "..upper2..", tx: "..tx..", level:"..level2) + -- now for the second level. + if( upper2 > tx) then + forth_level = find_cur_level(stmt2,"tmp") + --print("\n\n\n\t\t\t\tforthlevel:"..forth_level) + --print_code() + tile(stmt2, forth_level, 1, forth_level, "tx", "tmp", counted) + --print(string.format("\n[Tile3B]tile(%d, %d, %d,%d,%s,%s,counted)",stmt2, forth_level, tx,forth_level, "ty", "tmp")) + --print_code() + --tile(stmt2,forth_level+1,forth_level+1) + --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level+1)) + --tile(stmt2,forth_level+1,forth_level) + --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level)) + else + new_level = find_cur_level(stmt2,"ty") + tile(stmt2,level2,1,new_level,"tx","tx",counted) + --print(string.format("\n[Tile3BELSE]tile(%d, %d, %d)",stmt2,level2,level2)) + tmp_level = find_cur_level(stmt2,"tmp") + tile(stmt2,tmp_level,tmp_level) + end + + --print_code() + --print("\n----------------------------------") +--]] + + --print_code() + --print("\nStarting tmp2\n");--print_code(); + first_level = find_cur_level(stmt,"tmp1") + second_level = find_cur_level(stmt,"tmp2") + lower,upper = hard_loop_bounds(stmt,second_level) + + --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..",first level:"..first_level..",second_level:"..second_level) + + -- Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx. + --print(string.format("\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,1,first_level, "tx", "tx")) + tile(stmt,second_level,1,first_level,"tx","tx",counted) + --print_code() + + first_level = find_cur_level(stmt,"tmp1") + lower_1,upper_1 = hard_loop_bounds(stmt,first_level) + tx_level = find_cur_level(stmt,"tx") + lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level) + --print(string.format("UL_1 %d %d UL_tx %d %d", lower_1, upper_1, lower_tx, upper_tx)) + + if(math.ceil(upper_tx/tx) > 1)then + --print "ceil I say" + --print(string.format("\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,tx,tx_level, "tx", "tmp1")) + tile(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted) + --print_code() + + peat = find_cur_level(stmt,"tx") + --print(string.format("\n[Tile1]tile(%d, %d, %d)",stmt, peat, peat)) + tile(stmt, peat, peat ) --find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx")) + --print_code() + + if (find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx")) then + --print(string.format("\nagain [Tile1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))) + tile(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")) + --print_code() + end + --else + --tile(stmt, tx_level,1, tx_level, "tx", "tx", counted) + --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,1,tx_level, "tx", "tx")) + end + --print_code() + --]] -- this apparently is NOT the end of a block comment + + --print("\nStarting tmp1\n") + -- Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty". + tile(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1")) + --print_code() + + ty_level = find_cur_level(stmt,"tmp1") + lower_ty,upper_ty = hard_loop_bounds(stmt,ty_level) + + tx_level = find_cur_level(stmt,"tx") + lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level) + --print("[Malik]-loop cleanup@tmp1: lowerty, upperty: "..lower_ty..", "..upper_ty..", ty: "..ty..",ty level:"..ty_level..",tx_level:"..tx_level..", stmt: "..stmt) + + --print "before ceil" + if(math.ceil(upper_ty/ty) > 1)then + --print "CEIL IF" + --print("\n Inside upper_ty/ty > 1\n"); + + --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,ty,ty_level, "ty", "tmp_ty")) + tile(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted) + --print_code() + + --print(string.format("\n[Tile2-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt ,"ty"),find_cur_level(stmt,"ty"))) + tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty")) + --print_code() + + ----------------------------------------------------------------------- + ---------------------------------------------------------------------- + cur_idxs = cur_indices(stmt) + --print("\n cur indexes are "..list_to_string(cur_idxs)) + + -- Putting ty before any tmp_tx + idx_flag = -1 + for num= 0,table.getn(cur_idxs) do + if(cur[num] == "tmp_tx") then + idx_flag = find_cur_level(stmt,cur[num]) + break + end + end + --print(string.format("\n (1) so i have found out the value of idx flag as %d",idx_flag) ) + + if(idx_flag >=0 ) then + if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then + --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) + tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) + --print_code() + end + end + + -- Now Putting ty before any tmp_ty + idx_flag = -1 + for num= 0,table.getn(cur_idxs) do + if(cur[num] == "tmp_ty") then + idx_flag = find_cur_level(stmt,cur[num]) + break + end + end + --print(string.format("\n IF so i have found out the value of idx flag as %d",idx_flag) ) + if(idx_flag >=0 ) then + --print "one more test" + if ((find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"))) then + --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) + tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) + --print_code() + end + end + else + --print "CEIL ELSE" + --cur_idxs = cur_indices(stmt) + --print("\n Inside upper_ty/ty <= 1\n"); + + --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,1,ty_level, "ty", "ty")) + tile(stmt, ty_level,1, ty_level, "ty", "ty", counted) + --print_code() + + --print(string.format("\n[Tile3-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)) + tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1) + --print_code() + + idx_flag = -1 + if(cur_idxs) then + --print "CAN NEVER GET HERE? cur_idxs" + for num= 0,table.getn(cur_idxs) do + if(cur[num] == "tmp_ty") then + idx_flag = find_cur_level(stmt,cur[num]) + break + end + end + end + --print(string.format("\n ELSE so i have found out the value of idx flag as %d",idx_flag) ) + if(idx_flag >=0 ) then + if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then + --print(string.format("tile( stmt %d, level ty %d, level ty %d",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) + tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) + --print(string.format("\n[Tile3-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) + end + end + end + + --print_code() + end + + + --print "\n\n *** at bottom of if in copy to shared, " + --print_code() + --print "end of if" + + else + --copy to shared only created one level, not two, so we use a different approach (MV & TMV) + --print("\nCopy to shared: [If was error]\n") + level = find_cur_level(stmt,"tmp1") + tile(stmt, level, level) + + --print(string.format("\n[Tile]tile(%d, %d, %d)",stmt, level, level)) + tx,ty = thread_dims() + lower,upper = hard_loop_bounds(stmt, level) + upper = upper+1 --upper bound given as <=, compare to dimensions tx which is < + --print("upper "..upper.." tx "..tx) + if upper == tx then + rename_index(stmt, "tmp1", "tx") + else + --print("upper is not tx") + --TODO: Don't know, maybe do some tileing etc + --print_code() + --print("upper "..upper.." tx "..tx.." stmt: "..stmt.." level: "..level) + tile(stmt, level,tx,level, "tx", "tmp_tx", counted) + --print_code() + + --print("stmt:"..stmt.." level+1: "..level+1) + --print("TILE 7") + tile(stmt, level+1,1,level+1,"tx", "tx",counted) + --print("TILE 3") + tile(stmt,level+1,level) + --print_code() + + if(ty > 1) then + --print_code() + --print("GOING IN") + lower,upper = hard_loop_bounds(stmt, level+1) + --print(string.format("ty %d lower %d upper %d", ty, lower, upper)) + --upper=125 + --print("NOW FOR Y: upper "..upper.." ty "..ty.." stmt: "..stmt.." level: "..(level+1).." bound:"..math.ceil(upper/ty)) + tile(stmt, level+1,math.ceil(upper/ty),level+1, "tmp_ty", "ty", counted) + --tile(stmt, level+2,math.ceil(upper/ty),level+2, "tmp_ty", "ty", counted) + end + --print_code() + --rename_index(stmt, "tmp1", "tx") + --print("Warning: Need to implement some logic here to tile the single level shared copy loop to match thread dimensions") + end + end + --Always add sync + add_sync(stmt,start_loop) + + end + --print("ending copy to shared\n") + --print_code() +end + +function unroll_to_depth(max_depth) + --print(string.format("\n\nunroll_to_depth(%d)", max_depth )) + --print "SYNC UP" + + cur = cur_indices(0) + thread_idxs = thread_indices() + guard_idx = thread_idxs[#thread_idxs] + + --print(string.format("cur indices %s",list_to_string(cur))) + --print(string.format("thread indices %s",list_to_string(thread_idxs))) + --print(string.format("#thread_idxs = %d", #thread_idxs)) + --print(string.format("guard_idx = %s", guard_idx)) + + ---- HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS + common_loops = {} + comm_loops_cnt = 0 + num_stmts = num_statements() + --print(string.format("num statements %d", num_stmts)) + + for stmt=0,num_stmts-1 do + cur_idxs = cur_indices(stmt) + + --print(string.format("\nSTMT %d Current Indices: %s",stmt,list_to_string(cur_idxs))) + + if(chk_cur_level(stmt,"tx")>0) then + for ii=1,find_cur_level(stmt,"tx")-1 do -- started at 0 + --print(string.format("ii = %d", ii)) -- index starts at 1, what does index 0 do? + --if cur_idxs[ii] == nil then print "cur_idxs[i]] is NIL" + --else print(string.format("cur_idxs[%d] = '%s'", ii, cur_idxs[ii])) -- index starts at 1, what does index 0 do? + --end + + if(cur_idxs[ii] ~= "bx" and cur_idxs[ii] ~= "by" and cur_idxs[ii] ~= nil and cur_idxs[ii] ~= "tx" and cur_idxs[ii] ~= "ty" and cur_idxs[ii] ~= "") then + + --print(string.format("id %s is not in the list", cur_idxs[ii] )) + + for stmt1=stmt+1,num_stmts-1 do + --print(string.format("\nii %d stmt1 is %d", ii, stmt1)) + cur_idxs1 = cur_indices(stmt1) + --print("\nstmt1 cur_idxs1 is "..list_to_string(cur_idxs1)) + + --print(string.format("cur level(%d, %s) = %d", stmt, "tx", find_cur_level(stmt,"tx"))) + + endrange = find_cur_level(stmt,"tx")-1 + --print(string.format("for iii=1, %d do", endrange)) + + for iii=1,find_cur_level(stmt,"tx")-1 do -- started at 0 + --print(string.format("stmt %d ii %d iii %d ", stmt, ii, iii)) + --if(cur_idxs1[iii] ~= nil) then + -- print(string.format("stmt %d ii %d iii %d cur_idxs1[%d] = '%s'", stmt, ii, iii, iii, cur_idxs1[iii])) + --else + -- print(string.format("stmt %d ii %d iii %d cur_idxs1[%d] = NIL", stmt, ii, iii, iii)) + --end + + if(cur_idxs1[iii] ~= "bx" and cur_idxs1[iii] ~= "by" and cur_idxs1[iii] ~= nil and cur_idxs1[iii] ~= "tx" and cur_idxs1[iii] ~= "ty" and cur_idxs1[iii] ~= "") then + if(cur_idxs[ii] == cur_idxs1[iii]) then + --print("\nfound idx:"..cur_idxs[ii]) + --if(comm_loops_cnt == 0) then print "\n\n*** WARNING *** assigning to array index ZERO in Lua" end + common_loops[comm_loops_cnt] = cur_idxs[ii] + --print(string.format("cl[%d] = '%s'", comm_loops_cnt, common_loops[comm_loops_cnt])) + comm_loops_cnt = comm_loops_cnt + 1 + end + end + end + end + end + end + end + end + ---- + --if(comm_loops_cnt>0) then + -- print("\n COMM LOOPS :TOTAL "..comm_loops_cnt..", and are "..list_to_string(common_loops).." this loop :"..common_loops[0]) + --else + -- print "UNROLL can't unroll any loops?" + --end + + + + + repeat + old_num_stmts = num_statements() + --print(string.format("old_num_statements %d", old_num_stmts)) + + for stmt=0,old_num_stmts-1 do + cur_idxs = cur_indices(stmt) + --print(string.format("stmt %d cur_idxs = %s", stmt, list_to_string(cur_idxs))) + if(#cur_idxs > 0) then + gaurd_level = -1 + if(chk_cur_level(stmt,guard_idx)>0) then + gaurd_level = find_cur_level(stmt,guard_idx) + end + --print(string.format("guard_level(sp) = %d", gaurd_level)) + + if(gaurd_level>-1) then + level = next_clean_level(cur_idxs,gaurd_level) + --print(string.format("next clean level %d", level)) + + --need to handle max_depth + num_unrolled = 0 + level_unroll_comm = level + level_arr = {} + while level >= 0 do + --print(string.format("while: level = %d", level)) + + if num_unrolled == max_depth then break end + --print("Unrolling "..stmt.." at level "..(level).." index ".. cur_idxs[gaurd_level+1]) + + level_arr[num_unrolled] = level + num_unrolled = num_unrolled + 1 + + guard_level = find_cur_level(stmt,guard_idx) + level = next_clean_level(cur_idxs,level+1) + end + --dies print("How many levels for unroll commands"..table.getn(level_arr).." which is "..level_arr[0].." and "..level_arr[#level_arr]) + --if(table.getn(level_arr) ~= nil) then + + --print "OK, NOW WE UNROLL" + + if(level_unroll_comm >= 0)then + for i = table.getn(level_arr),0,-1 do + --print(string.format("\ni=%d", i)) + --print(string.format("[Unroll]unroll(%d, %d, 0)",stmt, level_arr[i])) + + unroll(stmt,level_arr[i],0) + --print("finished unroll]]\n") + --print_code() + end + end +------ + end +--[[ + +THERE WAS A BIG BLOCK OF COMMENTED OUT CODE HERE + + +--]] +------ + end + end + new_num_stmts = num_statements() + + until old_num_stmts == new_num_stmts + +end + + diff --git a/examples/cuda-chill/mm.c b/examples/cuda-chill/mm.c new file mode 100644 index 0000000..0efbeeb --- /dev/null +++ b/examples/cuda-chill/mm.c @@ -0,0 +1,10 @@ +#define N 1024 + +void normalMM(float c[N][N], float a[N][N], float b[N][N]) { + int i, j, k; + + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + for (k = 0; k < N; k++) + c[j][i] = c[j][i] + a[k][i] * b[j][k]; +} diff --git a/examples/cuda-chill/mm.lua b/examples/cuda-chill/mm.lua new file mode 100644 index 0000000..5bde1b0 --- /dev/null +++ b/examples/cuda-chill/mm.lua @@ -0,0 +1,38 @@ +init("mm.c", "normalMM", 0) +dofile("cudaize.lua") +N=1024 +Ti=128 +Tj=64 +Tk=16 +Tii=16 +Tjj=16 + + + + +N=1024 + + + + + + + + + + + + + +tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k"})CU=1 + +tile_by_index({"k"},{Tk},{l1_control="kk"},{"ii","jj","kk","i","j","k"})CU=3 + +tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","kk","i","iii","j","jjj","k"},1)CU=2 + +cudaize("mm_GPU",{a=1048576,b=1048576,c=1048576},{block={"ii","jj"}, thread={"i","j"}})CU=2 +copy_to_shared("tx","a",-16) +copy_to_shared("tx","b",-16) +copy_to_registers("kk","c") +--print_code() +unroll_to_depth(2) diff --git a/examples/cuda-chill/mpeg4.c b/examples/cuda-chill/mpeg4.c new file mode 100755 index 0000000..7f83bf7 --- /dev/null +++ b/examples/cuda-chill/mpeg4.c @@ -0,0 +1,23 @@ +#define N1 4096 +#define N2 4096 +#define WINDOW_SIZE 16 + +void mpeg4_cpu(float result[N1][N2], float prev[N2+WINDOW_SIZE][N2+WINDOW_SIZE], float curr[WINDOW_SIZE*WINDOW_SIZE]) +{ + unsigned int i; + unsigned int j; + unsigned int k; + unsigned int l; + + for ( i = 0; i < N1; ++i) + for ( j = 0; j < N2; ++j) + for ( k = 0; k < WINDOW_SIZE; ++k) + for ( l = 0; l < WINDOW_SIZE; ++l) + result[i][j] += prev[i+k][j+l] * curr[k*WINDOW_SIZE+l]; + + + + + +} + diff --git a/examples/cuda-chill/mpeg4.lua b/examples/cuda-chill/mpeg4.lua new file mode 100644 index 0000000..f025dc0 --- /dev/null +++ b/examples/cuda-chill/mpeg4.lua @@ -0,0 +1,45 @@ +--CUBLAS 2 MM Multiply + +--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you +--call init() and use global variables to specify procedure and loop + +--Second parameter is procedure # and third is loop # +init("mpeg4.c", "mpeg4_cpu", 0) + +--dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods + +N=4096 +M=4096 +W=16 + +--TI 4ust be <= M +--TJ must be <=TI +Ti=32 +Tj=32 +Tii=16 +Tjj=16 +Tk=4 +--permute(0,{"j","i","k","l"}) +tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k","l"}) +--tile_by_index({"k","l"},{Tk*2,Tk*2},{l1_control="kk",l2_control="ll"},{"ii","jj","kk","ll","i","j","k","l"}) +--print_code() +--tile_by_index({"k","l"},{Tk,Tk},{l1_control="kk",l2_control="ll"},{"ii","jj","i","j","kk","k","ll","l"}) +tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","iii","i","jjj","j","k","l"}) +--print_code() +--normalize_index("j") +--normalize_index("i") +--print_code() +cudaize("kernel_GPU",{curr=W*W,prev=(N+W)*(M+W),result=N*M},{block={"ii","jj"}, thread={"i","j"}}) +--print_code() +copy_to_shared("iii","prev",16) + +copy_to_registers("jjj","result") + +--print_code() +--copy_to_constant_no_tile("curr") +unroll_to_depth(2) +print_code() +print_space() + + diff --git a/examples/cuda-chill/mriq-fh.c b/examples/cuda-chill/mriq-fh.c new file mode 100755 index 0000000..1e924b7 --- /dev/null +++ b/examples/cuda-chill/mriq-fh.c @@ -0,0 +1,38 @@ +#define X 32768 +#define K 256 +struct kValues { + float Kx; + float Ky; + float Kz; + float PhiMag; +}; +extern float sin(float); +extern float cos(float); + +void mriFH_cpu(float *rPhi,float *rRho,float *iRho, float *iPhi, float *rD, float *iD, float *kx, float *ky, float *kz, float *dx, float *dy, float *dz, float *rFHref, float *iFHref) +{ + + float rfh; + float ifh; + float exp; + float cArg; + float sArg; + //float rRho[K]; + //float iRho[K]; + unsigned int k; + unsigned int x; + + + for (x = 0; x < X; ++x) { + for (k = 0; k < K; ++k) { + + exp = 2 * 3.14159 * (kx[k]* dx[x] + ky[k]* dy[x] + kz[k]* dz[x]); + cArg = cos(exp); + sArg = sin(exp); + rFHref[x] += rRho[k]* cArg - iRho[k]* sArg; + iFHref[x] += iRho[k]*cArg + rRho[k]*sArg; + } + + } +} + diff --git a/examples/cuda-chill/mriq-fh.lua b/examples/cuda-chill/mriq-fh.lua new file mode 100755 index 0000000..3277bac --- /dev/null +++ b/examples/cuda-chill/mriq-fh.lua @@ -0,0 +1,73 @@ +--CUBLAS 2 MM Multiply + +--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you +--call init() and use global variables to specify procedure and loop + +--Second parameter is procedure # and third is loop # +init("mriq-fh.c", "mriFH_cpu", 0) + +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, + --copy_to_shared methods +N=32768 +M=256 +Tx=256 + + +print_code() +--permute(0,{"j","i"}) +--tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"}) +tile_by_index({"x"},{Tx},{l1_control="xx"},{"xx","x","k"}) +--tile_by_index({"x"},{16},{l1_control="xx1"},{"xx","x","xx1","k"}) +--tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"}) +--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) +print_code() + +normalize_index("x") +--normalize_index("i") +print_code() +--tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"}) +--print_code() +--cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}}) +cudaize("kernel_GPU",{dx=N,dy=N,dz=N,iRho=M,kx=M,ky=M,kz=M,rFHref=N,iFHref=N,rRho=M},{block={"xx"}, thread={"x"}}) +--copy_to_shared("tx","iRho",-16) +--copy_to_shared("tx","dz",1) +--copy_to_shared("tx","rRho",-16) +--copy_to_registers("tx","rFHref") +--copy_to_registers("tx","rRho") +--copy_to_registers("tx","iRho") +--copy_to_registers("tx","kx") +--copy_to_registers("tx","dx") +--copy_to_registers("tx","ky") +--copy_to_registers("tx","dy") +--copy_to_registers("tx","kz") +--copy_to_registers("tx","dz") +--copy_to_registers("tx","iFHref") +--copy_to_texture("rRho") +--copy_to_texture("kx") +--copy_to_texture("dx") +--copy_to_texture("ky") +--copy_to_texture("dy") +--copy_to_texture("kz") +--copy_to_texture("dz") +--copy_to_texture("iRho") +--print_code()--]] +--unroll(0,4,0) +--copy_to_constant_no_tile("kx") +--copy_to_constant_no_tile("ky") +--copy_to_constant_no_tile("kz") +--copy_to_constant_no_tile("rRho") +--copy_to_constant_no_tile("iRho") + +--unroll_to_depth(1) +print_code() +--[[ +copy_to_Texture("rRho") +copy_to_Texture("kx") +copy_to_Texture("dx") +copy_to_Texture("ky") +copy_to_Texture("dy") +copy_to_Texture("kz") +copy_to_Texture("dz") +copy_to_Texture("iRho") +--unroll_to_depth(2) +--]] diff --git a/examples/cuda-chill/mriq.c b/examples/cuda-chill/mriq.c new file mode 100644 index 0000000..ba4b87c --- /dev/null +++ b/examples/cuda-chill/mriq.c @@ -0,0 +1,33 @@ +#define N 32768 +#define M 3072 +struct kValues { + float Kx; + float Ky; + float Kz; + float PhiMag; +}; +extern float sinf(float); +extern float cosf(float); + +void +ComputeQCPU(int numK, int numX,struct kValues kVals[M],float x[N], float y[N], float z[N],float Qr[N], float Qi[N]) { + float expArg; + float cosArg; + float sinArg; + float phi; + int i; + int j; + numK = M; + numX = N; + for ( i = 0; i < M; i++) { + for ( j = 0; j < N; j++) { + expArg = 6.2831853071795864769252867665590058f * (kVals[i].Kx * x[j] +kVals[i].Ky * y[j] +kVals[i].Kz * z[j]); + cosArg = cosf(expArg); + sinArg = sinf(expArg); + phi = kVals[i].PhiMag; + Qr[j] += phi * cosArg; + Qi[j] += phi * sinArg; + } + } +} + diff --git a/examples/cuda-chill/mriq.lua b/examples/cuda-chill/mriq.lua new file mode 100644 index 0000000..1170111 --- /dev/null +++ b/examples/cuda-chill/mriq.lua @@ -0,0 +1,55 @@ +--CUBLAS 2 MM Multiply + +--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you +--call init() and use global variables to specify procedure and loop + +--Second parameter is procedure # and third is loop # +init("mriq.c", "ComputeQCPU", 0) + +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, + --copy_to_shared methods +N=32768 +M=3072 +TI=128 +TJ=128 + +permute(0,{"j","i"}) +--tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"}) +tile_by_index({"i"}, {TJ}, {l1_control="ii",l1_tile="i"}, {"ii", "j","i"}) +tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"}) +--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) +--print_code() + +normalize_index("j") +normalize_index("i") +--print_code() +--tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"}) +--print_code() +cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}}) + +copy_to_shared("tx","kVals",1) +--copy_to_shared("tx","x",1) +--copy_to_shared("tx","y",1) +--copy_to_shared("tx","z",1) + +--copy_to_texture("kVals") +--datacopy(0, 3, "kVals", {"tt","t"},false,0,1,-16,true) +--print_code() +--datacopy_privatized(0,"tx","kVals",{"tx"}) +--copy_to_registers("tx","kVals") +copy_to_registers("ii","x") +copy_to_registers("ii","y") +copy_to_registers("ii","z") +copy_to_registers("ii","Qi") +copy_to_registers("ii","Qr") +--[[datacopy_privatized(0,"tx","x",{"tx"}) +datacopy_privatized(0,"tx","y",{"tx"}) +datacopy_privatized(0,"tx","z",{"tx"}) +datacopy_privatized(0,"tx","Qi",{"tx"}) +datacopy_privatized(0,"tx","Qr",{"tx"}) + + +]]-- +--unroll(0,5,64) +print_code() +--unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels diff --git a/examples/cuda-chill/mv-shadow.c b/examples/cuda-chill/mv-shadow.c new file mode 100644 index 0000000..582b187 --- /dev/null +++ b/examples/cuda-chill/mv-shadow.c @@ -0,0 +1,9 @@ +#define N 1024 + +void normalMV(float c[N][N], float a[N], float b[N]) { + int i, j; + + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + a[i] = a[i] + c[j][i] * b[j]; +} diff --git a/examples/cuda-chill/mv-shadow.lua b/examples/cuda-chill/mv-shadow.lua new file mode 100644 index 0000000..43e8491 --- /dev/null +++ b/examples/cuda-chill/mv-shadow.lua @@ -0,0 +1,65 @@ +init("mv-shadow.c","normalMV",0) +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, + --copy_to_shared methods + +N=129 +TI=32 +TJ=64 + +N=1024 +TI=16 + + + + + + + + + + + + + + + + +--Tile the i and j loop, introducing "ii" as the control loop for the "i" +--tile, "k" for the control loop fo the "j" tile, with the final order +--of {"ii", "k", "i", "j"} +tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) +--tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) +--print_code() +--Normalize indx will do a tile size of one over the loop level specified +--by the input index. This is useful to get a zero lower bound and hard +--upper bound on a loop instead of it being relative to previous loop +--levels. +--normalize_index("ii") +normalize_index("i") +print_code() + +--Cudaize now determines the grid dimentions from the loops themselves +--(the upper bounds of the block and thread loops). It also renames the +--given block and thread loops's indexes to the approviate values from +--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the +--size of the arrays to be copied in the CUDA scaffolding. +cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}}) +--print_code() + +--Does a datacopy, tile, and add_sync to get a shared memory copy + +--copy_to_shared("tx", "b", 1) +--copy_to_shared("tx", "c", -16) +--print_code() +--copy_to_texture("b") +--copy_to_texture("c") +copy_to_registers("k", "a") +--print_code() + +unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels +--copy_to_texture("b") +--print_code() +--unroll(0,5,0) +--print_code() diff --git a/examples/cuda-chill/mv.c b/examples/cuda-chill/mv.c new file mode 100644 index 0000000..582b187 --- /dev/null +++ b/examples/cuda-chill/mv.c @@ -0,0 +1,9 @@ +#define N 1024 + +void normalMV(float c[N][N], float a[N], float b[N]) { + int i, j; + + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + a[i] = a[i] + c[j][i] * b[j]; +} diff --git a/examples/cuda-chill/mv.lua b/examples/cuda-chill/mv.lua new file mode 100644 index 0000000..ca54501 --- /dev/null +++ b/examples/cuda-chill/mv.lua @@ -0,0 +1,65 @@ +init("mv.c","normalMV",0) +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, + --copy_to_shared methods + +N=129 +TI=32 +TJ=64 + +N=1024 + + + + + + + + + + + + + + + + +--Tile the i and j loop, introducing "ii" as the control loop for the "i" +--tile, "k" for the control loop fo the "j" tile, with the final order +--of {"ii", "k", "i", "j"} +tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) +--tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) +--print_code() +--Normalize indx will do a tile size of one over the loop level specified +--by the input index. This is useful to get a zero lower bound and hard +--upper bound on a loop instead of it being relative to previous loop +--levels. +--normalize_index("ii") +normalize_index("i") +print_code() + +--Cudaize now determines the grid dimentions from the loops themselves +--(the upper bounds of the block and thread loops). It also renames the +--given block and thread loops's indexes to the approviate values from +--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the +--size of the arrays to be copied in the CUDA scaffolding. +cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}}) + +--print_code() + +--Does a datacopy, tile, and add_sync to get a shared memory copy + +--copy_to_shared("tx", "b", 1) +--copy_to_shared("tx", "c", -16) +--print_code() +--copy_to_texture("b") +--copy_to_texture("c") +copy_to_registers("k", "a") +--print_code() + +unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels +--copy_to_texture("b") +--print_code() +--unroll(0,5,0) +--print_code() diff --git a/examples/cuda-chill/mv_try.c b/examples/cuda-chill/mv_try.c new file mode 100644 index 0000000..7781f3b --- /dev/null +++ b/examples/cuda-chill/mv_try.c @@ -0,0 +1,9 @@ +#define N 4096 + +void normalMV(int n, float c[N][N], float a[N], float b[N]) { + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) + a[i] = a[i] + c[i][j] * b[j]; +} diff --git a/examples/cuda-chill/mv_try.lua b/examples/cuda-chill/mv_try.lua new file mode 100644 index 0000000..db4d9ad --- /dev/null +++ b/examples/cuda-chill/mv_try.lua @@ -0,0 +1,14 @@ +init("mv_try.c","normalMV",0) +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, + --copy_to_shared methods + +TI=96 + +N=4096 + + +tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) +cudaize("mv_GPU", {a=N, b=N, c=N*N}, + {block={"ii"}, thread={"i"}}) + +print_code() diff --git a/examples/cuda-chill/nbody.c b/examples/cuda-chill/nbody.c new file mode 100644 index 0000000..57899b6 --- /dev/null +++ b/examples/cuda-chill/nbody.c @@ -0,0 +1,66 @@ +#define NBODIES 16384 +#define SOFTENINGSQUARED 0.01f +#define DELTATIME 0.001f +#define DAMPING 1.0f + +#define NBLOCKSY 1 +#define NBLOCKSX (NBODIES/NTHREADSX) +#define NTHREADSY 1 +#define NTHREADSX 64 + +#define BLOCKSIZE 128 + +#define SHARED 1 +#define TIMER 1 +#define VERIFY 1 + +extern float sqrtf(float); + +void nbody_cpu(float* oldpos,float* oldpos1, float *newpos, float *oldvel, float *newvel, float *force) +{ + float r0,r1,r2; + float invDist, invDistCube, mass, invMass; + unsigned int i,j; + for(i = 0; i < NBODIES; ++i) { + //force[i*4 ] = 0; + //force[i*4+1] = 0; + //force[i*4+2] = 0; + //force[i*4+3] = 0; + for(j = 0; j < NBODIES; ++j) { + r0 = oldpos[j*4]-oldpos1[i*4]; + r1 = oldpos[j*4+1]-oldpos1[i*4+1]; + r2 = oldpos[j*4+2]-oldpos1[i*4+2]; + + invDist = 1.0/sqrtf(r0 * r0 + r1 * r1 + r2 * r2 + SOFTENINGSQUARED); + invDistCube = invDist * invDist * invDist; + mass = oldpos1[i*4+3]; + + force[i*4] = force[i*4] + r0 * mass * invDistCube; + force[i*4+1] = force[i*4+1] + r1 * mass * invDistCube; + force[i*4+2] = force[i*4+2] + r2 * mass * invDistCube; + + } + } + +/* for (i = 0; i < NBODIES; ++i) { + invMass = oldvel[4*i+3]; + + oldvel[4*i] += (force[4*i] * invMass) * DELTATIME * DAMPING; + oldvel[4*i+1] += (force[4*i+1] * invMass) * DELTATIME * DAMPING; + oldvel[4*i+2] += (force[4*i+2] * invMass) * DELTATIME * DAMPING; + + oldpos[4*i] += oldvel[4*i] * DELTATIME; + oldpos[4*i+1] += oldvel[4*i+1] * DELTATIME; + oldpos[4*i+2] += oldvel[4*i+2] * DELTATIME; + + newpos[4*i+0] = oldpos[4*i]; + newpos[4*i+1] = oldpos[4*i+1]; + newpos[4*i+2] = oldpos[4*i+2]; + newpos[4*i+3] = oldpos[4*i+3]; + + newvel[4*i+0] = oldvel[4*i]; + newvel[4*i+1] = oldvel[4*i+1]; + newvel[4*i+2] = oldvel[4*i+2]; + newvel[4*i+3] = oldvel[4*i+3]; + }*/ +} diff --git a/examples/cuda-chill/nbody.lua b/examples/cuda-chill/nbody.lua new file mode 100644 index 0000000..08f88a9 --- /dev/null +++ b/examples/cuda-chill/nbody.lua @@ -0,0 +1,53 @@ +--CUBLAS 2 MM Multiply + +--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you +--call init() and use global variables to specify procedure and loop + +--Second parameter is procedure # and third is loop # +init("nbody.c", "nbody_cpu" , 0) + +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, + --copy_to_shared methods +NBODIES=16384 + + +--Tj=128 CHANGE FOR BEST..... BEST IS 64BLOCKS 128THREADS +--Ti=256 +Tj=64 +Ti=32 +Tjjj=1 +Tiii=1 +Tn=0.1 +--normalize_index("j") +-- +--print_code() +--normalize_index("n") +-- TILE COMMANDS ZEROOOOOOOOOOO:3 +--tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j"})--CU=-1 +tile_by_index({"i"},{Ti},{l1_control="ii"},{"ii","i","j"})--CU=-1 +--normalize_index("i") +--tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1 + +--tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3 +--tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"}) +--tile_by_index({"i"}, {Ti/2}, {l1_control="iii"}, {"ii","iii", "jj","i","j"}) +--print_code() +cudaize("kernel_GPU",{oldpos=4*NBODIES,oldpos1=4*NBODIES,oldvel=4*NBODIES,force=4*NBODIES,newpos=4*NBODIES,newvel=4*NBODIES},{block={"ii"}, thread={"i"}})--CU=3 +print_code() +--tile(0,6,6) +--copy_to_shared("tx","oldpos",-16) +--copy_to_registers("j","oldpos") +--copy_to_registers("j","oldpos1") +--copy_to_registers("j","force") + +--copy_to_texture("oldpos") +--tile(1,3,3) +--tile(2,3,3) + +print_code() +--unroll_to_depth(1) +-- +--tile(2,3,3) +--unroll(2,3,0) +--unroll(0,5,0) +--print_code() diff --git a/examples/cuda-chill/tmv-shadow.c b/examples/cuda-chill/tmv-shadow.c new file mode 100644 index 0000000..cb9ea8d --- /dev/null +++ b/examples/cuda-chill/tmv-shadow.c @@ -0,0 +1,9 @@ +#define N 1024 + +void normalMV(float c[N][N], float a[N], float b[N]) { + int i, j; + + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + a[i] = a[i] + c[i][j] * b[j]; +} diff --git a/examples/cuda-chill/tmv-shadow.lua b/examples/cuda-chill/tmv-shadow.lua new file mode 100644 index 0000000..196b939 --- /dev/null +++ b/examples/cuda-chill/tmv-shadow.lua @@ -0,0 +1,50 @@ +init("tmv-shadow.c","normalMV",0) +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, + --copy_to_shared methods + +N=1024 +--N= 8209 +--N=129 +TI=64 +N=1024 +TI=32 +--tile, "k" for the control loop for the "j" tile, with the final order +--of {"ii", "k", "i", "j"} +tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) +--print_code() +--tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) + +--print_code() +--Normalize indx will do a tile size of one over the loop level specified +--by the input index. This is useful to get a zero lower bound and hard +--upper bound on a loop instead of it being relative to previous loop +--levels. +--normalize_index("i") +--print_code() + +--Cudaize now determines the grid dimentions from the loops themselves +--(the upper bounds of the block and thread loops). It also renames the +--given block and thread loops's indexes to the approviate values from +--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the +--size of the arrays to be copied in the CUDA scaffolding. +cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}}) + +--print_code() + +--Does a datacopy, tile, and add_sync to get a shared memory copy +copy_to_shared("tx", "b", 1) +--copy_to_texture("b") +--print_code() + +copy_to_shared("tx", "c", -16) +--copy_to_texture("c") +--print_code() + +copy_to_registers("k", "a") +print_code() +--unroll(0,5,0) +--unroll(0,4,0) +--unroll(2,4,16) +unroll_to_depth(1) +--print_code() diff --git a/examples/cuda-chill/tmv.c b/examples/cuda-chill/tmv.c new file mode 100644 index 0000000..cb9ea8d --- /dev/null +++ b/examples/cuda-chill/tmv.c @@ -0,0 +1,9 @@ +#define N 1024 + +void normalMV(float c[N][N], float a[N], float b[N]) { + int i, j; + + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + a[i] = a[i] + c[i][j] * b[j]; +} diff --git a/examples/cuda-chill/tmv.lua b/examples/cuda-chill/tmv.lua new file mode 100644 index 0000000..5071108 --- /dev/null +++ b/examples/cuda-chill/tmv.lua @@ -0,0 +1,50 @@ +init("tmv.c","normalMV",0) +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, + --copy_to_shared methods + +N=1024 +--N= 8209 +--N=129 +TI=64 +N=1024 +TI=32 +--tile, "k" for the control loop for the "j" tile, with the final order +--of {"ii", "k", "i", "j"} +tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) +--print_code() +--tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) + +--print_code() +--Normalize indx will do a tile size of one over the loop level specified +--by the input index. This is useful to get a zero lower bound and hard +--upper bound on a loop instead of it being relative to previous loop +--levels. +--normalize_index("i") +--print_code() + +--Cudaize now determines the grid dimentions from the loops themselves +--(the upper bounds of the block and thread loops). It also renames the +--given block and thread loops's indexes to the approviate values from +--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the +--size of the arrays to be copied in the CUDA scaffolding. +cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}}) + +--print_code() + +--Does a datacopy, tile, and add_sync to get a shared memory copy +copy_to_shared("tx", "b", 1) +--copy_to_texture("b") +--print_code() + +copy_to_shared("tx", "c", -16) +--copy_to_texture("c") +--print_code() + +copy_to_registers("k", "a") +print_code() +--unroll(0,5,0) +--unroll(0,4,0) +--unroll(2,4,16) +unroll_to_depth(1) +--print_code() diff --git a/examples/fortran/README b/examples/fortran/README new file mode 100644 index 0000000..4f23bee --- /dev/null +++ b/examples/fortran/README @@ -0,0 +1,10 @@ +// Manu + +1) Fortran support added to permute, tile, unroll and datacopy. Tested these w.r.t gemm.c using gemm.script. + There might be other issues (like fusion due to unroll, ...) that have not been tested. + +2) To incorporate Fortran support I had to modify certain values in omega (include/omega/omega_core/oc.h). + To solve for large number of unknowns, these values have to be reverted back. + +3) Tested the existing chill scripts using Derick's python script. + At least the existing chill scripts are not affected by the fortran related changes. diff --git a/examples/fortran/ccd.f b/examples/fortran/ccd.f new file mode 100644 index 0000000..12d834d --- /dev/null +++ b/examples/fortran/ccd.f @@ -0,0 +1,32 @@ +c +c These have been separated out from ccsd_t_singles_l.F and ccsd_t_doubles_l.F +c + subroutine clean_sd_t_s1_1(h3d,h2d,h1d,p6d,p5d,p4d, + 2 triplesx,t1sub,v2sub) + IMPLICIT NONE + integer h3d,h2d,h1d,p6d,p5d,p4d + integer h3,h2,h1,p6,p5,p4 + integer N + double precision triplesx(16,16,16,16,16,16) + double precision t1sub(16,16) + double precision v2sub(16,16,16,16) + + N = 16 + + do p4=1,10 + do p5=1,10 + do p6=1,10 + do h1=1,10 + do h2=1,10 + do h3=1,10 + triplesx(h3,h2,h1,p6,p5,p4)=triplesx(h3,h2,h1,p6,p5,p4) + 1 + t1sub(p4,h1)*v2sub(h3,h2,p6,p5) + enddo + enddo + enddo + enddo + enddo + enddo + return + end + diff --git a/examples/fortran/ccd.script b/examples/fortran/ccd.script new file mode 100644 index 0000000..c2af500 --- /dev/null +++ b/examples/fortran/ccd.script @@ -0,0 +1,18 @@ +source: ccd.f +procedure: clean_sd_t_s1_1 +format : rose +loop: 0 + + + +original() + +UN=4 + +unroll(0,5,4) +unroll(0,4,4) +unroll(0,3,4) +unroll(0,2,4) +unroll(0,1,4) + +print diff --git a/examples/fortran/gemm.f90 b/examples/fortran/gemm.f90 new file mode 100644 index 0000000..b65bb58 --- /dev/null +++ b/examples/fortran/gemm.f90 @@ -0,0 +1,58 @@ +program matmul + + integer N,i,j,k + real*8 a(10,10), b(10,10), c(10,10), ct(10,10),mysum + + do i=1,10,1 + do j=1,10,1 + a(i,j) = i+j + b(i,j) = i-j + c(i,j) = 0.0 + ct(i,j) = 0.0 + end do + b(i,i) = 1.0; + end do + + + DO j=1,10,1 + DO k=1,10,1 + DO i=1,10,1 + c(i,j) = c(i,j)+a(i,k)*b(k,j) + end do + end do + end do + + + + call gemm(10,a,b,ct) + + mysum = 0.0 + do i=1,10,1 + do j=1,10,1 + mysum = c(i,j) - ct(i,j) + end do + end do + + if (abs(mysum) >= 0.00001) then + write (*,*) "Something wrong" + else + write (*,*) "Output matches" + end if + +end program matmul + + SUBROUTINE gemm(N,A,B,C) + INTEGER N + REAL*8 A(N,N), B(N,N), C(N,N) + + INTEGER I,J,K + + DO J=1,N,1 + DO K=1,N,1 + DO I=1,N,1 + C(I,J) = C(I,J)+A(I,K)*B(K,J) + end do + end do + end do + + END subroutine diff --git a/examples/fortran/gemm.script b/examples/fortran/gemm.script new file mode 100644 index 0000000..01eb859 --- /dev/null +++ b/examples/fortran/gemm.script @@ -0,0 +1,30 @@ +#matrix multiply large array size for intel machine +source: gemm.f90 +procedure: gemm +format: rose +loop: 0 + +TI = 128 +#TI = 4 +TJ = 8 +#TK = 3 +TK = 512 +UI = 2 +UJ = 2 + +permute([3,1,2]) +tile(0,2,TJ) +#print space +tile(0,2,TI) +#print space +tile(0,5,TK) +#print space + + +datacopy(0,3,A,false,-1) +#print space + +datacopy(0,4,B) +unroll(0,4,UI) +unroll(0,5,UJ) + diff --git a/examples/fortran/rose_gemm.f90 b/examples/fortran/rose_gemm.f90 new file mode 100644 index 0000000..d150922 --- /dev/null +++ b/examples/fortran/rose_gemm.f90 @@ -0,0 +1,155 @@ +PROGRAM matmul +INTEGER :: N, i, j, k +REAL(kind=8) :: a(10,10), b(10,10), c(10,10), ct(10,10), mysum +DO i = 1, 10, 1 +DO j = 1, 10, 1 +a(i,j) = i + j +b(i,j) = i - j +c(i,j) = 0.0 +ct(i,j) = 0.0 +END DO +b(i,i) = 1.0 +END DO +DO j = 1, 10, 1 +DO k = 1, 10, 1 +DO i = 1, 10, 1 +c(i,j) = c(i,j) + a(i,k) * b(k,j) +END DO +END DO +END DO +CALL gemm(10,a,b,ct) +mysum = 0.0 +DO i = 1, 10, 1 +DO j = 1, 10, 1 +mysum = c(i,j) - ct(i,j) +END DO +END DO +IF (abs(mysum) >= 0.00001) THEN +WRITE (*, FMT=*) "Something wrong" +ELSE +WRITE (*, FMT=*) "Output matches" +END IF +END PROGRAM matmul + +SUBROUTINE gemm(N,A,B,C) +INTEGER :: t12 +INTEGER :: t10 +INTEGER :: t8 +INTEGER :: t6 +INTEGER :: t4 +INTEGER :: t2 +INTEGER :: chill_t64 +INTEGER :: chill_t63 +INTEGER :: chill_t62 +INTEGER :: chill_t61 +INTEGER :: chill_t60 +INTEGER :: chill_t59 +INTEGER :: chill_t58 +INTEGER :: chill_t57 +INTEGER :: chill_t56 +INTEGER :: chill_t55 +INTEGER :: chill_t54 +INTEGER :: chill_t53 +INTEGER :: chill_t52 +INTEGER :: chill_t51 +INTEGER :: chill_t50 +INTEGER :: chill_t49 +INTEGER :: chill_t48 +INTEGER :: chill_t47 +INTEGER :: over2 +INTEGER :: chill_t46 +INTEGER :: chill_t45 +INTEGER :: chill_t44 +INTEGER :: chill_t43 +INTEGER :: chill_t42 +INTEGER :: chill_t41 +INTEGER :: chill_t40 +INTEGER :: chill_t39 +INTEGER :: chill_t38 +INTEGER :: chill_t37 +INTEGER :: chill_t36 +INTEGER :: chill_t35 +INTEGER :: chill_t34 +INTEGER :: chill_t33 +INTEGER :: chill_t32 +INTEGER :: chill_t31 +INTEGER :: chill_t30 +INTEGER :: chill_t29 +INTEGER :: chill_t28 +INTEGER :: chill_t27 +INTEGER :: chill_t26 +INTEGER :: chill_t25 +INTEGER :: chill_t24 +INTEGER :: chill_t23 +INTEGER :: over1 +INTEGER :: chill_t22 +INTEGER :: chill_t21 +INTEGER :: chill_t20 +INTEGER :: chill_t19 +INTEGER :: chill_t18 +INTEGER :: chill_t17 +INTEGER :: chill_t16 +INTEGER :: chill_t15 +REAL(kind=8), DIMENSION(8,512) :: f_P2 +INTEGER :: chill_t14 +INTEGER :: chill_t13 +INTEGER :: chill_t12 +INTEGER :: chill_t11 +INTEGER :: chill_t10 +INTEGER :: chill_t9 +INTEGER :: chill_t8 +INTEGER :: chill_t7 +REAL(kind=8), DIMENSION(512,128) :: f_P1 +INTEGER :: chill_t1 +INTEGER :: chill_t2 +INTEGER :: chill_t4 +INTEGER :: chill_t6 +INTEGER :: chill_t5 +INTEGER :: N +REAL(kind=8) :: A(N,N), B(N,N), C(N,N) +INTEGER :: I, J, K +over1 = 0 +over2 = 0 +DO t2 = 1, N, 512 +DO t4 = 1, N, 128 +DO t6 = t2, merge(N,t2 + 511,N <= t2 + 511), 1 +DO t8 = t4, merge(t4 + 127,N,t4 + 127 <= N), 1 +f_P1(t8 - t4 + 1,t6 - t2 + 1) = A(t8,t6) +END DO +END DO +DO t6 = 1, N, 8 +DO t8 = t6, merge(N,t6 + 7,N <= t6 + 7), 1 +DO t10 = t2, merge(N,t2 + 511,N <= t2 + 511), 1 +f_P2(t10 - t2 + 1,t8 - t6 + 1) = B(t10,t8) +END DO +END DO +over1 = MOD(N,2) +DO t8 = t4, merge(-over1 + N,t4 + 126,-over1 + N <= t4 + 126), 2 +over2 = MOD(N,2) +DO t10 = t6, merge(t6 + 6,N - over2,t6 + 6 <= N - over2), 2 +DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1 +C(t8,t10) = C(t8,t10) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1) +C(t8 + 1,t10) = C(t8 + 1,t10) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1) +C(t8,t10 + 1) = C(t8,t10 + 1) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1) +C(t8 + 1,t10 + 1) = C(t8 + 1,t10 + 1) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1) +END DO +END DO +IF (N - 7 <= t6 .AND. 1 <= over2) THEN +DO t12 = t2, merge(N,t2 + 511,N <= t2 + 511), 1 +C(t8,N) = C(t8,N) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1) +C(t8 + 1,N) = C(t8 + 1,N) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1) +END DO +END IF +END DO +IF (N - 127 <= t4 .AND. 1 <= over1) THEN +DO t10 = t6, merge(t6 + 7,N,t6 + 7 <= N), 1 +DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1 +C(N,t10) = C(N,t10) + f_P1(N - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1) +END DO +END DO +END IF +END DO +END DO +END DO +END SUBROUTINE + |