diff options
Diffstat (limited to 'test-chill/test-cases/examples/cuda-chill')
24 files changed, 0 insertions, 2849 deletions
| diff --git a/test-chill/test-cases/examples/cuda-chill/cp.c b/test-chill/test-cases/examples/cuda-chill/cp.c deleted file mode 100644 index 837d7a6..0000000 --- a/test-chill/test-cases/examples/cuda-chill/cp.c +++ /dev/null @@ -1,29 +0,0 @@ -#define N 1 - -#define VOLSIZEY 512 -#define VOLSIZEX 512 -#define VOLSIZEZ 1 -#define ATOMCOUNT 4000 -#define GRIDSPACING 0.1 -#define zDim 0 - -extern float sqrtf(float); - -void cenergy_cpu(float atoms[ATOMCOUNT*4],float *energy,float z) -{ -int i,j,n;float dx,dy,dz;  -    -    for (j=0; j<VOLSIZEY; j++) { -        for (i=0; i<VOLSIZEX; i++) { -            	  for (n=0;n<ATOMCOUNT;n+=4) { -				dx = (GRIDSPACING * i) - atoms[n]; -				dy = (GRIDSPACING * j) - atoms[n+1]; -				dz = z - atoms[n+2]; -        		        energy[(j*VOLSIZEX + i)+VOLSIZEX*VOLSIZEY*zDim] += atoms[n+3]/sqrtf( (dx*dx) + (dy*dy)+ (dz*dz) ) ; -            } -               - -        } -    } -} - diff --git a/test-chill/test-cases/examples/cuda-chill/cp.lua b/test-chill/test-cases/examples/cuda-chill/cp.lua deleted file mode 100644 index 1ef2264..0000000 --- a/test-chill/test-cases/examples/cuda-chill/cp.lua +++ /dev/null @@ -1,46 +0,0 @@ ---CUBLAS 2 MM Multiply - ---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you ---call init() and use global variables to specify procedure and loop - ---Second parameter is procedure # and third is loop # -init("cp.c", "cenergy_cpu", 0)  - -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, -                     --copy_to_shared methods -V=512 -N=4000 -N=1 - -Tj=32 -Ti=16 -Tii=16 -Tjj=16 - ---normalize_index("j") ---normalize_index("i") -print_code() -normalize_index("n") --- TILE COMMANDS ZEROOOOOOOOOOO:3 ---permute(0,{"i","j","n"}) ---tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","n"})--CU=-1 -tile_by_index({"j","i"},{Tj,Ti},{l1_control="jj",l2_control="ii"},{"jj","ii","j","i","n"})--CU=-1 ---tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1 - ---tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3 ---tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","i","iii","j","jjj","n"})--CU=3 ---tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"}) ---tile_by_index({"i"}, {Tii}, {l1_control="iii",l1_tile="i"}, {"ii", "jj", "iii","i","j","n"}) -print_code() -cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"jj","ii"}, thread={"j","i"}})--CU=3 ---cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"ii","jj"}, thread={"i","j"}})--CU=3 -print_code() -copy_to_shared("tx","atoms",-16) -copy_to_registers("tx","energy") ---copy_to_texture("atoms") ---unroll_to_depth(1) ---unroll(0,9,0) ---unroll(0,5,0) - ---unroll(0,8,256) -print_code() diff --git a/test-chill/test-cases/examples/cuda-chill/cudaize.lua b/test-chill/test-cases/examples/cuda-chill/cudaize.lua deleted file mode 100644 index 7359cca..0000000 --- a/test-chill/test-cases/examples/cuda-chill/cudaize.lua +++ /dev/null @@ -1,1004 +0,0 @@ - --- THIS IS CUDAIZE.LUA - -function table.contains_key(table, key) -   for k in pairs(table) do -      if k == key then -         return true -      end -   end -   return false -end - -function valid_indices(stmt, indices) -   --print( "valid_indices() lua calling C cur_indices") -   --io.flush() -   cur = cur_indices(stmt)  -   --print("Cur indices "..list_to_string(cur)) -   for idx in pairs(indices) do -      if not table.contains_key(cur,idx) then -         return false -      end -   end -   return true -end - -function next_clean_level(cur_idxs,level) -   --print("next_clean_level( ..., "..level.." )") -   --print(string.format("indices_at_each_level %s ",list_to_string(cur_idxs) )) -    -   --print("loop to "..#cur_idxs) -   for i=level+1,#cur_idxs do -      --print("Checking level "..i.." = '"..cur_idxs[i].."'") -      if (# cur_idxs[i] > 0) then -         --print("Good enough"..(# cur_idxs[i])) -         --print("returning "..i) -         return i -      end -   end -   return -1 --sentinal that there were no non-dummy indices left -end - -function build_order(final_order, tile_idx_names, ctrl_idx_names, tile_idx_map, cur_level) -   order = {} -   --print("\nbuild_order()") -   --print("build_order(): final_order = ( "..list_to_string(final_order).." )") -   --print("build_order(): ctrl_idx_names = ("..list_to_string(ctrl_idx_names).." )") -   --print("cur_level "..cur_level.."") -   --io.flush() -    -   for i,k in ipairs(final_order) do -      skip = false -      cur = final_order[i] -      --print("\ncur "..cur.." = final_order["..i.."] = "..final_order[i].."  ") -      --control loops below our current level should not be in the current order -      for j=cur_level+2,# ctrl_idx_names do -         --print("j "..j.." final_order["..i.."] = "..final_order[i].."  ") -         if ctrl_idx_names[j] == final_order[i] then -            skip = true -            --print("SKIP "..final_order[i].."  ") -            --io.flush() -         end -      end -      --possibly substitute tile indices ifn necessar -      if table.contains_key(tile_idx_map,final_order[i]) then -         approved_sub = false -         sub_string = tile_idx_map[final_order[i]] -         for j=cur_level+2,# tile_idx_names do -            if tile_idx_names[j] == sub_string then -               approved_sub = true -            end -         end -         if approved_sub then -            cur = sub_string -         end -      end -      if not skip then -         table.insert(order,cur) -      end -   end -   return order -end - -function list_to_string(str_list) -   --Helpful debug output -   l = "" -   for i,str in ipairs(str_list) do -      if i > 1 then -         l = l .. ", " .. str -      else -         l = str -      end -   end -   return l -end - - -function find_cur_level(stmt,idx) -   --Search cur_indices for a idx at stmt -   cur = cur_indices(stmt) -   --print(string.format("find_cur_level(stmt %d, idx %s)  Cur indices %s", stmt, idx, list_to_string(cur))) -   for i,cidx in ipairs(cur) do -      if cidx == idx then -         --print(string.format("found it at index %d", i)) -         return i -      end -   end -   error("Unable to find "..idx.." in current list of indices") -end - - -function chk_cur_level(stmt,idx) -   --Search cur_indices for a idx at stmt -   cur = cur_indices(stmt) -   for i,cidx in ipairs(cur) do -      if cidx == idx then -         return i -      end -   end -   return -1 -end - - -function find_offset(cur_order, tile, control) -   --print("Looking for tile '"..tile.."' and control '"..control.."' in ( "..list_to_string(cur_order)..", )") -   idx1 = -1 -   idx2 = -1 -   for i,cur in ipairs(cur_order) do -      if(cur == tile) then -         idx1 = i -      end -      if(cur == control) then -         idx2 = i -      end -   end -   if(idx1 < 0) then -      error("Unable to find tile " .. tile .. " in current list of indices") -   end -   if(idx2 < 0) then -      error("Unable to find control " .. control .. " in current list of indices") -   end -   --print("found at level " .. idx2 .. " and " .. idx1) -   if(idx2 < idx1) then -      return idx2-idx1+1 -   else -      return idx2-idx1 -   end -end - -function tile_by_index(tile_indices, sizes, index_names, final_order, tile_method) -   --print "STARTING TILE BY INDEX" -   --io.flush() -   stmt = 0 --assume stmt 0 -   cur = cur_indices(stmt) -   --print("Cur indices "..list_to_string(cur)) -   if not valid_indices(stmt,tile_indices) then -      error('One of the indices in the first parameter were not '.. -            'found in the current set of indices.') -   end -   if not tile_method then tile_method = counted end -   tile_idx_names = {} -   for i,s in ipairs(tile_indices) do tile_idx_names[i]=s end --shallow copy -   --print("tile_index_names: ['"..list_to_string(tile_indices).."']") -    -   --print("index_names:  ")  -   --for k,v in pairs(index_names) do print(k,v) end -    -   --io.flush() -    -   ctrl_idx_names = {} -   tile_idx_map = {} -   for k,v in pairs(index_names) do -      valid = false -      if(string.sub(k,1,1) == "l") then -         if string.sub(k,-8) == "_control" then -            i = tonumber(string.sub(k,2,-9)) -            if i and i >= 1 and i <= (# tile_indices) then -               ctrl_idx_names[i] = v -               --print(string.format("Handling control %s for loop level %d",v,i)) -               --print("control "..k.."   name  "..v.." ") -               valid = true -            end -         elseif string.sub(k,-5) == "_tile" then -            i = tonumber(string.sub(k,2,-6)) -            if i and i >= 1 and i <= (# tile_indices) then -               --print(string.format("tile %s -> %s",tile_indices[i], v)) -               tile_idx_names[i] = v -               tile_idx_map[v] = tile_indices[i] -               --print(string.format("tile %s -> %s",tile_indices[i], v)) -               valid = true -            end -         end -      end -      if not valid then error(string.format("%s is not a proper key for specifying ".. -                                            "tile or control loop indices\n", k)) end -   end -    -   --filter out control indices (and do name substitution of unprocessed tile indices) for a given level -   cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, -1) -   permute(stmt, cur_order) -    -   for i,cur_idx in ipairs(tile_indices) do -      --print(string.format("i %d  cur_idx %s calling build order ********", i-1, cur_idx)) -      cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1) -      --Find a offset between tile loop and control loop -      -- 0   = control loop one level above tile loop -      -- -1  = control loop two levels above tile loop -      -- > 0 = tile loop above control loop -      -- In the last case, we do two extra tile commands to get the control -      -- above the tile and then rely on the final permute to handle the -      -- rest -      level = find_cur_level(stmt,cur_idx) -      offset = find_offset(cur_order, tile_idx_names[i], ctrl_idx_names[i]) -      --print(string.format("offset %d", offset)) -       -      if (offset <= 0) then -         --print(string.format("[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %s)",stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method))  -         tile(stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method) -      else -         --print(string.format("2tile(%d, %d, %d, %d, %s, %s, %s)", stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method)) -         tile(stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method);--regular level -         --flip tile and control loop -         --print(string.format("3tile(%d, %d, %d)",stmt, level+1, level+1)) -         tile(stmt, level+1, level+1); -         --print(string.format("4tile(%d, %d, %d)",stmt, level+1, level)) -         tile(stmt, level+1, level); -         --print(string.format("\n[offset>0]tile(%d, %d, %d, %d,%s,%s,%s)",stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method))  -	 --print_code() -          -      end -       -      --Do permutation based on cur_order -      --print "permute based on build order calling build_order()" -      --print "cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)" -      cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1) -      --print "permute(stmt, cur_order);" -      permute(stmt, cur_order); -      --print "\nafter permute(), code is:" -      --print_code() -   end -   --print "ENDING TILE BY INDEX" -   --print_code() -end - -function normalize_index(index) -   stmt = 0 --assume stmt 0cur = cur_indices(stmt) -   --print("Cur indices "..list_to_string(cur)) -   l = find_cur_level(stmt, index) -   tile(stmt, l, l) -   --print(string.format("\n[Normalize]tile(%d, %d, %d)",stmt, l,l))  -end - -function is_in_indices(stmt, idx) -   cur = cur_indices(stmt) -   for i=0,#cur,1 do -      if(cur[i]==idx) then -         return true -      end -   end -   return false -    -end - - -function copy_to_registers(start_loop, array_name) -    -   --print("\n\n****** starting copy to registers") -   io.flush() - -   stmt = 0 --assume stmt 0 -    -   -- [Malik] first we make sure that tx and ty are consecutive loops in the 2D thread setup, otherwise all levels for subsequent operations are messed up. Start logic. -   cur = cur_indices(stmt) -   table_Size = table.getn(cur) -    -   --print(string.format("Cur indices %s,",list_to_string(cur))) -   --print(string.format("The table size is %d", table_Size)) -   --table.foreach(cur, print) -   --print_code() -    -   level_tx = -1 -   level_ty = -1 -   if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end -   if is_in_indices(stmt,"ty") then level_ty = find_cur_level(stmt,"ty") end -   --print(string.format("level_tx %d  level_ty %d", level_tx, level_ty)) -    -   ty_lookup_idx = ""  -   org_level_ty = level_ty -    -   --if(cur[level_tx+1]~=nil and cur[level_tx+1]~="") then ty_lookup = ty_lookup+1 end -   if(cur[level_ty+1]~=nil and cur[level_ty+1]~="") then  -      --print(string.format("IF  cur[%d] = %s", level_ty+1, cur[level_ty+1])) -      ty_lookup_idx = cur[level_ty+1]  -   else -      --if cur[level_ty]  ~= nil then print(string.format("ELSE ty_lookup_idx = cur[%d] = %s", level_ty, cur[level_ty])) --   TODO  -      --else print "ELSE (dangerous)" end -      ty_lookup_idx = cur[level_ty]  -- may assign nil !? -   end -   --if ty_lookup_idx ~= nil then print(string.format("ty_lookup_idx '%s'", ty_lookup_idx))  --  TODO  -   --else print "ty_lookup_idx is NIL" -   --end -    -   if level_ty > 0 then -      --print(string.format("\ntile3(%d,%d,%d)",stmt,level_ty,level_tx+1)) -      tile(stmt,level_ty,level_tx+1)  -   end -   --print_code() -    -   --print("\ntylookup is %d",ty_lookup) -   --exit(0) -   -- -   cur = cur_indices(stmt) -   table_Size = table.getn(cur) -   --print(string.format("Cur indices %s,",list_to_string(cur))) -   --print("The table size is "..table.getn(cur)) -   --table.foreach(cur, print) -    -   if is_in_indices(stmt,"tx") then   level_tx = find_cur_level(stmt,"tx") end -   if ty_lookup_idx then -      if is_in_indices(stmt,ty_lookup_idx) then level_ty = find_cur_level(stmt,ty_lookup_idx) end -   end -    -   ty_lookup = 1 -   idx_flag = -1 -   -- find the level of the next valid index after ty+1 -   --print(string.format("\nlevel_ty %d", level_ty)) -   if level_ty > 0 then -      --print(string.format("table_Size %d", table_Size)) -      for num= level_ty+ty_lookup,table_Size do -         --print(string.format("num=%d   cur[num] = '%s'",num, cur[num])) -         if(cur[num] ~= "") then -            idx_flag = find_cur_level(stmt,cur[num]) -            --print (string.format("idx_flag = %d", idx_flag)) -            break -         end -      end -   end -    -   --print(string.format("\n(first) I am checking all indexes after ty+1 %s",idx_flag)) -   --print_code() -   --print "" -    -   how_many_levels = 1 -   startat = idx_flag + 1 -   if startat == 0 then startat = 1 end  -- avoid attempt to examine an illegal array offset -   --print(string.format("idx_flag = %d   I will check levels starting with %d", idx_flag, idx_flag+1)) -    -   for ch_lev = startat,table_Size,1 do    -- was for ch_lev = idx_flag+1,table_Size,1 do -      --print(string.format("ch_lev %d", ch_lev)) -      if(cur[ch_lev] ~= nil and cur[ch_lev] ~= "") then -         --print(string.format("cur[%d] = '%s'", ch_lev, cur[ch_lev]))  -         how_many_levels = how_many_levels+1 -      end -   end -   --print("\nHow Many Levels",how_many_levels) -    -   -- change this all to reflect the real logic which is to normalize all loops inside the thread loops.  -   if(how_many_levels <2) then -      while( idx_flag >= 0) do -         for num = level_ty+ty_lookup,(table_Size) do -            --print(string.format("at top of loop, num is %d", num)) -            --print(string.format("num %d", num)) -            --print(string.format("cur[num] = '%s'", cur[num])) -            if(cur[num] ~= "") then -               idx=cur[num] -               --print(string.format("idx '%s'", idx)) -                -               curlev = find_cur_level(stmt,idx) -               --print(string.format("curlev %d", curlev)) -                -               --print_code() -               --print(string.format("\n[COPYTOREG]tile(%d,%d,%d)",stmt,find_cur_level(stmt,idx),level_tx)) -               tile(stmt,find_cur_level(stmt,idx),find_cur_level(stmt,idx)) -               curlev = find_cur_level(stmt,idx) -               --print(string.format("curlev %d", curlev)) -               tile(stmt,find_cur_level(stmt,idx),level_tx) -               --print(string.format("hehe '%s'",cur[num])) -                -               cur = cur_indices(stmt) -               --print("Cur indices INSIDE"..list_to_string(cur)) -               table_Size = table.getn(cur) -               --print(string.format("Table Size is: %d",table_Size)) -               level_tx = find_cur_level(stmt,"tx") -               --print(string.format("\n level TX is: %d",level_tx)) -               level_ty = find_cur_level(stmt,ty_lookup_idx) -               --print(string.format("\n level TY is: %d",level_ty)) -               idx_flag = -1 -               --print "idx_flag = -1" -                -               -- find the level of the next valid index after ty+1 -                -               -- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?) -               for num= level_ty+ty_lookup,table_Size do -                  --print(string.format("num mucking num = %d", num)) -                  if(cur[num] ~= nil and cur[num] ~= "") then -                     idx_flag = find_cur_level(stmt,cur[num]) -                     --print("\n(second) I am checking all indexes after ty+1 %s",cur[num]) -                     break -                  end -               end -               --print(string.format("num mucked to %d     idx_flag = %d", num, idx_flag)) -                -            end -            --print(string.format("at bottom of loop, num is %d", num)) -         end -      end -   end -   --print "done with levels" -    -    -    -    -   --print "ARE WE SYNCED HERE?" -   --print_code() -   --print("\ntile(%d,%d,%d)",stmt,level_k,level_k) -   --tile(stmt,level_k,level_k) -    -   -- [Malik] end logic -   --print_code() -   start_level = find_cur_level(stmt, start_loop) -   --We should hold contant any block or tile loop -   block_idxs = block_indices() -   thread_idxs = thread_indices() -   --print("\nblock indices are") -   --table.foreach(block_idxs, print) -   --print("\nthread indices are") -   --table.foreach(thread_idxs, print) -   --print(string.format("\nStart Level: %d",start_level)) -    -   hold_constant = {} -   --print("\n Now in Blocks") -   for i,idx in ipairs(block_idxs) do -      --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx))) -      if find_cur_level(stmt,idx) >= start_level then -         table.insert(hold_constant, idx) -         --print(string.format("\nJust inserted block %s in hold_constant",idx)) -      end -   end -    -    -   --print("\n Now in Threads") -   for i,idx in ipairs(thread_idxs) do -      --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx))) -      if find_cur_level(stmt,idx) >= start_level then -         table.insert(hold_constant, idx) -         --print(string.format("\nJust inserted thread %s in hold_constant",idx)) -      end -   end -    -   --print "\nhold constant table is: " -   --table.foreach(hold_constant, print) -    -   --print("\nbefore datacopy pvt") -   old_num_stmts = num_statements() -   --print_code() -   --print(string.format("\n[DataCopy]datacopy_privatized(%d, %s, %s, vector having privatized levels)",stmt, start_loop, array_name))  -   --table.foreach(hold_constant, print) -   datacopy_privatized(stmt, start_loop, array_name, hold_constant) -    -   --print(hold_constant) -   new_num_stmts = num_statements() -   --print("\nthe num of statements:%d\n",new_num_stmt) -   --print_code() -   --exit(0) -   -- [Malik] normalize the copy loops created. -   cur = cur_indices(old_num_stmts) -   --print("Cur indices "..list_to_string(cur)) -   for cidx,i in ipairs(cur) do -      if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then -         --tile(old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i)) -         --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i)) -      end -   end -   --print_code() -   --print("\nthe num of statements OLD+1 :",(old_num_stmts+1))   - - ---[[  -   is this commented out? why yes, yes it is   block comment  -   if( (old_num_stmts+1) <= new_num_stmts) then -      cur = cur_indices(old_num_stmts+1) -      --print("Cur indices+1 "..list_to_string(cur)) -      for cidx,i in ipairs(cur) do -         if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then -            tile(old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i)) -	    --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i)) -         end -      end -   end ---]] - - -   --Unroll to the last thread level -   --for stmt=old_num_stmts,new_num_stmts-1 do -   -- level = find_cur_level(stmt,thread_idxs[#thread_idxs])--get last thread level -   --if level < #cur_indices(stmt) then -   -- unroll(stmt,level+1,0) -   --print(string.format("\n[Unroll]unroll(%d, %d, 0)",stmt, level+1))  -   ----print_code() -   --end -   --end -   io.flush() -   --print("****** ending copy to registers\n\n") -   --io.flush() -end - -function copy_to_shared(start_loop, array_name, alignment) -   --print(string.format("\nstarting copy to shared(%s, %s, %d )",start_loop,array_name,alignment)) -   stmt = 0 --assume stmt 0 -   cur = cur_indices(stmt) -   --print("Cur indices "..list_to_string(cur)) -    -   start_level = find_cur_level(stmt, start_loop) -   --print(string.format("start_level %d", start_level)) -    -   old_num_stmts = num_statements() -   --print(string.format("old_num_statements %d", old_num_stmts)) -    -   --Now, we give it indices for up to two dimentions for copy loop -   copy_loop_idxs = {"tmp1","tmp2"} -   --print(string.format("\n[DataCopy]datacopy(%d, %d, %s, {\"tmp1\",\"tmp2\"},false,0,1,%d,true)",stmt, start_level, array_name, alignment))  -   datacopy(stmt, start_level, array_name, copy_loop_idxs, false, 0, 1, alignment,true) -    -   add_sync(stmt,start_loop) -   new_num_stmts = num_statements() -    -   --This is fairly CUBLAS2 specific, not sure how well it generalizes, -   --but for a 2D copy, what we want to do is "normalize" the first loop -   --"tmp1" then get its hard upper bound. We then want to tile it to -   --make the control loop of that tile "ty". We then tile "tmp2" with a -   --size of 1 and make it "tx". -   --print(string.format("fairly CUBLAS2 specific, OLD %d  NEW %d",  old_num_stmts, new_num_stmts )) -    -   for stmt=old_num_stmts,new_num_stmts-1 do -      --print(string.format("for stmt = %d", stmt)) -      was_no_error, level = pcall(find_cur_level, stmt, "tmp2") -       -      if was_no_error then  -         --print_code()  -         --print("\nCopy to shared: [If was no error]\n") -         find_cur_level(stmt,"tmp2") -         tile(stmt, level, level) -          -         lower,upper = hard_loop_bounds(stmt, level) -         upper = upper + 1 -         --print(string.format("lower %d  upper %d", lower, upper)) -          -         tx,ty = thread_dims() -         --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx) -          -         level = find_cur_level(stmt,"tmp1") -         --print(string.format("level %d", level)) -          -         if tx == upper and ty == 1 then -            --print(string.format("tx = %d    upper = %d     ty = %d", tx, upper, ty)) -            --print "Don't need" -             -            --Don't need an extra tile level, just move this loop up -            second_level = find_cur_level(stmt,"tmp2") -            --print(string.format("\n[Tile0]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx"))  -            tile(stmt, second_level, 1, level, "tx", "tx", counted) -         else -            --print "DO need?" -            --print_code() -            if(ty == 1) then new_ctrl = "tmp3" else new_ctrl = "ty" end - - ---[[ Commenting out a block of Gabe's code in this control flow -               -- level = find_cur_level(stmt,"tmp1") -               tile(stmt, level, level) - -               lower,upper = hard_loop_bounds(stmt, level) -               upper = upper + 1 -               --print_code() -               --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx..", level: "..level) -               if(math.ceil(upper/ty) > 1)then -                  tile(stmt, level, math.ceil(upper/ty), level, "tmp", new_ctrl, counted) -                  --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level,  math.ceil(upper/ty),upper,ty, level, "tmp", new_ctrl))  -               else -                  tile(stmt, level, math.ceil(upper/ty), level, "ty", new_ctrl, counted) -		  --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level,  math.ceil(upper/ty),upper,ty, level, "tx", new_ctrl)) -               end -                -               --print_code()     -               -- [Malik] If here we have the loop upper bound > tx, then we should tile once more after the next tile, to carve out the correct tx.  -               lower1,upper1 = hard_loop_bounds(stmt,level) -               level1 = level -               stmt1 = stmt -               -- [Malik] Do the tile after the second level tile with if condition. Just to keep the original order, the tile is being pushed to the end.  -                -               --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1) - -               --print_code() -               --level = find_cur_level(stmt,"tmp") -               --tile(stmt,level,level) -               --print_code()  -                -               --[Malik] if you are moving the loop above the level1, you need to update level1 with new position which would be level1+2 or second_level -               if(level <= level1) then level1 = level1+2 end - 	       --print(string.format("\n[Tile2]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx"))  -               --print("\n----------------------------------") -               --print_code() -               --print("\n**********************************") -               --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1) -               -- [Malik] If the upper bound > tx, we do another tile to carve out the correct tx from a bigger loop. Else just normalize the bounds.  -               if( upper1 > ty) then -                  third_level = find_cur_level(stmt1,"tmp") -                  --print("\n\n\n\t\t\t\tthirdlevel:"..third_level) -                  tile(stmt1, third_level, ty, third_level, "ty", "tmp", counted) -                  --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt1, third_level, ty,third_level, "ty", "tmp")) -                  tile(stmt1,third_level+1,third_level+1) -                  --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level+1)) -                  tile(stmt1,third_level+1,third_level) -                  --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level)) -               else -                  tile(stmt1,level1,level1) -                  --print(string.format("\n[Tile3ELSE]tile(%d, %d, %d)",stmt1,level1,level1)) -               end -                -               --print("\nStarting tmp2\n");--print_code(); -               second_level = find_cur_level(stmt,"tmp2") -               lower,upper = hard_loop_bounds(stmt,second_level) -               level = second_level -               --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..", level:"..level) -                -               if(math.ceil(upper/tx) > 1)then -                  tile(stmt, second_level,math.ceil(upper/tx), level, "tmp", "tx", counted) -                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tmp", "tx")) -               else -                  tile(stmt, second_level,math.ceil(upper/tx), level, "tx", "tx", counted) -                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tx", "tx")) -               end -               --print_code() -               lower2,upper2 = hard_loop_bounds(stmt,level) -               level2 = level -               stmt2 = stmt -               --print("[Malik]-loop cleanup@tmp2: lower2, upper2: "..lower2..", "..upper2..", tx: "..tx..", level:"..level2) -               -- now for the second level. -               if( upper2 > tx) then -                  forth_level = find_cur_level(stmt2,"tmp") -                  --print("\n\n\n\t\t\t\tforthlevel:"..forth_level) -                  --print_code() -                  tile(stmt2, forth_level, 1, forth_level, "tx", "tmp", counted) -                  --print(string.format("\n[Tile3B]tile(%d, %d, %d,%d,%s,%s,counted)",stmt2, forth_level, tx,forth_level, "ty", "tmp")) -                  --print_code() -                  --tile(stmt2,forth_level+1,forth_level+1) -                  --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level+1)) -                  --tile(stmt2,forth_level+1,forth_level) -                  --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level)) -               else -                  new_level = find_cur_level(stmt2,"ty") -                  tile(stmt2,level2,1,new_level,"tx","tx",counted) -                  --print(string.format("\n[Tile3BELSE]tile(%d, %d, %d)",stmt2,level2,level2)) -                  tmp_level = find_cur_level(stmt2,"tmp") -                  tile(stmt2,tmp_level,tmp_level) -               end -                -               --print_code() -               --print("\n----------------------------------") ---]] -                -               --print_code()  -               --print("\nStarting tmp2\n");--print_code(); -               first_level = find_cur_level(stmt,"tmp1") -               second_level = find_cur_level(stmt,"tmp2") -               lower,upper = hard_loop_bounds(stmt,second_level) -                -               --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..",first level:"..first_level..",second_level:"..second_level) -                -               -- Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx. -               --print(string.format("\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,1,first_level, "tx", "tx")) -               tile(stmt,second_level,1,first_level,"tx","tx",counted) -               --print_code() -                -               first_level = find_cur_level(stmt,"tmp1") -               lower_1,upper_1 = hard_loop_bounds(stmt,first_level) -               tx_level = find_cur_level(stmt,"tx") -               lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level) -               --print(string.format("UL_1 %d %d     UL_tx %d %d", lower_1, upper_1, lower_tx, upper_tx)) -                -               if(math.ceil(upper_tx/tx) > 1)then -                  --print "ceil I say" -                  --print(string.format("\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,tx,tx_level, "tx", "tmp1")) -                  tile(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted) -                  --print_code() -                   -                  peat = find_cur_level(stmt,"tx") -                  --print(string.format("\n[Tile1]tile(%d, %d, %d)",stmt, peat, peat)) -                  tile(stmt, peat, peat )  --find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx")) -                  --print_code() -                   -                  if (find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx")) then -                     --print(string.format("\nagain [Tile1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))) -                     tile(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")) -                     --print_code() -                  end -                  --else -                  --tile(stmt, tx_level,1, tx_level, "tx", "tx", counted) -                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,1,tx_level, "tx", "tx")) -               end -               --print_code() -               --]]  -- this apparently is NOT the end of a block comment -                -               --print("\nStarting tmp1\n") -               -- Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty". -               tile(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))      -               --print_code()   -                -               ty_level = find_cur_level(stmt,"tmp1") -               lower_ty,upper_ty = hard_loop_bounds(stmt,ty_level) -                -               tx_level = find_cur_level(stmt,"tx") -               lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level) -               --print("[Malik]-loop cleanup@tmp1: lowerty, upperty: "..lower_ty..", "..upper_ty..", ty: "..ty..",ty level:"..ty_level..",tx_level:"..tx_level..", stmt: "..stmt) -                -               --print "before ceil" -               if(math.ceil(upper_ty/ty) > 1)then -                  --print "CEIL IF" -                  --print("\n Inside upper_ty/ty > 1\n"); -                   -                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,ty,ty_level, "ty", "tmp_ty")) -                  tile(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted) -                  --print_code() -                   -                  --print(string.format("\n[Tile2-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt  ,"ty"),find_cur_level(stmt,"ty"))) -                  tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty")) -                  --print_code() -                   -                  ----------------------------------------------------------------------- -                  ---------------------------------------------------------------------- -                  cur_idxs = cur_indices(stmt) -                  --print("\n cur indexes are "..list_to_string(cur_idxs)) -                   -                  -- Putting ty before any tmp_tx    -                  idx_flag = -1 -                  for num= 0,table.getn(cur_idxs) do -                     if(cur[num] == "tmp_tx") then -                        idx_flag = find_cur_level(stmt,cur[num]) -                        break -                     end -                  end -                  --print(string.format("\n (1) so i have found out the value of idx flag as %d",idx_flag) ) -                   -                  if(idx_flag >=0 ) then   -                     if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then -                        --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) -                        tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) -                        --print_code() -                     end -                  end -                   -                  -- Now Putting ty before any tmp_ty -                  idx_flag = -1 -                  for num= 0,table.getn(cur_idxs) do -                     if(cur[num] == "tmp_ty") then -                        idx_flag = find_cur_level(stmt,cur[num]) -                        break -                     end -                  end -		  --print(string.format("\n IF  so i have found out the value of idx flag as %d",idx_flag) ) -                  if(idx_flag >=0 ) then   -                     --print "one more test" -                     if ((find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"))) then -                        --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) -                        tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) -                        --print_code() -                     end -                  end -               else -                  --print "CEIL ELSE" -                  --cur_idxs = cur_indices(stmt) -                  --print("\n Inside upper_ty/ty <= 1\n"); -                   -                  --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,1,ty_level, "ty", "ty")) -                  tile(stmt, ty_level,1, ty_level, "ty", "ty", counted) -                  --print_code() -                   -                  --print(string.format("\n[Tile3-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)) -                  tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1) -                  --print_code() -                   -                  idx_flag = -1 -                  if(cur_idxs) then -                     --print "CAN NEVER GET HERE?  cur_idxs" -                     for num= 0,table.getn(cur_idxs) do -                        if(cur[num] == "tmp_ty") then -                           idx_flag = find_cur_level(stmt,cur[num]) -                           break -                        end -                     end -                  end -                  --print(string.format("\n ELSE so i have found out the value of idx flag as %d",idx_flag) ) -                  if(idx_flag >=0 ) then   -                     if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then -                        --print(string.format("tile( stmt %d, level ty %d, level ty %d",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))  -                        tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) -                        --print(string.format("\n[Tile3-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) -                     end -                  end -               end -                -               --print_code() -         end -          -          -         --print "\n\n *** at bottom of if in copy to shared, " -         --print_code() -         --print "end of if" -          -      else -         --copy to shared only created one level, not two, so we use a different approach (MV & TMV) -         --print("\nCopy to shared: [If was error]\n") -         level = find_cur_level(stmt,"tmp1") -         tile(stmt, level, level) -          -         --print(string.format("\n[Tile]tile(%d, %d, %d)",stmt, level, level))  -         tx,ty = thread_dims() -         lower,upper = hard_loop_bounds(stmt, level) -         upper = upper+1 --upper bound given as <=, compare to dimensions tx which is < -         --print("upper "..upper.." tx "..tx) -         if upper == tx then -            rename_index(stmt, "tmp1", "tx") -         else -            --print("upper is not tx") -            --TODO: Don't know, maybe do some tileing etc -            --print_code() -            --print("upper "..upper.." tx "..tx.." stmt: "..stmt.." level: "..level) -            tile(stmt, level,tx,level, "tx", "tmp_tx", counted) -            --print_code() -             -            --print("stmt:"..stmt.." level+1: "..level+1) -            --print("TILE 7") -            tile(stmt, level+1,1,level+1,"tx", "tx",counted) -            --print("TILE 3") -            tile(stmt,level+1,level) -            --print_code() -             -            if(ty > 1) then -               --print_code() -               --print("GOING IN") -               lower,upper = hard_loop_bounds(stmt, level+1) -               --print(string.format("ty %d  lower %d  upper %d", ty, lower, upper)) -               --upper=125 -               --print("NOW FOR Y: upper "..upper.." ty "..ty.." stmt: "..stmt.." level: "..(level+1).." bound:"..math.ceil(upper/ty)) -               tile(stmt, level+1,math.ceil(upper/ty),level+1, "tmp_ty", "ty", counted) -               --tile(stmt, level+2,math.ceil(upper/ty),level+2, "tmp_ty", "ty", counted) -            end -            --print_code() -            --rename_index(stmt, "tmp1", "tx") -            --print("Warning: Need to implement some logic here to tile the single level shared copy loop to match thread dimensions") -         end -      end -      --Always add sync -      add_sync(stmt,start_loop) -       -   end -   --print("ending copy to shared\n") -   --print_code() -end - -function unroll_to_depth(max_depth) -   --print(string.format("\n\nunroll_to_depth(%d)", max_depth )) -   --print "SYNC UP" -    -   cur = cur_indices(0) -   thread_idxs = thread_indices() -   guard_idx = thread_idxs[#thread_idxs] -    -   --print(string.format("cur    indices %s",list_to_string(cur))) -   --print(string.format("thread indices %s",list_to_string(thread_idxs))) -   --print(string.format("#thread_idxs = %d", #thread_idxs)) -   --print(string.format("guard_idx = %s", guard_idx)) -    -   ---- HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS    -   common_loops = {} -   comm_loops_cnt = 0 -   num_stmts = num_statements() -   --print(string.format("num statements %d", num_stmts)) -    -   for stmt=0,num_stmts-1 do -      cur_idxs = cur_indices(stmt) -       -      --print(string.format("\nSTMT %d Current Indices: %s",stmt,list_to_string(cur_idxs))) -       -      if(chk_cur_level(stmt,"tx")>0) then -         for ii=1,find_cur_level(stmt,"tx")-1 do    -- started at 0 -            --print(string.format("ii = %d", ii)) -- index starts at 1, what does index 0 do? -            --if cur_idxs[ii] == nil then print "cur_idxs[i]] is NIL"  -            --else print(string.format("cur_idxs[%d] = '%s'", ii, cur_idxs[ii])) -- index starts at 1, what does index 0 do? -            --end -             -            if(cur_idxs[ii] ~= "bx" and cur_idxs[ii] ~= "by" and cur_idxs[ii] ~= nil and cur_idxs[ii] ~= "tx" and cur_idxs[ii] ~= "ty" and cur_idxs[ii] ~= "") then  -                -               --print(string.format("id %s is not in the list", cur_idxs[ii] )) -                -               for stmt1=stmt+1,num_stmts-1 do -                  --print(string.format("\nii %d stmt1 is %d", ii, stmt1))           -                  cur_idxs1 = cur_indices(stmt1) -                  --print("\nstmt1 cur_idxs1 is "..list_to_string(cur_idxs1))    -                   -                  --print(string.format("cur level(%d, %s) = %d", stmt, "tx",  find_cur_level(stmt,"tx")))     -                   -                  endrange = find_cur_level(stmt,"tx")-1 -                  --print(string.format("for iii=1, %d do", endrange)) -                   -                  for iii=1,find_cur_level(stmt,"tx")-1 do  -- started at 0 -                     --print(string.format("stmt %d   ii %d   iii %d ", stmt, ii, iii)) -                     --if(cur_idxs1[iii] ~= nil) then  -                     --   print(string.format("stmt %d   ii %d   iii %d  cur_idxs1[%d] = '%s'", stmt, ii, iii, iii, cur_idxs1[iii]))   -                     --else  -                     --   print(string.format("stmt %d   ii %d   iii %d  cur_idxs1[%d] = NIL", stmt, ii, iii, iii))   -                     --end -                      -                     if(cur_idxs1[iii] ~= "bx" and cur_idxs1[iii] ~= "by" and cur_idxs1[iii] ~= nil and cur_idxs1[iii] ~= "tx" and cur_idxs1[iii] ~= "ty" and cur_idxs1[iii] ~= "") then   -                        if(cur_idxs[ii] == cur_idxs1[iii]) then -                           --print("\nfound idx:"..cur_idxs[ii]) -			   --if(comm_loops_cnt == 0) then print "\n\n*** WARNING *** assigning to array index ZERO in Lua" end -                           common_loops[comm_loops_cnt] = cur_idxs[ii] -                           --print(string.format("cl[%d] = '%s'", comm_loops_cnt,   common_loops[comm_loops_cnt])) -                           comm_loops_cnt = comm_loops_cnt + 1 -                        end -                     end   -                  end -               end   -            end -         end -      end -   end -   ---- -   --if(comm_loops_cnt>0) then  -   --   print("\n COMM LOOPS :TOTAL "..comm_loops_cnt..", and are "..list_to_string(common_loops).." this loop :"..common_loops[0]) -   --else -   --   print "UNROLL can't unroll any loops?" -   --end -    -    -    -    -   repeat -      old_num_stmts = num_statements() -      --print(string.format("old_num_statements %d", old_num_stmts)) -       -      for stmt=0,old_num_stmts-1 do -         cur_idxs = cur_indices(stmt) -         --print(string.format("stmt %d    cur_idxs = %s", stmt, list_to_string(cur_idxs))) -         if(#cur_idxs > 0) then  -            gaurd_level = -1 -            if(chk_cur_level(stmt,guard_idx)>0) then -               gaurd_level = find_cur_level(stmt,guard_idx) -            end -            --print(string.format("guard_level(sp) = %d", gaurd_level)) -             -            if(gaurd_level>-1) then -               level = next_clean_level(cur_idxs,gaurd_level) -               --print(string.format("next clean level %d", level)) -                -               --need to handle max_depth -               num_unrolled = 0 -               level_unroll_comm = level -               level_arr = {} -               while level >= 0 do -                  --print(string.format("while: level = %d", level)) -                   -                  if num_unrolled == max_depth then break end -                  --print("Unrolling "..stmt.." at level "..(level).." index ".. cur_idxs[gaurd_level+1]) -                   -                  level_arr[num_unrolled] = level -                  num_unrolled = num_unrolled + 1 -                   -                  guard_level = find_cur_level(stmt,guard_idx) -                  level = next_clean_level(cur_idxs,level+1) -               end -               --dies print("How many levels for unroll commands"..table.getn(level_arr).." which is "..level_arr[0].." and "..level_arr[#level_arr]) -               --if(table.getn(level_arr) ~= nil) then -                -               --print "OK, NOW WE UNROLL" -                -               if(level_unroll_comm >= 0)then -                  for i = table.getn(level_arr),0,-1 do -                     --print(string.format("\ni=%d", i)) -                     --print(string.format("[Unroll]unroll(%d, %d, 0)",stmt, level_arr[i]))      -                      -                     unroll(stmt,level_arr[i],0) -                     --print("finished unroll]]\n") -                     --print_code() -                  end -               end ------- -            end     ---[[ - -THERE WAS A BIG BLOCK OF COMMENTED OUT CODE HERE  - - ---]] ------- -         end -      end -      new_num_stmts = num_statements() - -   until old_num_stmts == new_num_stmts - -end - - diff --git a/test-chill/test-cases/examples/cuda-chill/cudaize.py b/test-chill/test-cases/examples/cuda-chill/cudaize.py deleted file mode 100755 index ffef009..0000000 --- a/test-chill/test-cases/examples/cuda-chill/cudaize.py +++ /dev/null @@ -1,1047 +0,0 @@ -#! /usr/bin/python - -# THIS IS CUDAIZE.PY - -import chill -import sys -import math  - -strided = 0 -counted = 1 - -def print_code(): -    chill.print_code() -    print "" -    sys.stdout.flush() - -     -def table_contains_key( table, key ):  # use a dict for the 'table'? -    return table.has_key(key) # (key in table)? - -def print_array( arr ):  # a useful function to mimic lua output  -    for a in arr[:-1]: -        print "%s," % a, -    print "%s" % arr[-1] -    sys.stdout.flush() - -def valid_indices( statement, indices ): -    #print "valid_indices() python calling C cur_indices" -    #print statement -    cur = chill.cur_indices(statement) # calls C -    #print "python valid_indices(), cur = ", -    #print cur -    #print "indices = ", -    #print indices - -    for index in indices: -        if not index in cur: -            return False -    return True - -def next_clean_level( indices_at_each_level, level): -    #print "next_clean_level( ..., %d )" % level  -    #print "indices_at_each_level ", -    print_array( indices_at_each_level ) - -    numlevels = len(indices_at_each_level) -    #print "loop to %d" % numlevels -    for i in range(level+1, numlevels+1): -        pythoni = i-1 # LUA index starts at 1 -        #print "Checking level %d = '%s'" % (i, indices_at_each_level[pythoni]) -        sys.stdout.flush() -        if len(indices_at_each_level[pythoni]) > 0: # LUA INDEX STARTS AT 1 -            #print "returning %d" % i -            return i  # MATCH lua return value, LUA index starts at one -    return -1  # no non-dummy indices - - - - -def build_order(  final_order, tile_index_names, control_index_names, tile_index_map, current_level): -    order = []    -    #print "\nbuild_order()" -    #print "build_order(): final_order = (", -    count = 0 -    for f in final_order: -        #if count+1 == len(final_order): -        #    print "%s )" % f -        #else: -        #    print "%s," % f , -        count += 1 - -        keys = control_index_names.keys() -        keys.sort() -        #if (2 == len(keys)): -        #    print "build_order(): ctrl_idx_names = (%s, %s)" % (control_index_names[0], control_index_names[1]) -        #else: -        #    print "build_order(): ctrl_idx_names = (%s" % control_index_names[0], -        #    for k in keys[1:]: -        #        print ", %s" % control_index_names[k], -        #    print ")" - -    #print control_index_names -    #print "cur_level %d" % current_level -     -    #print "tile index map: ", -    #print tile_index_map - - -    for i in range(len(final_order)): -        k = final_order[i]  # not used? -        skip = False -        cur = final_order[i]   -        # control loops below our current level should not be in the current order - -        # skip = cur in control_index_names[current_level+2:]  -        #print "\n%d control_index_names, " % len(control_index_names) -        #print control_index_names - -        for j in range(current_level+1, len(control_index_names)): -            #print "comparing cur %s with cin[%d] %s" % ( cur, j, control_index_names[j]) -            if control_index_names[j] == cur: -                skip = True  -                #print "SKIP %s  " % cur - -        # possibly substitute tile indices if necessary -        if tile_index_map.has_key(cur): -            approved_sub = False -            sub_string = tile_index_map[cur] -            #print "sub_string = ", -            #print sub_string - -            # approved_sub = sub_string in tile_index_names[current_level+2:] -            for j in range(current_level+1, len(tile_index_names)): -                if tile_index_names[j] == sub_string: -                    approved_sub = True -            if approved_sub: -                cur = sub_string - -        if not skip: -            order.append( cur)   -    #print "build_order() returning order (", -    #print order -    #for o in order: -    #    print "%s," % o, -    #print ")" -    return order - -def find_cur_level( stmt, idx ): -    #print "find_cur_level(stmt %d, idx %s)  Cur indices" % ( stmt, idx ), -     -    cur = chill.cur_indices(stmt) -    #for c in cur[:-1]: -    #    print "%s," % c, -    #print "%s" % cur[ -1 ]  - -    index = 1 # lua starts indices at 1 !!   -    for c in cur: -        if c == idx: -            #print "found it at index %d" % index -            #sys.stdout.flush() -            #print "in find_cur_level, returning ", -            #print index -            return index -        index += 1 -    #print "find_cur_level(), Unable to find index %s in" % idx, -    #print cur -    #print "in find_cur_level, returning -1" -    return -1  # special meaning "it's not there" - -def chk_cur_level( stmt, idx ): -    # search cur_indices for a ind at stmt -    cur = chill.cur_indices(stmt) -    if idx in cur: -       return 1 + cur.index(idx)  # lua index starts at 1 ! -    return -1 - -def find_offset( cur_order, tile, control): -    #print "Looking for tile '%s' and control '%s' in (" % (tile, control), -    #print cur_order -    #for o in cur_order: -    #    print "%s," % o, -    #print ")" - -    idx1 = -1 -    idx2 = -1 -    if tile in cur_order:  -        idx1 = 1 + cur_order.index(tile) # lua indexes from 1! -    else: -        print "find_offset(), unable to find tile %s in current list of indices" % tile -        sys.exit(-1) - -    if control in cur_order: -        idx2 = 1 + cur_order.index(control) # lua indexes from 1! -    else: -        print "find_offset(), unable to find control %s in current list of indices" % control -        sys.exit(-1) - -    #print "found at level %d and %d" % ( idx2, idx1 ) -    # this appears horrible -    if idx2 < idx1: -        return idx2-idx1+1 # bad ordering -    else: -        return idx2-idx1 - - - -def tile_by_index( tile_indices, sizes, index_names, final_order, tile_method): -    #print "STARTING TILE BY INDEX" -    #print "tile_by_index() tile_method ", -    #print tile_method -    #print "index_names: ", -    #print index_names - -    stmt = 0 # assume statement 0 -    if not valid_indices( stmt, tile_indices): -        print "python tile_by_index() one or more of ", -        print tile_indices, -        print " is not valid" -        sys.exit(-1) - -    if tile_method == None: -        #print "CREATING tile_method = 1" -        tile_method = 1 # "counted" - -    tile_index_names = [] -    for ti in tile_indices: -        tile_index_names.append( ti )  # make a copy?  -    #print "tile_index_names:", -    #print tile_index_names - -    control_index_names = {} # a dictionary? -    tile_index_map =  {} -     -    #print "index_names: " -    #print index_names - -    for pair in index_names: -        valid = False -        control = pair[0] -        name    = pair[1] -        #print "control %s   name  %s" % ( control, name ) -         -        if control[0] == "l" and control[1].isdigit(): -            if control.endswith("_control"): -                index = int(control[1: -8]) -                control_index_names[index-1] = name -                valid = True - -            elif control.endswith("_tile"): -                index = int(control[1: -5]) -                #print "index %d" % index -                tile_index_names[index-1] = name # ??  -                tile_index_map[name] = tile_indices[index-1] -                valid = True -        if not valid: -            print "%s is not a proper key for specifying tile or control loop indices\n" % control - -    #print "control_index_names = ", -    #print control_index_names - -    #print "tile_index_names = ", -    #print tile_index_names - -    #print "before call to build_order(), tile_index_map = ", -    #print tile_index_map - - -    # filter out control indices (and do name substitution of unprocessed tile indices) for a given level -    cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, -1) - -    #print "returned from build_order python\n\n" - -    # print("permute("..stmt..", {"..list_to_string(cur_order).."})") -    #print "permute(%d, {" % stmt, -    #print "cur_order = ", -    #print cur_order, -    #print "})" - -    cur_order.insert(0, stmt) -    #print cur_order -    chill.permute( tuple( cur_order))  -    #print "in cudaize.py, returned from C code chill.permute()\n" - -    for i in range(len(tile_indices)): -        cur_idx = tile_indices[i] -        #print "i %d  cur_idx %s calling build order ********" % (i, cur_idx) -        cur_order = build_order( final_order, tile_indices, control_index_names, tile_index_map, i) -        #print "cur_idx %s return from build order" % cur_idx -         -        # Find an offset between tile loop and control loop -        #  0   = control loop one level above tile loop -        #  -1  = control loop two levels above tile loop -        #  > 0 = tile loop above control loop -        #  In the last case, we do two extra tile commands to get the control -        #  above the tile and then rely on the final permute to handle the -        #  rest -        level = find_cur_level(stmt,cur_idx) -        #print "level %d\n" % level      - -        offset = find_offset(cur_order, tile_index_names[i], control_index_names[i]) -        #print "offset %d" % offset - -        if offset <= 0: -            #print "[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  ) -            chill.tile7( stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  ) -            #print "in cudaize.py, returned from C code chill.tile7\n" - -        else: -            #print "2tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  ) -            chill.tile7( stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  ) # regular level - -            # flip and tile control loop -            #print "3tile(%d, %d, %d)" % ( stmt, level+1, level+1) -            chill.tile3( stmt, level+1, level+1) - -            #print "4tile(%d, %d, %d)" % ( stmt, level+1, level) -            chill.tile3( stmt, level+1, level) - -            #print_code() - -        # Do permutation based on cur_order -        #print("permute based on build order calling build_order()") -        cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, i) - -        #print("permute based on build order return from build_order()") - -        #  print("permute("..stmt..", {"..list_to_string(cur_order).."})") -        topermute = cur_order -        topermute.insert(0, stmt) -        chill.permute( tuple(topermute) )  -        #print "\nafter permute(), code is:" -        #print_code() - -def normalize_index( index ): -    #print "in cudaize.py, normalize_index( %s )" % index -    stmt = 0  # assume stmt 0 -    l = find_cur_level( stmt, index ) -    chill.tile3( stmt, l, l ) - -def is_in_indices( stmt, idx): -    cur = chill.cur_indices(stmt) -    return idx in cur - -def copy_to_registers( start_loop, array_name ): -    #print "\n\n****** starting copy to registers" -    #sys.stdout.flush() - -    stmt = 0    # assume stmt 0 -    cur = chill.cur_indices(stmt) # calls C     -    table_Size = len(cur) - -    #print "Cur indices", -    #print_array(cur) -    #print "\nThe table size is %d" % table_Size -    #count=1 -    #for c in cur: -    #    print "%d\t%s" % (count,c) -    #    count += 1 - -    #print_code() - -    # would be much cleaner if not translating this code from lua! -    level_tx = -1 -    level_ty = -1    -    if is_in_indices(stmt,"tx"): -        level_tx = find_cur_level(stmt,"tx") -    if is_in_indices(stmt,"ty"): -        level_ty = find_cur_level(stmt,"ty") -    #print "level_tx %d  level_ty %d" % ( level_tx, level_ty ) -    #sys.stdout.flush() - -    ty_lookup_idx = ""  -    org_level_ty = level_ty - -    # UGLY logic. Lua index starts at 1, so all tests etc here are off by 1 from the lua code -    # level_ty initializes to -1 , which is not a valid index, and so there is added code to  -    # make it not try to acccess offset -1.   -1 IS a valid python array index -    # to top it off, the else below can assign a NIL to ty_lookup_idx!  -    if level_ty != -1 and cur[level_ty] != "": -        #print "IF  cur[%d] = %s" % ( level_ty, cur[level_ty] ) -        ty_lookup_idx = cur[level_ty]  -    else: -        #print "ELSE ty_lookup_idx = cur[%d] = %s" % ( level_ty, cur[level_ty-1])  -        ty_lookup_idx = cur[level_ty-1]  -    #print "ty_lookup_idx '%s'" % ty_lookup_idx - -    if level_ty > -1: -        #print "\ntile3(%d,%d,%d)" % (stmt,level_ty,level_tx+1) -        chill.tile3(stmt,level_ty,level_tx+1)  -    #print_code()    - -    cur = chill.cur_indices(stmt) # calls C  -    table_Size = len(cur) -    #print "Cur indices ", -    #for c in cur: -    #    print "%s," % c, -    #print "\nThe table size is %d" % len(cur) -    #count=1 -    #for c in cur: -    #    print "%d\t%s" % (count,c) -    #    count += 1 -    #sys.stdout.flush() - -    if is_in_indices(stmt,"tx"): -        level_tx = find_cur_level(stmt,"tx") -    if ty_lookup_idx != "":                      # perhaps incorrect test  -        if is_in_indices(stmt,ty_lookup_idx): -           level_ty = find_cur_level(stmt,ty_lookup_idx) -            -    ty_lookup = 1 -    idx_flag = -1 -    # find the level of the next valid index after ty+1 -    #print "\nlevel_ty %d" % level_ty -    if level_ty > -1: -       #print "table_Size %d" % table_Size -       for num in range(-1 + level_ty+ty_lookup,table_Size):   # ??  off by one? -           #print "num=%d   cur[num] = '%s'" % (num+1, cur[num]) # num+1 is lua index ???? -           sys.stdout.flush() -           if cur[num] != "": -               idx_flag = find_cur_level(stmt,cur[num]) -               #print "idx_flag = %d" % idx_flag -               break -                -    #print "\n(first) I am checking all indexes after ty+1 %s" % idx_flag -    #print_code()    -    #print ""  - -    how_many_levels = 1 -     -    #print "idx_flag = %d   I will check levels starting with %d" % (idx_flag, idx_flag+1) -    # lua arrays start at index 1. the next loop in lua starts at offset 0, since idx_flag can be -1 -    # thus the check for "not equal nil" in lua (bad idea) -    # python arrays start at 0, so will check for things that lua doesn't (?) -    startat = idx_flag + 1 -    if idx_flag == -1: -        startat = 1  # pretend we're lua for now.   TODO: fix the logic - -    for ch_lev in range(startat,table_Size+1):       # logic may be wrong (off by one) -        #print "ch_lev %d" % ch_lev -        if ch_lev <= table_Size and cur[ch_lev-1] != "": -           #print "cur[%d] = '%s'" % ( ch_lev, cur[ch_lev-1] ) -           how_many_levels += 1 - -    #print "\nHow Many Levels %d" % how_many_levels -    sys.stdout.flush() -    sys.stdout.flush() - -    if how_many_levels< 2: -        while( idx_flag >= 0): -            for num in range(level_ty+ty_lookup,table_Size+1): -                #print "at top of loop, num is %d" % num -                #print "cur[num] = '%s'" % cur[num-1] -                if cur[num-1] != "": -                    idx = cur[num-1] -                    #print "idx '%s'" % idx -                    sys.stdout.flush() -                    curlev = find_cur_level(stmt,idx) -                    #print "curlev %d" % curlev - -                    #print "\n[COPYTOREG]tile(%d,%d,%d)"%(stmt,curlev,level_tx) - -                    chill.tile3(stmt, curlev, curlev) -                    curlev = find_cur_level(stmt,idx) -                    #print "curlev %d" % curlev -                    chill.tile3(stmt,curlev,level_tx) -                    #print "hehe '%s'" % cur[num-1] -                     -                    cur = chill.cur_indices(stmt) -                    #print "Cur indices INSIDE", -                    #for c in cur: -                    #    print "%s," % c, -                    table_Size = len(cur) -                    #print "\nTable Size is: %d" % len(cur) - -                    level_tx = find_cur_level(stmt,"tx") -                    #print "\n level TX is: %d" % level_tx -                    level_ty = find_cur_level(stmt,ty_lookup_idx) -                    #print "\n level TY is: %d" %level_ty -                    idx_flag = -1 -                    #print "idx_flag = -1" - - -                    #- find the level of the next valid index after ty+1 -                    #- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?) -                    for num2 in range( -1 + level_ty+ty_lookup ,table_Size): # lua starts index at one -                        #print "num mucking num = %d" % num2 -                        if(cur[num2] != ""): -                            #print "cur[%d] = '%s'" % ( num2, cur[num2] ) -                            idx_flag = find_cur_level(stmt,cur[num2]) -                            #print("\n(second) I am checking all indexes after ty+1 %s",cur[num2]) -                            break - -                    #print "num mucked to %d     idx_flag = %d" % (num, idx_flag) - -                #print "at bottom of loop, num is %d" % num -           -    #print "done with levels" - -    # this was a block comment ??? - -#    for num in range(level_ty+1, table_Size+1): -#        print "num %d" % num -#        if cur[num-1] != "": -#            idx_flag = find_cur_level(stmt,cur[num-1])  ## ugly  -#    print "idx_flag = %d" % idx_flag - -    # change this all to reflect the real logic which is to normalize all loops inside the thread loops.  -#    print "change this all ...\n" -#    print "level_ty+1 %d  table_Size-1 %d     idx_flag %d" %( level_ty+1, table_Size-1, idx_flag) -#    sys.stdout.flush() -#    sys.stdout.flush() - -#    while level_ty+1 < (table_Size-1) and idx_flag >= 0: -#        print "*** level_ty %d" %  level_ty -#        for num in range(level_ty+2,table_Size+1):  # lua for includes second value -#            print "num %d   cur[num] %s" % (num, cur[num]) -#            if cur[num] != "": -#                idx = cur[num] -#                print "idx='%s'" % idx -#                #print_code() -                 -                 -             - -    #print "ARE WE SYNCED HERE?" -    #print_code() - -    #  [Malik] end logic -    start_level = find_cur_level(stmt, start_loop) # start_loop was passed parameter! - -    # We should hold constant any block or tile loop -    block_idxs  = chill.block_indices() -    thread_idxs = chill.thread_indices() -    #print"\nblock indices are" -    #for index, val in enumerate(block_idxs): -    #    print "%d\t%s" % ( int(index)+1 , val ) -    #print"\nthread indices are" -    #for index, val in enumerate(thread_idxs): -    #    print "%d\t%s" % ( int(index)+1 , val ) -    #print "\nStart Level: %d" % start_level - -    hold_constant = [] -    #print("\n Now in Blocks") -    for idx in block_idxs: -        blocklevel = find_cur_level(stmt,idx) -        if blocklevel >= start_level: -           hold_constant.append(idx) -           #print "\nJust inserted block %s in hold_constant" %idx - -    #print("\n Now in Threads") -    for idx in thread_idxs: -        blocklevel = find_cur_level(stmt,idx) -        if blocklevel >= start_level: -            hold_constant.append(idx) -            #print "\nJust inserted thread %s in hold_constant" %idx -    #print "\nhold constant table is: " -    #for index, val in enumerate(hold_constant): -    #    print "%d\t%s" % ( int(index)+1 , val ) -     -    #print("\nbefore datacopy pvt") -    old_num_stmts = chill.num_statements() -    #sys.stdout.flush() - -    #print "\n[DataCopy]datacopy_privatized(%d, %s, %s, " % (stmt, start_loop, array_name), -    #print hold_constant, -    #print ")" -    passtoC = [stmt, start_loop, array_name ] # a list -    passtoC.append( len(hold_constant ) ) -    for h in hold_constant: -        passtoC.append( h ) -    chill.datacopy_privatized( tuple( passtoC )) -    sys.stdout.flush() -    sys.stdout.flush() -     -    new_num_statements = chill.num_statements() -    #print "new num statements %d" % new_num_statements     - -    # Unroll to the last thread level -#    for stmt in range(old_num_statements, new_num_statements): -#        print "unrolling statement %d" % stmt -#        level = find_cur_level(stmt,thread_idxs[-1]) #get last thread level -#        print "level is %d" % level -#        idxs = chill.cur_indices(stmt) -#        if level < len(idxs): -#            chill.unroll(stmt,level+1,0) - - - -def copy_to_shared( start_loop, array_name, alignment ): -    #print "\nstarting copy to shared( %s, %s, %d)" % (start_loop, array_name, alignment )  -    #print "copy_to_shared( %s, %s, %d) in cudaize.py" % ( start_loop, array_name, alignment ) -    stmt = 0 # assume statement 0 - -    cur = chill.cur_indices(stmt) -    #print "Cur indices ", -    #print_array( cur ) - -    start_level = find_cur_level( stmt, start_loop ) -    #print "start_level %d" % start_level - -    old_num_statements = chill.num_statements() -    #print "old_num_statements %d" % old_num_statements -     - -    # Now, we give it indices for up to two dimensions for copy loop -    copy_loop_idxs = ["tmp1","tmp2"] -    #chill.datacopy_9arg(stmt, start_level, array_name, copy_loop_idxs, False, 0, 1, alignment,True) -    passtoC = [stmt, start_level, array_name]   # a list -    passtoC.append( len(copy_loop_idxs)) -    for i in copy_loop_idxs: -        passtoC.append(i) -    passtoC.append( 0 ) # False -    passtoC.append( 0 ) -    passtoC.append( 1 ) -    passtoC.append( alignment ) -    passtoC.append( 1 )   # True -    #print "\n[DataCopy]datacopy( ", -    #print passtoC, -    #print ")" - -    #if array_name == "b": -    #    chill.cheat(1) -    #if array_name == "c": -    #    chill.cheat(2) -     -    chill.datacopy_9arg( tuple( passtoC )) - -    #print "back from datacopy_9arg\n\n\n" -    #sys.stdout.flush() - - -    #print "calling add_sync( %d, %s )" % ( stmt, start_loop ) -    chill.add_sync( stmt, start_loop ) -    #print "back from add_sync()\n\n" - -    new_num_statements = chill.num_statements() -     -    #  This is fairly CUBLAS2 specific, not sure how well it generalizes, -    #  but for a 2D copy, what we want to do is "normalize" the first loop -    #  "tmp1" then get its hard upper bound. We then want to tile it to -    #  make the control loop of that tile "ty". We then tile "tmp2" with a -    #  size of 1 and make it "tx". - -    #print "fairly CUBLAS2 specific, OLD %d  NEW %d" % ( old_num_statements, new_num_statements) -    sys.stdout.flush() -    sys.stdout.flush() - -    for stmt in range(old_num_statements, new_num_statements): -        #print "for stmt = %d" % stmt -        level = find_cur_level( stmt, "tmp2") -        #print "FOUND CUR LEVEL?  level '", -        #print level, -        #print "'" - -        #print "in loop, stmt %d   level %d" % ( stmt, level ) -        if level != -1: -            #print "\nCopy to shared: [If was no error]\n" -            find_cur_level(stmt,"tmp2") -            chill.tile3( stmt, level, level ) -             -            #print "hard_loop_bounds( %d, %d )" % (stmt, level) -            bounds = chill.hard_loop_bounds(stmt, level) -            lower = bounds[0] -            upper = 1+ bounds[1] -            #print "lower %d  upper %d" % ( lower, upper ) - -            dims = chill.thread_dims() -            #print "in cudaize.py copy_to_shared, dims =", -            #print dims -            tx = dims[0] -            ty = dims[1] -            #print "2-loop cleanup: lower, upper: %d, %d,  tx: %d" % ( lower, upper, tx) - -            level = find_cur_level(stmt,"tmp1") -            #print "level %d" % level -            if tx == upper and ty == 1: -                #print "tx = %d    upper = %d     ty = %d"% (tx, upper, ty) -                #print "Don't need" - -                # Don't need an extra tile level, just move this loop up -                second_level = find_cur_level(stmt,"tmp2") -                chill.tile7(stmt, second_level, 1, level, "tx", "tx", counted) - -            else: -                #print "DO need?" -                if ty == 1: -                    new_ctrl = "tmp3"  -                else: -                    new_ctrl = "ty" - -                # LOTS of commented out code here in cudaize.lua  - -                #print_code() -                #print "\nStarting tmp2\n" -                first_level  = find_cur_level(stmt,"tmp1") -                second_level = find_cur_level(stmt,"tmp2") -                bounds = chill.hard_loop_bounds(stmt, second_level) -                lower = bounds[0] -                upper = 1 + bounds[1]   # BROKEN? -                         -                #print "[Malik]-loop cleanup@tmp2: lower, upper: %d, %d, tx: %d,first level:%d,second_level:%d" % ( lower, upper-1, tx, first_level, second_level)  - -                # Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx. -                #print "\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, second_level,1,first_level, "tx", "tx") -                chill.tile7(stmt, second_level,1,first_level,"tx","tx",counted) -                #print_code() - -                first_level = find_cur_level(stmt,"tmp1") -                bounds = chill.hard_loop_bounds(stmt, first_level) -                lower_1 =     bounds[0] -                upper_1 = 1 + bounds[1] -                tx_level = find_cur_level(stmt,"tx") -                bounds = chill.hard_loop_bounds(stmt,tx_level) -                lower_tx =   bounds[0] -                upper_tx = 1+bounds[1] -                #print "UL_1 %d %d     UL_tx %d %d" % ( lower_1, upper_1-1, lower_tx, upper_tx-1) - -                if int(math.ceil( float(upper_tx)/float(tx))) > 1: -                     #print "ceil I say" -                     #print "\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, tx_level,tx,tx_level, "tx", "tmp1") -                     chill.tile7(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted) -                     #print_code() - -                     repeat = find_cur_level(stmt,"tx") -                     #print "\n[Tile1]tile(%d, %d, %d)" % (stmt, repeat, repeat) -                     chill.tile3(stmt, repeat, repeat)  #find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx")) -                     #print_code() - -                     if find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx"): -                        #print "\nagain [Tile1]tile(%d, %d, %d)" % (stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")) -                        chill.tile3(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")) -                        #print_code() - -                #print_code() - -                #print "\nStarting tmp1\n" -                # Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty". -                chill.tile3(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))       -                #print_code() - -                ty_level = find_cur_level(stmt,"tmp1") -                bounds = chill.hard_loop_bounds(stmt,ty_level) -                lower_ty = bounds[0] -                upper_ty = 1 + bounds[1] - -                tx_level = find_cur_level(stmt,"tx") -                bounds = chill.hard_loop_bounds(stmt,tx_level) -                lower_tx = bounds[0] -                upper_tx = 1 + bounds[1] - -                #print "[Malik]-loop cleanup@tmp1: lowerty, upperty: %d, %d, ty: %d,ty level:%d,tx_level:%d, stmt: %d" % ( lower_ty, upper_ty-1, ty, ty_level, tx_level, stmt) -                 -                #print "before ceil" -                #sys.stdout.flush() - -                if(math.ceil(float(upper_ty)/float(ty)) > 1): -                    #print "CEIL IF" -                    #print "\n Inside upper_ty/ty > 1\n" - -                    #print "\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, ty_level,ty,ty_level, "ty", "tmp_ty") -                    chill.tile7(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted) -                    #print_code() - -                    #print "\n[Tile2-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt  ,"ty"),find_cur_level(stmt,"ty")) -                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty")) -                    #print_code() - -                    cur_idxs = chill.cur_indices(stmt) -                    #print "\n cur indexes are ", -                    #print_array( cur_idxs) -                    #sys.stdout.flush() - -                    # Putting ty before any tmp_tx -                    idx_flag = -1 -                    if "tmp_tx" in cur_idxs: -                        idx_flag = 1 + cur_idxs.index("tmp_tx")   # lua index starts at 1 -                    #print "\n (1) so i have found out the value of idx flag as %d" % idx_flag -                    #sys.stdout.flush()       -                     -                    if idx_flag >= 0: -                         if find_cur_level(stmt,"ty") > find_cur_level(stmt,"tmp_ty"): -                             #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) -                             chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) -                             #print_code() -                     -                     -                    #  Now Putting ty before any tmp_ty -                    sys.stdout.flush()       -                    idx_flag = -1 -                    if "tmp_ty" in cur_idxs: -                        idx_flag = 1 + cur_idxs.index("tmp_ty") # lua index starts at 1 -                    #print "\n IF  so i have found out the value of idx flag as %d" % idx_flag -                    #sys.stdout.flush()       -                                             -                    if idx_flag >= 0: -                        #print "one more test" -                        sys.stdout.flush() -                        if find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"): -                            #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) -                            #sys.stdout.flush() -                            chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) -                            #print_code() - - - -                else: -                    #print "CEIL ELSE" -                    #print "\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, ty_level,1,ty_level, "ty", "ty") -                    #sys.stdout.flush() -                    chill.tile7( stmt, ty_level, 1, ty_level, "ty", "ty", counted ) -                    #print_code() - -                    #print "\n[Tile3-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1) -                    sys.stdout.flush() - -                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1) -                    #print_code() - - -                    idx_flag = -1 -                    # LUA code checks to see if cur_idxs exists?  it is unused except in the other clause of this is -                    #if(cur_idxs) then -                        #print "CAN NEVER GET HERE?  cur_idxs" -                        #for num= 0,table.getn(cur_idxs) do -                            #if(cur[num] == "tmp_ty") then -                            #idx_flag = find_cur_level(stmt,cur[num]) -                            #break -                        #end -                    #end -                    print "\n ELSE so i have found out the value of idx flag as %d" % idx_flag -                    if idx_flag >= 0:  # can't happen -                        print "tile( stmt %d, level ty %d, level ty %d" % ( stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) -                        #chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) -                     -                         -                     - -                     -            #print "\n\n *** at bottom of if in copy to shared, " -            #print_code() -            #print "end of if" - -        else: -            #  copy to shared only created one level, not two, so we use a different approach (MV & TMV) -            #print "\nCopy to shared: [If was error]\n" -            level = find_cur_level(stmt,"tmp1") -            chill.tile3(stmt, level, level) - -            dims = chill.thread_dims() -            #print dims -            tx = dims[0] -            ty = dims[1] - -            bounds = chill.hard_loop_bounds(stmt, level) -            lower = bounds[0]    -            upper = bounds[1] - -            #print "bounds  lower %d    upper %d" % (lower, upper) -            upper = upper+1 # upper bound given as <=, compare to dimensions tx which is < -            if upper == tx: -                #print "upper == tx" -                chill.rename_index( stmt, "tmp1", "tx") -            else: -                #print "upper is not tx" -                #print "upper %d tx %d stmt: %d level: %d" % ( upper, tx, stmt, level) -                chill.tile7( stmt, level, tx, level, "tx", "tmp_tx", counted) -                #print_code() - -                #print "stmt:%d level+1: %d" % ( stmt, level+1)  -                #print("TILE 7") -                chill.tile7( stmt, level+1,1,level+1,"tx", "tx",counted) -                #print("TILE 3") -                chill.tile3( stmt, level+1, level) -                #print_code()            - - -                if ty > 1: -                   #print "GOING IN" -                   bounds = chill.hard_loop_bounds(stmt, level+1) -                   lower = bounds[0]    -                   upper = bounds[1]    -                   #print "ty %d  lower %d  upper %d" % ( ty, lower, upper ) -                   floatdiv = float(upper)/float(ty) -                   bound =  int(math.ceil(float(upper)/float(ty))) -                   #print "NOW FOR Y: upper %d ty %d stmt: %d level: %d bound: %d" % ( upper, ty, stmt, level+1,   bound) -                   chill.tile7(stmt, level+1, bound, level+1, "tmp_ty", "ty", counted) - -        # Always add sync -        chill.add_sync( stmt, start_loop ) -    #print "ending copy to shared\n" -    #sys.stdout.flush() -    #print_code()      - - - - - - - - - - - - - - - - - - - -def unroll_to_depth( max_depth ): -    print "\n\nunroll_to_depth(%d)" % max_depth -    print "SYNC UP" -    sys.stdout.flush() - -    cur = chill.cur_indices(0) -    thread_idxs = chill.thread_indices() -    guard_idx = thread_idxs[-1]  # last one - -    print "cur    indices", -    print_array(cur) -    print "thread indices",  -    print_array(thread_idxs) -    print "guard_idx = %s" % guard_idx - -    #print "thread_idxs = ", -    #print thread_idxs -    guard_idx = thread_idxs[-1] -    #print "guard_idx = %s" % guard_idx - -    #  HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS -    common_loops = [] -    comm_loops_cnt = 0 -    num_stmts = chill.num_statements() -    print "num statements %d" % num_stmts - -    for stmt in range(num_stmts): -        sys.stdout.flush() -        print "\nSTMT %d" % stmt, -        cur_idxs = chill.cur_indices(stmt) -        print "Current Indices:", -        for c in cur_idxs[:-1]: -            print "%s," % c, -        print "%s" % cur_idxs[-1]   # last one -        sys.stdout.flush() -        #print_code() -         -        if chk_cur_level(stmt, "tx") > 0: -             -            for ii in range(find_cur_level(stmt,"tx")-1): -                print "ii = %d\ncur_idxs[%d] = '%s'" % (ii+1, ii+1, cur_idxs[ii]) # print to match lua -                id = cur_idxs[ii] -                if id not in ["bx", "by", "", "tx", "ty"]: - -                    print "id %s is not in the list" % id - -                    for stmt1 in range(stmt+1, num_stmts): -                        print "\nii %d stmt1 is %d" % (ii+1, stmt1)  # print to match lua  -                        cur_idxs1 = chill.cur_indices(stmt1) -                        print "\nstmt1 cur_idxs1 is ", -                        for ind in cur_idxs1[:-1]: -                            print "%s," % ind, -                        print "%s" % cur_idxs1[-1] - -                        print "cur level(%d, %s) = %d" % (stmt, "tx", find_cur_level(stmt,"tx") ) -                        sys.stdout.flush() - -                        endrange = find_cur_level(stmt,"tx")-1 -                        print "for iii=1, %d do" % endrange -                        sys.stdout.flush() -                        for iii in range(endrange):   # off by one?  TODO  -                            print "stmt %d   ii %d   iii %d\n" % (stmt, ii+1, iii+1), -                            sys.stdout.flush() -                             -                            if iii >= len(cur_idxs1): -                                print "stmt %d   ii %d   iii %d  cur_idxs1[%d] = NIL" % (stmt, ii+1, iii+1, iii+1, )  # print to match lua  -                            else: -                                print "stmt %d   ii %d   iii %d  cur_idxs1[%d] = '%s'" % (stmt, ii+1, iii+1, iii+1, cur_idxs1[iii])  # print to match lua  -                            sys.stdout.flush() - -                            # this will still probably die  -                            if iii < len(cur_idxs1) and [iii] not in ["bx", "by", "tx", "ty", ""]: -                                if cur_idxs[ii] == cur_idxs1[iii]: -                                    print "\nfound idx:%s" % cur_idxs[ii] -                                    common_loops.append(cur_idxs[ii]) -                                    print "cl[%d] = '%s'" % ( comm_loops_cnt, cur_idxs[ii] ) -                                    comm_loops_cnt = len(common_loops) - -    if len(common_loops) > 0: -        print "\n COMM LOOPS :TOTAL %d, and are " % comm_loops_cnt, -        print common_loops,  -        print " this loop : %s" % common_loops[0] -    else: -        print "UNROLL can't unroll any loops?" - - -    while True:  # break at bottom of loop   (repeat in lua) -        old_num_statements = chill.num_statements() -        print "old_num_statements %d" % old_num_statements - -        for stmt in range(old_num_statements): -            cur_idxs = chill.cur_indices(stmt) -            print "stmt %d    cur_idxs =" % stmt, -            index = 0 -            for i in cur_idxs: -                index +=1 -                if index == len(cur_idxs): -                    print "%s" %i -                else: -                    print "%s," % i, - -            if len(cur_idxs) > 0: -                guard_level = -1 -                if chk_cur_level(stmt, guard_idx) > 0: -                    guard_level = find_cur_level(stmt,guard_idx) -                print "guard_level(sp) = %d" % guard_level -                if guard_level > -1: -                    level = next_clean_level(cur_idxs,guard_level) -                    print "next clean level %d" % level - -                     -                    #print "looking at %d" % stmt -                    #print "comparing %d and %d in" % (guard_level, level), -                    #index = 0 -                    #for i in cur_idxs: -                    #index +=1 -                    #if index == len(cur_idxs): -                    #    print "%s" %i -                    #else: -                    #    print "%s," % i, - -                    # need to handle max_depth -                    num_unrolled = 0 -                    level_unroll_comm = level -                    level_arr = [] - -                    #print "before while, level = %d" % level  -                    while level >= 0: -                        print "while: level = %d" % level  -                        if num_unrolled == max_depth: -                            break - -                        print "Unrolling %d at level %d index %s" % ( stmt, level, cur_idxs[guard_level])  # ???  -                        level_arr.append(level) - -                        guard_level = find_cur_level(stmt,guard_idx) -                        level = next_clean_level(cur_idxs,level+1) - -                    print "OK, NOW WE UNROLL" -                    if level_unroll_comm >= 0: -                        level_arr.reverse()   -                        for i,lev in enumerate(level_arr): -                            print "\ni=%d" % i -                            print "[Unroll]unroll(%d, %d, 0)" % (stmt, lev) -                            chill.unroll(stmt, lev, 0) - - -        new_num_statements = chill.num_statements() -        if old_num_statements == new_num_statements: -            break  # exit infinite loop - - -#  all other calls to C have a routine in this file   (?) -def unroll( statement, level, unroll_amount ): -    chill.unroll( statement, level, unroll_amount ) - diff --git a/test-chill/test-cases/examples/cuda-chill/mm.c b/test-chill/test-cases/examples/cuda-chill/mm.c deleted file mode 100644 index 0efbeeb..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mm.c +++ /dev/null @@ -1,10 +0,0 @@ -#define N 1024 - -void normalMM(float c[N][N], float a[N][N], float b[N][N]) { -  int i, j, k; - -  for (i = 0; i < N; i++) -    for (j = 0; j < N; j++) -      for (k = 0; k < N; k++) -        c[j][i] = c[j][i] + a[k][i] * b[j][k]; -} diff --git a/test-chill/test-cases/examples/cuda-chill/mm.lua b/test-chill/test-cases/examples/cuda-chill/mm.lua deleted file mode 100644 index 5bde1b0..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mm.lua +++ /dev/null @@ -1,38 +0,0 @@ -init("mm.c", "normalMM", 0) -dofile("cudaize.lua") -N=1024 -Ti=128 -Tj=64 -Tk=16 -Tii=16 -Tjj=16 - - - - -N=1024 - - - - - - - - - - - - - -tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k"})CU=1 - -tile_by_index({"k"},{Tk},{l1_control="kk"},{"ii","jj","kk","i","j","k"})CU=3 - -tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","kk","i","iii","j","jjj","k"},1)CU=2 - -cudaize("mm_GPU",{a=1048576,b=1048576,c=1048576},{block={"ii","jj"}, thread={"i","j"}})CU=2 -copy_to_shared("tx","a",-16) -copy_to_shared("tx","b",-16) -copy_to_registers("kk","c") ---print_code() -unroll_to_depth(2) diff --git a/test-chill/test-cases/examples/cuda-chill/mpeg4.c b/test-chill/test-cases/examples/cuda-chill/mpeg4.c deleted file mode 100755 index 7f83bf7..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mpeg4.c +++ /dev/null @@ -1,23 +0,0 @@ -#define N1 4096 -#define N2 4096 -#define WINDOW_SIZE 16 - -void mpeg4_cpu(float result[N1][N2], float prev[N2+WINDOW_SIZE][N2+WINDOW_SIZE], float  curr[WINDOW_SIZE*WINDOW_SIZE]) -{ -	unsigned int i; -	unsigned int j; -	unsigned int k; -	unsigned int l; - -	for ( i = 0; i < N1; ++i)     -		for ( j = 0; j < N2; ++j)  -                       for ( k = 0; k < WINDOW_SIZE; ++k)  -				for ( l = 0; l < WINDOW_SIZE; ++l)  -					result[i][j] += prev[i+k][j+l] * curr[k*WINDOW_SIZE+l]; -				 -			 - -		 -	 -} - diff --git a/test-chill/test-cases/examples/cuda-chill/mpeg4.lua b/test-chill/test-cases/examples/cuda-chill/mpeg4.lua deleted file mode 100644 index f025dc0..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mpeg4.lua +++ /dev/null @@ -1,45 +0,0 @@ ---CUBLAS 2 MM Multiply - ---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you ---call init() and use global variables to specify procedure and loop - ---Second parameter is procedure # and third is loop # -init("mpeg4.c", "mpeg4_cpu", 0)  - ---dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods - -N=4096 -M=4096 -W=16 - ---TI 4ust be <= M ---TJ must be <=TI -Ti=32 -Tj=32 -Tii=16 -Tjj=16 -Tk=4 ---permute(0,{"j","i","k","l"}) -tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k","l"}) ---tile_by_index({"k","l"},{Tk*2,Tk*2},{l1_control="kk",l2_control="ll"},{"ii","jj","kk","ll","i","j","k","l"}) ---print_code() ---tile_by_index({"k","l"},{Tk,Tk},{l1_control="kk",l2_control="ll"},{"ii","jj","i","j","kk","k","ll","l"}) -tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","iii","i","jjj","j","k","l"}) ---print_code() ---normalize_index("j") ---normalize_index("i") ---print_code() -cudaize("kernel_GPU",{curr=W*W,prev=(N+W)*(M+W),result=N*M},{block={"ii","jj"}, thread={"i","j"}}) ---print_code() -copy_to_shared("iii","prev",16) - -copy_to_registers("jjj","result") - ---print_code() ---copy_to_constant_no_tile("curr") -unroll_to_depth(2) -print_code() -print_space() - - diff --git a/test-chill/test-cases/examples/cuda-chill/mriq-fh.c b/test-chill/test-cases/examples/cuda-chill/mriq-fh.c deleted file mode 100755 index 1e924b7..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mriq-fh.c +++ /dev/null @@ -1,38 +0,0 @@ -#define X 32768 -#define K 256 -struct kValues { -  float Kx; -  float Ky; -  float Kz; -  float PhiMag; -}; -extern float sin(float); -extern float cos(float); - -void mriFH_cpu(float *rPhi,float *rRho,float *iRho, float *iPhi, float *rD, float *iD, float *kx, float *ky, float *kz, float *dx, float *dy, float *dz, float *rFHref, float *iFHref) -{ - -    	float rfh; -	float ifh; -	float exp; -	float cArg; -	float sArg; -    	//float rRho[K]; -	//float iRho[K]; -        unsigned int k; -	unsigned int x; -  -       -    for (x = 0; x < X; ++x) { -        for (k = 0; k < K; ++k) { -             -	       exp = 2 * 3.14159 * (kx[k]* dx[x] + ky[k]* dy[x] + kz[k]* dz[x]); -	       cArg = cos(exp); -	       sArg = sin(exp); -            rFHref[x] += rRho[k]* cArg - iRho[k]* sArg; -            iFHref[x] += iRho[k]*cArg + rRho[k]*sArg; -        } -          -    } -} - diff --git a/test-chill/test-cases/examples/cuda-chill/mriq-fh.lua b/test-chill/test-cases/examples/cuda-chill/mriq-fh.lua deleted file mode 100755 index 3277bac..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mriq-fh.lua +++ /dev/null @@ -1,73 +0,0 @@ ---CUBLAS 2 MM Multiply - ---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you ---call init() and use global variables to specify procedure and loop - ---Second parameter is procedure # and third is loop # -init("mriq-fh.c", "mriFH_cpu", 0)  - -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, -                      --copy_to_shared methods -N=32768 -M=256 -Tx=256 - - -print_code() ---permute(0,{"j","i"}) ---tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"}) -tile_by_index({"x"},{Tx},{l1_control="xx"},{"xx","x","k"}) ---tile_by_index({"x"},{16},{l1_control="xx1"},{"xx","x","xx1","k"}) ---tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"}) ---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) -print_code() - -normalize_index("x") ---normalize_index("i") -print_code() ---tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"}) ---print_code() ---cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}}) -cudaize("kernel_GPU",{dx=N,dy=N,dz=N,iRho=M,kx=M,ky=M,kz=M,rFHref=N,iFHref=N,rRho=M},{block={"xx"}, thread={"x"}}) ---copy_to_shared("tx","iRho",-16) ---copy_to_shared("tx","dz",1) ---copy_to_shared("tx","rRho",-16) ---copy_to_registers("tx","rFHref") ---copy_to_registers("tx","rRho") ---copy_to_registers("tx","iRho") ---copy_to_registers("tx","kx") ---copy_to_registers("tx","dx") ---copy_to_registers("tx","ky") ---copy_to_registers("tx","dy") ---copy_to_registers("tx","kz") ---copy_to_registers("tx","dz") ---copy_to_registers("tx","iFHref") ---copy_to_texture("rRho") ---copy_to_texture("kx") ---copy_to_texture("dx") ---copy_to_texture("ky") ---copy_to_texture("dy") ---copy_to_texture("kz") ---copy_to_texture("dz") ---copy_to_texture("iRho") ---print_code()--]] ---unroll(0,4,0) ---copy_to_constant_no_tile("kx") ---copy_to_constant_no_tile("ky") ---copy_to_constant_no_tile("kz") ---copy_to_constant_no_tile("rRho") ---copy_to_constant_no_tile("iRho") - ---unroll_to_depth(1) -print_code() ---[[ -copy_to_Texture("rRho") -copy_to_Texture("kx") -copy_to_Texture("dx") -copy_to_Texture("ky") -copy_to_Texture("dy") -copy_to_Texture("kz") -copy_to_Texture("dz") -copy_to_Texture("iRho") ---unroll_to_depth(2) ---]] diff --git a/test-chill/test-cases/examples/cuda-chill/mriq.c b/test-chill/test-cases/examples/cuda-chill/mriq.c deleted file mode 100644 index ba4b87c..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mriq.c +++ /dev/null @@ -1,33 +0,0 @@ -#define N 32768 -#define M 3072 -struct kValues { -  float Kx; -  float Ky; -  float Kz; -  float PhiMag; -}; -extern float sinf(float); -extern float cosf(float); - -void -ComputeQCPU(int numK, int numX,struct kValues kVals[M],float x[N], float y[N], float z[N],float Qr[N], float Qi[N]) { -  float expArg; -  float cosArg; -  float sinArg; -  float phi; -  int i; -  int j; -  numK = M; -  numX = N; -  for ( i = 0; i < M; i++) { -    for ( j = 0; j < N; j++) { -      expArg = 6.2831853071795864769252867665590058f * (kVals[i].Kx * x[j] +kVals[i].Ky * y[j] +kVals[i].Kz * z[j]); -      cosArg = cosf(expArg); -      sinArg = sinf(expArg); -      phi = kVals[i].PhiMag; -      Qr[j] += phi * cosArg; -      Qi[j] += phi * sinArg; -    } -  } -} -   diff --git a/test-chill/test-cases/examples/cuda-chill/mriq.lua b/test-chill/test-cases/examples/cuda-chill/mriq.lua deleted file mode 100644 index 1170111..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mriq.lua +++ /dev/null @@ -1,55 +0,0 @@ ---CUBLAS 2 MM Multiply - ---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you ---call init() and use global variables to specify procedure and loop - ---Second parameter is procedure # and third is loop # -init("mriq.c", "ComputeQCPU", 0)  - -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, -                      --copy_to_shared methods -N=32768 -M=3072 -TI=128 -TJ=128 - -permute(0,{"j","i"}) ---tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"}) -tile_by_index({"i"}, {TJ}, {l1_control="ii",l1_tile="i"}, {"ii", "j","i"}) -tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"}) ---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) ---print_code() - -normalize_index("j") -normalize_index("i") ---print_code() ---tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"}) ---print_code() -cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}}) - -copy_to_shared("tx","kVals",1) ---copy_to_shared("tx","x",1) ---copy_to_shared("tx","y",1) ---copy_to_shared("tx","z",1) - ---copy_to_texture("kVals") ---datacopy(0, 3, "kVals", {"tt","t"},false,0,1,-16,true) ---print_code() ---datacopy_privatized(0,"tx","kVals",{"tx"}) ---copy_to_registers("tx","kVals") -copy_to_registers("ii","x") -copy_to_registers("ii","y") -copy_to_registers("ii","z") -copy_to_registers("ii","Qi") -copy_to_registers("ii","Qr") ---[[datacopy_privatized(0,"tx","x",{"tx"}) -datacopy_privatized(0,"tx","y",{"tx"}) -datacopy_privatized(0,"tx","z",{"tx"}) -datacopy_privatized(0,"tx","Qi",{"tx"}) -datacopy_privatized(0,"tx","Qr",{"tx"}) - - -]]-- ---unroll(0,5,64) -print_code() ---unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels diff --git a/test-chill/test-cases/examples/cuda-chill/mv-shadow.c b/test-chill/test-cases/examples/cuda-chill/mv-shadow.c deleted file mode 100644 index 582b187..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mv-shadow.c +++ /dev/null @@ -1,9 +0,0 @@ -#define N 1024 - -void normalMV(float c[N][N], float a[N], float b[N]) { -  int i, j; - -  for (i = 0; i < N; i++) -    for (j = 0; j < N; j++) -      a[i] = a[i] + c[j][i] * b[j]; -} diff --git a/test-chill/test-cases/examples/cuda-chill/mv-shadow.lua b/test-chill/test-cases/examples/cuda-chill/mv-shadow.lua deleted file mode 100644 index 43e8491..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mv-shadow.lua +++ /dev/null @@ -1,65 +0,0 @@ -init("mv-shadow.c","normalMV",0) -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, -                      --copy_to_shared methods - -N=129 -TI=32 -TJ=64 - -N=1024 -TI=16 - - - - - - - - - - - - - - - - ---Tile the i and j loop, introducing "ii" as the control loop for the "i" ---tile, "k" for the control loop fo the "j" tile, with the final order ---of {"ii", "k", "i", "j"} -tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) ---tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) ---tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"}) ---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) ---print_code() ---Normalize indx will do a tile size of one over the loop level specified ---by the input index. This is useful to get a zero lower bound and hard ---upper bound on a loop instead of it being relative to previous loop ---levels. ---normalize_index("ii") -normalize_index("i") -print_code() - ---Cudaize now determines the grid dimentions from the loops themselves ---(the upper bounds of the block and thread loops). It also renames the ---given block and thread loops's indexes to the approviate values from ---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the ---size of the arrays to be copied in the CUDA scaffolding. -cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}}) ---print_code() - ---Does a datacopy, tile, and add_sync to get a shared memory copy - ---copy_to_shared("tx", "b", 1) ---copy_to_shared("tx", "c", -16) ---print_code() ---copy_to_texture("b") ---copy_to_texture("c") -copy_to_registers("k", "a") ---print_code() - -unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels ---copy_to_texture("b") ---print_code() ---unroll(0,5,0) ---print_code() diff --git a/test-chill/test-cases/examples/cuda-chill/mv.c b/test-chill/test-cases/examples/cuda-chill/mv.c deleted file mode 100644 index 582b187..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mv.c +++ /dev/null @@ -1,9 +0,0 @@ -#define N 1024 - -void normalMV(float c[N][N], float a[N], float b[N]) { -  int i, j; - -  for (i = 0; i < N; i++) -    for (j = 0; j < N; j++) -      a[i] = a[i] + c[j][i] * b[j]; -} diff --git a/test-chill/test-cases/examples/cuda-chill/mv.lua b/test-chill/test-cases/examples/cuda-chill/mv.lua deleted file mode 100644 index ca54501..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mv.lua +++ /dev/null @@ -1,65 +0,0 @@ -init("mv.c","normalMV",0) -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, -                      --copy_to_shared methods - -N=129 -TI=32 -TJ=64 - -N=1024 - - - - - - - - - - - - - - - - ---Tile the i and j loop, introducing "ii" as the control loop for the "i" ---tile, "k" for the control loop fo the "j" tile, with the final order ---of {"ii", "k", "i", "j"} -tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) ---tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) ---tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"}) ---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) ---print_code() ---Normalize indx will do a tile size of one over the loop level specified ---by the input index. This is useful to get a zero lower bound and hard ---upper bound on a loop instead of it being relative to previous loop ---levels. ---normalize_index("ii") -normalize_index("i") -print_code() - ---Cudaize now determines the grid dimentions from the loops themselves ---(the upper bounds of the block and thread loops). It also renames the ---given block and thread loops's indexes to the approviate values from ---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the ---size of the arrays to be copied in the CUDA scaffolding. -cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}}) - ---print_code() - ---Does a datacopy, tile, and add_sync to get a shared memory copy - ---copy_to_shared("tx", "b", 1) ---copy_to_shared("tx", "c", -16) ---print_code() ---copy_to_texture("b") ---copy_to_texture("c") -copy_to_registers("k", "a") ---print_code() - -unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels ---copy_to_texture("b") ---print_code() ---unroll(0,5,0) ---print_code() diff --git a/test-chill/test-cases/examples/cuda-chill/mv_try.c b/test-chill/test-cases/examples/cuda-chill/mv_try.c deleted file mode 100644 index 7781f3b..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mv_try.c +++ /dev/null @@ -1,9 +0,0 @@ -#define N 4096 - -void normalMV(int n, float c[N][N], float a[N], float b[N]) { -  int i, j; - -  for (i = 0; i < n; i++) -    for (j = 0; j < n; j++) -      a[i] = a[i] + c[i][j] * b[j]; -} diff --git a/test-chill/test-cases/examples/cuda-chill/mv_try.lua b/test-chill/test-cases/examples/cuda-chill/mv_try.lua deleted file mode 100644 index db4d9ad..0000000 --- a/test-chill/test-cases/examples/cuda-chill/mv_try.lua +++ /dev/null @@ -1,14 +0,0 @@ -init("mv_try.c","normalMV",0) -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, -                      --copy_to_shared methods - -TI=96 - -N=4096 - - -tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) -cudaize("mv_GPU", {a=N, b=N, c=N*N}, -        {block={"ii"}, thread={"i"}}) - -print_code() diff --git a/test-chill/test-cases/examples/cuda-chill/nbody.c b/test-chill/test-cases/examples/cuda-chill/nbody.c deleted file mode 100644 index 57899b6..0000000 --- a/test-chill/test-cases/examples/cuda-chill/nbody.c +++ /dev/null @@ -1,66 +0,0 @@ -#define NBODIES 16384 -#define SOFTENINGSQUARED 0.01f -#define DELTATIME 0.001f -#define DAMPING 1.0f - -#define NBLOCKSY 1 -#define NBLOCKSX (NBODIES/NTHREADSX) -#define NTHREADSY 1  -#define NTHREADSX 64 - -#define BLOCKSIZE 128 - -#define SHARED 1 -#define TIMER 1 -#define VERIFY 1 - -extern float sqrtf(float); - -void nbody_cpu(float* oldpos,float* oldpos1, float *newpos, float *oldvel, float *newvel, float *force) -{ -    float r0,r1,r2; -    float invDist, invDistCube, mass, invMass; -    unsigned int i,j; -    for(i = 0; i < NBODIES; ++i) { -        //force[i*4  ] = 0; -        //force[i*4+1] = 0; -        //force[i*4+2] = 0; -        //force[i*4+3] = 0; -        for(j = 0; j < NBODIES; ++j) { -	    r0 = oldpos[j*4]-oldpos1[i*4]; -	    r1 = oldpos[j*4+1]-oldpos1[i*4+1]; -	    r2 = oldpos[j*4+2]-oldpos1[i*4+2]; - -	    invDist = 1.0/sqrtf(r0 * r0 + r1 * r1 + r2 * r2 + SOFTENINGSQUARED); -	    invDistCube =  invDist * invDist * invDist; -	    mass = oldpos1[i*4+3]; - -	    force[i*4] = force[i*4] + r0 * mass * invDistCube; -	    force[i*4+1] = force[i*4+1] + r1 * mass * invDistCube; -	    force[i*4+2] = force[i*4+2] + r2 * mass * invDistCube; - -        } -    } - -/*    for (i = 0; i < NBODIES; ++i) { -        invMass = oldvel[4*i+3]; - -        oldvel[4*i] += (force[4*i] * invMass) * DELTATIME * DAMPING; -        oldvel[4*i+1] += (force[4*i+1] * invMass) * DELTATIME * DAMPING; -        oldvel[4*i+2] += (force[4*i+2] * invMass) * DELTATIME * DAMPING; - -        oldpos[4*i] += oldvel[4*i] * DELTATIME; -        oldpos[4*i+1] += oldvel[4*i+1] * DELTATIME; -        oldpos[4*i+2] += oldvel[4*i+2] * DELTATIME; - -        newpos[4*i+0] = oldpos[4*i]; -        newpos[4*i+1] = oldpos[4*i+1]; -        newpos[4*i+2] = oldpos[4*i+2]; -        newpos[4*i+3] = oldpos[4*i+3]; - -        newvel[4*i+0] = oldvel[4*i]; -        newvel[4*i+1] = oldvel[4*i+1]; -        newvel[4*i+2] = oldvel[4*i+2]; -        newvel[4*i+3] = oldvel[4*i+3]; -    }*/ -} diff --git a/test-chill/test-cases/examples/cuda-chill/nbody.lua b/test-chill/test-cases/examples/cuda-chill/nbody.lua deleted file mode 100644 index 08f88a9..0000000 --- a/test-chill/test-cases/examples/cuda-chill/nbody.lua +++ /dev/null @@ -1,53 +0,0 @@ ---CUBLAS 2 MM Multiply - ---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you ---call init() and use global variables to specify procedure and loop - ---Second parameter is procedure # and third is loop # -init("nbody.c", "nbody_cpu" , 0)  - -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, -                     --copy_to_shared methods -NBODIES=16384 - - ---Tj=128 CHANGE FOR BEST..... BEST IS 64BLOCKS 128THREADS ---Ti=256 -Tj=64 -Ti=32 -Tjjj=1 -Tiii=1 -Tn=0.1 ---normalize_index("j") --- ---print_code() ---normalize_index("n") --- TILE COMMANDS ZEROOOOOOOOOOO:3 ---tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j"})--CU=-1 -tile_by_index({"i"},{Ti},{l1_control="ii"},{"ii","i","j"})--CU=-1 ---normalize_index("i") ---tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1 - ---tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3 ---tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"}) ---tile_by_index({"i"}, {Ti/2}, {l1_control="iii"}, {"ii","iii", "jj","i","j"}) ---print_code() -cudaize("kernel_GPU",{oldpos=4*NBODIES,oldpos1=4*NBODIES,oldvel=4*NBODIES,force=4*NBODIES,newpos=4*NBODIES,newvel=4*NBODIES},{block={"ii"}, thread={"i"}})--CU=3 -print_code() ---tile(0,6,6) ---copy_to_shared("tx","oldpos",-16) ---copy_to_registers("j","oldpos") ---copy_to_registers("j","oldpos1") ---copy_to_registers("j","force") - ---copy_to_texture("oldpos") ---tile(1,3,3) ---tile(2,3,3) - -print_code() ---unroll_to_depth(1) --- ---tile(2,3,3) ---unroll(2,3,0) ---unroll(0,5,0) ---print_code() diff --git a/test-chill/test-cases/examples/cuda-chill/tmv-shadow.c b/test-chill/test-cases/examples/cuda-chill/tmv-shadow.c deleted file mode 100644 index cb9ea8d..0000000 --- a/test-chill/test-cases/examples/cuda-chill/tmv-shadow.c +++ /dev/null @@ -1,9 +0,0 @@ -#define N 1024 - -void normalMV(float c[N][N], float a[N], float b[N]) { -  int i, j; - -  for (i = 0; i < N; i++) -    for (j = 0; j < N; j++) -      a[i] = a[i] + c[i][j] * b[j]; -} diff --git a/test-chill/test-cases/examples/cuda-chill/tmv-shadow.lua b/test-chill/test-cases/examples/cuda-chill/tmv-shadow.lua deleted file mode 100644 index 196b939..0000000 --- a/test-chill/test-cases/examples/cuda-chill/tmv-shadow.lua +++ /dev/null @@ -1,50 +0,0 @@ -init("tmv-shadow.c","normalMV",0) -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, -                      --copy_to_shared methods - -N=1024 ---N= 8209 ---N=129 -TI=64 -N=1024 -TI=32 ---tile, "k" for the control loop for the "j" tile, with the final order ---of {"ii", "k", "i", "j"} -tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) ---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii",  "i", "j"}) ---print_code() ---tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) - ---print_code() ---Normalize indx will do a tile size of one over the loop level specified ---by the input index. This is useful to get a zero lower bound and hard ---upper bound on a loop instead of it being relative to previous loop ---levels. ---normalize_index("i") ---print_code() - ---Cudaize now determines the grid dimentions from the loops themselves ---(the upper bounds of the block and thread loops). It also renames the ---given block and thread loops's indexes to the approviate values from ---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the ---size of the arrays to be copied in the CUDA scaffolding. -cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}}) - ---print_code() - ---Does a datacopy, tile, and add_sync to get a shared memory copy -copy_to_shared("tx", "b", 1) ---copy_to_texture("b") ---print_code() - -copy_to_shared("tx", "c", -16) ---copy_to_texture("c") ---print_code() - -copy_to_registers("k", "a") -print_code() ---unroll(0,5,0) ---unroll(0,4,0) ---unroll(2,4,16) -unroll_to_depth(1) ---print_code() diff --git a/test-chill/test-cases/examples/cuda-chill/tmv.c b/test-chill/test-cases/examples/cuda-chill/tmv.c deleted file mode 100644 index cb9ea8d..0000000 --- a/test-chill/test-cases/examples/cuda-chill/tmv.c +++ /dev/null @@ -1,9 +0,0 @@ -#define N 1024 - -void normalMV(float c[N][N], float a[N], float b[N]) { -  int i, j; - -  for (i = 0; i < N; i++) -    for (j = 0; j < N; j++) -      a[i] = a[i] + c[i][j] * b[j]; -} diff --git a/test-chill/test-cases/examples/cuda-chill/tmv.lua b/test-chill/test-cases/examples/cuda-chill/tmv.lua deleted file mode 100644 index 5071108..0000000 --- a/test-chill/test-cases/examples/cuda-chill/tmv.lua +++ /dev/null @@ -1,50 +0,0 @@ -init("tmv.c","normalMV",0) -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, -                      --copy_to_shared methods - -N=1024 ---N= 8209 ---N=129 -TI=64 -N=1024 -TI=32 ---tile, "k" for the control loop for the "j" tile, with the final order ---of {"ii", "k", "i", "j"} -tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) ---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii",  "i", "j"}) ---print_code() ---tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) - ---print_code() ---Normalize indx will do a tile size of one over the loop level specified ---by the input index. This is useful to get a zero lower bound and hard ---upper bound on a loop instead of it being relative to previous loop ---levels. ---normalize_index("i") ---print_code() - ---Cudaize now determines the grid dimentions from the loops themselves ---(the upper bounds of the block and thread loops). It also renames the ---given block and thread loops's indexes to the approviate values from ---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the ---size of the arrays to be copied in the CUDA scaffolding. -cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}}) - ---print_code() - ---Does a datacopy, tile, and add_sync to get a shared memory copy -copy_to_shared("tx", "b", 1) ---copy_to_texture("b") ---print_code() - -copy_to_shared("tx", "c", -16) ---copy_to_texture("c") ---print_code() - -copy_to_registers("k", "a") -print_code() ---unroll(0,5,0) ---unroll(0,4,0) ---unroll(2,4,16) -unroll_to_depth(1) ---print_code() | 
