summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authordhuth <derickhuth@gmail.com>2014-08-28 16:27:13 -0600
committerdhuth <derickhuth@gmail.com>2014-08-28 16:27:13 -0600
commit95f50d6f6f08f603441bb2f4e87dadfc81b46f74 (patch)
tree45d3f63e31915fe26b2afb8411e2e72b9a334011 /examples
parente1d8ec8dfe3dcd25815ac707074b0dfcbe62ad69 (diff)
downloadchill-95f50d6f6f08f603441bb2f4e87dadfc81b46f74.tar.gz
chill-95f50d6f6f08f603441bb2f4e87dadfc81b46f74.tar.bz2
chill-95f50d6f6f08f603441bb2f4e87dadfc81b46f74.zip
added cudaize.py from svn repo
Diffstat (limited to 'examples')
-rwxr-xr-xexamples/cuda-chill/cudaize.py1047
1 files changed, 1047 insertions, 0 deletions
diff --git a/examples/cuda-chill/cudaize.py b/examples/cuda-chill/cudaize.py
new file mode 100755
index 0000000..ffef009
--- /dev/null
+++ b/examples/cuda-chill/cudaize.py
@@ -0,0 +1,1047 @@
+#! /usr/bin/python
+
+# THIS IS CUDAIZE.PY
+
+import chill
+import sys
+import math
+
+strided = 0
+counted = 1
+
+def print_code():
+ chill.print_code()
+ print ""
+ sys.stdout.flush()
+
+
+def table_contains_key( table, key ): # use a dict for the 'table'?
+ return table.has_key(key) # (key in table)?
+
+def print_array( arr ): # a useful function to mimic lua output
+ for a in arr[:-1]:
+ print "%s," % a,
+ print "%s" % arr[-1]
+ sys.stdout.flush()
+
+def valid_indices( statement, indices ):
+ #print "valid_indices() python calling C cur_indices"
+ #print statement
+ cur = chill.cur_indices(statement) # calls C
+ #print "python valid_indices(), cur = ",
+ #print cur
+ #print "indices = ",
+ #print indices
+
+ for index in indices:
+ if not index in cur:
+ return False
+ return True
+
+def next_clean_level( indices_at_each_level, level):
+ #print "next_clean_level( ..., %d )" % level
+ #print "indices_at_each_level ",
+ print_array( indices_at_each_level )
+
+ numlevels = len(indices_at_each_level)
+ #print "loop to %d" % numlevels
+ for i in range(level+1, numlevels+1):
+ pythoni = i-1 # LUA index starts at 1
+ #print "Checking level %d = '%s'" % (i, indices_at_each_level[pythoni])
+ sys.stdout.flush()
+ if len(indices_at_each_level[pythoni]) > 0: # LUA INDEX STARTS AT 1
+ #print "returning %d" % i
+ return i # MATCH lua return value, LUA index starts at one
+ return -1 # no non-dummy indices
+
+
+
+
+def build_order( final_order, tile_index_names, control_index_names, tile_index_map, current_level):
+ order = []
+ #print "\nbuild_order()"
+ #print "build_order(): final_order = (",
+ count = 0
+ for f in final_order:
+ #if count+1 == len(final_order):
+ # print "%s )" % f
+ #else:
+ # print "%s," % f ,
+ count += 1
+
+ keys = control_index_names.keys()
+ keys.sort()
+ #if (2 == len(keys)):
+ # print "build_order(): ctrl_idx_names = (%s, %s)" % (control_index_names[0], control_index_names[1])
+ #else:
+ # print "build_order(): ctrl_idx_names = (%s" % control_index_names[0],
+ # for k in keys[1:]:
+ # print ", %s" % control_index_names[k],
+ # print ")"
+
+ #print control_index_names
+ #print "cur_level %d" % current_level
+
+ #print "tile index map: ",
+ #print tile_index_map
+
+
+ for i in range(len(final_order)):
+ k = final_order[i] # not used?
+ skip = False
+ cur = final_order[i]
+ # control loops below our current level should not be in the current order
+
+ # skip = cur in control_index_names[current_level+2:]
+ #print "\n%d control_index_names, " % len(control_index_names)
+ #print control_index_names
+
+ for j in range(current_level+1, len(control_index_names)):
+ #print "comparing cur %s with cin[%d] %s" % ( cur, j, control_index_names[j])
+ if control_index_names[j] == cur:
+ skip = True
+ #print "SKIP %s " % cur
+
+ # possibly substitute tile indices if necessary
+ if tile_index_map.has_key(cur):
+ approved_sub = False
+ sub_string = tile_index_map[cur]
+ #print "sub_string = ",
+ #print sub_string
+
+ # approved_sub = sub_string in tile_index_names[current_level+2:]
+ for j in range(current_level+1, len(tile_index_names)):
+ if tile_index_names[j] == sub_string:
+ approved_sub = True
+ if approved_sub:
+ cur = sub_string
+
+ if not skip:
+ order.append( cur)
+ #print "build_order() returning order (",
+ #print order
+ #for o in order:
+ # print "%s," % o,
+ #print ")"
+ return order
+
+def find_cur_level( stmt, idx ):
+ #print "find_cur_level(stmt %d, idx %s) Cur indices" % ( stmt, idx ),
+
+ cur = chill.cur_indices(stmt)
+ #for c in cur[:-1]:
+ # print "%s," % c,
+ #print "%s" % cur[ -1 ]
+
+ index = 1 # lua starts indices at 1 !!
+ for c in cur:
+ if c == idx:
+ #print "found it at index %d" % index
+ #sys.stdout.flush()
+ #print "in find_cur_level, returning ",
+ #print index
+ return index
+ index += 1
+ #print "find_cur_level(), Unable to find index %s in" % idx,
+ #print cur
+ #print "in find_cur_level, returning -1"
+ return -1 # special meaning "it's not there"
+
+def chk_cur_level( stmt, idx ):
+ # search cur_indices for a ind at stmt
+ cur = chill.cur_indices(stmt)
+ if idx in cur:
+ return 1 + cur.index(idx) # lua index starts at 1 !
+ return -1
+
+def find_offset( cur_order, tile, control):
+ #print "Looking for tile '%s' and control '%s' in (" % (tile, control),
+ #print cur_order
+ #for o in cur_order:
+ # print "%s," % o,
+ #print ")"
+
+ idx1 = -1
+ idx2 = -1
+ if tile in cur_order:
+ idx1 = 1 + cur_order.index(tile) # lua indexes from 1!
+ else:
+ print "find_offset(), unable to find tile %s in current list of indices" % tile
+ sys.exit(-1)
+
+ if control in cur_order:
+ idx2 = 1 + cur_order.index(control) # lua indexes from 1!
+ else:
+ print "find_offset(), unable to find control %s in current list of indices" % control
+ sys.exit(-1)
+
+ #print "found at level %d and %d" % ( idx2, idx1 )
+ # this appears horrible
+ if idx2 < idx1:
+ return idx2-idx1+1 # bad ordering
+ else:
+ return idx2-idx1
+
+
+
+def tile_by_index( tile_indices, sizes, index_names, final_order, tile_method):
+ #print "STARTING TILE BY INDEX"
+ #print "tile_by_index() tile_method ",
+ #print tile_method
+ #print "index_names: ",
+ #print index_names
+
+ stmt = 0 # assume statement 0
+ if not valid_indices( stmt, tile_indices):
+ print "python tile_by_index() one or more of ",
+ print tile_indices,
+ print " is not valid"
+ sys.exit(-1)
+
+ if tile_method == None:
+ #print "CREATING tile_method = 1"
+ tile_method = 1 # "counted"
+
+ tile_index_names = []
+ for ti in tile_indices:
+ tile_index_names.append( ti ) # make a copy?
+ #print "tile_index_names:",
+ #print tile_index_names
+
+ control_index_names = {} # a dictionary?
+ tile_index_map = {}
+
+ #print "index_names: "
+ #print index_names
+
+ for pair in index_names:
+ valid = False
+ control = pair[0]
+ name = pair[1]
+ #print "control %s name %s" % ( control, name )
+
+ if control[0] == "l" and control[1].isdigit():
+ if control.endswith("_control"):
+ index = int(control[1: -8])
+ control_index_names[index-1] = name
+ valid = True
+
+ elif control.endswith("_tile"):
+ index = int(control[1: -5])
+ #print "index %d" % index
+ tile_index_names[index-1] = name # ??
+ tile_index_map[name] = tile_indices[index-1]
+ valid = True
+ if not valid:
+ print "%s is not a proper key for specifying tile or control loop indices\n" % control
+
+ #print "control_index_names = ",
+ #print control_index_names
+
+ #print "tile_index_names = ",
+ #print tile_index_names
+
+ #print "before call to build_order(), tile_index_map = ",
+ #print tile_index_map
+
+
+ # filter out control indices (and do name substitution of unprocessed tile indices) for a given level
+ cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, -1)
+
+ #print "returned from build_order python\n\n"
+
+ # print("permute("..stmt..", {"..list_to_string(cur_order).."})")
+ #print "permute(%d, {" % stmt,
+ #print "cur_order = ",
+ #print cur_order,
+ #print "})"
+
+ cur_order.insert(0, stmt)
+ #print cur_order
+ chill.permute( tuple( cur_order))
+ #print "in cudaize.py, returned from C code chill.permute()\n"
+
+ for i in range(len(tile_indices)):
+ cur_idx = tile_indices[i]
+ #print "i %d cur_idx %s calling build order ********" % (i, cur_idx)
+ cur_order = build_order( final_order, tile_indices, control_index_names, tile_index_map, i)
+ #print "cur_idx %s return from build order" % cur_idx
+
+ # Find an offset between tile loop and control loop
+ # 0 = control loop one level above tile loop
+ # -1 = control loop two levels above tile loop
+ # > 0 = tile loop above control loop
+ # In the last case, we do two extra tile commands to get the control
+ # above the tile and then rely on the final permute to handle the
+ # rest
+ level = find_cur_level(stmt,cur_idx)
+ #print "level %d\n" % level
+
+ offset = find_offset(cur_order, tile_index_names[i], control_index_names[i])
+ #print "offset %d" % offset
+
+ if offset <= 0:
+ #print "[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method )
+ chill.tile7( stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method )
+ #print "in cudaize.py, returned from C code chill.tile7\n"
+
+ else:
+ #print "2tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method )
+ chill.tile7( stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method ) # regular level
+
+ # flip and tile control loop
+ #print "3tile(%d, %d, %d)" % ( stmt, level+1, level+1)
+ chill.tile3( stmt, level+1, level+1)
+
+ #print "4tile(%d, %d, %d)" % ( stmt, level+1, level)
+ chill.tile3( stmt, level+1, level)
+
+ #print_code()
+
+ # Do permutation based on cur_order
+ #print("permute based on build order calling build_order()")
+ cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, i)
+
+ #print("permute based on build order return from build_order()")
+
+ # print("permute("..stmt..", {"..list_to_string(cur_order).."})")
+ topermute = cur_order
+ topermute.insert(0, stmt)
+ chill.permute( tuple(topermute) )
+ #print "\nafter permute(), code is:"
+ #print_code()
+
+def normalize_index( index ):
+ #print "in cudaize.py, normalize_index( %s )" % index
+ stmt = 0 # assume stmt 0
+ l = find_cur_level( stmt, index )
+ chill.tile3( stmt, l, l )
+
+def is_in_indices( stmt, idx):
+ cur = chill.cur_indices(stmt)
+ return idx in cur
+
+def copy_to_registers( start_loop, array_name ):
+ #print "\n\n****** starting copy to registers"
+ #sys.stdout.flush()
+
+ stmt = 0 # assume stmt 0
+ cur = chill.cur_indices(stmt) # calls C
+ table_Size = len(cur)
+
+ #print "Cur indices",
+ #print_array(cur)
+ #print "\nThe table size is %d" % table_Size
+ #count=1
+ #for c in cur:
+ # print "%d\t%s" % (count,c)
+ # count += 1
+
+ #print_code()
+
+ # would be much cleaner if not translating this code from lua!
+ level_tx = -1
+ level_ty = -1
+ if is_in_indices(stmt,"tx"):
+ level_tx = find_cur_level(stmt,"tx")
+ if is_in_indices(stmt,"ty"):
+ level_ty = find_cur_level(stmt,"ty")
+ #print "level_tx %d level_ty %d" % ( level_tx, level_ty )
+ #sys.stdout.flush()
+
+ ty_lookup_idx = ""
+ org_level_ty = level_ty
+
+ # UGLY logic. Lua index starts at 1, so all tests etc here are off by 1 from the lua code
+ # level_ty initializes to -1 , which is not a valid index, and so there is added code to
+ # make it not try to acccess offset -1. -1 IS a valid python array index
+ # to top it off, the else below can assign a NIL to ty_lookup_idx!
+ if level_ty != -1 and cur[level_ty] != "":
+ #print "IF cur[%d] = %s" % ( level_ty, cur[level_ty] )
+ ty_lookup_idx = cur[level_ty]
+ else:
+ #print "ELSE ty_lookup_idx = cur[%d] = %s" % ( level_ty, cur[level_ty-1])
+ ty_lookup_idx = cur[level_ty-1]
+ #print "ty_lookup_idx '%s'" % ty_lookup_idx
+
+ if level_ty > -1:
+ #print "\ntile3(%d,%d,%d)" % (stmt,level_ty,level_tx+1)
+ chill.tile3(stmt,level_ty,level_tx+1)
+ #print_code()
+
+ cur = chill.cur_indices(stmt) # calls C
+ table_Size = len(cur)
+ #print "Cur indices ",
+ #for c in cur:
+ # print "%s," % c,
+ #print "\nThe table size is %d" % len(cur)
+ #count=1
+ #for c in cur:
+ # print "%d\t%s" % (count,c)
+ # count += 1
+ #sys.stdout.flush()
+
+ if is_in_indices(stmt,"tx"):
+ level_tx = find_cur_level(stmt,"tx")
+ if ty_lookup_idx != "": # perhaps incorrect test
+ if is_in_indices(stmt,ty_lookup_idx):
+ level_ty = find_cur_level(stmt,ty_lookup_idx)
+
+ ty_lookup = 1
+ idx_flag = -1
+ # find the level of the next valid index after ty+1
+ #print "\nlevel_ty %d" % level_ty
+ if level_ty > -1:
+ #print "table_Size %d" % table_Size
+ for num in range(-1 + level_ty+ty_lookup,table_Size): # ?? off by one?
+ #print "num=%d cur[num] = '%s'" % (num+1, cur[num]) # num+1 is lua index ????
+ sys.stdout.flush()
+ if cur[num] != "":
+ idx_flag = find_cur_level(stmt,cur[num])
+ #print "idx_flag = %d" % idx_flag
+ break
+
+ #print "\n(first) I am checking all indexes after ty+1 %s" % idx_flag
+ #print_code()
+ #print ""
+
+ how_many_levels = 1
+
+ #print "idx_flag = %d I will check levels starting with %d" % (idx_flag, idx_flag+1)
+ # lua arrays start at index 1. the next loop in lua starts at offset 0, since idx_flag can be -1
+ # thus the check for "not equal nil" in lua (bad idea)
+ # python arrays start at 0, so will check for things that lua doesn't (?)
+ startat = idx_flag + 1
+ if idx_flag == -1:
+ startat = 1 # pretend we're lua for now. TODO: fix the logic
+
+ for ch_lev in range(startat,table_Size+1): # logic may be wrong (off by one)
+ #print "ch_lev %d" % ch_lev
+ if ch_lev <= table_Size and cur[ch_lev-1] != "":
+ #print "cur[%d] = '%s'" % ( ch_lev, cur[ch_lev-1] )
+ how_many_levels += 1
+
+ #print "\nHow Many Levels %d" % how_many_levels
+ sys.stdout.flush()
+ sys.stdout.flush()
+
+ if how_many_levels< 2:
+ while( idx_flag >= 0):
+ for num in range(level_ty+ty_lookup,table_Size+1):
+ #print "at top of loop, num is %d" % num
+ #print "cur[num] = '%s'" % cur[num-1]
+ if cur[num-1] != "":
+ idx = cur[num-1]
+ #print "idx '%s'" % idx
+ sys.stdout.flush()
+ curlev = find_cur_level(stmt,idx)
+ #print "curlev %d" % curlev
+
+ #print "\n[COPYTOREG]tile(%d,%d,%d)"%(stmt,curlev,level_tx)
+
+ chill.tile3(stmt, curlev, curlev)
+ curlev = find_cur_level(stmt,idx)
+ #print "curlev %d" % curlev
+ chill.tile3(stmt,curlev,level_tx)
+ #print "hehe '%s'" % cur[num-1]
+
+ cur = chill.cur_indices(stmt)
+ #print "Cur indices INSIDE",
+ #for c in cur:
+ # print "%s," % c,
+ table_Size = len(cur)
+ #print "\nTable Size is: %d" % len(cur)
+
+ level_tx = find_cur_level(stmt,"tx")
+ #print "\n level TX is: %d" % level_tx
+ level_ty = find_cur_level(stmt,ty_lookup_idx)
+ #print "\n level TY is: %d" %level_ty
+ idx_flag = -1
+ #print "idx_flag = -1"
+
+
+ #- find the level of the next valid index after ty+1
+ #- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?)
+ for num2 in range( -1 + level_ty+ty_lookup ,table_Size): # lua starts index at one
+ #print "num mucking num = %d" % num2
+ if(cur[num2] != ""):
+ #print "cur[%d] = '%s'" % ( num2, cur[num2] )
+ idx_flag = find_cur_level(stmt,cur[num2])
+ #print("\n(second) I am checking all indexes after ty+1 %s",cur[num2])
+ break
+
+ #print "num mucked to %d idx_flag = %d" % (num, idx_flag)
+
+ #print "at bottom of loop, num is %d" % num
+
+ #print "done with levels"
+
+ # this was a block comment ???
+
+# for num in range(level_ty+1, table_Size+1):
+# print "num %d" % num
+# if cur[num-1] != "":
+# idx_flag = find_cur_level(stmt,cur[num-1]) ## ugly
+# print "idx_flag = %d" % idx_flag
+
+ # change this all to reflect the real logic which is to normalize all loops inside the thread loops.
+# print "change this all ...\n"
+# print "level_ty+1 %d table_Size-1 %d idx_flag %d" %( level_ty+1, table_Size-1, idx_flag)
+# sys.stdout.flush()
+# sys.stdout.flush()
+
+# while level_ty+1 < (table_Size-1) and idx_flag >= 0:
+# print "*** level_ty %d" % level_ty
+# for num in range(level_ty+2,table_Size+1): # lua for includes second value
+# print "num %d cur[num] %s" % (num, cur[num])
+# if cur[num] != "":
+# idx = cur[num]
+# print "idx='%s'" % idx
+# #print_code()
+
+
+
+
+ #print "ARE WE SYNCED HERE?"
+ #print_code()
+
+ # [Malik] end logic
+ start_level = find_cur_level(stmt, start_loop) # start_loop was passed parameter!
+
+ # We should hold constant any block or tile loop
+ block_idxs = chill.block_indices()
+ thread_idxs = chill.thread_indices()
+ #print"\nblock indices are"
+ #for index, val in enumerate(block_idxs):
+ # print "%d\t%s" % ( int(index)+1 , val )
+ #print"\nthread indices are"
+ #for index, val in enumerate(thread_idxs):
+ # print "%d\t%s" % ( int(index)+1 , val )
+ #print "\nStart Level: %d" % start_level
+
+ hold_constant = []
+ #print("\n Now in Blocks")
+ for idx in block_idxs:
+ blocklevel = find_cur_level(stmt,idx)
+ if blocklevel >= start_level:
+ hold_constant.append(idx)
+ #print "\nJust inserted block %s in hold_constant" %idx
+
+ #print("\n Now in Threads")
+ for idx in thread_idxs:
+ blocklevel = find_cur_level(stmt,idx)
+ if blocklevel >= start_level:
+ hold_constant.append(idx)
+ #print "\nJust inserted thread %s in hold_constant" %idx
+ #print "\nhold constant table is: "
+ #for index, val in enumerate(hold_constant):
+ # print "%d\t%s" % ( int(index)+1 , val )
+
+ #print("\nbefore datacopy pvt")
+ old_num_stmts = chill.num_statements()
+ #sys.stdout.flush()
+
+ #print "\n[DataCopy]datacopy_privatized(%d, %s, %s, " % (stmt, start_loop, array_name),
+ #print hold_constant,
+ #print ")"
+ passtoC = [stmt, start_loop, array_name ] # a list
+ passtoC.append( len(hold_constant ) )
+ for h in hold_constant:
+ passtoC.append( h )
+ chill.datacopy_privatized( tuple( passtoC ))
+ sys.stdout.flush()
+ sys.stdout.flush()
+
+ new_num_statements = chill.num_statements()
+ #print "new num statements %d" % new_num_statements
+
+ # Unroll to the last thread level
+# for stmt in range(old_num_statements, new_num_statements):
+# print "unrolling statement %d" % stmt
+# level = find_cur_level(stmt,thread_idxs[-1]) #get last thread level
+# print "level is %d" % level
+# idxs = chill.cur_indices(stmt)
+# if level < len(idxs):
+# chill.unroll(stmt,level+1,0)
+
+
+
+def copy_to_shared( start_loop, array_name, alignment ):
+ #print "\nstarting copy to shared( %s, %s, %d)" % (start_loop, array_name, alignment )
+ #print "copy_to_shared( %s, %s, %d) in cudaize.py" % ( start_loop, array_name, alignment )
+ stmt = 0 # assume statement 0
+
+ cur = chill.cur_indices(stmt)
+ #print "Cur indices ",
+ #print_array( cur )
+
+ start_level = find_cur_level( stmt, start_loop )
+ #print "start_level %d" % start_level
+
+ old_num_statements = chill.num_statements()
+ #print "old_num_statements %d" % old_num_statements
+
+
+ # Now, we give it indices for up to two dimensions for copy loop
+ copy_loop_idxs = ["tmp1","tmp2"]
+ #chill.datacopy_9arg(stmt, start_level, array_name, copy_loop_idxs, False, 0, 1, alignment,True)
+ passtoC = [stmt, start_level, array_name] # a list
+ passtoC.append( len(copy_loop_idxs))
+ for i in copy_loop_idxs:
+ passtoC.append(i)
+ passtoC.append( 0 ) # False
+ passtoC.append( 0 )
+ passtoC.append( 1 )
+ passtoC.append( alignment )
+ passtoC.append( 1 ) # True
+ #print "\n[DataCopy]datacopy( ",
+ #print passtoC,
+ #print ")"
+
+ #if array_name == "b":
+ # chill.cheat(1)
+ #if array_name == "c":
+ # chill.cheat(2)
+
+ chill.datacopy_9arg( tuple( passtoC ))
+
+ #print "back from datacopy_9arg\n\n\n"
+ #sys.stdout.flush()
+
+
+ #print "calling add_sync( %d, %s )" % ( stmt, start_loop )
+ chill.add_sync( stmt, start_loop )
+ #print "back from add_sync()\n\n"
+
+ new_num_statements = chill.num_statements()
+
+ # This is fairly CUBLAS2 specific, not sure how well it generalizes,
+ # but for a 2D copy, what we want to do is "normalize" the first loop
+ # "tmp1" then get its hard upper bound. We then want to tile it to
+ # make the control loop of that tile "ty". We then tile "tmp2" with a
+ # size of 1 and make it "tx".
+
+ #print "fairly CUBLAS2 specific, OLD %d NEW %d" % ( old_num_statements, new_num_statements)
+ sys.stdout.flush()
+ sys.stdout.flush()
+
+ for stmt in range(old_num_statements, new_num_statements):
+ #print "for stmt = %d" % stmt
+ level = find_cur_level( stmt, "tmp2")
+ #print "FOUND CUR LEVEL? level '",
+ #print level,
+ #print "'"
+
+ #print "in loop, stmt %d level %d" % ( stmt, level )
+ if level != -1:
+ #print "\nCopy to shared: [If was no error]\n"
+ find_cur_level(stmt,"tmp2")
+ chill.tile3( stmt, level, level )
+
+ #print "hard_loop_bounds( %d, %d )" % (stmt, level)
+ bounds = chill.hard_loop_bounds(stmt, level)
+ lower = bounds[0]
+ upper = 1+ bounds[1]
+ #print "lower %d upper %d" % ( lower, upper )
+
+ dims = chill.thread_dims()
+ #print "in cudaize.py copy_to_shared, dims =",
+ #print dims
+ tx = dims[0]
+ ty = dims[1]
+ #print "2-loop cleanup: lower, upper: %d, %d, tx: %d" % ( lower, upper, tx)
+
+ level = find_cur_level(stmt,"tmp1")
+ #print "level %d" % level
+ if tx == upper and ty == 1:
+ #print "tx = %d upper = %d ty = %d"% (tx, upper, ty)
+ #print "Don't need"
+
+ # Don't need an extra tile level, just move this loop up
+ second_level = find_cur_level(stmt,"tmp2")
+ chill.tile7(stmt, second_level, 1, level, "tx", "tx", counted)
+
+ else:
+ #print "DO need?"
+ if ty == 1:
+ new_ctrl = "tmp3"
+ else:
+ new_ctrl = "ty"
+
+ # LOTS of commented out code here in cudaize.lua
+
+ #print_code()
+ #print "\nStarting tmp2\n"
+ first_level = find_cur_level(stmt,"tmp1")
+ second_level = find_cur_level(stmt,"tmp2")
+ bounds = chill.hard_loop_bounds(stmt, second_level)
+ lower = bounds[0]
+ upper = 1 + bounds[1] # BROKEN?
+
+ #print "[Malik]-loop cleanup@tmp2: lower, upper: %d, %d, tx: %d,first level:%d,second_level:%d" % ( lower, upper-1, tx, first_level, second_level)
+
+ # Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx.
+ #print "\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, second_level,1,first_level, "tx", "tx")
+ chill.tile7(stmt, second_level,1,first_level,"tx","tx",counted)
+ #print_code()
+
+ first_level = find_cur_level(stmt,"tmp1")
+ bounds = chill.hard_loop_bounds(stmt, first_level)
+ lower_1 = bounds[0]
+ upper_1 = 1 + bounds[1]
+ tx_level = find_cur_level(stmt,"tx")
+ bounds = chill.hard_loop_bounds(stmt,tx_level)
+ lower_tx = bounds[0]
+ upper_tx = 1+bounds[1]
+ #print "UL_1 %d %d UL_tx %d %d" % ( lower_1, upper_1-1, lower_tx, upper_tx-1)
+
+ if int(math.ceil( float(upper_tx)/float(tx))) > 1:
+ #print "ceil I say"
+ #print "\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, tx_level,tx,tx_level, "tx", "tmp1")
+ chill.tile7(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted)
+ #print_code()
+
+ repeat = find_cur_level(stmt,"tx")
+ #print "\n[Tile1]tile(%d, %d, %d)" % (stmt, repeat, repeat)
+ chill.tile3(stmt, repeat, repeat) #find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx"))
+ #print_code()
+
+ if find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx"):
+ #print "\nagain [Tile1]tile(%d, %d, %d)" % (stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
+ chill.tile3(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
+ #print_code()
+
+ #print_code()
+
+ #print "\nStarting tmp1\n"
+ # Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty".
+ chill.tile3(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))
+ #print_code()
+
+ ty_level = find_cur_level(stmt,"tmp1")
+ bounds = chill.hard_loop_bounds(stmt,ty_level)
+ lower_ty = bounds[0]
+ upper_ty = 1 + bounds[1]
+
+ tx_level = find_cur_level(stmt,"tx")
+ bounds = chill.hard_loop_bounds(stmt,tx_level)
+ lower_tx = bounds[0]
+ upper_tx = 1 + bounds[1]
+
+ #print "[Malik]-loop cleanup@tmp1: lowerty, upperty: %d, %d, ty: %d,ty level:%d,tx_level:%d, stmt: %d" % ( lower_ty, upper_ty-1, ty, ty_level, tx_level, stmt)
+
+ #print "before ceil"
+ #sys.stdout.flush()
+
+ if(math.ceil(float(upper_ty)/float(ty)) > 1):
+ #print "CEIL IF"
+ #print "\n Inside upper_ty/ty > 1\n"
+
+ #print "\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, ty_level,ty,ty_level, "ty", "tmp_ty")
+ chill.tile7(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted)
+ #print_code()
+
+ #print "\n[Tile2-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt ,"ty"),find_cur_level(stmt,"ty"))
+ chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty"))
+ #print_code()
+
+ cur_idxs = chill.cur_indices(stmt)
+ #print "\n cur indexes are ",
+ #print_array( cur_idxs)
+ #sys.stdout.flush()
+
+ # Putting ty before any tmp_tx
+ idx_flag = -1
+ if "tmp_tx" in cur_idxs:
+ idx_flag = 1 + cur_idxs.index("tmp_tx") # lua index starts at 1
+ #print "\n (1) so i have found out the value of idx flag as %d" % idx_flag
+ #sys.stdout.flush()
+
+ if idx_flag >= 0:
+ if find_cur_level(stmt,"ty") > find_cur_level(stmt,"tmp_ty"):
+ #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+ chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+ #print_code()
+
+
+ # Now Putting ty before any tmp_ty
+ sys.stdout.flush()
+ idx_flag = -1
+ if "tmp_ty" in cur_idxs:
+ idx_flag = 1 + cur_idxs.index("tmp_ty") # lua index starts at 1
+ #print "\n IF so i have found out the value of idx flag as %d" % idx_flag
+ #sys.stdout.flush()
+
+ if idx_flag >= 0:
+ #print "one more test"
+ sys.stdout.flush()
+ if find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"):
+ #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+ #sys.stdout.flush()
+ chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+ #print_code()
+
+
+
+ else:
+ #print "CEIL ELSE"
+ #print "\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, ty_level,1,ty_level, "ty", "ty")
+ #sys.stdout.flush()
+ chill.tile7( stmt, ty_level, 1, ty_level, "ty", "ty", counted )
+ #print_code()
+
+ #print "\n[Tile3-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
+ sys.stdout.flush()
+
+ chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
+ #print_code()
+
+
+ idx_flag = -1
+ # LUA code checks to see if cur_idxs exists? it is unused except in the other clause of this is
+ #if(cur_idxs) then
+ #print "CAN NEVER GET HERE? cur_idxs"
+ #for num= 0,table.getn(cur_idxs) do
+ #if(cur[num] == "tmp_ty") then
+ #idx_flag = find_cur_level(stmt,cur[num])
+ #break
+ #end
+ #end
+ print "\n ELSE so i have found out the value of idx flag as %d" % idx_flag
+ if idx_flag >= 0: # can't happen
+ print "tile( stmt %d, level ty %d, level ty %d" % ( stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+ #chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+
+
+
+
+
+ #print "\n\n *** at bottom of if in copy to shared, "
+ #print_code()
+ #print "end of if"
+
+ else:
+ # copy to shared only created one level, not two, so we use a different approach (MV & TMV)
+ #print "\nCopy to shared: [If was error]\n"
+ level = find_cur_level(stmt,"tmp1")
+ chill.tile3(stmt, level, level)
+
+ dims = chill.thread_dims()
+ #print dims
+ tx = dims[0]
+ ty = dims[1]
+
+ bounds = chill.hard_loop_bounds(stmt, level)
+ lower = bounds[0]
+ upper = bounds[1]
+
+ #print "bounds lower %d upper %d" % (lower, upper)
+ upper = upper+1 # upper bound given as <=, compare to dimensions tx which is <
+ if upper == tx:
+ #print "upper == tx"
+ chill.rename_index( stmt, "tmp1", "tx")
+ else:
+ #print "upper is not tx"
+ #print "upper %d tx %d stmt: %d level: %d" % ( upper, tx, stmt, level)
+ chill.tile7( stmt, level, tx, level, "tx", "tmp_tx", counted)
+ #print_code()
+
+ #print "stmt:%d level+1: %d" % ( stmt, level+1)
+ #print("TILE 7")
+ chill.tile7( stmt, level+1,1,level+1,"tx", "tx",counted)
+ #print("TILE 3")
+ chill.tile3( stmt, level+1, level)
+ #print_code()
+
+
+ if ty > 1:
+ #print "GOING IN"
+ bounds = chill.hard_loop_bounds(stmt, level+1)
+ lower = bounds[0]
+ upper = bounds[1]
+ #print "ty %d lower %d upper %d" % ( ty, lower, upper )
+ floatdiv = float(upper)/float(ty)
+ bound = int(math.ceil(float(upper)/float(ty)))
+ #print "NOW FOR Y: upper %d ty %d stmt: %d level: %d bound: %d" % ( upper, ty, stmt, level+1, bound)
+ chill.tile7(stmt, level+1, bound, level+1, "tmp_ty", "ty", counted)
+
+ # Always add sync
+ chill.add_sync( stmt, start_loop )
+ #print "ending copy to shared\n"
+ #sys.stdout.flush()
+ #print_code()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def unroll_to_depth( max_depth ):
+ print "\n\nunroll_to_depth(%d)" % max_depth
+ print "SYNC UP"
+ sys.stdout.flush()
+
+ cur = chill.cur_indices(0)
+ thread_idxs = chill.thread_indices()
+ guard_idx = thread_idxs[-1] # last one
+
+ print "cur indices",
+ print_array(cur)
+ print "thread indices",
+ print_array(thread_idxs)
+ print "guard_idx = %s" % guard_idx
+
+ #print "thread_idxs = ",
+ #print thread_idxs
+ guard_idx = thread_idxs[-1]
+ #print "guard_idx = %s" % guard_idx
+
+ # HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS
+ common_loops = []
+ comm_loops_cnt = 0
+ num_stmts = chill.num_statements()
+ print "num statements %d" % num_stmts
+
+ for stmt in range(num_stmts):
+ sys.stdout.flush()
+ print "\nSTMT %d" % stmt,
+ cur_idxs = chill.cur_indices(stmt)
+ print "Current Indices:",
+ for c in cur_idxs[:-1]:
+ print "%s," % c,
+ print "%s" % cur_idxs[-1] # last one
+ sys.stdout.flush()
+ #print_code()
+
+ if chk_cur_level(stmt, "tx") > 0:
+
+ for ii in range(find_cur_level(stmt,"tx")-1):
+ print "ii = %d\ncur_idxs[%d] = '%s'" % (ii+1, ii+1, cur_idxs[ii]) # print to match lua
+ id = cur_idxs[ii]
+ if id not in ["bx", "by", "", "tx", "ty"]:
+
+ print "id %s is not in the list" % id
+
+ for stmt1 in range(stmt+1, num_stmts):
+ print "\nii %d stmt1 is %d" % (ii+1, stmt1) # print to match lua
+ cur_idxs1 = chill.cur_indices(stmt1)
+ print "\nstmt1 cur_idxs1 is ",
+ for ind in cur_idxs1[:-1]:
+ print "%s," % ind,
+ print "%s" % cur_idxs1[-1]
+
+ print "cur level(%d, %s) = %d" % (stmt, "tx", find_cur_level(stmt,"tx") )
+ sys.stdout.flush()
+
+ endrange = find_cur_level(stmt,"tx")-1
+ print "for iii=1, %d do" % endrange
+ sys.stdout.flush()
+ for iii in range(endrange): # off by one? TODO
+ print "stmt %d ii %d iii %d\n" % (stmt, ii+1, iii+1),
+ sys.stdout.flush()
+
+ if iii >= len(cur_idxs1):
+ print "stmt %d ii %d iii %d cur_idxs1[%d] = NIL" % (stmt, ii+1, iii+1, iii+1, ) # print to match lua
+ else:
+ print "stmt %d ii %d iii %d cur_idxs1[%d] = '%s'" % (stmt, ii+1, iii+1, iii+1, cur_idxs1[iii]) # print to match lua
+ sys.stdout.flush()
+
+ # this will still probably die
+ if iii < len(cur_idxs1) and [iii] not in ["bx", "by", "tx", "ty", ""]:
+ if cur_idxs[ii] == cur_idxs1[iii]:
+ print "\nfound idx:%s" % cur_idxs[ii]
+ common_loops.append(cur_idxs[ii])
+ print "cl[%d] = '%s'" % ( comm_loops_cnt, cur_idxs[ii] )
+ comm_loops_cnt = len(common_loops)
+
+ if len(common_loops) > 0:
+ print "\n COMM LOOPS :TOTAL %d, and are " % comm_loops_cnt,
+ print common_loops,
+ print " this loop : %s" % common_loops[0]
+ else:
+ print "UNROLL can't unroll any loops?"
+
+
+ while True: # break at bottom of loop (repeat in lua)
+ old_num_statements = chill.num_statements()
+ print "old_num_statements %d" % old_num_statements
+
+ for stmt in range(old_num_statements):
+ cur_idxs = chill.cur_indices(stmt)
+ print "stmt %d cur_idxs =" % stmt,
+ index = 0
+ for i in cur_idxs:
+ index +=1
+ if index == len(cur_idxs):
+ print "%s" %i
+ else:
+ print "%s," % i,
+
+ if len(cur_idxs) > 0:
+ guard_level = -1
+ if chk_cur_level(stmt, guard_idx) > 0:
+ guard_level = find_cur_level(stmt,guard_idx)
+ print "guard_level(sp) = %d" % guard_level
+ if guard_level > -1:
+ level = next_clean_level(cur_idxs,guard_level)
+ print "next clean level %d" % level
+
+
+ #print "looking at %d" % stmt
+ #print "comparing %d and %d in" % (guard_level, level),
+ #index = 0
+ #for i in cur_idxs:
+ #index +=1
+ #if index == len(cur_idxs):
+ # print "%s" %i
+ #else:
+ # print "%s," % i,
+
+ # need to handle max_depth
+ num_unrolled = 0
+ level_unroll_comm = level
+ level_arr = []
+
+ #print "before while, level = %d" % level
+ while level >= 0:
+ print "while: level = %d" % level
+ if num_unrolled == max_depth:
+ break
+
+ print "Unrolling %d at level %d index %s" % ( stmt, level, cur_idxs[guard_level]) # ???
+ level_arr.append(level)
+
+ guard_level = find_cur_level(stmt,guard_idx)
+ level = next_clean_level(cur_idxs,level+1)
+
+ print "OK, NOW WE UNROLL"
+ if level_unroll_comm >= 0:
+ level_arr.reverse()
+ for i,lev in enumerate(level_arr):
+ print "\ni=%d" % i
+ print "[Unroll]unroll(%d, %d, 0)" % (stmt, lev)
+ chill.unroll(stmt, lev, 0)
+
+
+ new_num_statements = chill.num_statements()
+ if old_num_statements == new_num_statements:
+ break # exit infinite loop
+
+
+# all other calls to C have a routine in this file (?)
+def unroll( statement, level, unroll_amount ):
+ chill.unroll( statement, level, unroll_amount )
+