1 files changed, 0 insertions, 1047 deletions
diff --git a/test-chill/test-cases/examples/cuda-chill/cudaize.py b/test-chill/test-cases/examples/cuda-chill/cudaize.py
deleted file mode 100755
index ffef009..0000000
--- a/test-chill/test-cases/examples/cuda-chill/cudaize.py
+++ /dev/null
@@ -1,1047 +0,0 @@
-#! /usr/bin/python
-
-# THIS IS CUDAIZE.PY
-
-import chill
-import sys
-import math 
-
-strided = 0
-counted = 1
-
-def print_code():
-    chill.print_code()
-    print ""
-    sys.stdout.flush()
-
-    
-def table_contains_key( table, key ):  # use a dict for the 'table'?
-    return table.has_key(key) # (key in table)?
-
-def print_array( arr ):  # a useful function to mimic lua output 
-    for a in arr[:-1]:
-        print "%s," % a,
-    print "%s" % arr[-1]
-    sys.stdout.flush()
-
-def valid_indices( statement, indices ):
-    #print "valid_indices() python calling C cur_indices"
-    #print statement
-    cur = chill.cur_indices(statement) # calls C
-    #print "python valid_indices(), cur = ",
-    #print cur
-    #print "indices = ",
-    #print indices
-
-    for index in indices:
-        if not index in cur:
-            return False
-    return True
-
-def next_clean_level( indices_at_each_level, level):
-    #print "next_clean_level( ..., %d )" % level 
-    #print "indices_at_each_level ",
-    print_array( indices_at_each_level )
-
-    numlevels = len(indices_at_each_level)
-    #print "loop to %d" % numlevels
-    for i in range(level+1, numlevels+1):
-        pythoni = i-1 # LUA index starts at 1
-        #print "Checking level %d = '%s'" % (i, indices_at_each_level[pythoni])
-        sys.stdout.flush()
-        if len(indices_at_each_level[pythoni]) > 0: # LUA INDEX STARTS AT 1
-            #print "returning %d" % i
-            return i  # MATCH lua return value, LUA index starts at one
-    return -1  # no non-dummy indices
-
-
-
-
-def build_order(  final_order, tile_index_names, control_index_names, tile_index_map, current_level):
-    order = []   
-    #print "\nbuild_order()"
-    #print "build_order(): final_order = (",
-    count = 0
-    for f in final_order:
-        #if count+1 == len(final_order):
-        #    print "%s )" % f
-        #else:
-        #    print "%s," % f ,
-        count += 1
-
-        keys = control_index_names.keys()
-        keys.sort()
-        #if (2 == len(keys)):
-        #    print "build_order(): ctrl_idx_names = (%s, %s)" % (control_index_names[0], control_index_names[1])
-        #else:
-        #    print "build_order(): ctrl_idx_names = (%s" % control_index_names[0],
-        #    for k in keys[1:]:
-        #        print ", %s" % control_index_names[k],
-        #    print ")"
-
-    #print control_index_names
-    #print "cur_level %d" % current_level
-    
-    #print "tile index map: ",
-    #print tile_index_map
-
-
-    for i in range(len(final_order)):
-        k = final_order[i]  # not used?
-        skip = False
-        cur = final_order[i]  
-        # control loops below our current level should not be in the current order
-
-        # skip = cur in control_index_names[current_level+2:] 
-        #print "\n%d control_index_names, " % len(control_index_names)
-        #print control_index_names
-
-        for j in range(current_level+1, len(control_index_names)):
-            #print "comparing cur %s with cin[%d] %s" % ( cur, j, control_index_names[j])
-            if control_index_names[j] == cur:
-                skip = True 
-                #print "SKIP %s  " % cur
-
-        # possibly substitute tile indices if necessary
-        if tile_index_map.has_key(cur):
-            approved_sub = False
-            sub_string = tile_index_map[cur]
-            #print "sub_string = ",
-            #print sub_string
-
-            # approved_sub = sub_string in tile_index_names[current_level+2:]
-            for j in range(current_level+1, len(tile_index_names)):
-                if tile_index_names[j] == sub_string:
-                    approved_sub = True
-            if approved_sub:
-                cur = sub_string
-
-        if not skip:
-            order.append( cur)  
-    #print "build_order() returning order (",
-    #print order
-    #for o in order:
-    #    print "%s," % o,
-    #print ")"
-    return order
-
-def find_cur_level( stmt, idx ):
-    #print "find_cur_level(stmt %d, idx %s)  Cur indices" % ( stmt, idx ),
-    
-    cur = chill.cur_indices(stmt)
-    #for c in cur[:-1]:
-    #    print "%s," % c,
-    #print "%s" % cur[ -1 ] 
-
-    index = 1 # lua starts indices at 1 !!  
-    for c in cur:
-        if c == idx:
-            #print "found it at index %d" % index
-            #sys.stdout.flush()
-            #print "in find_cur_level, returning ",
-            #print index
-            return index
-        index += 1
-    #print "find_cur_level(), Unable to find index %s in" % idx,
-    #print cur
-    #print "in find_cur_level, returning -1"
-    return -1  # special meaning "it's not there"
-
-def chk_cur_level( stmt, idx ):
-    # search cur_indices for a ind at stmt
-    cur = chill.cur_indices(stmt)
-    if idx in cur:
-       return 1 + cur.index(idx)  # lua index starts at 1 !
-    return -1
-
-def find_offset( cur_order, tile, control):
-    #print "Looking for tile '%s' and control '%s' in (" % (tile, control),
-    #print cur_order
-    #for o in cur_order:
-    #    print "%s," % o,
-    #print ")"
-
-    idx1 = -1
-    idx2 = -1
-    if tile in cur_order: 
-        idx1 = 1 + cur_order.index(tile) # lua indexes from 1!
-    else:
-        print "find_offset(), unable to find tile %s in current list of indices" % tile
-        sys.exit(-1)
-
-    if control in cur_order:
-        idx2 = 1 + cur_order.index(control) # lua indexes from 1!
-    else:
-        print "find_offset(), unable to find control %s in current list of indices" % control
-        sys.exit(-1)
-
-    #print "found at level %d and %d" % ( idx2, idx1 )
-    # this appears horrible
-    if idx2 < idx1:
-        return idx2-idx1+1 # bad ordering
-    else:
-        return idx2-idx1
-
-
-
-def tile_by_index( tile_indices, sizes, index_names, final_order, tile_method):
-    #print "STARTING TILE BY INDEX"
-    #print "tile_by_index() tile_method ",
-    #print tile_method
-    #print "index_names: ",
-    #print index_names
-
-    stmt = 0 # assume statement 0
-    if not valid_indices( stmt, tile_indices):
-        print "python tile_by_index() one or more of ",
-        print tile_indices,
-        print " is not valid"
-        sys.exit(-1)
-
-    if tile_method == None:
-        #print "CREATING tile_method = 1"
-        tile_method = 1 # "counted"
-
-    tile_index_names = []
-    for ti in tile_indices:
-        tile_index_names.append( ti )  # make a copy? 
-    #print "tile_index_names:",
-    #print tile_index_names
-
-    control_index_names = {} # a dictionary?
-    tile_index_map =  {}
-    
-    #print "index_names: "
-    #print index_names
-
-    for pair in index_names:
-        valid = False
-        control = pair[0]
-        name    = pair[1]
-        #print "control %s   name  %s" % ( control, name )
-        
-        if control[0] == "l" and control[1].isdigit():
-            if control.endswith("_control"):
-                index = int(control[1: -8])
-                control_index_names[index-1] = name
-                valid = True
-
-            elif control.endswith("_tile"):
-                index = int(control[1: -5])
-                #print "index %d" % index
-                tile_index_names[index-1] = name # ?? 
-                tile_index_map[name] = tile_indices[index-1]
-                valid = True
-        if not valid:
-            print "%s is not a proper key for specifying tile or control loop indices\n" % control
-
-    #print "control_index_names = ",
-    #print control_index_names
-
-    #print "tile_index_names = ",
-    #print tile_index_names
-
-    #print "before call to build_order(), tile_index_map = ",
-    #print tile_index_map
-
-
-    # filter out control indices (and do name substitution of unprocessed tile indices) for a given level
-    cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, -1)
-
-    #print "returned from build_order python\n\n"
-
-    # print("permute("..stmt..", {"..list_to_string(cur_order).."})")
-    #print "permute(%d, {" % stmt,
-    #print "cur_order = ",
-    #print cur_order,
-    #print "})"
-
-    cur_order.insert(0, stmt)
-    #print cur_order
-    chill.permute( tuple( cur_order)) 
-    #print "in cudaize.py, returned from C code chill.permute()\n"
-
-    for i in range(len(tile_indices)):
-        cur_idx = tile_indices[i]
-        #print "i %d  cur_idx %s calling build order ********" % (i, cur_idx)
-        cur_order = build_order( final_order, tile_indices, control_index_names, tile_index_map, i)
-        #print "cur_idx %s return from build order" % cur_idx
-        
-        # Find an offset between tile loop and control loop
-        #  0   = control loop one level above tile loop
-        #  -1  = control loop two levels above tile loop
-        #  > 0 = tile loop above control loop
-        #  In the last case, we do two extra tile commands to get the control
-        #  above the tile and then rely on the final permute to handle the
-        #  rest
-        level = find_cur_level(stmt,cur_idx)
-        #print "level %d\n" % level     
-
-        offset = find_offset(cur_order, tile_index_names[i], control_index_names[i])
-        #print "offset %d" % offset
-
-        if offset <= 0:
-            #print "[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  )
-            chill.tile7( stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  )
-            #print "in cudaize.py, returned from C code chill.tile7\n"
-
-        else:
-            #print "2tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  )
-            chill.tile7( stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  ) # regular level
-
-            # flip and tile control loop
-            #print "3tile(%d, %d, %d)" % ( stmt, level+1, level+1)
-            chill.tile3( stmt, level+1, level+1)
-
-            #print "4tile(%d, %d, %d)" % ( stmt, level+1, level)
-            chill.tile3( stmt, level+1, level)
-
-            #print_code()
-
-        # Do permutation based on cur_order
-        #print("permute based on build order calling build_order()")
-        cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, i)
-
-        #print("permute based on build order return from build_order()")
-
-        #  print("permute("..stmt..", {"..list_to_string(cur_order).."})")
-        topermute = cur_order
-        topermute.insert(0, stmt)
-        chill.permute( tuple(topermute) ) 
-        #print "\nafter permute(), code is:"
-        #print_code()
-
-def normalize_index( index ):
-    #print "in cudaize.py, normalize_index( %s )" % index
-    stmt = 0  # assume stmt 0
-    l = find_cur_level( stmt, index )
-    chill.tile3( stmt, l, l )
-
-def is_in_indices( stmt, idx):
-    cur = chill.cur_indices(stmt)
-    return idx in cur
-
-def copy_to_registers( start_loop, array_name ):
-    #print "\n\n****** starting copy to registers"
-    #sys.stdout.flush()
-
-    stmt = 0    # assume stmt 0
-    cur = chill.cur_indices(stmt) # calls C    
-    table_Size = len(cur)
-
-    #print "Cur indices",
-    #print_array(cur)
-    #print "\nThe table size is %d" % table_Size
-    #count=1
-    #for c in cur:
-    #    print "%d\t%s" % (count,c)
-    #    count += 1
-
-    #print_code()
-
-    # would be much cleaner if not translating this code from lua!
-    level_tx = -1
-    level_ty = -1   
-    if is_in_indices(stmt,"tx"):
-        level_tx = find_cur_level(stmt,"tx")
-    if is_in_indices(stmt,"ty"):
-        level_ty = find_cur_level(stmt,"ty")
-    #print "level_tx %d  level_ty %d" % ( level_tx, level_ty )
-    #sys.stdout.flush()
-
-    ty_lookup_idx = "" 
-    org_level_ty = level_ty
-
-    # UGLY logic. Lua index starts at 1, so all tests etc here are off by 1 from the lua code
-    # level_ty initializes to -1 , which is not a valid index, and so there is added code to 
-    # make it not try to acccess offset -1.   -1 IS a valid python array index
-    # to top it off, the else below can assign a NIL to ty_lookup_idx! 
-    if level_ty != -1 and cur[level_ty] != "":
-        #print "IF  cur[%d] = %s" % ( level_ty, cur[level_ty] )
-        ty_lookup_idx = cur[level_ty] 
-    else:
-        #print "ELSE ty_lookup_idx = cur[%d] = %s" % ( level_ty, cur[level_ty-1]) 
-        ty_lookup_idx = cur[level_ty-1] 
-    #print "ty_lookup_idx '%s'" % ty_lookup_idx
-
-    if level_ty > -1:
-        #print "\ntile3(%d,%d,%d)" % (stmt,level_ty,level_tx+1)
-        chill.tile3(stmt,level_ty,level_tx+1) 
-    #print_code()   
-
-    cur = chill.cur_indices(stmt) # calls C 
-    table_Size = len(cur)
-    #print "Cur indices ",
-    #for c in cur:
-    #    print "%s," % c,
-    #print "\nThe table size is %d" % len(cur)
-    #count=1
-    #for c in cur:
-    #    print "%d\t%s" % (count,c)
-    #    count += 1
-    #sys.stdout.flush()
-
-    if is_in_indices(stmt,"tx"):
-        level_tx = find_cur_level(stmt,"tx")
-    if ty_lookup_idx != "":                      # perhaps incorrect test 
-        if is_in_indices(stmt,ty_lookup_idx):
-           level_ty = find_cur_level(stmt,ty_lookup_idx)
-           
-    ty_lookup = 1
-    idx_flag = -1
-    # find the level of the next valid index after ty+1
-    #print "\nlevel_ty %d" % level_ty
-    if level_ty > -1:
-       #print "table_Size %d" % table_Size
-       for num in range(-1 + level_ty+ty_lookup,table_Size):   # ??  off by one?
-           #print "num=%d   cur[num] = '%s'" % (num+1, cur[num]) # num+1 is lua index ????
-           sys.stdout.flush()
-           if cur[num] != "":
-               idx_flag = find_cur_level(stmt,cur[num])
-               #print "idx_flag = %d" % idx_flag
-               break
-               
-    #print "\n(first) I am checking all indexes after ty+1 %s" % idx_flag
-    #print_code()   
-    #print "" 
-
-    how_many_levels = 1
-    
-    #print "idx_flag = %d   I will check levels starting with %d" % (idx_flag, idx_flag+1)
-    # lua arrays start at index 1. the next loop in lua starts at offset 0, since idx_flag can be -1
-    # thus the check for "not equal nil" in lua (bad idea)
-    # python arrays start at 0, so will check for things that lua doesn't (?)
-    startat = idx_flag + 1
-    if idx_flag == -1:
-        startat = 1  # pretend we're lua for now.   TODO: fix the logic
-
-    for ch_lev in range(startat,table_Size+1):       # logic may be wrong (off by one)
-        #print "ch_lev %d" % ch_lev
-        if ch_lev <= table_Size and cur[ch_lev-1] != "":
-           #print "cur[%d] = '%s'" % ( ch_lev, cur[ch_lev-1] )
-           how_many_levels += 1
-
-    #print "\nHow Many Levels %d" % how_many_levels
-    sys.stdout.flush()
-    sys.stdout.flush()
-
-    if how_many_levels< 2:
-        while( idx_flag >= 0):
-            for num in range(level_ty+ty_lookup,table_Size+1):
-                #print "at top of loop, num is %d" % num
-                #print "cur[num] = '%s'" % cur[num-1]
-                if cur[num-1] != "":
-                    idx = cur[num-1]
-                    #print "idx '%s'" % idx
-                    sys.stdout.flush()
-                    curlev = find_cur_level(stmt,idx)
-                    #print "curlev %d" % curlev
-
-                    #print "\n[COPYTOREG]tile(%d,%d,%d)"%(stmt,curlev,level_tx)
-
-                    chill.tile3(stmt, curlev, curlev)
-                    curlev = find_cur_level(stmt,idx)
-                    #print "curlev %d" % curlev
-                    chill.tile3(stmt,curlev,level_tx)
-                    #print "hehe '%s'" % cur[num-1]
-                    
-                    cur = chill.cur_indices(stmt)
-                    #print "Cur indices INSIDE",
-                    #for c in cur:
-                    #    print "%s," % c,
-                    table_Size = len(cur)
-                    #print "\nTable Size is: %d" % len(cur)
-
-                    level_tx = find_cur_level(stmt,"tx")
-                    #print "\n level TX is: %d" % level_tx
-                    level_ty = find_cur_level(stmt,ty_lookup_idx)
-                    #print "\n level TY is: %d" %level_ty
-                    idx_flag = -1
-                    #print "idx_flag = -1"
-
-
-                    #- find the level of the next valid index after ty+1
-                    #- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?)
-                    for num2 in range( -1 + level_ty+ty_lookup ,table_Size): # lua starts index at one
-                        #print "num mucking num = %d" % num2
-                        if(cur[num2] != ""):
-                            #print "cur[%d] = '%s'" % ( num2, cur[num2] )
-                            idx_flag = find_cur_level(stmt,cur[num2])
-                            #print("\n(second) I am checking all indexes after ty+1 %s",cur[num2])
-                            break
-
-                    #print "num mucked to %d     idx_flag = %d" % (num, idx_flag)
-
-                #print "at bottom of loop, num is %d" % num
-          
-    #print "done with levels"
-
-    # this was a block comment ???
-
-#    for num in range(level_ty+1, table_Size+1):
-#        print "num %d" % num
-#        if cur[num-1] != "":
-#            idx_flag = find_cur_level(stmt,cur[num-1])  ## ugly 
-#    print "idx_flag = %d" % idx_flag
-
-    # change this all to reflect the real logic which is to normalize all loops inside the thread loops. 
-#    print "change this all ...\n"
-#    print "level_ty+1 %d  table_Size-1 %d     idx_flag %d" %( level_ty+1, table_Size-1, idx_flag)
-#    sys.stdout.flush()
-#    sys.stdout.flush()
-
-#    while level_ty+1 < (table_Size-1) and idx_flag >= 0:
-#        print "*** level_ty %d" %  level_ty
-#        for num in range(level_ty+2,table_Size+1):  # lua for includes second value
-#            print "num %d   cur[num] %s" % (num, cur[num])
-#            if cur[num] != "":
-#                idx = cur[num]
-#                print "idx='%s'" % idx
-#                #print_code()
-                
-                
-            
-
-    #print "ARE WE SYNCED HERE?"
-    #print_code()
-
-    #  [Malik] end logic
-    start_level = find_cur_level(stmt, start_loop) # start_loop was passed parameter!
-
-    # We should hold constant any block or tile loop
-    block_idxs  = chill.block_indices()
-    thread_idxs = chill.thread_indices()
-    #print"\nblock indices are"
-    #for index, val in enumerate(block_idxs):
-    #    print "%d\t%s" % ( int(index)+1 , val )
-    #print"\nthread indices are"
-    #for index, val in enumerate(thread_idxs):
-    #    print "%d\t%s" % ( int(index)+1 , val )
-    #print "\nStart Level: %d" % start_level
-
-    hold_constant = []
-    #print("\n Now in Blocks")
-    for idx in block_idxs:
-        blocklevel = find_cur_level(stmt,idx)
-        if blocklevel >= start_level:
-           hold_constant.append(idx)
-           #print "\nJust inserted block %s in hold_constant" %idx
-
-    #print("\n Now in Threads")
-    for idx in thread_idxs:
-        blocklevel = find_cur_level(stmt,idx)
-        if blocklevel >= start_level:
-            hold_constant.append(idx)
-            #print "\nJust inserted thread %s in hold_constant" %idx
-    #print "\nhold constant table is: "
-    #for index, val in enumerate(hold_constant):
-    #    print "%d\t%s" % ( int(index)+1 , val )
-    
-    #print("\nbefore datacopy pvt")
-    old_num_stmts = chill.num_statements()
-    #sys.stdout.flush()
-
-    #print "\n[DataCopy]datacopy_privatized(%d, %s, %s, " % (stmt, start_loop, array_name),
-    #print hold_constant,
-    #print ")"
-    passtoC = [stmt, start_loop, array_name ] # a list
-    passtoC.append( len(hold_constant ) )
-    for h in hold_constant:
-        passtoC.append( h )
-    chill.datacopy_privatized( tuple( passtoC ))
-    sys.stdout.flush()
-    sys.stdout.flush()
-    
-    new_num_statements = chill.num_statements()
-    #print "new num statements %d" % new_num_statements    
-
-    # Unroll to the last thread level
-#    for stmt in range(old_num_statements, new_num_statements):
-#        print "unrolling statement %d" % stmt
-#        level = find_cur_level(stmt,thread_idxs[-1]) #get last thread level
-#        print "level is %d" % level
-#        idxs = chill.cur_indices(stmt)
-#        if level < len(idxs):
-#            chill.unroll(stmt,level+1,0)
-
-
-
-def copy_to_shared( start_loop, array_name, alignment ):
-    #print "\nstarting copy to shared( %s, %s, %d)" % (start_loop, array_name, alignment ) 
-    #print "copy_to_shared( %s, %s, %d) in cudaize.py" % ( start_loop, array_name, alignment )
-    stmt = 0 # assume statement 0
-
-    cur = chill.cur_indices(stmt)
-    #print "Cur indices ",
-    #print_array( cur )
-
-    start_level = find_cur_level( stmt, start_loop )
-    #print "start_level %d" % start_level
-
-    old_num_statements = chill.num_statements()
-    #print "old_num_statements %d" % old_num_statements
-    
-
-    # Now, we give it indices for up to two dimensions for copy loop
-    copy_loop_idxs = ["tmp1","tmp2"]
-    #chill.datacopy_9arg(stmt, start_level, array_name, copy_loop_idxs, False, 0, 1, alignment,True)
-    passtoC = [stmt, start_level, array_name]   # a list
-    passtoC.append( len(copy_loop_idxs))
-    for i in copy_loop_idxs:
-        passtoC.append(i)
-    passtoC.append( 0 ) # False
-    passtoC.append( 0 )
-    passtoC.append( 1 )
-    passtoC.append( alignment )
-    passtoC.append( 1 )   # True
-    #print "\n[DataCopy]datacopy( ",
-    #print passtoC,
-    #print ")"
-
-    #if array_name == "b":
-    #    chill.cheat(1)
-    #if array_name == "c":
-    #    chill.cheat(2)
-    
-    chill.datacopy_9arg( tuple( passtoC ))
-
-    #print "back from datacopy_9arg\n\n\n"
-    #sys.stdout.flush()
-
-
-    #print "calling add_sync( %d, %s )" % ( stmt, start_loop )
-    chill.add_sync( stmt, start_loop )
-    #print "back from add_sync()\n\n"
-
-    new_num_statements = chill.num_statements()
-    
-    #  This is fairly CUBLAS2 specific, not sure how well it generalizes,
-    #  but for a 2D copy, what we want to do is "normalize" the first loop
-    #  "tmp1" then get its hard upper bound. We then want to tile it to
-    #  make the control loop of that tile "ty". We then tile "tmp2" with a
-    #  size of 1 and make it "tx".
-
-    #print "fairly CUBLAS2 specific, OLD %d  NEW %d" % ( old_num_statements, new_num_statements)
-    sys.stdout.flush()
-    sys.stdout.flush()
-
-    for stmt in range(old_num_statements, new_num_statements):
-        #print "for stmt = %d" % stmt
-        level = find_cur_level( stmt, "tmp2")
-        #print "FOUND CUR LEVEL?  level '",
-        #print level,
-        #print "'"
-
-        #print "in loop, stmt %d   level %d" % ( stmt, level )
-        if level != -1:
-            #print "\nCopy to shared: [If was no error]\n"
-            find_cur_level(stmt,"tmp2")
-            chill.tile3( stmt, level, level )
-            
-            #print "hard_loop_bounds( %d, %d )" % (stmt, level)
-            bounds = chill.hard_loop_bounds(stmt, level)
-            lower = bounds[0]
-            upper = 1+ bounds[1]
-            #print "lower %d  upper %d" % ( lower, upper )
-
-            dims = chill.thread_dims()
-            #print "in cudaize.py copy_to_shared, dims =",
-            #print dims
-            tx = dims[0]
-            ty = dims[1]
-            #print "2-loop cleanup: lower, upper: %d, %d,  tx: %d" % ( lower, upper, tx)
-
-            level = find_cur_level(stmt,"tmp1")
-            #print "level %d" % level
-            if tx == upper and ty == 1:
-                #print "tx = %d    upper = %d     ty = %d"% (tx, upper, ty)
-                #print "Don't need"
-
-                # Don't need an extra tile level, just move this loop up
-                second_level = find_cur_level(stmt,"tmp2")
-                chill.tile7(stmt, second_level, 1, level, "tx", "tx", counted)
-
-            else:
-                #print "DO need?"
-                if ty == 1:
-                    new_ctrl = "tmp3" 
-                else:
-                    new_ctrl = "ty"
-
-                # LOTS of commented out code here in cudaize.lua 
-
-                #print_code()
-                #print "\nStarting tmp2\n"
-                first_level  = find_cur_level(stmt,"tmp1")
-                second_level = find_cur_level(stmt,"tmp2")
-                bounds = chill.hard_loop_bounds(stmt, second_level)
-                lower = bounds[0]
-                upper = 1 + bounds[1]   # BROKEN?
-                        
-                #print "[Malik]-loop cleanup@tmp2: lower, upper: %d, %d, tx: %d,first level:%d,second_level:%d" % ( lower, upper-1, tx, first_level, second_level) 
-
-                # Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx.
-                #print "\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, second_level,1,first_level, "tx", "tx")
-                chill.tile7(stmt, second_level,1,first_level,"tx","tx",counted)
-                #print_code()
-
-                first_level = find_cur_level(stmt,"tmp1")
-                bounds = chill.hard_loop_bounds(stmt, first_level)
-                lower_1 =     bounds[0]
-                upper_1 = 1 + bounds[1]
-                tx_level = find_cur_level(stmt,"tx")
-                bounds = chill.hard_loop_bounds(stmt,tx_level)
-                lower_tx =   bounds[0]
-                upper_tx = 1+bounds[1]
-                #print "UL_1 %d %d     UL_tx %d %d" % ( lower_1, upper_1-1, lower_tx, upper_tx-1)
-
-                if int(math.ceil( float(upper_tx)/float(tx))) > 1:
-                     #print "ceil I say"
-                     #print "\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, tx_level,tx,tx_level, "tx", "tmp1")
-                     chill.tile7(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted)
-                     #print_code()
-
-                     repeat = find_cur_level(stmt,"tx")
-                     #print "\n[Tile1]tile(%d, %d, %d)" % (stmt, repeat, repeat)
-                     chill.tile3(stmt, repeat, repeat)  #find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx"))
-                     #print_code()
-
-                     if find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx"):
-                        #print "\nagain [Tile1]tile(%d, %d, %d)" % (stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
-                        chill.tile3(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
-                        #print_code()
-
-                #print_code()
-
-                #print "\nStarting tmp1\n"
-                # Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty".
-                chill.tile3(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))      
-                #print_code()
-
-                ty_level = find_cur_level(stmt,"tmp1")
-                bounds = chill.hard_loop_bounds(stmt,ty_level)
-                lower_ty = bounds[0]
-                upper_ty = 1 + bounds[1]
-
-                tx_level = find_cur_level(stmt,"tx")
-                bounds = chill.hard_loop_bounds(stmt,tx_level)
-                lower_tx = bounds[0]
-                upper_tx = 1 + bounds[1]
-
-                #print "[Malik]-loop cleanup@tmp1: lowerty, upperty: %d, %d, ty: %d,ty level:%d,tx_level:%d, stmt: %d" % ( lower_ty, upper_ty-1, ty, ty_level, tx_level, stmt)
-                
-                #print "before ceil"
-                #sys.stdout.flush()
-
-                if(math.ceil(float(upper_ty)/float(ty)) > 1):
-                    #print "CEIL IF"
-                    #print "\n Inside upper_ty/ty > 1\n"
-
-                    #print "\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, ty_level,ty,ty_level, "ty", "tmp_ty")
-                    chill.tile7(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted)
-                    #print_code()
-
-                    #print "\n[Tile2-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt  ,"ty"),find_cur_level(stmt,"ty"))
-                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty"))
-                    #print_code()
-
-                    cur_idxs = chill.cur_indices(stmt)
-                    #print "\n cur indexes are ",
-                    #print_array( cur_idxs)
-                    #sys.stdout.flush()
-
-                    # Putting ty before any tmp_tx
-                    idx_flag = -1
-                    if "tmp_tx" in cur_idxs:
-                        idx_flag = 1 + cur_idxs.index("tmp_tx")   # lua index starts at 1
-                    #print "\n (1) so i have found out the value of idx flag as %d" % idx_flag
-                    #sys.stdout.flush()      
-                    
-                    if idx_flag >= 0:
-                         if find_cur_level(stmt,"ty") > find_cur_level(stmt,"tmp_ty"):
-                             #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                             chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                             #print_code()
-                    
-                    
-                    #  Now Putting ty before any tmp_ty
-                    sys.stdout.flush()      
-                    idx_flag = -1
-                    if "tmp_ty" in cur_idxs:
-                        idx_flag = 1 + cur_idxs.index("tmp_ty") # lua index starts at 1
-                    #print "\n IF  so i have found out the value of idx flag as %d" % idx_flag
-                    #sys.stdout.flush()      
-                                            
-                    if idx_flag >= 0:
-                        #print "one more test"
-                        sys.stdout.flush()
-                        if find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"):
-                            #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                            #sys.stdout.flush()
-                            chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                            #print_code()
-
-
-
-                else:
-                    #print "CEIL ELSE"
-                    #print "\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, ty_level,1,ty_level, "ty", "ty")
-                    #sys.stdout.flush()
-                    chill.tile7( stmt, ty_level, 1, ty_level, "ty", "ty", counted )
-                    #print_code()
-
-                    #print "\n[Tile3-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
-                    sys.stdout.flush()
-
-                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
-                    #print_code()
-
-
-                    idx_flag = -1
-                    # LUA code checks to see if cur_idxs exists?  it is unused except in the other clause of this is
-                    #if(cur_idxs) then
-                        #print "CAN NEVER GET HERE?  cur_idxs"
-                        #for num= 0,table.getn(cur_idxs) do
-                            #if(cur[num] == "tmp_ty") then
-                            #idx_flag = find_cur_level(stmt,cur[num])
-                            #break
-                        #end
-                    #end
-                    print "\n ELSE so i have found out the value of idx flag as %d" % idx_flag
-                    if idx_flag >= 0:  # can't happen
-                        print "tile( stmt %d, level ty %d, level ty %d" % ( stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                        #chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                    
-                        
-                    
-
-                    
-            #print "\n\n *** at bottom of if in copy to shared, "
-            #print_code()
-            #print "end of if"
-
-        else:
-            #  copy to shared only created one level, not two, so we use a different approach (MV & TMV)
-            #print "\nCopy to shared: [If was error]\n"
-            level = find_cur_level(stmt,"tmp1")
-            chill.tile3(stmt, level, level)
-
-            dims = chill.thread_dims()
-            #print dims
-            tx = dims[0]
-            ty = dims[1]
-
-            bounds = chill.hard_loop_bounds(stmt, level)
-            lower = bounds[0]   
-            upper = bounds[1]
-
-            #print "bounds  lower %d    upper %d" % (lower, upper)
-            upper = upper+1 # upper bound given as <=, compare to dimensions tx which is <
-            if upper == tx:
-                #print "upper == tx"
-                chill.rename_index( stmt, "tmp1", "tx")
-            else:
-                #print "upper is not tx"
-                #print "upper %d tx %d stmt: %d level: %d" % ( upper, tx, stmt, level)
-                chill.tile7( stmt, level, tx, level, "tx", "tmp_tx", counted)
-                #print_code()
-
-                #print "stmt:%d level+1: %d" % ( stmt, level+1) 
-                #print("TILE 7")
-                chill.tile7( stmt, level+1,1,level+1,"tx", "tx",counted)
-                #print("TILE 3")
-                chill.tile3( stmt, level+1, level)
-                #print_code()           
-
-
-                if ty > 1:
-                   #print "GOING IN"
-                   bounds = chill.hard_loop_bounds(stmt, level+1)
-                   lower = bounds[0]   
-                   upper = bounds[1]   
-                   #print "ty %d  lower %d  upper %d" % ( ty, lower, upper )
-                   floatdiv = float(upper)/float(ty)
-                   bound =  int(math.ceil(float(upper)/float(ty)))
-                   #print "NOW FOR Y: upper %d ty %d stmt: %d level: %d bound: %d" % ( upper, ty, stmt, level+1,   bound)
-                   chill.tile7(stmt, level+1, bound, level+1, "tmp_ty", "ty", counted)
-
-        # Always add sync
-        chill.add_sync( stmt, start_loop )
-    #print "ending copy to shared\n"
-    #sys.stdout.flush()
-    #print_code()     
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-def unroll_to_depth( max_depth ):
-    print "\n\nunroll_to_depth(%d)" % max_depth
-    print "SYNC UP"
-    sys.stdout.flush()
-
-    cur = chill.cur_indices(0)
-    thread_idxs = chill.thread_indices()
-    guard_idx = thread_idxs[-1]  # last one
-
-    print "cur    indices",
-    print_array(cur)
-    print "thread indices", 
-    print_array(thread_idxs)
-    print "guard_idx = %s" % guard_idx
-
-    #print "thread_idxs = ",
-    #print thread_idxs
-    guard_idx = thread_idxs[-1]
-    #print "guard_idx = %s" % guard_idx
-
-    #  HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS
-    common_loops = []
-    comm_loops_cnt = 0
-    num_stmts = chill.num_statements()
-    print "num statements %d" % num_stmts
-
-    for stmt in range(num_stmts):
-        sys.stdout.flush()
-        print "\nSTMT %d" % stmt,
-        cur_idxs = chill.cur_indices(stmt)
-        print "Current Indices:",
-        for c in cur_idxs[:-1]:
-            print "%s," % c,
-        print "%s" % cur_idxs[-1]   # last one
-        sys.stdout.flush()
-        #print_code()
-        
-        if chk_cur_level(stmt, "tx") > 0:
-            
-            for ii in range(find_cur_level(stmt,"tx")-1):
-                print "ii = %d\ncur_idxs[%d] = '%s'" % (ii+1, ii+1, cur_idxs[ii]) # print to match lua
-                id = cur_idxs[ii]
-                if id not in ["bx", "by", "", "tx", "ty"]:
-
-                    print "id %s is not in the list" % id
-
-                    for stmt1 in range(stmt+1, num_stmts):
-                        print "\nii %d stmt1 is %d" % (ii+1, stmt1)  # print to match lua 
-                        cur_idxs1 = chill.cur_indices(stmt1)
-                        print "\nstmt1 cur_idxs1 is ",
-                        for ind in cur_idxs1[:-1]:
-                            print "%s," % ind,
-                        print "%s" % cur_idxs1[-1]
-
-                        print "cur level(%d, %s) = %d" % (stmt, "tx", find_cur_level(stmt,"tx") )
-                        sys.stdout.flush()
-
-                        endrange = find_cur_level(stmt,"tx")-1
-                        print "for iii=1, %d do" % endrange
-                        sys.stdout.flush()
-                        for iii in range(endrange):   # off by one?  TODO 
-                            print "stmt %d   ii %d   iii %d\n" % (stmt, ii+1, iii+1),
-                            sys.stdout.flush()
-                            
-                            if iii >= len(cur_idxs1):
-                                print "stmt %d   ii %d   iii %d  cur_idxs1[%d] = NIL" % (stmt, ii+1, iii+1, iii+1, )  # print to match lua 
-                            else:
-                                print "stmt %d   ii %d   iii %d  cur_idxs1[%d] = '%s'" % (stmt, ii+1, iii+1, iii+1, cur_idxs1[iii])  # print to match lua 
-                            sys.stdout.flush()
-
-                            # this will still probably die 
-                            if iii < len(cur_idxs1) and [iii] not in ["bx", "by", "tx", "ty", ""]:
-                                if cur_idxs[ii] == cur_idxs1[iii]:
-                                    print "\nfound idx:%s" % cur_idxs[ii]
-                                    common_loops.append(cur_idxs[ii])
-                                    print "cl[%d] = '%s'" % ( comm_loops_cnt, cur_idxs[ii] )
-                                    comm_loops_cnt = len(common_loops)
-
-    if len(common_loops) > 0:
-        print "\n COMM LOOPS :TOTAL %d, and are " % comm_loops_cnt,
-        print common_loops, 
-        print " this loop : %s" % common_loops[0]
-    else:
-        print "UNROLL can't unroll any loops?"
-
-
-    while True:  # break at bottom of loop   (repeat in lua)
-        old_num_statements = chill.num_statements()
-        print "old_num_statements %d" % old_num_statements
-
-        for stmt in range(old_num_statements):
-            cur_idxs = chill.cur_indices(stmt)
-            print "stmt %d    cur_idxs =" % stmt,
-            index = 0
-            for i in cur_idxs:
-                index +=1
-                if index == len(cur_idxs):
-                    print "%s" %i
-                else:
-                    print "%s," % i,
-
-            if len(cur_idxs) > 0:
-                guard_level = -1
-                if chk_cur_level(stmt, guard_idx) > 0:
-                    guard_level = find_cur_level(stmt,guard_idx)
-                print "guard_level(sp) = %d" % guard_level
-                if guard_level > -1:
-                    level = next_clean_level(cur_idxs,guard_level)
-                    print "next clean level %d" % level
-
-                    
-                    #print "looking at %d" % stmt
-                    #print "comparing %d and %d in" % (guard_level, level),
-                    #index = 0
-                    #for i in cur_idxs:
-                    #index +=1
-                    #if index == len(cur_idxs):
-                    #    print "%s" %i
-                    #else:
-                    #    print "%s," % i,
-
-                    # need to handle max_depth
-                    num_unrolled = 0
-                    level_unroll_comm = level
-                    level_arr = []
-
-                    #print "before while, level = %d" % level 
-                    while level >= 0:
-                        print "while: level = %d" % level 
-                        if num_unrolled == max_depth:
-                            break
-
-                        print "Unrolling %d at level %d index %s" % ( stmt, level, cur_idxs[guard_level])  # ??? 
-                        level_arr.append(level)
-
-                        guard_level = find_cur_level(stmt,guard_idx)
-                        level = next_clean_level(cur_idxs,level+1)
-
-                    print "OK, NOW WE UNROLL"
-                    if level_unroll_comm >= 0:
-                        level_arr.reverse()  
-                        for i,lev in enumerate(level_arr):
-                            print "\ni=%d" % i
-                            print "[Unroll]unroll(%d, %d, 0)" % (stmt, lev)
-                            chill.unroll(stmt, lev, 0)
-
-
-        new_num_statements = chill.num_statements()
-        if old_num_statements == new_num_statements:
-            break  # exit infinite loop
-
-
-#  all other calls to C have a routine in this file   (?)
-def unroll( statement, level, unroll_amount ):
-    chill.unroll( statement, level, unroll_amount )
-