diff options
Diffstat (limited to 'examples')
| -rwxr-xr-x | examples/cuda-chill/cudaize.py | 1047 | 
1 files changed, 1047 insertions, 0 deletions
| diff --git a/examples/cuda-chill/cudaize.py b/examples/cuda-chill/cudaize.py new file mode 100755 index 0000000..ffef009 --- /dev/null +++ b/examples/cuda-chill/cudaize.py @@ -0,0 +1,1047 @@ +#! /usr/bin/python + +# THIS IS CUDAIZE.PY + +import chill +import sys +import math  + +strided = 0 +counted = 1 + +def print_code(): +    chill.print_code() +    print "" +    sys.stdout.flush() + +     +def table_contains_key( table, key ):  # use a dict for the 'table'? +    return table.has_key(key) # (key in table)? + +def print_array( arr ):  # a useful function to mimic lua output  +    for a in arr[:-1]: +        print "%s," % a, +    print "%s" % arr[-1] +    sys.stdout.flush() + +def valid_indices( statement, indices ): +    #print "valid_indices() python calling C cur_indices" +    #print statement +    cur = chill.cur_indices(statement) # calls C +    #print "python valid_indices(), cur = ", +    #print cur +    #print "indices = ", +    #print indices + +    for index in indices: +        if not index in cur: +            return False +    return True + +def next_clean_level( indices_at_each_level, level): +    #print "next_clean_level( ..., %d )" % level  +    #print "indices_at_each_level ", +    print_array( indices_at_each_level ) + +    numlevels = len(indices_at_each_level) +    #print "loop to %d" % numlevels +    for i in range(level+1, numlevels+1): +        pythoni = i-1 # LUA index starts at 1 +        #print "Checking level %d = '%s'" % (i, indices_at_each_level[pythoni]) +        sys.stdout.flush() +        if len(indices_at_each_level[pythoni]) > 0: # LUA INDEX STARTS AT 1 +            #print "returning %d" % i +            return i  # MATCH lua return value, LUA index starts at one +    return -1  # no non-dummy indices + + + + +def build_order(  final_order, tile_index_names, control_index_names, tile_index_map, current_level): +    order = []    +    #print "\nbuild_order()" +    #print "build_order(): final_order = (", +    count = 0 +    for f in final_order: +        #if count+1 == len(final_order): +        #    print "%s )" % f +        #else: +        #    print "%s," % f , +        count += 1 + +        keys = control_index_names.keys() +        keys.sort() +        #if (2 == len(keys)): +        #    print "build_order(): ctrl_idx_names = (%s, %s)" % (control_index_names[0], control_index_names[1]) +        #else: +        #    print "build_order(): ctrl_idx_names = (%s" % control_index_names[0], +        #    for k in keys[1:]: +        #        print ", %s" % control_index_names[k], +        #    print ")" + +    #print control_index_names +    #print "cur_level %d" % current_level +     +    #print "tile index map: ", +    #print tile_index_map + + +    for i in range(len(final_order)): +        k = final_order[i]  # not used? +        skip = False +        cur = final_order[i]   +        # control loops below our current level should not be in the current order + +        # skip = cur in control_index_names[current_level+2:]  +        #print "\n%d control_index_names, " % len(control_index_names) +        #print control_index_names + +        for j in range(current_level+1, len(control_index_names)): +            #print "comparing cur %s with cin[%d] %s" % ( cur, j, control_index_names[j]) +            if control_index_names[j] == cur: +                skip = True  +                #print "SKIP %s  " % cur + +        # possibly substitute tile indices if necessary +        if tile_index_map.has_key(cur): +            approved_sub = False +            sub_string = tile_index_map[cur] +            #print "sub_string = ", +            #print sub_string + +            # approved_sub = sub_string in tile_index_names[current_level+2:] +            for j in range(current_level+1, len(tile_index_names)): +                if tile_index_names[j] == sub_string: +                    approved_sub = True +            if approved_sub: +                cur = sub_string + +        if not skip: +            order.append( cur)   +    #print "build_order() returning order (", +    #print order +    #for o in order: +    #    print "%s," % o, +    #print ")" +    return order + +def find_cur_level( stmt, idx ): +    #print "find_cur_level(stmt %d, idx %s)  Cur indices" % ( stmt, idx ), +     +    cur = chill.cur_indices(stmt) +    #for c in cur[:-1]: +    #    print "%s," % c, +    #print "%s" % cur[ -1 ]  + +    index = 1 # lua starts indices at 1 !!   +    for c in cur: +        if c == idx: +            #print "found it at index %d" % index +            #sys.stdout.flush() +            #print "in find_cur_level, returning ", +            #print index +            return index +        index += 1 +    #print "find_cur_level(), Unable to find index %s in" % idx, +    #print cur +    #print "in find_cur_level, returning -1" +    return -1  # special meaning "it's not there" + +def chk_cur_level( stmt, idx ): +    # search cur_indices for a ind at stmt +    cur = chill.cur_indices(stmt) +    if idx in cur: +       return 1 + cur.index(idx)  # lua index starts at 1 ! +    return -1 + +def find_offset( cur_order, tile, control): +    #print "Looking for tile '%s' and control '%s' in (" % (tile, control), +    #print cur_order +    #for o in cur_order: +    #    print "%s," % o, +    #print ")" + +    idx1 = -1 +    idx2 = -1 +    if tile in cur_order:  +        idx1 = 1 + cur_order.index(tile) # lua indexes from 1! +    else: +        print "find_offset(), unable to find tile %s in current list of indices" % tile +        sys.exit(-1) + +    if control in cur_order: +        idx2 = 1 + cur_order.index(control) # lua indexes from 1! +    else: +        print "find_offset(), unable to find control %s in current list of indices" % control +        sys.exit(-1) + +    #print "found at level %d and %d" % ( idx2, idx1 ) +    # this appears horrible +    if idx2 < idx1: +        return idx2-idx1+1 # bad ordering +    else: +        return idx2-idx1 + + + +def tile_by_index( tile_indices, sizes, index_names, final_order, tile_method): +    #print "STARTING TILE BY INDEX" +    #print "tile_by_index() tile_method ", +    #print tile_method +    #print "index_names: ", +    #print index_names + +    stmt = 0 # assume statement 0 +    if not valid_indices( stmt, tile_indices): +        print "python tile_by_index() one or more of ", +        print tile_indices, +        print " is not valid" +        sys.exit(-1) + +    if tile_method == None: +        #print "CREATING tile_method = 1" +        tile_method = 1 # "counted" + +    tile_index_names = [] +    for ti in tile_indices: +        tile_index_names.append( ti )  # make a copy?  +    #print "tile_index_names:", +    #print tile_index_names + +    control_index_names = {} # a dictionary? +    tile_index_map =  {} +     +    #print "index_names: " +    #print index_names + +    for pair in index_names: +        valid = False +        control = pair[0] +        name    = pair[1] +        #print "control %s   name  %s" % ( control, name ) +         +        if control[0] == "l" and control[1].isdigit(): +            if control.endswith("_control"): +                index = int(control[1: -8]) +                control_index_names[index-1] = name +                valid = True + +            elif control.endswith("_tile"): +                index = int(control[1: -5]) +                #print "index %d" % index +                tile_index_names[index-1] = name # ??  +                tile_index_map[name] = tile_indices[index-1] +                valid = True +        if not valid: +            print "%s is not a proper key for specifying tile or control loop indices\n" % control + +    #print "control_index_names = ", +    #print control_index_names + +    #print "tile_index_names = ", +    #print tile_index_names + +    #print "before call to build_order(), tile_index_map = ", +    #print tile_index_map + + +    # filter out control indices (and do name substitution of unprocessed tile indices) for a given level +    cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, -1) + +    #print "returned from build_order python\n\n" + +    # print("permute("..stmt..", {"..list_to_string(cur_order).."})") +    #print "permute(%d, {" % stmt, +    #print "cur_order = ", +    #print cur_order, +    #print "})" + +    cur_order.insert(0, stmt) +    #print cur_order +    chill.permute( tuple( cur_order))  +    #print "in cudaize.py, returned from C code chill.permute()\n" + +    for i in range(len(tile_indices)): +        cur_idx = tile_indices[i] +        #print "i %d  cur_idx %s calling build order ********" % (i, cur_idx) +        cur_order = build_order( final_order, tile_indices, control_index_names, tile_index_map, i) +        #print "cur_idx %s return from build order" % cur_idx +         +        # Find an offset between tile loop and control loop +        #  0   = control loop one level above tile loop +        #  -1  = control loop two levels above tile loop +        #  > 0 = tile loop above control loop +        #  In the last case, we do two extra tile commands to get the control +        #  above the tile and then rely on the final permute to handle the +        #  rest +        level = find_cur_level(stmt,cur_idx) +        #print "level %d\n" % level      + +        offset = find_offset(cur_order, tile_index_names[i], control_index_names[i]) +        #print "offset %d" % offset + +        if offset <= 0: +            #print "[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  ) +            chill.tile7( stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  ) +            #print "in cudaize.py, returned from C code chill.tile7\n" + +        else: +            #print "2tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  ) +            chill.tile7( stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  ) # regular level + +            # flip and tile control loop +            #print "3tile(%d, %d, %d)" % ( stmt, level+1, level+1) +            chill.tile3( stmt, level+1, level+1) + +            #print "4tile(%d, %d, %d)" % ( stmt, level+1, level) +            chill.tile3( stmt, level+1, level) + +            #print_code() + +        # Do permutation based on cur_order +        #print("permute based on build order calling build_order()") +        cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, i) + +        #print("permute based on build order return from build_order()") + +        #  print("permute("..stmt..", {"..list_to_string(cur_order).."})") +        topermute = cur_order +        topermute.insert(0, stmt) +        chill.permute( tuple(topermute) )  +        #print "\nafter permute(), code is:" +        #print_code() + +def normalize_index( index ): +    #print "in cudaize.py, normalize_index( %s )" % index +    stmt = 0  # assume stmt 0 +    l = find_cur_level( stmt, index ) +    chill.tile3( stmt, l, l ) + +def is_in_indices( stmt, idx): +    cur = chill.cur_indices(stmt) +    return idx in cur + +def copy_to_registers( start_loop, array_name ): +    #print "\n\n****** starting copy to registers" +    #sys.stdout.flush() + +    stmt = 0    # assume stmt 0 +    cur = chill.cur_indices(stmt) # calls C     +    table_Size = len(cur) + +    #print "Cur indices", +    #print_array(cur) +    #print "\nThe table size is %d" % table_Size +    #count=1 +    #for c in cur: +    #    print "%d\t%s" % (count,c) +    #    count += 1 + +    #print_code() + +    # would be much cleaner if not translating this code from lua! +    level_tx = -1 +    level_ty = -1    +    if is_in_indices(stmt,"tx"): +        level_tx = find_cur_level(stmt,"tx") +    if is_in_indices(stmt,"ty"): +        level_ty = find_cur_level(stmt,"ty") +    #print "level_tx %d  level_ty %d" % ( level_tx, level_ty ) +    #sys.stdout.flush() + +    ty_lookup_idx = ""  +    org_level_ty = level_ty + +    # UGLY logic. Lua index starts at 1, so all tests etc here are off by 1 from the lua code +    # level_ty initializes to -1 , which is not a valid index, and so there is added code to  +    # make it not try to acccess offset -1.   -1 IS a valid python array index +    # to top it off, the else below can assign a NIL to ty_lookup_idx!  +    if level_ty != -1 and cur[level_ty] != "": +        #print "IF  cur[%d] = %s" % ( level_ty, cur[level_ty] ) +        ty_lookup_idx = cur[level_ty]  +    else: +        #print "ELSE ty_lookup_idx = cur[%d] = %s" % ( level_ty, cur[level_ty-1])  +        ty_lookup_idx = cur[level_ty-1]  +    #print "ty_lookup_idx '%s'" % ty_lookup_idx + +    if level_ty > -1: +        #print "\ntile3(%d,%d,%d)" % (stmt,level_ty,level_tx+1) +        chill.tile3(stmt,level_ty,level_tx+1)  +    #print_code()    + +    cur = chill.cur_indices(stmt) # calls C  +    table_Size = len(cur) +    #print "Cur indices ", +    #for c in cur: +    #    print "%s," % c, +    #print "\nThe table size is %d" % len(cur) +    #count=1 +    #for c in cur: +    #    print "%d\t%s" % (count,c) +    #    count += 1 +    #sys.stdout.flush() + +    if is_in_indices(stmt,"tx"): +        level_tx = find_cur_level(stmt,"tx") +    if ty_lookup_idx != "":                      # perhaps incorrect test  +        if is_in_indices(stmt,ty_lookup_idx): +           level_ty = find_cur_level(stmt,ty_lookup_idx) +            +    ty_lookup = 1 +    idx_flag = -1 +    # find the level of the next valid index after ty+1 +    #print "\nlevel_ty %d" % level_ty +    if level_ty > -1: +       #print "table_Size %d" % table_Size +       for num in range(-1 + level_ty+ty_lookup,table_Size):   # ??  off by one? +           #print "num=%d   cur[num] = '%s'" % (num+1, cur[num]) # num+1 is lua index ???? +           sys.stdout.flush() +           if cur[num] != "": +               idx_flag = find_cur_level(stmt,cur[num]) +               #print "idx_flag = %d" % idx_flag +               break +                +    #print "\n(first) I am checking all indexes after ty+1 %s" % idx_flag +    #print_code()    +    #print ""  + +    how_many_levels = 1 +     +    #print "idx_flag = %d   I will check levels starting with %d" % (idx_flag, idx_flag+1) +    # lua arrays start at index 1. the next loop in lua starts at offset 0, since idx_flag can be -1 +    # thus the check for "not equal nil" in lua (bad idea) +    # python arrays start at 0, so will check for things that lua doesn't (?) +    startat = idx_flag + 1 +    if idx_flag == -1: +        startat = 1  # pretend we're lua for now.   TODO: fix the logic + +    for ch_lev in range(startat,table_Size+1):       # logic may be wrong (off by one) +        #print "ch_lev %d" % ch_lev +        if ch_lev <= table_Size and cur[ch_lev-1] != "": +           #print "cur[%d] = '%s'" % ( ch_lev, cur[ch_lev-1] ) +           how_many_levels += 1 + +    #print "\nHow Many Levels %d" % how_many_levels +    sys.stdout.flush() +    sys.stdout.flush() + +    if how_many_levels< 2: +        while( idx_flag >= 0): +            for num in range(level_ty+ty_lookup,table_Size+1): +                #print "at top of loop, num is %d" % num +                #print "cur[num] = '%s'" % cur[num-1] +                if cur[num-1] != "": +                    idx = cur[num-1] +                    #print "idx '%s'" % idx +                    sys.stdout.flush() +                    curlev = find_cur_level(stmt,idx) +                    #print "curlev %d" % curlev + +                    #print "\n[COPYTOREG]tile(%d,%d,%d)"%(stmt,curlev,level_tx) + +                    chill.tile3(stmt, curlev, curlev) +                    curlev = find_cur_level(stmt,idx) +                    #print "curlev %d" % curlev +                    chill.tile3(stmt,curlev,level_tx) +                    #print "hehe '%s'" % cur[num-1] +                     +                    cur = chill.cur_indices(stmt) +                    #print "Cur indices INSIDE", +                    #for c in cur: +                    #    print "%s," % c, +                    table_Size = len(cur) +                    #print "\nTable Size is: %d" % len(cur) + +                    level_tx = find_cur_level(stmt,"tx") +                    #print "\n level TX is: %d" % level_tx +                    level_ty = find_cur_level(stmt,ty_lookup_idx) +                    #print "\n level TY is: %d" %level_ty +                    idx_flag = -1 +                    #print "idx_flag = -1" + + +                    #- find the level of the next valid index after ty+1 +                    #- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?) +                    for num2 in range( -1 + level_ty+ty_lookup ,table_Size): # lua starts index at one +                        #print "num mucking num = %d" % num2 +                        if(cur[num2] != ""): +                            #print "cur[%d] = '%s'" % ( num2, cur[num2] ) +                            idx_flag = find_cur_level(stmt,cur[num2]) +                            #print("\n(second) I am checking all indexes after ty+1 %s",cur[num2]) +                            break + +                    #print "num mucked to %d     idx_flag = %d" % (num, idx_flag) + +                #print "at bottom of loop, num is %d" % num +           +    #print "done with levels" + +    # this was a block comment ??? + +#    for num in range(level_ty+1, table_Size+1): +#        print "num %d" % num +#        if cur[num-1] != "": +#            idx_flag = find_cur_level(stmt,cur[num-1])  ## ugly  +#    print "idx_flag = %d" % idx_flag + +    # change this all to reflect the real logic which is to normalize all loops inside the thread loops.  +#    print "change this all ...\n" +#    print "level_ty+1 %d  table_Size-1 %d     idx_flag %d" %( level_ty+1, table_Size-1, idx_flag) +#    sys.stdout.flush() +#    sys.stdout.flush() + +#    while level_ty+1 < (table_Size-1) and idx_flag >= 0: +#        print "*** level_ty %d" %  level_ty +#        for num in range(level_ty+2,table_Size+1):  # lua for includes second value +#            print "num %d   cur[num] %s" % (num, cur[num]) +#            if cur[num] != "": +#                idx = cur[num] +#                print "idx='%s'" % idx +#                #print_code() +                 +                 +             + +    #print "ARE WE SYNCED HERE?" +    #print_code() + +    #  [Malik] end logic +    start_level = find_cur_level(stmt, start_loop) # start_loop was passed parameter! + +    # We should hold constant any block or tile loop +    block_idxs  = chill.block_indices() +    thread_idxs = chill.thread_indices() +    #print"\nblock indices are" +    #for index, val in enumerate(block_idxs): +    #    print "%d\t%s" % ( int(index)+1 , val ) +    #print"\nthread indices are" +    #for index, val in enumerate(thread_idxs): +    #    print "%d\t%s" % ( int(index)+1 , val ) +    #print "\nStart Level: %d" % start_level + +    hold_constant = [] +    #print("\n Now in Blocks") +    for idx in block_idxs: +        blocklevel = find_cur_level(stmt,idx) +        if blocklevel >= start_level: +           hold_constant.append(idx) +           #print "\nJust inserted block %s in hold_constant" %idx + +    #print("\n Now in Threads") +    for idx in thread_idxs: +        blocklevel = find_cur_level(stmt,idx) +        if blocklevel >= start_level: +            hold_constant.append(idx) +            #print "\nJust inserted thread %s in hold_constant" %idx +    #print "\nhold constant table is: " +    #for index, val in enumerate(hold_constant): +    #    print "%d\t%s" % ( int(index)+1 , val ) +     +    #print("\nbefore datacopy pvt") +    old_num_stmts = chill.num_statements() +    #sys.stdout.flush() + +    #print "\n[DataCopy]datacopy_privatized(%d, %s, %s, " % (stmt, start_loop, array_name), +    #print hold_constant, +    #print ")" +    passtoC = [stmt, start_loop, array_name ] # a list +    passtoC.append( len(hold_constant ) ) +    for h in hold_constant: +        passtoC.append( h ) +    chill.datacopy_privatized( tuple( passtoC )) +    sys.stdout.flush() +    sys.stdout.flush() +     +    new_num_statements = chill.num_statements() +    #print "new num statements %d" % new_num_statements     + +    # Unroll to the last thread level +#    for stmt in range(old_num_statements, new_num_statements): +#        print "unrolling statement %d" % stmt +#        level = find_cur_level(stmt,thread_idxs[-1]) #get last thread level +#        print "level is %d" % level +#        idxs = chill.cur_indices(stmt) +#        if level < len(idxs): +#            chill.unroll(stmt,level+1,0) + + + +def copy_to_shared( start_loop, array_name, alignment ): +    #print "\nstarting copy to shared( %s, %s, %d)" % (start_loop, array_name, alignment )  +    #print "copy_to_shared( %s, %s, %d) in cudaize.py" % ( start_loop, array_name, alignment ) +    stmt = 0 # assume statement 0 + +    cur = chill.cur_indices(stmt) +    #print "Cur indices ", +    #print_array( cur ) + +    start_level = find_cur_level( stmt, start_loop ) +    #print "start_level %d" % start_level + +    old_num_statements = chill.num_statements() +    #print "old_num_statements %d" % old_num_statements +     + +    # Now, we give it indices for up to two dimensions for copy loop +    copy_loop_idxs = ["tmp1","tmp2"] +    #chill.datacopy_9arg(stmt, start_level, array_name, copy_loop_idxs, False, 0, 1, alignment,True) +    passtoC = [stmt, start_level, array_name]   # a list +    passtoC.append( len(copy_loop_idxs)) +    for i in copy_loop_idxs: +        passtoC.append(i) +    passtoC.append( 0 ) # False +    passtoC.append( 0 ) +    passtoC.append( 1 ) +    passtoC.append( alignment ) +    passtoC.append( 1 )   # True +    #print "\n[DataCopy]datacopy( ", +    #print passtoC, +    #print ")" + +    #if array_name == "b": +    #    chill.cheat(1) +    #if array_name == "c": +    #    chill.cheat(2) +     +    chill.datacopy_9arg( tuple( passtoC )) + +    #print "back from datacopy_9arg\n\n\n" +    #sys.stdout.flush() + + +    #print "calling add_sync( %d, %s )" % ( stmt, start_loop ) +    chill.add_sync( stmt, start_loop ) +    #print "back from add_sync()\n\n" + +    new_num_statements = chill.num_statements() +     +    #  This is fairly CUBLAS2 specific, not sure how well it generalizes, +    #  but for a 2D copy, what we want to do is "normalize" the first loop +    #  "tmp1" then get its hard upper bound. We then want to tile it to +    #  make the control loop of that tile "ty". We then tile "tmp2" with a +    #  size of 1 and make it "tx". + +    #print "fairly CUBLAS2 specific, OLD %d  NEW %d" % ( old_num_statements, new_num_statements) +    sys.stdout.flush() +    sys.stdout.flush() + +    for stmt in range(old_num_statements, new_num_statements): +        #print "for stmt = %d" % stmt +        level = find_cur_level( stmt, "tmp2") +        #print "FOUND CUR LEVEL?  level '", +        #print level, +        #print "'" + +        #print "in loop, stmt %d   level %d" % ( stmt, level ) +        if level != -1: +            #print "\nCopy to shared: [If was no error]\n" +            find_cur_level(stmt,"tmp2") +            chill.tile3( stmt, level, level ) +             +            #print "hard_loop_bounds( %d, %d )" % (stmt, level) +            bounds = chill.hard_loop_bounds(stmt, level) +            lower = bounds[0] +            upper = 1+ bounds[1] +            #print "lower %d  upper %d" % ( lower, upper ) + +            dims = chill.thread_dims() +            #print "in cudaize.py copy_to_shared, dims =", +            #print dims +            tx = dims[0] +            ty = dims[1] +            #print "2-loop cleanup: lower, upper: %d, %d,  tx: %d" % ( lower, upper, tx) + +            level = find_cur_level(stmt,"tmp1") +            #print "level %d" % level +            if tx == upper and ty == 1: +                #print "tx = %d    upper = %d     ty = %d"% (tx, upper, ty) +                #print "Don't need" + +                # Don't need an extra tile level, just move this loop up +                second_level = find_cur_level(stmt,"tmp2") +                chill.tile7(stmt, second_level, 1, level, "tx", "tx", counted) + +            else: +                #print "DO need?" +                if ty == 1: +                    new_ctrl = "tmp3"  +                else: +                    new_ctrl = "ty" + +                # LOTS of commented out code here in cudaize.lua  + +                #print_code() +                #print "\nStarting tmp2\n" +                first_level  = find_cur_level(stmt,"tmp1") +                second_level = find_cur_level(stmt,"tmp2") +                bounds = chill.hard_loop_bounds(stmt, second_level) +                lower = bounds[0] +                upper = 1 + bounds[1]   # BROKEN? +                         +                #print "[Malik]-loop cleanup@tmp2: lower, upper: %d, %d, tx: %d,first level:%d,second_level:%d" % ( lower, upper-1, tx, first_level, second_level)  + +                # Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx. +                #print "\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, second_level,1,first_level, "tx", "tx") +                chill.tile7(stmt, second_level,1,first_level,"tx","tx",counted) +                #print_code() + +                first_level = find_cur_level(stmt,"tmp1") +                bounds = chill.hard_loop_bounds(stmt, first_level) +                lower_1 =     bounds[0] +                upper_1 = 1 + bounds[1] +                tx_level = find_cur_level(stmt,"tx") +                bounds = chill.hard_loop_bounds(stmt,tx_level) +                lower_tx =   bounds[0] +                upper_tx = 1+bounds[1] +                #print "UL_1 %d %d     UL_tx %d %d" % ( lower_1, upper_1-1, lower_tx, upper_tx-1) + +                if int(math.ceil( float(upper_tx)/float(tx))) > 1: +                     #print "ceil I say" +                     #print "\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, tx_level,tx,tx_level, "tx", "tmp1") +                     chill.tile7(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted) +                     #print_code() + +                     repeat = find_cur_level(stmt,"tx") +                     #print "\n[Tile1]tile(%d, %d, %d)" % (stmt, repeat, repeat) +                     chill.tile3(stmt, repeat, repeat)  #find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx")) +                     #print_code() + +                     if find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx"): +                        #print "\nagain [Tile1]tile(%d, %d, %d)" % (stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")) +                        chill.tile3(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")) +                        #print_code() + +                #print_code() + +                #print "\nStarting tmp1\n" +                # Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty". +                chill.tile3(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))       +                #print_code() + +                ty_level = find_cur_level(stmt,"tmp1") +                bounds = chill.hard_loop_bounds(stmt,ty_level) +                lower_ty = bounds[0] +                upper_ty = 1 + bounds[1] + +                tx_level = find_cur_level(stmt,"tx") +                bounds = chill.hard_loop_bounds(stmt,tx_level) +                lower_tx = bounds[0] +                upper_tx = 1 + bounds[1] + +                #print "[Malik]-loop cleanup@tmp1: lowerty, upperty: %d, %d, ty: %d,ty level:%d,tx_level:%d, stmt: %d" % ( lower_ty, upper_ty-1, ty, ty_level, tx_level, stmt) +                 +                #print "before ceil" +                #sys.stdout.flush() + +                if(math.ceil(float(upper_ty)/float(ty)) > 1): +                    #print "CEIL IF" +                    #print "\n Inside upper_ty/ty > 1\n" + +                    #print "\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, ty_level,ty,ty_level, "ty", "tmp_ty") +                    chill.tile7(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted) +                    #print_code() + +                    #print "\n[Tile2-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt  ,"ty"),find_cur_level(stmt,"ty")) +                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty")) +                    #print_code() + +                    cur_idxs = chill.cur_indices(stmt) +                    #print "\n cur indexes are ", +                    #print_array( cur_idxs) +                    #sys.stdout.flush() + +                    # Putting ty before any tmp_tx +                    idx_flag = -1 +                    if "tmp_tx" in cur_idxs: +                        idx_flag = 1 + cur_idxs.index("tmp_tx")   # lua index starts at 1 +                    #print "\n (1) so i have found out the value of idx flag as %d" % idx_flag +                    #sys.stdout.flush()       +                     +                    if idx_flag >= 0: +                         if find_cur_level(stmt,"ty") > find_cur_level(stmt,"tmp_ty"): +                             #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                             chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                             #print_code() +                     +                     +                    #  Now Putting ty before any tmp_ty +                    sys.stdout.flush()       +                    idx_flag = -1 +                    if "tmp_ty" in cur_idxs: +                        idx_flag = 1 + cur_idxs.index("tmp_ty") # lua index starts at 1 +                    #print "\n IF  so i have found out the value of idx flag as %d" % idx_flag +                    #sys.stdout.flush()       +                                             +                    if idx_flag >= 0: +                        #print "one more test" +                        sys.stdout.flush() +                        if find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"): +                            #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                            #sys.stdout.flush() +                            chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                            #print_code() + + + +                else: +                    #print "CEIL ELSE" +                    #print "\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, ty_level,1,ty_level, "ty", "ty") +                    #sys.stdout.flush() +                    chill.tile7( stmt, ty_level, 1, ty_level, "ty", "ty", counted ) +                    #print_code() + +                    #print "\n[Tile3-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1) +                    sys.stdout.flush() + +                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1) +                    #print_code() + + +                    idx_flag = -1 +                    # LUA code checks to see if cur_idxs exists?  it is unused except in the other clause of this is +                    #if(cur_idxs) then +                        #print "CAN NEVER GET HERE?  cur_idxs" +                        #for num= 0,table.getn(cur_idxs) do +                            #if(cur[num] == "tmp_ty") then +                            #idx_flag = find_cur_level(stmt,cur[num]) +                            #break +                        #end +                    #end +                    print "\n ELSE so i have found out the value of idx flag as %d" % idx_flag +                    if idx_flag >= 0:  # can't happen +                        print "tile( stmt %d, level ty %d, level ty %d" % ( stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                        #chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                     +                         +                     + +                     +            #print "\n\n *** at bottom of if in copy to shared, " +            #print_code() +            #print "end of if" + +        else: +            #  copy to shared only created one level, not two, so we use a different approach (MV & TMV) +            #print "\nCopy to shared: [If was error]\n" +            level = find_cur_level(stmt,"tmp1") +            chill.tile3(stmt, level, level) + +            dims = chill.thread_dims() +            #print dims +            tx = dims[0] +            ty = dims[1] + +            bounds = chill.hard_loop_bounds(stmt, level) +            lower = bounds[0]    +            upper = bounds[1] + +            #print "bounds  lower %d    upper %d" % (lower, upper) +            upper = upper+1 # upper bound given as <=, compare to dimensions tx which is < +            if upper == tx: +                #print "upper == tx" +                chill.rename_index( stmt, "tmp1", "tx") +            else: +                #print "upper is not tx" +                #print "upper %d tx %d stmt: %d level: %d" % ( upper, tx, stmt, level) +                chill.tile7( stmt, level, tx, level, "tx", "tmp_tx", counted) +                #print_code() + +                #print "stmt:%d level+1: %d" % ( stmt, level+1)  +                #print("TILE 7") +                chill.tile7( stmt, level+1,1,level+1,"tx", "tx",counted) +                #print("TILE 3") +                chill.tile3( stmt, level+1, level) +                #print_code()            + + +                if ty > 1: +                   #print "GOING IN" +                   bounds = chill.hard_loop_bounds(stmt, level+1) +                   lower = bounds[0]    +                   upper = bounds[1]    +                   #print "ty %d  lower %d  upper %d" % ( ty, lower, upper ) +                   floatdiv = float(upper)/float(ty) +                   bound =  int(math.ceil(float(upper)/float(ty))) +                   #print "NOW FOR Y: upper %d ty %d stmt: %d level: %d bound: %d" % ( upper, ty, stmt, level+1,   bound) +                   chill.tile7(stmt, level+1, bound, level+1, "tmp_ty", "ty", counted) + +        # Always add sync +        chill.add_sync( stmt, start_loop ) +    #print "ending copy to shared\n" +    #sys.stdout.flush() +    #print_code()      + + + + + + + + + + + + + + + + + + + +def unroll_to_depth( max_depth ): +    print "\n\nunroll_to_depth(%d)" % max_depth +    print "SYNC UP" +    sys.stdout.flush() + +    cur = chill.cur_indices(0) +    thread_idxs = chill.thread_indices() +    guard_idx = thread_idxs[-1]  # last one + +    print "cur    indices", +    print_array(cur) +    print "thread indices",  +    print_array(thread_idxs) +    print "guard_idx = %s" % guard_idx + +    #print "thread_idxs = ", +    #print thread_idxs +    guard_idx = thread_idxs[-1] +    #print "guard_idx = %s" % guard_idx + +    #  HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS +    common_loops = [] +    comm_loops_cnt = 0 +    num_stmts = chill.num_statements() +    print "num statements %d" % num_stmts + +    for stmt in range(num_stmts): +        sys.stdout.flush() +        print "\nSTMT %d" % stmt, +        cur_idxs = chill.cur_indices(stmt) +        print "Current Indices:", +        for c in cur_idxs[:-1]: +            print "%s," % c, +        print "%s" % cur_idxs[-1]   # last one +        sys.stdout.flush() +        #print_code() +         +        if chk_cur_level(stmt, "tx") > 0: +             +            for ii in range(find_cur_level(stmt,"tx")-1): +                print "ii = %d\ncur_idxs[%d] = '%s'" % (ii+1, ii+1, cur_idxs[ii]) # print to match lua +                id = cur_idxs[ii] +                if id not in ["bx", "by", "", "tx", "ty"]: + +                    print "id %s is not in the list" % id + +                    for stmt1 in range(stmt+1, num_stmts): +                        print "\nii %d stmt1 is %d" % (ii+1, stmt1)  # print to match lua  +                        cur_idxs1 = chill.cur_indices(stmt1) +                        print "\nstmt1 cur_idxs1 is ", +                        for ind in cur_idxs1[:-1]: +                            print "%s," % ind, +                        print "%s" % cur_idxs1[-1] + +                        print "cur level(%d, %s) = %d" % (stmt, "tx", find_cur_level(stmt,"tx") ) +                        sys.stdout.flush() + +                        endrange = find_cur_level(stmt,"tx")-1 +                        print "for iii=1, %d do" % endrange +                        sys.stdout.flush() +                        for iii in range(endrange):   # off by one?  TODO  +                            print "stmt %d   ii %d   iii %d\n" % (stmt, ii+1, iii+1), +                            sys.stdout.flush() +                             +                            if iii >= len(cur_idxs1): +                                print "stmt %d   ii %d   iii %d  cur_idxs1[%d] = NIL" % (stmt, ii+1, iii+1, iii+1, )  # print to match lua  +                            else: +                                print "stmt %d   ii %d   iii %d  cur_idxs1[%d] = '%s'" % (stmt, ii+1, iii+1, iii+1, cur_idxs1[iii])  # print to match lua  +                            sys.stdout.flush() + +                            # this will still probably die  +                            if iii < len(cur_idxs1) and [iii] not in ["bx", "by", "tx", "ty", ""]: +                                if cur_idxs[ii] == cur_idxs1[iii]: +                                    print "\nfound idx:%s" % cur_idxs[ii] +                                    common_loops.append(cur_idxs[ii]) +                                    print "cl[%d] = '%s'" % ( comm_loops_cnt, cur_idxs[ii] ) +                                    comm_loops_cnt = len(common_loops) + +    if len(common_loops) > 0: +        print "\n COMM LOOPS :TOTAL %d, and are " % comm_loops_cnt, +        print common_loops,  +        print " this loop : %s" % common_loops[0] +    else: +        print "UNROLL can't unroll any loops?" + + +    while True:  # break at bottom of loop   (repeat in lua) +        old_num_statements = chill.num_statements() +        print "old_num_statements %d" % old_num_statements + +        for stmt in range(old_num_statements): +            cur_idxs = chill.cur_indices(stmt) +            print "stmt %d    cur_idxs =" % stmt, +            index = 0 +            for i in cur_idxs: +                index +=1 +                if index == len(cur_idxs): +                    print "%s" %i +                else: +                    print "%s," % i, + +            if len(cur_idxs) > 0: +                guard_level = -1 +                if chk_cur_level(stmt, guard_idx) > 0: +                    guard_level = find_cur_level(stmt,guard_idx) +                print "guard_level(sp) = %d" % guard_level +                if guard_level > -1: +                    level = next_clean_level(cur_idxs,guard_level) +                    print "next clean level %d" % level + +                     +                    #print "looking at %d" % stmt +                    #print "comparing %d and %d in" % (guard_level, level), +                    #index = 0 +                    #for i in cur_idxs: +                    #index +=1 +                    #if index == len(cur_idxs): +                    #    print "%s" %i +                    #else: +                    #    print "%s," % i, + +                    # need to handle max_depth +                    num_unrolled = 0 +                    level_unroll_comm = level +                    level_arr = [] + +                    #print "before while, level = %d" % level  +                    while level >= 0: +                        print "while: level = %d" % level  +                        if num_unrolled == max_depth: +                            break + +                        print "Unrolling %d at level %d index %s" % ( stmt, level, cur_idxs[guard_level])  # ???  +                        level_arr.append(level) + +                        guard_level = find_cur_level(stmt,guard_idx) +                        level = next_clean_level(cur_idxs,level+1) + +                    print "OK, NOW WE UNROLL" +                    if level_unroll_comm >= 0: +                        level_arr.reverse()   +                        for i,lev in enumerate(level_arr): +                            print "\ni=%d" % i +                            print "[Unroll]unroll(%d, %d, 0)" % (stmt, lev) +                            chill.unroll(stmt, lev, 0) + + +        new_num_statements = chill.num_statements() +        if old_num_statements == new_num_statements: +            break  # exit infinite loop + + +#  all other calls to C have a routine in this file   (?) +def unroll( statement, level, unroll_amount ): +    chill.unroll( statement, level, unroll_amount ) + | 
