diff options
Diffstat (limited to 'loop_cuda.cc')
-rw-r--r-- | loop_cuda.cc | 2123 |
1 files changed, 0 insertions, 2123 deletions
diff --git a/loop_cuda.cc b/loop_cuda.cc deleted file mode 100644 index a23990d..0000000 --- a/loop_cuda.cc +++ /dev/null @@ -1,2123 +0,0 @@ -/***************************************************************************** - Copyright (C) 2009 University of Utah - All Rights Reserved. - - Purpose: - Cudaize methods - - Notes: - - History: - 1/7/10 Created by Gabe Rudy by migrating code from loop.cc - 31/1/11 Modified by Protonu Basu -*****************************************************************************/ - -#include <code_gen/code_gen.h> -#include <code_gen/CG_stringBuilder.h> -#include <code_gen/output_repr.h> -#include <code_gen/CG_outputRepr.h> -#include "loop_cuda.hh" -#include "loop.hh" -#include <math.h> -#include <useful.h> -#include "omegatools.hh" -#include "ir_cudasuif.hh" -#include "ir_suif.hh" -#include "ir_suif_utils.hh" -#include "chill_error.hh" -#include <vector> - -using namespace omega; -char *k_cuda_texture_memory; //protonu--added to track texture memory type -char *k_cuda_constant_memory; //protonu--added to track constant memory type -//extern char *omega::k_cuda_texture_memory; //protonu--added to track texture memory type -extern char *omega::k_ocg_comment; - - -static int cudaDebug; -class CudaStaticInit{ public: CudaStaticInit(){ cudaDebug=0; //Change this to 1 for debug -}}; -static CudaStaticInit junkInitInstance__; - - - -std::string& upcase(std::string& s) -{ - for(int i=0; i<s.size(); i++) - s[i] = toupper(s[i]); - return s; -} - -void printVs(const std::vector<std::string>& curOrder){ - if(!cudaDebug) return; - for(int i=0; i<curOrder.size(); i++){ - if(i>0) - printf(","); - printf("%s", curOrder[i].c_str()); - } - printf("\n"); -} - -void printVS(const std::vector<std::string>& curOrder){ - //if(!cudaDebug) return; - for(int i=0; i<curOrder.size(); i++){ - if(i>0) - printf(","); - printf("%s", curOrder[i].c_str()); - } - printf("\n"); -} - -LoopCuda::~LoopCuda() { - const int m = stmt.size(); - for (int i = 0; i < m; i++) - stmt[i].code->clear(); -} - -bool LoopCuda::symbolExists(std::string s){ - if(symtab->lookup_sym(s.c_str(), SYM_VAR, false)) - return true; - if(globals->lookup_sym(s.c_str(), SYM_VAR, false)) - return true; - for(int i=0; i<idxNames.size(); i++) - for(int j=0; j<idxNames[i].size(); j++) - if(strcmp(idxNames[i][j].c_str(), s.c_str()) == 0) - return true; - return false; -} - -void LoopCuda::addSync(int stmt_num, std::string idxName) -{ - //we store these and code-gen inserts sync to omega comments where stmt - //in loop that has idxName being generated - syncs.push_back(make_pair(stmt_num,idxName)); -} - -void LoopCuda::renameIndex(int stmt_num, std::string idx, std::string newName) -{ - int level = findCurLevel(stmt_num, idx); - if(idxNames.size() <= stmt_num || idxNames[stmt_num].size() < level) - throw std::runtime_error("Invalid statment number of index"); - idxNames[stmt_num][level-1] = newName.c_str(); -} - - - -enum Type{ Int }; - -struct VarDefs{ - std::string name; - std::string secondName; - operand size_expr; //array size as an expression (can be a product of other variables etc) - type_node * type; - var_sym* in_data; //Variable of array to copy data in from (before kernel call) - var_sym* out_data; //Variable of array to copy data out to (after kernel call) - int size_2d; //-1 if linearized, the constant size N, of a NxN 2D array otherwise - bool tex_mapped; //protonu-- true if this variable will be texture mapped, so no need to pass it as a argument - bool cons_mapped; //protonu-- true if this variable will be constant mem mapped, so no need to pass it as a argument - std::string original_name; //this is such a hack, to store the original name, to store a table to textures used - int var_ref_size ; -}; - -tree_node_list* wrapInIfFromMinBound(tree_node_list* then_part, tree_for* loop, base_symtab* symtab, var_sym* bound_sym) -{ - tree_node_list* ub = loop->ub_list(); - tree_node_list_iter upli(ub); - while(!upli.is_empty()){ - tree_node *node = upli.step(); - if(node->kind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr) - { - in_rrr* ins = (in_rrr*)((tree_instr*)node)->instr(); - //expect the structure: cpy( _ = min(grab_me, _)) - if(ins->opcode() == io_cpy && ins->src1_op().is_instr()){ - ins = (in_rrr*)ins->src1_op().instr(); - if(ins->opcode() == io_min){ - tree_node_list* tnl = new tree_node_list; - tnl->append(if_node(symtab, fold_sle(operand(bound_sym), ins->src1_op().instr()->clone()), then_part)); - return tnl; - } - } - } - } - return then_part; //Failed to go to proper loop level -} - -/** - * This would be better if it was done by a CHiLL xformation instead of at codegen - * - * state: - * for(...) - * for(...) - * cur_body - * stmt1 - * - * stm1 is in-between two loops that are going to be reduced. The - * solution is to put stmt1 at the end of cur_body but conditionally run - * in on the last step of the for loop. - * - * A CHiLL command that would work better: - * - * for(...) - * stmt0 - * for(for i=0; i<n; i++) - * cur_body - * stmt1 - * => - * for(...) - * for(for i=0; i<n; i++) - * if(i==0) stmt0 - * cur_body - * if(i==n-1) stmt1 - */ - -std::vector<tree_for*> findCommentedFors(const char* index, tree_node_list* tnl){ - std::vector<tree_for *> result; - - tree_node_list_iter iter(tnl); - bool next_loop_ok = false; - while (!iter.is_empty()) { - tree_node *tn = iter.step(); - if (tn->kind() == TREE_INSTR && ((tree_instr*)tn)->instr()->opcode() == io_mrk) - { - instruction* inst = ((tree_instr*)tn)->instr(); - std::string comment; - if ((inst->peek_annote(k_ocg_comment) != NULL)) - { - immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment)); - immed_list_iter data_iter(data); - if(!data_iter.is_empty()){ - immed first_immed = data_iter.step(); - if(first_immed.kind() == im_string) - comment = first_immed.string(); - } - } - if(comment.find("~cuda~") != std::string::npos - && comment.find("preferredIdx: ") != std::string::npos){ - std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos); - if(idx.find(" ") != std::string::npos) - idx = idx.substr(0,idx.find(" ")); - if(strcmp(idx.c_str(),index) == 0) - next_loop_ok = true; - } - } - if (tn->kind() == TREE_FOR){ - if(next_loop_ok){ - //printf("found loop %s\n", static_cast<tree_for *>(tn)->index()->name()); - result.push_back(static_cast<tree_for *>(tn)); - } - else{ - //printf("looking down for loop %s\n", static_cast<tree_for *>(tn)->index()->name()); - std::vector<tree_for*> t = findCommentedFors(index, static_cast<tree_for *>(tn)->body()); - std::copy(t.begin(), t.end(), back_inserter(result)); - } - next_loop_ok = false; - } - if (tn->kind() == TREE_IF) { - //printf("looking down if\n"); - tree_if *tni = static_cast<tree_if *>(tn); - std::vector<tree_for*> t = findCommentedFors(index, tni->then_part()); - std::copy(t.begin(), t.end(), back_inserter(result)); - } - } - - return result; -} - -tree_node_list* forReduce(tree_for* loop, var_sym* reduceIndex, proc_symtab* proc_syms) -{ - //We did the replacements all at once with recursiveFindPreferedIdxs - //replacements r; - //r.oldsyms.append(loop->index()); - //r.newsyms.append(reduceIndex); - //tree_for* new_loop = (tree_for*)loop->clone_helper(&r, true); - tree_for* new_loop = loop; - - //return body one loops in - tree_node_list* tnl = loop_body_at_level(new_loop, 1); - //wrap in conditional if necessary - tnl = wrapInIfFromMinBound(tnl, new_loop, proc_syms, reduceIndex); - return tnl; -} - -void recursiveFindRefs(tree_node_list* code, proc_symtab* proc_syms, replacements* r) -{ - if(code->parent() && code->scope()->is_block()) - ((block_symtab*)code->scope())->find_exposed_refs(proc_syms, r); - tree_node_list_iter tnli(code); - while (!tnli.is_empty()) { - tree_node *node = tnli.step(); - //printf("node kind: %d\n", node->kind()); - if(node->is_instr()) - { - tree_instr* t_instr = (tree_instr*)node; - t_instr->find_exposed_refs(proc_syms, r); - } - if(node->is_block()){ - recursiveFindRefs(static_cast<tree_block *>(node)->body(), proc_syms, r); - } - else if(node->is_for()){ - tree_for* tn_for = static_cast<tree_for *>(node); - //Find refs in statemetns and body - tn_for->find_exposed_refs(proc_syms, r); - //recursiveFindRefs(tn_for->body(), proc_syms, r); - } - } -} - -tree_node_list* recursiveFindReplacePreferedIdxs(tree_node_list* code, proc_symtab* proc_syms, - proc_sym* cudaSync, func_type* unkown_func, - std::map<std::string, var_sym*>& loop_idxs) -{ - tree_node_list* tnl = new tree_node_list; - tree_node_list_iter tnli(code); - var_sym* idxSym=0; - bool sync = false; - std::vector<tree_node*> r1; - std::vector<tree_node_list*> r2; - while (!tnli.is_empty()) { - tree_node *node = tnli.step(); - //printf("node kind: %d\n", node->kind()); - if(node->is_instr()) - { - if(((tree_instr*)node)->instr()->format() == inf_rrr){ - in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr(); - if(inst->opcode() == io_mrk){ - std::string comment; - if ((inst->peek_annote(k_ocg_comment) != NULL)) - { - immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment)); - immed_list_iter data_iter(data); - if(!data_iter.is_empty()){ - immed first_immed = data_iter.step(); - if(first_immed.kind() == im_string) - comment = first_immed.string(); - } - } - if(comment.find("~cuda~") != std::string::npos - && comment.find("preferredIdx: ") != std::string::npos){ - std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos); - if(idx.find(" ") != std::string::npos) - idx = idx.substr(0,idx.find(" ")); - //printf("sym_tab preferred index: %s\n", idx.c_str()); - if(loop_idxs.find(idx) != loop_idxs.end()) - idxSym = loop_idxs.find(idx)->second; - //Get the proc variable sybol for this preferred index - if(idxSym == 0){ - idxSym = (var_sym*)proc_syms->lookup_sym(idx.c_str(), SYM_VAR, false); - //printf("idx not found: lookup %p\n", idxSym); - if(!idxSym){ - idxSym = new var_sym(type_s32, (char*)idx.c_str()); - proc_syms->add_sym(idxSym); - //printf("idx created and inserted\n"); - } - //Now insert into our map for future - loop_idxs.insert(make_pair(idx, idxSym)); - } - //See if we have a sync as well - if(comment.find("sync") != std::string::npos){ - //printf("Inserting sync after current block\n"); - sync = true; - } - } - } - } - tnl->append(node); - } - else if(node->is_block()){ - tree_block* b = static_cast<tree_block *>(node); - b->set_body(recursiveFindReplacePreferedIdxs(b->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); - tnl->append(b); - } - else if(node->is_for()){ - tree_for* tn_for = static_cast<tree_for *>(node); - if(idxSym){ - //Replace the current tn_for's index variable with idxSym - //printf("replacing sym %s -> %s\n", tn_for->index()->name(), idxSym->name()); - replacements r; - r.oldsyms.append(tn_for->index()); - r.newsyms.append(idxSym); - tree_for* new_loop = (tree_for*)tn_for->clone_helper(&r, true); - idxSym = 0; //Reset for more loops in this tnl - new_loop->set_body(recursiveFindReplacePreferedIdxs(new_loop->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); - tnl->append(new_loop); - - if(sync){ - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaSync))), 0); - tnl->append(new tree_instr(the_call)); - //tnl->print(); - sync = true; - } - }else{ - tn_for->set_body(recursiveFindReplacePreferedIdxs(tn_for->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); - tnl->append(tn_for); - } - }else if (node->kind() == TREE_IF) { - tree_if *tni = static_cast<tree_if *>(node); - tni->set_then_part(recursiveFindReplacePreferedIdxs(tni->then_part(), proc_syms, cudaSync, unkown_func, loop_idxs)); - tnl->append(tni); - } - } - //Do this after the loop to not screw up the pointer interator - /* - for(int i=0; i<r1.size(); i++){ - swap_node_for_node_list(r1[i],r2[i]); - }*/ - return tnl; -} - -// loop_vars -> array references -// loop_idxs -> <idx_name,idx_sym> map for when we encounter a loop with a different preferredIndex -// dim_vars -> out param, fills with <old,new> var_sym pair for 2D array dimentions (messy stuff) -tree_node_list* swapVarReferences(tree_node_list* code, replacements* r, CG_suifBuilder *ocg, - std::map<std::string, var_sym*>& loop_vars, - proc_symtab *proc_syms, - std::vector< std::pair<var_sym*,var_sym*> >& dim_vars) -{ - //Iterate over every expression, looking up each variable and type - //reference used and possibly replacing it or adding it to our symbol - //table - // - //We use the built-in cloning helper methods to seriously help us with this! - - //Need to do a recursive mark - recursiveFindRefs(code, proc_syms, r); - - - //We can't rely on type_node->clone() to do the heavy lifting when the - //old type is a two dimentional array with variable upper bounds as - //that requires creating and saveing variable references to the upper - //bounds. So we do one pass over the oldtypes doing this type of - //conversion, putting results in the fixed_types map for a second pass - //to pick up. - std::map<type_node*,type_node*> fixed_types; //array_types needing their upper bound installed - type_node_list_iter tlip(&r->oldtypes); - while(!tlip.is_empty()) - { - type_node* old_tn = tlip.step(); - type_node* new_tn = 0; - type_node* base_type = old_tn; - std::vector< std::pair<var_sym*, type_node*> > variable_upper_bouneds; - if(old_tn->is_ptr()){ - while (base_type->is_array() || base_type->is_ptr()) { - if (base_type->is_array()){ - array_bound ub = ((array_type*)base_type)->upper_bound(); - if(ub.is_variable()){ - var_sym* old_ub = (var_sym*)ub.variable(); - var_sym *new_ub = proc_syms->new_unique_var(type_s32); - dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub)); - variable_upper_bouneds.push_back( std::pair<var_sym*, type_node*>(new_ub, base_type) ); - } - base_type = static_cast<array_type *>(base_type)->elem_type(); - } - else if (base_type->is_ptr()) - base_type = static_cast<ptr_type *>(base_type)->ref_type(); - } - } - for (int i = variable_upper_bouneds.size()-1; i >= 0; i--) { - var_sym *var_ub = variable_upper_bouneds[i].first; - type_node* old_tn = variable_upper_bouneds[i].second; - if(new_tn == 0) - new_tn = new array_type(base_type, array_bound(1), array_bound(var_ub)); - else - new_tn = new array_type(new_tn, array_bound(1), array_bound(var_ub)); - proc_syms->add_type(new_tn); - fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn)); - } - if(new_tn){ - if(old_tn->is_ptr()){ - new_tn = new ptr_type(new_tn); - proc_syms->add_type(new_tn); - } - fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn)); - } - } - - //Quickly look for modifiers on our our array types (__shared__ float [][]) - type_node_list_iter tliq(&r->oldtypes); - while(!tliq.is_empty()) - { - type_node* old_tn = tliq.step(); - if(old_tn->is_modifier()){ - type_node* base_type = static_cast<modifier_type *>(old_tn)->base(); - if(fixed_types.find(base_type) != fixed_types.end()){ - type_node* fixed_base = (*fixed_types.find(base_type)).second; - //printf("Fix modifier with fixed base\n"); - //This should work to copy over the annotations, but apparently doesn't work so well - type_node* new_tn = new modifier_type(static_cast<modifier_type*>(old_tn)->op(), fixed_base); - old_tn->copy_annotes(new_tn); - fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn)); - } - } - } - - //Run through the types and create entries in r->newtypes but don't install - type_node_list_iter tli(&r->oldtypes); - while(!tli.is_empty()) - { - type_node* old_tn = tli.step(); - type_node* new_tn = 0; - - //If we recorded this as fixed by our special case, use that type - //instead of cloning. - if(fixed_types.find(old_tn) != fixed_types.end()){ - new_tn = (*fixed_types.find(old_tn)).second; - //printf("Reusing fixed typ %u: ", new_tn->type_id()); - }else{ - new_tn = old_tn->clone(); - //printf("Cloning type %u: ", old_tn->type_id()); - } - new_tn = proc_syms->install_type(new_tn); - - //Ok, there is a weird case where an array type that has var_sym as - //their upper bounds can't be covered fully in this loop or the - //var_sym loop, so we need special code. - /* - if(old_tn->op() == TYPE_PTR && ((ptr_type*)old_tn)->ref_type()->op() == TYPE_ARRAY){ - array_type* outer_array = (array_type*)((ptr_type*)old_tn)->ref_type(); - array_bound ub = outer_array->upper_bound(); - if(ub.is_variable()){ - var_sym* old_ub = (var_sym*)ub.variable(); - var_sym* new_ub = (var_sym*)((array_type*)((ptr_type*)new_tn)->ref_type())->upper_bound().variable(); - //r->oldsyms.append(old_ub); - fix_ub.insert(std::pair<var_sym*,array_type*>(old_ub, (array_type*)((ptr_type*)new_tn)->ref_type())); - dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub)); - printf("array var_sym: %p\n", new_ub); - } - if(outer_array->elem_type()->op() == TYPE_ARRAY) - { - array_type* inner_array = (array_type*)outer_array->elem_type(); - array_bound ub = inner_array->upper_bound(); - if(ub.is_variable()){ - var_sym* old_ub = (var_sym*)ub.variable(); - var_sym* new_ub = (var_sym*)((array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type())->upper_bound().variable(); - dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub)); - printf("array var_sym: %p\n", new_ub); - //r->oldsyms.append(old_ub); - fix_ub.insert(std::pair<var_sym*,array_type*>(old_ub, (array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type())); - } - } - } - */ - r->newtypes.append(new_tn); - } - - //printf("proc_syms symbol run through\n"); - //proc_syms->print(); - - //Run through the syms creating new copies - sym_node_list_iter snli(&r->oldsyms); - while(!snli.is_empty()) - { - sym_node *old_sn = snli.step(); - - if(loop_vars.count(std::string(old_sn->name())) > 0) - { - r->newsyms.append(loop_vars[std::string(old_sn->name())]); - //printf("def exists: %s\n", old_sn->name()); - }else{ - sym_node *new_sn = old_sn->copy(); - if(new_sn->is_var()){ - var_sym* var = (var_sym*)new_sn; - type_node* new_type = var->type()->clone_helper(r); - - //TODO: Have a tagged list of variables to make shared - //Make local 2D arrays __shared__ - if(new_type->op() == TYPE_ARRAY && ((array_type*)new_type)->elem_type()->op() == TYPE_ARRAY){ - //protonu--changes suggested by Malik - //printf("Adding __shared__ annotation to : %s\n", new_sn->name()); - //new_type = ocg->ModifyType(new_type, "__shared__"); - //proc_syms->add_type(new_type); - } - var->set_type(new_type); - } - proc_syms->add_sym(new_sn); - r->newsyms.append(new_sn); - //printf("def new: %s\n", new_sn->name()); - } - } - - //printf("proc_syms var runthrough\n"); - //proc_syms->print(); - return code->clone_helper(r); -} - -bool LoopCuda::validIndexes(int stmt, const std::vector<std::string>& idxs){ - for(int i=0; i<idxs.size(); i++){ - bool found = false; - for(int j=0; j<idxNames[stmt].size(); j++){ - if(strcmp(idxNames[stmt][j].c_str(), idxs[i].c_str()) == 0){ - found=true; - } - } - if(!found){ - return false; - } - } - return true; -} - - -bool LoopCuda::cudaize_v2(std::string kernel_name, std::map<std::string, int> array_dims, - std::vector<std::string> blockIdxs, std::vector<std::string> threadIdxs) -{ - int stmt_num = 0; - if(cudaDebug){ - printf("cudaize_v2(%s, {", kernel_name.c_str()); - //for( - printf("}, blocks={"); printVs(blockIdxs); printf("}, thread={"); printVs(threadIdxs); printf("})\n"); - } - - this->array_dims = array_dims; - if(!validIndexes(stmt_num, blockIdxs)){ - throw std::runtime_error("One of the indexes in the block list was not " - "found in the current set of indexes."); - } - if(!validIndexes(stmt_num, threadIdxs)){ - throw std::runtime_error("One of the indexes in the thread list was not " - "found in the current set of indexes."); - } - if(blockIdxs.size() ==0) - throw std::runtime_error("Cudaize: Need at least one block dimention"); - int block_level=0; - //Now, we will determine the actual size (if possible, otherwise - //complain) for the block dimentions and thread dimentions based on our - //indexes and the relations for our stmt; - for(int i=0; i<blockIdxs.size(); i++){ - int level = findCurLevel(stmt_num, blockIdxs[i]); - int ub,lb; - extractCudaUB(stmt_num,level,ub,lb); - if(lb!= 0){ - //attempt to "normalize" the loop with an in-place tile and then re-check our bounds - if(cudaDebug) printf("Cudaize: doing tile at level %d to try and normalize lower bounds\n", level); - tile(stmt_num,level,1,level,CountedTile); - idxNames[stmt_num].insert(idxNames[stmt_num].begin()+(level),"");//TODO: possibly handle this for all sibling stmts - extractCudaUB(stmt_num,level,ub,lb); - } - if(lb != 0){ - char buf[1024]; - sprintf(buf, "Cudaize: Loop at level %d does not have 0 as it's lower bound", level); - throw std::runtime_error(buf); - } - if(ub < 0){ - char buf[1024]; - sprintf(buf, "Cudaize: Loop at level %d does not have a hard upper bound", level); - throw std::runtime_error(buf); - } - if(cudaDebug) printf("block idx %s level %d lb: %d ub %d\n", blockIdxs[i].c_str(), level, lb, ub); - if(i == 0){ - block_level = level; - cu_bx = ub+1; - idxNames[stmt_num][level-1] = "bx"; - } - else if(i == 1){ - cu_by = ub+1; - idxNames[stmt_num][level-1] = "by"; - } - } - if(!cu_by) - block_level=0; - int thread_level1 = 0; - int thread_level2 = 0; - for(int i=0; i<threadIdxs.size(); i++){ - int level = findCurLevel(stmt_num, threadIdxs[i]); - int ub,lb; - extractCudaUB(stmt_num,level,ub,lb); - if(lb!= 0){ - //attempt to "normalize" the loop with an in-place tile and then re-check our bounds - if(cudaDebug) printf("Cudaize: doing tile at level %d to try and normalize lower bounds\n", level); - tile(stmt_num,level,1,level,CountedTile); - idxNames[stmt_num].insert(idxNames[stmt_num].begin()+(level),""); - extractCudaUB(stmt_num,level,ub,lb); - } - if(lb != 0){ - char buf[1024]; - sprintf(buf, "Cudaize: Loop at level %d does not have 0 as it's lower bound", level); - throw std::runtime_error(buf); - } - if(ub < 0){ - char buf[1024]; - sprintf(buf, "Cudaize: Loop at level %d does not have a hard upper bound", level); - throw std::runtime_error(buf); - } - - if(cudaDebug) printf("thread idx %s level %d lb: %d ub %d\n", threadIdxs[i].c_str(), level, lb, ub); - if(i == 0){ - thread_level1 = level; - cu_tx = ub+1; - idxNames[stmt_num][level-1] = "tx"; - } - else if(i == 1){ - thread_level2 = level; - cu_ty = ub+1; - idxNames[stmt_num][level-1] = "ty"; - } - else if(i == 2){ - cu_tz = ub+1; - idxNames[stmt_num][level-1] = "tz"; - } - } - if(!cu_ty) - thread_level1 = 0; - if(!cu_tz) - thread_level2 = 0; - - //Make changes to nonsplitlevels - const int m = stmt.size(); - for (int i = 0; i < m; i++) { - if(block_level){ - //stmt[i].nonSplitLevels.append((block_level)*2); - stmt_nonSplitLevels[i].append((block_level)*2); - } - if(thread_level1){ - //stmt[i].nonSplitLevels.append((thread_level1)*2); - stmt_nonSplitLevels[i].append((thread_level1)*2); - } - if(thread_level2){ - //stmt[i].nonSplitLevels.append((thread_level1)*2); - stmt_nonSplitLevels[i].append((thread_level1)*2); - } - } - - if(cudaDebug) { - printf("Codegen: current names: "); - printVS(idxNames[stmt_num]); - } - //Set codegen flag - code_gen_flags |= GenCudaizeV2; - - //Save array dimention sizes - this->array_dims = array_dims; - cu_kernel_name = kernel_name.c_str(); - -} - -tree_node_list* LoopCuda::cudaize_codegen_v2() -{ - //printf("cudaize codegen V2\n"); - CG_suifBuilder *ocg = dynamic_cast<CG_suifBuilder*>(ir->builder()); - if(!ocg) return false; - - //protonu--adding an annote to track texture memory type - ANNOTE(k_cuda_texture_memory, "cuda texture memory", TRUE); - ANNOTE(k_cuda_constant_memory, "cuda constant memory", TRUE); - int tex_mem_on = 0; - int cons_mem_on = 0; - - - - CG_outputRepr* repr; - std::vector<VarDefs> arrayVars; - std::vector<VarDefs> localScopedVars; - - std::vector<IR_ArrayRef *> ro_refs; - std::vector<IR_ArrayRef *> wo_refs; - std::set<std::string> uniqueRefs; - std::set<std::string> uniqueWoRefs; - //protonu--let's try a much simpler approach of a map instead - //we also keep a map for constant memories - std::map<std::string , var_sym *>tex_ref_map; - std::map<std::string , var_sym *>cons_ref_map; - - for(int j=0; j<stmt.size(); j++) - { - std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[j].code); - for (int i = 0; i < refs.size(); i++) - { - //printf("ref %s wo %d\n", static_cast<const char*>(refs[i]->name()), refs[i]->is_write()); - var_sym* var = symtab->lookup_var((char*)refs[i]->name().c_str(),false); - //If the array is not a parameter, then it's a local array and we - //want to recreate it as a stack variable in the kernel as opposed to - //passing it in. - if(!var->is_param()) - continue; - if (uniqueRefs.find(refs[i]->name()) == uniqueRefs.end()) - { - uniqueRefs.insert(refs[i]->name()); - if(refs[i]->is_write()){ - uniqueWoRefs.insert(refs[i]->name()); - wo_refs.push_back(refs[i]); - } - else - ro_refs.push_back(refs[i]); - } - if (refs[i]->is_write() && uniqueWoRefs.find(refs[i]->name()) == uniqueWoRefs.end()){ - uniqueWoRefs.insert(refs[i]->name()); - wo_refs.push_back(refs[i]); - //printf("adding %s to wo\n", static_cast<const char*>(refs[i]->name())); - } - } - } - - // printf("reading from array "); - // for(int i=0; i<ro_refs.size(); i++) - // printf("'%s' ", ro_refs[i]->name().c_str()); - // printf("and writting to array "); - // for(int i=0; i<wo_refs.size(); i++) - // printf("'%s' ", wo_refs[i]->name().c_str()); - // printf("\n"); - - const char* gridName = "dimGrid"; - const char* blockName = "dimBlock"; - - //TODO: Could allow for array_dims_vars to be a mapping from array - //references to to variable names that define their length. - var_sym* dim1 = 0; - var_sym* dim2 = 0; - - for(int i=0; i<wo_refs.size(); i++) - { - //TODO: Currently assume all arrays are floats of one or two dimentions - var_sym* outArray = 0; - std::string name = wo_refs[i]->name(); - outArray = symtab->lookup_var((char*)name.c_str(),false); - - VarDefs v; - v.size_2d = -1; - char buf[32]; - snprintf(buf, 32, "devO%dPtr", i+1); - v.name = buf; - if(outArray->type()->is_ptr()) - if(((ptr_type *)(outArray->type()))->ref_type()->is_array()) - v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type(); - else - v.type = ((ptr_type *)(outArray->type()))->ref_type(); - else - v.type = type_f32; - v.tex_mapped = false; - v.cons_mapped = false; - v.original_name = wo_refs[i]->name(); - //Size of the array = dim1 * dim2 * num bytes of our array type - - //If our input array is 2D (non-linearized), we want the actual - //dimentions of the array - CG_outputRepr* size; - //Lookup in array_dims - std::map<std::string, int>::iterator it = array_dims.find(name.c_str()); - if(outArray->type()->is_ptr() && outArray->type()->ref_type(0)->is_array()) - { - array_type* t = (array_type*)outArray->type()->ref_type(0); - v.size_2d = t->upper_bound().constant()+1; - printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)wo_refs[i]->name().c_str()); - size = ocg->CreateInt(v.size_2d * v.size_2d); - }else if(it != array_dims.end()){ - int ref_size = it->second; - v.var_ref_size = ref_size; - size = ocg->CreateInt(ref_size); - } - else{ - if(dim1){ - size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)), - new CG_suifRepr(operand(dim2))); - }else{ - char buf[1024]; - sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a " - "detectable size or specififed dimentions", name.c_str()); - throw std::runtime_error(buf); - } - } - v.size_expr = operand(static_cast<CG_suifRepr*>(ocg->CreateTimes( - size, - ocg->CreateInt(v.type->size()/8)))->GetExpression()); - v.in_data = 0; - v.out_data = outArray; - //Check for in ro_refs and remove it at this point - std::vector<IR_ArrayRef *>::iterator it_; - for(it_ = ro_refs.begin(); it_ != ro_refs.end(); it_++) - { - if((*it_)->name() == wo_refs[i]->name()){ - break; - } - } - if(it_ != ro_refs.end()) - { - v.in_data = outArray; - ro_refs.erase(it_); - } - - arrayVars.push_back(v); - - } - - //protonu-- assuming that all texture mapped memories were originally read only mems - //there should be safety checks for that, will implement those later - - int cs_ref_size = 0; - - for(int i=0; i<ro_refs.size(); i++) - { - var_sym* inArray = 0; - std::string name = ro_refs[i]->name(); - inArray = symtab->lookup_var((char*)name.c_str(),false); - VarDefs v; - v.size_2d = -1; - char buf[32]; - snprintf(buf, 32, "devI%dPtr", i+1); - v.name = buf; - if(inArray->type()->is_ptr()) - if(((ptr_type *)(inArray->type()))->ref_type()->is_array()) - v.type = ((array_type *)(((ptr_type *)(inArray->type()))->ref_type()))->elem_type(); - else - v.type = ((ptr_type *)(inArray->type()))->ref_type(); - else - v.type = type_f32; - v.tex_mapped = false; - v.cons_mapped = false; - v.original_name = ro_refs[i]->name(); - if ( texture != NULL) - v.tex_mapped = (texture->is_array_tex_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars - if (v.tex_mapped){ - printf("this variable %s is mapped to texture memory", name.c_str()); - } - if ( constant_mem != NULL) - v.cons_mapped = (constant_mem->is_array_cons_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars - if (v.cons_mapped){ - printf("this variable %s is mapped to constant memory", name.c_str()); - } - - //Size of the array = dim1 * dim2 * num bytes of our array type - - //If our input array is 2D (non-linearized), we want the actual - //dimentions of the array (as it might be less than cu_n - CG_outputRepr* size; - //Lookup in array_dims - std::map<std::string, int>::iterator it = array_dims.find(name.c_str()); - int ref_size = 0; - if(inArray->type()->is_ptr() && inArray->type()->ref_type(0)->is_array()) - { - array_type* t = (array_type*)inArray->type()->ref_type(0); - v.size_2d = t->upper_bound().constant()+1; - printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)ro_refs[i]->name().c_str()); - size = ocg->CreateInt(v.size_2d * v.size_2d); - }else if(it != array_dims.end()){ - ref_size = it->second; - v.var_ref_size = ref_size; - size = ocg->CreateInt(ref_size); - }else{ - if(dim1){ - size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)), - new CG_suifRepr(operand(dim2))); - }else{ - char buf[1024]; - sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a " - "detectable size or specififed dimentions", name.c_str()); - throw std::runtime_error(buf); - } - } - - - - v.size_expr = operand(static_cast<CG_suifRepr*>(ocg->CreateTimes( - size, - ocg->CreateInt(v.type->size()/8)))->GetExpression()); - - v.in_data = inArray; - v.out_data = 0; - arrayVars.push_back(v); - } - - - if(arrayVars.size() < 2) - { - fprintf(stderr, "cudaize error: Did not find two arrays being accessed\n"); - return false; - } - - //protonu--debugging tool--the printf statement - //tex_mem_on signals use of tex mem - for(int i=0; i<arrayVars.size(); i++) - { - //printf("var name %s, tex_mem used %s\n", arrayVars[i].name.c_str(), (arrayVars[i].tex_mapped)?"true":"false"); - if (arrayVars[i].tex_mapped ) tex_mem_on ++; - if (arrayVars[i].cons_mapped ) cons_mem_on ++; - } - - //Add CUDA function extern prototypes and function types - func_type* unkown_func = new func_type(type_s32); //function on unkown args that returns a i32 - unkown_func = (func_type*)symtab->install_type(unkown_func); - func_type* void_func = new func_type(type_void); //function on unkown args that returns a void - void_func = (func_type*)globals->install_type(void_func); - func_type* float_func = new func_type(type_f32); //function on unkown args that returns a float - float_func = (func_type*)globals->install_type(float_func); - - type_node* result = ocg->ModifyType(type_void, "__global__"); - result = globals->install_type(result); - func_type* kernel_type = new func_type(result); //function returns a '__global__ void' - - int numArgs = arrayVars.size() + (dim1 ? 2 : 0) + localScopedVars.size(); - //protonu--need to account for texture memory here, reduce the #args - if( tex_mem_on ) numArgs -= tex_mem_on; - if( cons_mem_on ) numArgs -= cons_mem_on; - kernel_type->set_num_args(numArgs); - int argCount = 0; - for(int i=0; i<arrayVars.size(); i++) - { - type_node* fptr; - if(arrayVars[i].in_data) - fptr = arrayVars[i].in_data->type()->clone(); - else - fptr = arrayVars[i].out_data->type()->clone(); - //protonu--skip this for texture mems - if( arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true ) - kernel_type->set_arg_type(argCount++, fptr); - } - if(dim1){ - kernel_type->set_arg_type(argCount++, type_s32); //width x height dimentions - kernel_type->set_arg_type(argCount++, type_s32); - } - kernel_type = (func_type*)globals->install_type(kernel_type); - - proc_sym* cudaMalloc = globals->new_proc(unkown_func, src_c, "cudaMalloc"); - proc_sym* cudaMemcpy = globals->new_proc(unkown_func, src_c, "cudaMemcpy"); - proc_sym* cudaFree = globals->new_proc(unkown_func, src_c, "cudaFree"); - proc_sym* cudaSync = globals->new_proc(void_func, src_c, "__syncthreads"); - proc_sym* cudaBind = globals->new_proc(unkown_func, src_c, "cudaBindTexture"); - proc_sym* cudaMemcpySym = globals->new_proc(unkown_func, src_c, "cudaMemcpyToSymbol"); - - - //protonu-removing Gabe's function, introducing mine, this is pretty cosmetic - //proc_sym* cudaFetch = globals->new_proc(float_func, src_c, "tex1Dfetch"); - proc_sym* tex1D = globals->new_proc(float_func, src_c, "tex1Dfetch"); - - var_sym *cudaMemcpyHostToDevice = new var_sym(type_s32, "cudaMemcpyHostToDevice"); - var_sym *cudaMemcpyDeviceToHost = new var_sym(type_s32, "cudaMemcpyDeviceToHost"); - cudaMemcpyDeviceToHost->set_param(); - cudaMemcpyHostToDevice->set_param(); - globals->add_sym(cudaMemcpyHostToDevice); - globals->add_sym(cudaMemcpyDeviceToHost); - - //protonu--adding the bool tex_mem to the structure struct_type - //to bypass the re-naming of struct texture, this is a hack fix - struct_type* texType = new struct_type(TYPE_GROUP, 0, "texture<float, 1, cudaReadModeElementType>", 0, true); - immed_list *iml_tex = new immed_list; - iml_tex->append(immed("texture memory")); - texType->append_annote(k_cuda_texture_memory, iml_tex); - //protonu--end my changes - texType = (struct_type*)globals->install_type(texType); - //protonu--should register the locals later on - //when we do the bind operation - //var_sym* texRef = new var_sym(texType, "texRef"); - //globals->add_sym(texRef); - - //Add our mallocs (and input array memcpys) - for(int i=0; i<arrayVars.size(); i++) - { - //protonu--check if the variable is not a tex-mapped variable. If it is tex mapped - // allow a malloc and memcpy operation, and a bind, but only if it is tex mapped, but dont call - // the kernel with it as an argument. - - //Make a pointer of type a[i].type - //type_node* fptr = new ptr_type(arrayVars[i].type->clone()); - //protonu--temporary change - type_node* fptr = new ptr_type(arrayVars[i].type); - fptr = symtab->install_type(fptr); - var_sym *dvs = new var_sym(fptr, const_cast<char*>( - arrayVars[i].name.c_str())); - dvs->set_addr_taken(); - symtab->add_sym(dvs); - - //cudaMalloc args - //protonu--no cudaMalloc required for constant memory - tree_node_list* tnl = new tree_node_list; - if(arrayVars[i].cons_mapped != true ) - { - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMalloc))), 2); - the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to()->ptr_to(), operand(), immed(dvs)))); - the_call->set_argument(1, arrayVars[i].size_expr); - - tnl->append(new tree_instr(the_call)); - setup_code = ocg->StmtListAppend(setup_code, - new CG_suifRepr(tnl)); - } - if(arrayVars[i].in_data) - { - //cudaMemcpy args - //protonu-- no cudaMemcpy required for constant memory - if ( arrayVars[i].cons_mapped != true ) - { - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpy))), 4); - the_call->set_argument(0, operand(dvs)); - the_call->set_argument(1, operand(arrayVars[i].in_data)); - the_call->set_argument(2, arrayVars[i].size_expr.clone()); - the_call->set_argument(3, operand(cudaMemcpyHostToDevice)); - - tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - setup_code = ocg->StmtListAppend(setup_code, - new CG_suifRepr(tnl)); - } - - //protonu--check if the arrayvar is tex mapped - if(arrayVars[i].tex_mapped == true) - { - //Need a texture reference variable - char buf[32]; - snprintf(buf, 32, "tex%dRef", i+1); - arrayVars[i].secondName = buf; - - var_sym* texRef = new var_sym(texType, buf); - //printf("\n putting in %s\n", arrayVars[i].original_name.c_str()); - tex_ref_map[arrayVars[i].original_name] = texRef; - globals->add_sym(texRef); - //protonu--added the above two lines - - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaBind))), 4); - in_ldc *ins = new in_ldc(type_s32, operand(), immed(0)); - the_call->set_argument(0, operand(ins)); - the_call->set_argument(1, operand(texRef));//protonu--change to add the new sym - the_call->set_argument(2, operand(dvs)); - the_call->set_argument(3, arrayVars[i].size_expr.clone()); - - tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - setup_code = ocg->StmtListAppend(setup_code, - new CG_suifRepr(tnl)); - } - - //protonu--if arrayvar is mapped to constant memory - if(arrayVars[i].cons_mapped == true) - { - char buf[32]; - snprintf(buf, 32, "cs%dRef", i+1); - //arrayVars[i].secondName = buf; - array_bound low (0); - array_bound high (arrayVars[i].var_ref_size -1); - array_type *arr = new array_type(arrayVars[i].type,low, high); - type_node* cons_arr = ocg->ModifyType(arr, "__device__ __constant__"); - cons_arr = globals->install_type(cons_arr); - var_sym* consRef = new var_sym(cons_arr, buf); - cons_ref_map[arrayVars[i].original_name] = consRef; - globals->add_sym(consRef); - - - - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpySym))), 3); - the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to(), operand(), immed(consRef)))); - the_call->set_argument(1, operand(arrayVars[i].in_data)); - the_call->set_argument(2, arrayVars[i].size_expr.clone()); - - tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - setup_code = ocg->StmtListAppend(setup_code, - new CG_suifRepr(tnl)); - - } - } - } - - //Build dimGrid dim3 variables based on loop dimentions and ti/tj - char blockD1[120]; - char blockD2[120]; - if(dim1){ - snprintf(blockD1, 120, "%s/%d", dim1->name(), cu_tx); - snprintf(blockD2, 120, "%s/%d", dim2->name(), cu_ty); - }else{ - snprintf(blockD1, 120, "%d", cu_bx); - snprintf(blockD2, 120, "%d", cu_by); - //snprintf(blockD1, 120, "%d/%d", cu_nx, cu_tx); - //snprintf(blockD2, 120, "%d/%d", cu_ny, cu_ty); - } - repr = ocg->CreateDim3(immed((char*)gridName), - immed(blockD1), - immed(blockD2)); - setup_code = ocg->StmtListAppend(setup_code, repr); - - repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx),immed(cu_ty)); - - if(cu_tz > 1) - repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty), immed(cu_tz)); - else - repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty)); - setup_code = ocg->StmtListAppend(setup_code, repr); - - //call kernel function with name loop_name - //like: transpose_k<<<dimGrid,dimBlock>>>(devOPtr, devIPtr , width, height); - char dims[120]; - snprintf(dims,120,"<<<%s,%s>>>",gridName, blockName); - immed_list *iml = new immed_list; - iml->append(immed((char*)cu_kernel_name.c_str())); - iml->append(immed(dims)); - //printf("%s %s\n", static_cast<const char*>(cu_kernel_name), dims); - for(int i=0; i<arrayVars.size(); i++) - //Throw in a type cast if our kernel takes 2D array notation - //like (float(*) [1024]) - { - //protonu--throwing in another hack to stop the caller from passing tex mapped - //vars to the kernel. - if(arrayVars[i].tex_mapped == true || arrayVars[i].cons_mapped == true ) - continue; - if(arrayVars[i].size_2d >= 0) - { - snprintf(dims,120,"(float(*) [%d])%s", arrayVars[i].size_2d, - const_cast<char*>(arrayVars[i].name.c_str())); - //printf("%d %s\n", i, dims); - iml->append(immed(dims)); - }else{ - //printf("%d %s\n", i, static_cast<const char*>(arrayVars[i].name)); - iml->append(immed(const_cast<char*>( - arrayVars[i].name.c_str()))); - } - } - if(dim1){ - iml->append(immed(dim1)); - iml->append(immed(dim2)); - } - repr = ocg->CreateKernel(iml);//kernel call - setup_code = ocg->StmtListAppend(setup_code, repr); - - //cuda free variables - for(int i=0; i<arrayVars.size(); i++) - { - if(arrayVars[i].out_data) - { - //cudaMemcpy args - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpy))), 4); - the_call->set_argument(0, operand(arrayVars[i].out_data)); - the_call->set_argument(1, operand(symtab->lookup_var(const_cast<char*>( - arrayVars[i].name.c_str())))); - the_call->set_argument(2, arrayVars[i].size_expr.clone()); - the_call->set_argument(3, operand(cudaMemcpyDeviceToHost)); - - tree_node_list* tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - teardown_code = ocg->StmtListAppend(teardown_code, - new CG_suifRepr(tnl)); - } - - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaFree))), 1); - the_call->set_argument(0, operand(symtab->lookup_var(const_cast<char*>( - arrayVars[i].name.c_str())))); - - tree_node_list* tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - teardown_code = ocg->StmtListAppend(teardown_code, - new CG_suifRepr(tnl)); - } - - // --------------- - // BUILD THE KERNEL - // --------------- - - //Extract out kernel body - tree_node_list* code = getCode(); - //Get rid of wrapper if that original() added - if(code->head()->contents->kind() == TREE_IF) - { - tree_if* ifn = (tree_if*)code->head()->contents; - code = ifn->then_part(); - } - - //Create kernel function body - proc_sym *new_psym = globals->new_proc(kernel_type, src_c, (char*)cu_kernel_name.c_str()); - proc_symtab *new_proc_syms = new proc_symtab(new_psym->name()); - globals->add_child(new_proc_syms); - - //Add Params - std::map<std::string, var_sym*> loop_vars; - //In-Out arrays - type_node* fptr; - for(int i=0; i<arrayVars.size(); i++) - { - if(arrayVars[i].in_data) - //fptr = arrayVars[i].in_data->type()->clone(); - fptr = arrayVars[i].in_data->type(); - else - //fptr = arrayVars[i].out_data->type()->clone(); - fptr = arrayVars[i].out_data->type(); - fptr = new_proc_syms->install_type(fptr); - std::string name = arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name(); - var_sym* sym = new var_sym(fptr, arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name()); - //protonu--adding a check to ensure that texture memories are not passed in as arguments - if(arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true ) - { - sym->set_param(); - new_proc_syms->params()->append(sym); - new_proc_syms->add_sym(sym);//protonu--added to suppress the addition of the redundant var in the kernel - } - if (arrayVars[i].cons_mapped == true) - { - sym->set_param(); - new_proc_syms->add_sym(sym); - } - //printf("inserting name: %s\n", static_cast<const char*>(name)); - loop_vars.insert(std::pair<std::string, var_sym*>(std::string(name), sym)); - } - - if(dim1) - { - //Array dimentions - var_sym* kdim1 = new var_sym(dim1->type(), dim1->name()); - kdim1->set_param(); - new_proc_syms->add_sym(kdim1); - loop_vars.insert(std::pair<std::string, var_sym*>(std::string(dim1->name()), kdim1)); - var_sym* kdim2 = new var_sym(dim2->type(), dim2->name()); - kdim2->set_param(); - new_proc_syms->add_sym(kdim2); - loop_vars.insert(std::pair<std::string, var_sym*>(std::string(dim2->name()), kdim2)); - new_proc_syms->params()->append(kdim1); - new_proc_syms->params()->append(kdim2); - } - //Put block and thread implicit variables into scope - std::vector<var_sym *> index_syms; - /* Currently we don't use the block dimentions - var_sym* blockDim_x = new var_sym(type_s32, "blockDim.x"); - blockDim_x->set_param(); - new_proc_syms->add_sym(blockDim_x); - var_sym* blockDim_y = new var_sym(type_s32, "blockDim.y"); - blockDim_y->set_param(); - new_proc_syms->add_sym(blockDim_y); - */ - if(cu_bx > 1){ - var_sym* blockIdx_x = new var_sym(type_s32, "blockIdx.x"); - blockIdx_x->set_param(); - new_proc_syms->add_sym(blockIdx_x); - index_syms.push_back(blockIdx_x); - } - if(cu_by > 1){ - var_sym* blockIdx_y = new var_sym(type_s32, "blockIdx.y"); - blockIdx_y->set_param(); - new_proc_syms->add_sym(blockIdx_y); - index_syms.push_back(blockIdx_y); - } - if(cu_tx > 1){ - var_sym* threadIdx_x = new var_sym(type_s32, "threadIdx.x"); - threadIdx_x->set_param(); - new_proc_syms->add_sym(threadIdx_x); - index_syms.push_back(threadIdx_x); - } - if(cu_ty > 1){ - var_sym* threadIdx_y = new var_sym(type_s32, "threadIdx.y"); - threadIdx_y->set_param(); - new_proc_syms->add_sym(threadIdx_y); - index_syms.push_back(threadIdx_y); - } - - if(cu_tz > 1){ - var_sym* threadIdx_z = new var_sym(type_s32, "threadIdx.z"); - threadIdx_z->set_param(); - new_proc_syms->add_sym(threadIdx_z); - index_syms.push_back(threadIdx_z); - } - - //Figure out which loop variables will be our thread and block dimention variables - std::vector<var_sym *> loop_syms; - //Get our indexes - std::vector<const char*> indexes;// = get_loop_indexes(code,cu_num_reduce); - int threadsPos=0; - if(cu_bx > 1) - indexes.push_back("bx"); - if(cu_by > 1) - indexes.push_back("by"); - if(cu_tx > 1){ - threadsPos = indexes.size(); - indexes.push_back("tx"); - } - if(cu_ty > 1) - indexes.push_back("ty"); - if(cu_tz > 1) - indexes.push_back("tz"); - for(int i=0; i<indexes.size(); i++) - { - //printf("indexes[%d] = %s\n", i, (char*)indexes[i]); - loop_syms.push_back(new var_sym(type_s32, (char*)indexes[i])); - new_proc_syms->add_sym(loop_syms[i]); - //loop_vars.insert(std::pair<std::string, var_sym*>(std::string(indexes[i]), loop_syms[i])); - } - - //Generate this code - //int bx = blockIdx.x - //int by = blockIdx.y - //int tx = threadIdx.x - //int ty = threadIdx.y - CG_outputRepr *body=NULL; - for(int i=0; i<indexes.size(); i++){ - CG_outputRepr *lhs = new CG_suifRepr(operand(loop_syms[i])); - //body = ocg->StmtListAppend(body, ocg->CreateStmtList( - // ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i]))))); - body = ocg->StmtListAppend(body, ocg->StmtListAppend( - ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i]))), NULL)); - } - - //Get our inital code prepped for loop reduction. First we need to swap - //out internal SUIF variable references to point to the new local - //function symbol table. - std::map<std::string, var_sym*> loop_idxs; //map from idx names to their new syms - std::vector< std::pair<var_sym*, var_sym*> > dim_vars; //pair is of <old,new> var_sym (for 2D array size initializations) - replacements r; - tree_node_list* swapped = swapVarReferences(code, &r, ocg, loop_vars, new_proc_syms, dim_vars); - //printf("\n code before recursiveFindReplacePreferedIdxs :\n"); - //swapped->print(); - swapped = recursiveFindReplacePreferedIdxs(swapped, new_proc_syms, cudaSync, void_func, loop_idxs);//in-place swapping - //printf("\n code after recursiveFindReplacePreferedIdxs :\n"); - //swapped->print(); - - for(int i=0; i<indexes.size(); i++){ - std::vector<tree_for*> tfs = findCommentedFors(indexes[i], swapped); - for(int k=0; k<tfs.size(); k++){ - //printf("replacing %p tfs for index %s\n", tfs[k], indexes[i]); - tree_node_list* newBlock = forReduce(tfs[k], loop_idxs[indexes[i]], new_proc_syms); - //newBlock->print(); - swap_node_for_node_list(tfs[k], newBlock); - //printf("AFTER SWAP\n"); newBlock->print(); - } - } - //printf("AFTER REDUCE\n"); swapped->print(); - - if(static_cast<const IR_cudasuifCode *>(ir)->init_code()){ - tree_node_list* orig_init_code = static_cast<CG_suifRepr *>(static_cast<const IR_cudasuifCode *>(ir)->init_code())->GetCode(); - for(int i=0; i<dim_vars.size(); i++){ - //We have a map of var_sym from the original function body and we know - //that these var_syms have initialization statements which define the - //array size. We need to mimic these initialization statements. - - //First find the assignment and pull out the constant initialization - //value - int value = -1; - tree_node_list_iter tnli(orig_init_code); - while (!tnli.is_empty()) { - tree_node *node = tnli.step(); - if(node->kind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr) - { - in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr(); - //expect the structure: cpy( _ = min(grab_me, _)) - if(inst->opcode() == io_cpy && inst->dst_op().is_symbol()){ - //printf("looking at instruction: "); - //inst->print(); - var_sym* dest = inst->dst_op().symbol(); - if(dest == dim_vars[i].first) - { - if(inst->src1_op().is_instr() && inst->src1_op().instr()->format() == inf_ldc){ - value = ((in_ldc*)inst->src1_op().instr())->value().integer(); - } - } - } - } - } - if(value < 0){ - fprintf(stderr, "ERROR: Could not find initializing statement for variable used in upper_bound of array type"); - } - CG_outputRepr *lhs = new CG_suifRepr(operand(dim_vars[i].second)); - //body = ocg->StmtListAppend(body, ocg->CreateStmtList(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value)))); - body = ocg->StmtListAppend(body, ocg->StmtListAppend(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value)), NULL)); - } - } - - - body = ocg->StmtListAppend(body, new CG_suifRepr(swapped)); - - //protonu--lets try creating our function definiton here - var_sym *tsym = NULL; - - - std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(body); - for(int i=0; i<refs.size(); i++) - { - //check if the array is tex mapped - if(texture != NULL && texture->is_array_tex_mapped(refs[i]->name().c_str())) - { - //protonu--our new tex lookup function - in_cal *tex_lookup = - new in_cal(type_f32, operand(), operand(new in_ldc(float_func->ptr_to(), operand(), immed(tex1D))), 2); - - //printf("name of the array to be mapped is %s\n", refs[i]->name().c_str()); - tsym = tex_ref_map[refs[i]->name()]; - tex_lookup->set_argument(0, operand(tsym)); - - - int array_dims = ((IR_suifArrayRef *)refs[i])->ia_->dims(); - - if (array_dims == 1){ - tex_lookup->set_argument(1, ((IR_suifArrayRef *)refs[i])->ia_->index(0).clone()); - }else if (array_dims > 2) { - printf(" \n we don't handle more than 2D arrays mapped to textures yet\n"); - }else if (array_dims == 2) { - - IR_ArraySymbol *sym = refs[i]->symbol(); - CG_outputRepr *sz = sym->size(1); - delete sym; // free the wrapper object only - // find the builder ocg - CG_outputRepr *expr = ocg->CreateTimes(sz->clone(),refs[i]->index(0)); - delete sz; // free the wrapper object only - expr = ocg->CreatePlus(expr, refs[i]->index(1)); - // expr holds the 1D access expression and take it out - tex_lookup->set_argument(1, ((CG_suifRepr *)expr)->GetExpression()); - } - - //using chun's function to replace the array look up with the function call - ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(tex_lookup))); - } - - } - - - tsym = NULL; - //protonu--now let's try what we did above for constant memory - for(int i=0; i<refs.size(); i++) - { - //check if the array is tex mapped - if(constant_mem != NULL && constant_mem->is_array_cons_mapped(refs[i]->name().c_str())) - { - - //printf("name of the array to be cons mapped is %s\n", refs[i]->name().c_str()); - tsym = cons_ref_map[refs[i]->name()]; - //we should create a IR_SuifArray here - IR_ArraySymbol *ar_sym = new IR_suifArraySymbol(ir,tsym); - std::vector<CG_outputRepr *> ar_index; - ar_index.push_back(((IR_suifArrayRef *)refs[i])->index(0)); - IR_ArrayRef *ar_ref = ((IR_suifCode *)ir)->CreateArrayRef(ar_sym, ar_index); - //using chun's function to replace the array look up with the function call - ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(((IR_suifArrayRef *)ar_ref)->ia_))); - - } - } - - - tree_proc *new_body = new tree_proc(static_cast<CG_suifRepr*>(body)->GetCode(), new_proc_syms); - //globals->add_child(new_proc_syms); - new_psym->set_block(new_body); - new_procs.push_back(new_psym); - - return swapped; -} - -//Order taking out dummy variables -std::vector<std::string> cleanOrder(std::vector<std::string> idxNames){ - std::vector<std::string> results; - for(int j=0; j<idxNames.size(); j++){ - if(idxNames[j].length() != 0) - results.push_back(idxNames[j]); - } - return results; -} - -//First non-dummy level in ascending order -int LoopCuda::nonDummyLevel(int stmt, int level){ - //level comes in 1-basd and should leave 1-based - for(int j=level-1; j<idxNames[stmt].size(); j++){ - if(idxNames[stmt][j].length() != 0){ - //printf("found non dummy level of %d with idx: %s when searching for %d\n", j+1, (const char*) idxNames[stmt][j], level); - return j+1; - } - } - char buf[128]; sprintf(buf, "%d", level); - throw std::runtime_error(std::string("Unable to find a non-dummy level starting from ") + std::string(buf)); -} - -int LoopCuda::findCurLevel(int stmt, std::string idx){ - for(int j=0; j<idxNames[stmt].size(); j++){ - if(strcmp(idxNames[stmt][j].c_str(),idx.c_str()) == 0) - return j+1; - } - throw std::runtime_error(std::string("Unable to find index ") + idx + std::string(" in current list of indexes")); -} - -void LoopCuda::permute_cuda(int stmt, const std::vector<std::string>& curOrder) -{ - //printf("curOrder: "); - //printVs(curOrder); - //printf("idxNames: "); - //printVS(idxNames[stmt]); - std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt]); - bool same=true; - std::vector<int> pi; - for(int i=0; i<curOrder.size(); i++){ - bool found = false; - for(int j=0; j<cIdxNames.size(); j++){ - if(strcmp(cIdxNames[j].c_str(), curOrder[i].c_str()) == 0){ - pi.push_back(j+1); - found=true; - if(j!=i) - same=false; - } - } - if(!found){ - throw std::runtime_error("One of the indexes in the permute order where not " - "found in the current set of indexes."); - } - } - for(int i=curOrder.size(); i<cIdxNames.size(); i++){ - pi.push_back(i); - } - if(same) - return; - permute(stmt, pi); - //Set old indexe names as new - for(int i=0; i<curOrder.size(); i++){ - idxNames[stmt][i] = curOrder[i].c_str(); //what about sibling stmts? - } -} - - -bool LoopCuda::permute(int stmt_num, const std::vector<int> &pi) -{ -// check for sanity of parameters - if (stmt_num >= stmt.size() || stmt_num < 0) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - const int n = stmt[stmt_num].xform.n_out(); - if (pi.size() > (n-1)/2) - throw std::invalid_argument("iteration space dimensionality does not match permute dimensionality"); - int first_level = 0; - int last_level = 0; - for (int i = 0; i < pi.size(); i++) { - if (pi[i] > (n-1)/2 || pi[i] <= 0) - throw std::invalid_argument("invalid loop level " + to_string(pi[i]) + " in permuation"); - - if (pi[i] != i+1) { - if (first_level == 0) - first_level = i+1; - last_level = i+1; - } - } - if (first_level == 0) - return true; - - std::vector<int> lex = getLexicalOrder(stmt_num); - std::set<int> active = getStatements(lex, 2*first_level-2); - Loop::permute(active, pi); -} - - -void LoopCuda::tile_cuda(int stmt, int level, int outer_level) -{ - tile_cuda(stmt,level,1,outer_level,"","",CountedTile); -} -void LoopCuda::tile_cuda(int level, int tile_size, int outer_level, std::string idxName, - std::string ctrlName, TilingMethodType method){ - tile_cuda(0, level, tile_size, outer_level, idxName, ctrlName, method); -} - -void LoopCuda::tile_cuda(int stmt, int level, int tile_size, int outer_level, std::string idxName, - std::string ctrlName, TilingMethodType method){ - //Do regular tile but then update the index and control loop variable - //names as well as the idxName to reflect the current state of things. - //printf("tile(%d,%d,%d,%d)\n", stmt, level, tile_size, outer_level); - //printf("idxNames before: "); - //printVS(idxNames[stmt]); - - tile(stmt, level, tile_size, outer_level, method); - - if(idxName.size()) - idxNames[stmt][level-1] = idxName.c_str(); - if(tile_size == 1){ - //potentially rearrange loops - if(outer_level < level){ - std::string tmp = idxNames[stmt][level-1]; - for(int i=level-1; i>outer_level-1; i--){ - if(i-1 >= 0) - idxNames[stmt][i] = idxNames[stmt][i-1]; - } - idxNames[stmt][outer_level-1] = tmp; - } - //TODO: even with a tile size of one, you need a insert (of a dummy loop) - idxNames[stmt].insert(idxNames[stmt].begin()+(level),""); - }else{ - if(!ctrlName.size()) - throw std::runtime_error("No ctrl loop name for tile"); - //insert - idxNames[stmt].insert(idxNames[stmt].begin()+(outer_level-1),ctrlName.c_str()); - } - - //printf("idxNames after: "); - //printVS(idxNames[stmt]); -} - - -bool LoopCuda::datacopy_privatized_cuda(int stmt_num, int level, const std::string &array_name, const std::vector<int> &privatized_levels, bool allow_extra_read , int fastest_changing_dimension , int padding_stride , int padding_alignment , bool cuda_shared) -{ - int old_stmts =stmt.size(); - //datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, cuda_shared); - if(cuda_shared) - datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 1); - else - datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 0); - - - //Adjust idxNames to reflect updated state - std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]); - int new_stmts = stmt.size(); - for(int i=old_stmts; i<new_stmts; i++){ - //printf("fixing up statement %d\n", i); - std::vector<std::string> idxs; - - - //protonu-making sure the vector of nonSplitLevels grows along with - //the statement structure - stmt_nonSplitLevels.push_back(omega::Tuple<int>()); - - //Indexes up to level will be the same - for(int j=0; j<level-1; j++) - idxs.push_back(cIdxNames[j]); - - //Expect privatized_levels to match - for(int j=0; j<privatized_levels.size(); j++) - idxs.push_back(cIdxNames[privatized_levels[j]-1]);//level is one-based - - //all further levels should match order they are in originally - if(privatized_levels.size()){ - int last_privatized = privatized_levels.back(); - int top_level = last_privatized + (stmt[i].IS.n_set()-idxs.size()); - //printf("last privatized_levels: %d top_level: %d\n", last_privatized, top_level); - for(int j=last_privatized; j<top_level; j++){ - idxs.push_back(cIdxNames[j]); - //printf("pushing back: %s\n", (const char*)cIdxNames[j]); - } - } - idxNames.push_back(idxs); - } -} - -bool LoopCuda::datacopy_cuda(int stmt_num, int level, const std::string &array_name, std::vector<std::string> new_idxs, bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, bool cuda_shared) -{ - - int old_stmts =stmt.size(); - //datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment,cuda_shared); - if(cuda_shared) - datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 1); - else - datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 0); - //Adjust idxNames to reflect updated state - std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]); - int new_stmts = stmt.size(); - for(int i=old_stmts; i<new_stmts; i++){ - //printf("fixing up statement %d\n", i); - std::vector<std::string> idxs; - - //protonu-making sure the vector of nonSplitLevels grows along with - //the statement structure - stmt_nonSplitLevels.push_back(omega::Tuple<int>()); - - //protonu--lets dump out the code from each statement here - //printf("\n dumping statement :%d", i); - //stmt[i].code->Dump(); - - //Indexes up to level will be the same - for(int j=0; j<level-1; j++) - idxs.push_back(cIdxNames[j]); - - //all further levels should get names from new_idxs - int top_level = stmt[i].IS.n_set(); - //printf("top_level: %d level: %d\n", top_level, level); - if(new_idxs.size() < top_level-level+1) - throw std::runtime_error("Need more new index names for new datacopy loop levels"); - - for(int j=level-1; j<top_level; j++){ - idxs.push_back(new_idxs[j-level+1].c_str()); - //printf("pushing back: %s\n", new_idxs[j-level+1].c_str()); - } - idxNames.push_back(idxs); - } -} - -bool LoopCuda::unroll_cuda(int stmt_num, int level, int unroll_amount) -{ - int old_stmts =stmt.size(); - //bool b= unroll(stmt_num, , unroll_amount); - - - int dim = 2*level-1; - std::vector<int> lex = getLexicalOrder(stmt_num); - std::set<int> same_loop = getStatements(lex, dim-1); - - level = nonDummyLevel(stmt_num,level); - //printf("unrolling %d at level %d\n", stmt_num,level); - - //protonu--using the new version of unroll, which returns - //a set of ints instead of a bool. To keep Gabe's logic - //I'll check the size of the set, if it's 0 return true - //bool b= unroll(stmt_num, level, unroll_amount); - std::set<int> b_set= unroll(stmt_num, level, unroll_amount); - bool b = false; - if (b_set.size() == 0) b = true; - //end--protonu - - //Adjust idxNames to reflect updated state - std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]); - std::vector<std::string> origSource = idxNames[stmt_num];; - //Drop index names at level - if(unroll_amount == 0){ - //For all statements that were in this unroll together, drop index name for unrolled level - idxNames[stmt_num][level-1] = ""; - for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - //printf("in same loop as %d is %d\n", stmt_num, (*i)); - //idxNames[(*i)][level-1] = ""; - idxNames[(*i)] = idxNames[stmt_num]; - } - } - - lex = getLexicalOrder(stmt_num); - same_loop = getStatements(lex, dim-1); - - bool same_as_source = false; - int new_stmts = stmt.size(); - for(int i=old_stmts; i<new_stmts; i++){ - //Check whether we had a sync for the statement we are unrolling, if - //so, propogate that to newly created statements so that if they are - //in a different loop structure, they will also get a syncthreads - int size = syncs.size(); - for(int j=0; j<size; j++){ - if(syncs[j].first == stmt_num) - syncs.push_back(make_pair(i,syncs[j].second)); - } - - //protonu-making sure the vector of nonSplitLevels grows along with - //the statement structure - stmt_nonSplitLevels.push_back(omega::Tuple<int>()); - - - //We expect that new statements have a constant for the variable in - //stmt[i].IS at level (as seen with print_with_subs), otherwise there - //will be a for loop at level and idxNames should match stmt's - //idxNames pre-unrolled - Relation IS = stmt[i].IS; - //Ok, if you know how the hell to get anything out of a Relation, you - //should probably be able to do this more elegantly. But for now, I'm - //hacking it. - std::string s = IS.print_with_subs_to_string(); - //s looks looks like - //{[_t49,8,_t51,_t52,128]: 0 <= _t52 <= 3 && 0 <= _t51 <= 15 && 0 <= _t49 && 64_t49+16_t52+_t51 <= 128} - //where level == 5, you see a integer in the input set - - //If that's not an integer and this is the first new statement, then - //we think codegen will have a loop at that level. It's not perfect, - //not sure if it can be determined without round-tripping to codegen. - int sIdx = 0; - int eIdx = 0; - for(int j=0; j<level-1; j++){ - sIdx = s.find(",",sIdx+1); - if(sIdx < 0) break; - } - if(sIdx > 0){ - eIdx = s.find("]"); - int tmp = s.find(",",sIdx+1); - if(tmp > 0 && tmp < eIdx) - eIdx = tmp; //", before ]" - if(eIdx > 0){ - sIdx++; - std::string var = s.substr(sIdx,eIdx-sIdx); - //printf("%s\n", s.c_str()); - //printf("set var for stmt %d at level %d is %s\n", i, level, var.c_str()); - if(atoi(var.c_str()) == 0 && i ==old_stmts){ - //TODO:Maybe do see if this new statement would be in the same - //group as the original and if it would, don't say - //same_as_source - if(same_loop.find(i) == same_loop.end()){ - printf("stmt %d level %d, newly created unroll statement should have same level indexes as source\n", i, level); - same_as_source = true; - } - } - } - } - - - //printf("fixing up statement %d n_set %d with %d levels\n", i, stmt[i].IS.n_set(), level-1); - if(same_as_source) - idxNames.push_back(origSource); - else - idxNames.push_back(idxNames[stmt_num]); - } - - return b; -} - -void LoopCuda::copy_to_texture(const char *array_name) -{ - //protonu--placeholder for now - //set the bool for using cuda memory as true - //in a vector of strings, put the names of arrays to tex mapped - if ( !texture ) - texture = new texture_memory_mapping(true, array_name); - else - texture->add(array_name); - - -} - - -void LoopCuda::copy_to_constant(const char *array_name) -{ - //protonu--placeholder for now - //set the bool for using cuda memory as true - //in a vector of strings, put the names of arrays to tex mapped - if ( !constant_mem ) - constant_mem = new constant_memory_mapping(true, array_name); - else - constant_mem->add(array_name); -} - -//protonu--moving this from Loop -tree_node_list* LoopCuda::codegen() -{ - if(code_gen_flags & GenCudaizeV2) - return cudaize_codegen_v2(); - //Do other flagged codegen methods, return plain vanilla generated code - return getCode(); -} - -//These three are in Omega code_gen.cc and are used as a massive hack to -//get out some info from MMGenerateCode. Yea for nasty side-effects. -namespace omega{ - extern int checkLoopLevel; - extern int stmtForLoopCheck; - extern int upperBoundForLevel; - extern int lowerBoundForLevel; -} - - -void LoopCuda::extractCudaUB(int stmt_num, int level, int &outUpperBound, int &outLowerBound){ - // check for sanity of parameters - const int m = stmt.size(); - if (stmt_num >= m || stmt_num < 0) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - const int n = stmt[stmt_num].xform.n_out(); - if (level > (n-1)/2 || level <= 0) - throw std::invalid_argument("invalid loop level " + to_string(level)); - - int dim = 2*level-1; - - std::vector<int> lex = getLexicalOrder(stmt_num); - std::set<int> same_loop = getStatements(lex, dim-1); - - // extract the intersection of the iteration space to be considered - Relation hull; - { - hull = Relation::True(n); - for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - hull = Intersection(hull, project_onto_levels(getNewIS(*i), dim+1, true)); - hull.simplify(2, 4); - } - - for (int i = 2; i <= dim+1; i+=2) { - //std::string name = std::string("_t") + to_string(t_counter++); - std::string name = std::string("_t") + to_string(tmp_loop_var_name_counter++); - hull.name_set_var(i, name); - } - hull.setup_names(); - } - - // extract the exact loop bound of the dimension to be unrolled - if (is_single_iteration(hull, dim)){ - throw std::runtime_error("No loop availabe at level to extract upper bound."); - } - Relation bound = get_loop_bound(hull, dim); - if (!bound.has_single_conjunct() || !bound.is_satisfiable() || bound.is_tautology()) - throw loop_error("loop error: unable to extract loop bound for cudaize"); - - // extract the loop stride - EQ_Handle stride_eq; - int stride = 1; - { - bool simple_stride = true; - int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(dim+1), stride_eq, simple_stride); - if (strides > 1) - throw loop_error("loop error: too many strides"); - else if (strides == 1) { - int sign = stride_eq.get_coef(bound.set_var(dim+1)); -// assert(sign == 1 || sign == -1); - Constr_Vars_Iter it(stride_eq, true); - stride = abs((*it).coef/sign); - } - } - if(stride != 1){ - char buf[1024]; - sprintf(buf, "Cudaize: Loop at level %d has non-one stride of %d", level, stride); - throw std::runtime_error(buf); - } - - //Use code generation system to build tell us our bound information. We - //need a hard upper bound a 0 lower bound. - - checkLoopLevel = level*2; - stmtForLoopCheck = stmt_num; - upperBoundForLevel = -1; - lowerBoundForLevel = -1; - printCode(1,false); - checkLoopLevel = 0; - - outUpperBound = upperBoundForLevel; - outLowerBound = lowerBoundForLevel; - return; -} - - -void LoopCuda::printCode(int effort, bool actuallyPrint) const { - const int m = stmt.size(); - if (m == 0) - return; - const int n = stmt[0].xform.n_out(); - - - - Tuple<Relation> IS(m); - Tuple<Relation> xform(m); - Tuple<IntTuple > nonSplitLevels(m); - for (int i = 0; i < m; i++) { - IS[i+1] = stmt[i].IS; - xform[i+1] = stmt[i].xform; - nonSplitLevels[i+1] = stmt_nonSplitLevels[i]; - //nonSplitLevels[i+1] = stmt[i].nonSplitLevels; - } - - Tuple< Tuple<std::string> > idxTupleNames; - if(useIdxNames){ - for(int i=0; i<idxNames.size(); i++){ - Tuple<std::string> idxs; - for(int j=0; j<idxNames[i].size(); j++) - idxs.append(idxNames[i][j]); - idxTupleNames.append( idxs ); - } - } - - Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); - CG_stringBuilder *ocg = new CG_stringBuilder(); - Tuple<CG_outputRepr *> nameInfo; - for (int i = 1; i <= m; i++) - nameInfo.append(new CG_stringRepr("s" + to_string(i))); - CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort); - if(actuallyPrint) - std::cout << GetString(repr); -/* - for (int i = 1; i <= m; i++) - delete nameInfo[i]; -*/ - - delete ocg; -} - - - -void LoopCuda::printRuntimeInfo() const { - for(int i=0; i<stmt.size(); i++){ - Relation IS = stmt[i].IS; - Relation xform = stmt[i].xform; - printf("stmt[%d]\n", i); - printf("IS\n"); - IS.print_with_subs(); - - printf("xform[%d]\n", i); - xform.print_with_subs(); - - //printf("code\n"); - //static_cast<CG_suifRepr *>(stmt[i].code)->GetCode()->print_expr(); - } -} - -void LoopCuda::printIndexes() const { - for(int i=0; i<stmt.size(); i++){ - printf("stmt %d nset %d ", i, stmt[i].IS.n_set()); - - for(int j=0; j<idxNames[i].size(); j++){ - if(j>0) - printf(","); - printf("%s", idxNames[i][j].c_str()); - } - printf("\n"); - } -} - -tree_node_list* LoopCuda::getCode(int effort) const { - const int m = stmt.size(); - if (m == 0) - return new tree_node_list; - const int n = stmt[0].xform.n_out(); - - - - Tuple<CG_outputRepr *> ni(m); - Tuple<Relation> IS(m); - Tuple<Relation> xform(m); - Tuple< IntTuple > nonSplitLevels(m); - for (int i = 0; i < m; i++) { - ni[i+1] = stmt[i].code; - IS[i+1] = stmt[i].IS; - xform[i+1] = stmt[i].xform; - nonSplitLevels[i+1] = stmt_nonSplitLevels[i]; - //nonSplitLevels[i+1] = stmt[i].nonSplitLevels; - } - - - Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); -#ifdef DEBUG -// std::cout << GetString(MMGenerateCode(new CG_stringBuilder(), xform, IS, known, effort)); -#endif - Tuple< Tuple<std::string> > idxTupleNames; - if(useIdxNames){ - for(int i=0; i<idxNames.size(); i++){ - Tuple<std::string> idxs; - for(int j=0; j<idxNames[i].size(); j++) - idxs.append(idxNames[i][j]); - idxTupleNames.append( idxs ); - } - } - - CG_outputBuilder *ocg = ir->builder(); - CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, nonSplitLevels, syncs, idxTupleNames, effort); - - //CG_outputRepr *overflow_initialization = ocg->CreateStmtList(); - //protonu--using the new function CG_suifBuilder::StmtListAppend - CG_outputRepr *overflow_initialization = ocg->StmtListAppend(NULL, NULL); - for (std::map<int, std::vector<Free_Var_Decl *> >::const_iterator i = overflow.begin(); i != overflow.end(); i++) - for (std::vector<Free_Var_Decl *>::const_iterator j = i->second.begin(); j != i->second.end(); j++) - //overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->CreateStmtList(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)))); - overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->StmtListAppend(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)), NULL)); - - repr = ocg->StmtListAppend(overflow_initialization, repr); - tree_node_list *tnl = static_cast<CG_suifRepr *>(repr)->GetCode(); - - delete repr; - /* - for (int i = 1; i <= m; i++) - delete ni[i]; - */ - - return tnl; -} - - -//protonu--adding constructors for the new derived class -LoopCuda::LoopCuda():Loop(), code_gen_flags(GenInit){} - -LoopCuda::LoopCuda(IR_Control *irc, int loop_num) - :Loop(irc) -{ - setup_code = NULL; - teardown_code = NULL; - code_gen_flags = 0; - cu_bx = cu_by = cu_tx = cu_ty = cu_tz = 1; - cu_num_reduce = 0; - cu_mode = GlobalMem; - texture = NULL; - constant_mem = NULL; - - int m=stmt.size(); - //printf("\n the size of stmt(initially) is: %d\n", stmt.size()); - for(int i=0; i<m; i++) - stmt_nonSplitLevels.push_back(omega::Tuple<int>()); - - - //protonu--setting up - //proc_symtab *symtab - //global_symtab *globals - - globals = ((IR_cudasuifCode *)ir)->gsym_ ; - std::vector<tree_for *> tf = ((IR_cudasuifCode *)ir)->get_loops(); - - symtab = tf[loop_num]->proc()->block()->proc_syms(); - - std::vector<tree_for *> deepest = find_deepest_loops(tf[loop_num]); - - for (int i = 0; i < deepest.size(); i++){ - index.push_back(deepest[i]->index()->name()); //reflects original code index names - } - - for(int i=0; i< stmt.size(); i++) - idxNames.push_back(index); //refects prefered index names (used as handles in cudaize v2) - useIdxNames=false; - -} - |