diff options
author | dhuth <derickhuth@gmail.com> | 2014-08-27 09:52:06 -0600 |
---|---|---|
committer | dhuth <derickhuth@gmail.com> | 2014-08-27 09:52:06 -0600 |
commit | bff810cc371a38f493d688c54f71013f5a7d53bf (patch) | |
tree | fbe86954bb3c01deb21da9e41ebff5baa2889a45 /loop_cuda.cc | |
download | chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.gz chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.bz2 chill-bff810cc371a38f493d688c54f71013f5a7d53bf.zip |
Initial commit
Diffstat (limited to 'loop_cuda.cc')
-rw-r--r-- | loop_cuda.cc | 2123 |
1 files changed, 2123 insertions, 0 deletions
diff --git a/loop_cuda.cc b/loop_cuda.cc new file mode 100644 index 0000000..a23990d --- /dev/null +++ b/loop_cuda.cc @@ -0,0 +1,2123 @@ +/***************************************************************************** + Copyright (C) 2009 University of Utah + All Rights Reserved. + + Purpose: + Cudaize methods + + Notes: + + History: + 1/7/10 Created by Gabe Rudy by migrating code from loop.cc + 31/1/11 Modified by Protonu Basu +*****************************************************************************/ + +#include <code_gen/code_gen.h> +#include <code_gen/CG_stringBuilder.h> +#include <code_gen/output_repr.h> +#include <code_gen/CG_outputRepr.h> +#include "loop_cuda.hh" +#include "loop.hh" +#include <math.h> +#include <useful.h> +#include "omegatools.hh" +#include "ir_cudasuif.hh" +#include "ir_suif.hh" +#include "ir_suif_utils.hh" +#include "chill_error.hh" +#include <vector> + +using namespace omega; +char *k_cuda_texture_memory; //protonu--added to track texture memory type +char *k_cuda_constant_memory; //protonu--added to track constant memory type +//extern char *omega::k_cuda_texture_memory; //protonu--added to track texture memory type +extern char *omega::k_ocg_comment; + + +static int cudaDebug; +class CudaStaticInit{ public: CudaStaticInit(){ cudaDebug=0; //Change this to 1 for debug +}}; +static CudaStaticInit junkInitInstance__; + + + +std::string& upcase(std::string& s) +{ + for(int i=0; i<s.size(); i++) + s[i] = toupper(s[i]); + return s; +} + +void printVs(const std::vector<std::string>& curOrder){ + if(!cudaDebug) return; + for(int i=0; i<curOrder.size(); i++){ + if(i>0) + printf(","); + printf("%s", curOrder[i].c_str()); + } + printf("\n"); +} + +void printVS(const std::vector<std::string>& curOrder){ + //if(!cudaDebug) return; + for(int i=0; i<curOrder.size(); i++){ + if(i>0) + printf(","); + printf("%s", curOrder[i].c_str()); + } + printf("\n"); +} + +LoopCuda::~LoopCuda() { + const int m = stmt.size(); + for (int i = 0; i < m; i++) + stmt[i].code->clear(); +} + +bool LoopCuda::symbolExists(std::string s){ + if(symtab->lookup_sym(s.c_str(), SYM_VAR, false)) + return true; + if(globals->lookup_sym(s.c_str(), SYM_VAR, false)) + return true; + for(int i=0; i<idxNames.size(); i++) + for(int j=0; j<idxNames[i].size(); j++) + if(strcmp(idxNames[i][j].c_str(), s.c_str()) == 0) + return true; + return false; +} + +void LoopCuda::addSync(int stmt_num, std::string idxName) +{ + //we store these and code-gen inserts sync to omega comments where stmt + //in loop that has idxName being generated + syncs.push_back(make_pair(stmt_num,idxName)); +} + +void LoopCuda::renameIndex(int stmt_num, std::string idx, std::string newName) +{ + int level = findCurLevel(stmt_num, idx); + if(idxNames.size() <= stmt_num || idxNames[stmt_num].size() < level) + throw std::runtime_error("Invalid statment number of index"); + idxNames[stmt_num][level-1] = newName.c_str(); +} + + + +enum Type{ Int }; + +struct VarDefs{ + std::string name; + std::string secondName; + operand size_expr; //array size as an expression (can be a product of other variables etc) + type_node * type; + var_sym* in_data; //Variable of array to copy data in from (before kernel call) + var_sym* out_data; //Variable of array to copy data out to (after kernel call) + int size_2d; //-1 if linearized, the constant size N, of a NxN 2D array otherwise + bool tex_mapped; //protonu-- true if this variable will be texture mapped, so no need to pass it as a argument + bool cons_mapped; //protonu-- true if this variable will be constant mem mapped, so no need to pass it as a argument + std::string original_name; //this is such a hack, to store the original name, to store a table to textures used + int var_ref_size ; +}; + +tree_node_list* wrapInIfFromMinBound(tree_node_list* then_part, tree_for* loop, base_symtab* symtab, var_sym* bound_sym) +{ + tree_node_list* ub = loop->ub_list(); + tree_node_list_iter upli(ub); + while(!upli.is_empty()){ + tree_node *node = upli.step(); + if(node->kind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr) + { + in_rrr* ins = (in_rrr*)((tree_instr*)node)->instr(); + //expect the structure: cpy( _ = min(grab_me, _)) + if(ins->opcode() == io_cpy && ins->src1_op().is_instr()){ + ins = (in_rrr*)ins->src1_op().instr(); + if(ins->opcode() == io_min){ + tree_node_list* tnl = new tree_node_list; + tnl->append(if_node(symtab, fold_sle(operand(bound_sym), ins->src1_op().instr()->clone()), then_part)); + return tnl; + } + } + } + } + return then_part; //Failed to go to proper loop level +} + +/** + * This would be better if it was done by a CHiLL xformation instead of at codegen + * + * state: + * for(...) + * for(...) + * cur_body + * stmt1 + * + * stm1 is in-between two loops that are going to be reduced. The + * solution is to put stmt1 at the end of cur_body but conditionally run + * in on the last step of the for loop. + * + * A CHiLL command that would work better: + * + * for(...) + * stmt0 + * for(for i=0; i<n; i++) + * cur_body + * stmt1 + * => + * for(...) + * for(for i=0; i<n; i++) + * if(i==0) stmt0 + * cur_body + * if(i==n-1) stmt1 + */ + +std::vector<tree_for*> findCommentedFors(const char* index, tree_node_list* tnl){ + std::vector<tree_for *> result; + + tree_node_list_iter iter(tnl); + bool next_loop_ok = false; + while (!iter.is_empty()) { + tree_node *tn = iter.step(); + if (tn->kind() == TREE_INSTR && ((tree_instr*)tn)->instr()->opcode() == io_mrk) + { + instruction* inst = ((tree_instr*)tn)->instr(); + std::string comment; + if ((inst->peek_annote(k_ocg_comment) != NULL)) + { + immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment)); + immed_list_iter data_iter(data); + if(!data_iter.is_empty()){ + immed first_immed = data_iter.step(); + if(first_immed.kind() == im_string) + comment = first_immed.string(); + } + } + if(comment.find("~cuda~") != std::string::npos + && comment.find("preferredIdx: ") != std::string::npos){ + std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos); + if(idx.find(" ") != std::string::npos) + idx = idx.substr(0,idx.find(" ")); + if(strcmp(idx.c_str(),index) == 0) + next_loop_ok = true; + } + } + if (tn->kind() == TREE_FOR){ + if(next_loop_ok){ + //printf("found loop %s\n", static_cast<tree_for *>(tn)->index()->name()); + result.push_back(static_cast<tree_for *>(tn)); + } + else{ + //printf("looking down for loop %s\n", static_cast<tree_for *>(tn)->index()->name()); + std::vector<tree_for*> t = findCommentedFors(index, static_cast<tree_for *>(tn)->body()); + std::copy(t.begin(), t.end(), back_inserter(result)); + } + next_loop_ok = false; + } + if (tn->kind() == TREE_IF) { + //printf("looking down if\n"); + tree_if *tni = static_cast<tree_if *>(tn); + std::vector<tree_for*> t = findCommentedFors(index, tni->then_part()); + std::copy(t.begin(), t.end(), back_inserter(result)); + } + } + + return result; +} + +tree_node_list* forReduce(tree_for* loop, var_sym* reduceIndex, proc_symtab* proc_syms) +{ + //We did the replacements all at once with recursiveFindPreferedIdxs + //replacements r; + //r.oldsyms.append(loop->index()); + //r.newsyms.append(reduceIndex); + //tree_for* new_loop = (tree_for*)loop->clone_helper(&r, true); + tree_for* new_loop = loop; + + //return body one loops in + tree_node_list* tnl = loop_body_at_level(new_loop, 1); + //wrap in conditional if necessary + tnl = wrapInIfFromMinBound(tnl, new_loop, proc_syms, reduceIndex); + return tnl; +} + +void recursiveFindRefs(tree_node_list* code, proc_symtab* proc_syms, replacements* r) +{ + if(code->parent() && code->scope()->is_block()) + ((block_symtab*)code->scope())->find_exposed_refs(proc_syms, r); + tree_node_list_iter tnli(code); + while (!tnli.is_empty()) { + tree_node *node = tnli.step(); + //printf("node kind: %d\n", node->kind()); + if(node->is_instr()) + { + tree_instr* t_instr = (tree_instr*)node; + t_instr->find_exposed_refs(proc_syms, r); + } + if(node->is_block()){ + recursiveFindRefs(static_cast<tree_block *>(node)->body(), proc_syms, r); + } + else if(node->is_for()){ + tree_for* tn_for = static_cast<tree_for *>(node); + //Find refs in statemetns and body + tn_for->find_exposed_refs(proc_syms, r); + //recursiveFindRefs(tn_for->body(), proc_syms, r); + } + } +} + +tree_node_list* recursiveFindReplacePreferedIdxs(tree_node_list* code, proc_symtab* proc_syms, + proc_sym* cudaSync, func_type* unkown_func, + std::map<std::string, var_sym*>& loop_idxs) +{ + tree_node_list* tnl = new tree_node_list; + tree_node_list_iter tnli(code); + var_sym* idxSym=0; + bool sync = false; + std::vector<tree_node*> r1; + std::vector<tree_node_list*> r2; + while (!tnli.is_empty()) { + tree_node *node = tnli.step(); + //printf("node kind: %d\n", node->kind()); + if(node->is_instr()) + { + if(((tree_instr*)node)->instr()->format() == inf_rrr){ + in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr(); + if(inst->opcode() == io_mrk){ + std::string comment; + if ((inst->peek_annote(k_ocg_comment) != NULL)) + { + immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment)); + immed_list_iter data_iter(data); + if(!data_iter.is_empty()){ + immed first_immed = data_iter.step(); + if(first_immed.kind() == im_string) + comment = first_immed.string(); + } + } + if(comment.find("~cuda~") != std::string::npos + && comment.find("preferredIdx: ") != std::string::npos){ + std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos); + if(idx.find(" ") != std::string::npos) + idx = idx.substr(0,idx.find(" ")); + //printf("sym_tab preferred index: %s\n", idx.c_str()); + if(loop_idxs.find(idx) != loop_idxs.end()) + idxSym = loop_idxs.find(idx)->second; + //Get the proc variable sybol for this preferred index + if(idxSym == 0){ + idxSym = (var_sym*)proc_syms->lookup_sym(idx.c_str(), SYM_VAR, false); + //printf("idx not found: lookup %p\n", idxSym); + if(!idxSym){ + idxSym = new var_sym(type_s32, (char*)idx.c_str()); + proc_syms->add_sym(idxSym); + //printf("idx created and inserted\n"); + } + //Now insert into our map for future + loop_idxs.insert(make_pair(idx, idxSym)); + } + //See if we have a sync as well + if(comment.find("sync") != std::string::npos){ + //printf("Inserting sync after current block\n"); + sync = true; + } + } + } + } + tnl->append(node); + } + else if(node->is_block()){ + tree_block* b = static_cast<tree_block *>(node); + b->set_body(recursiveFindReplacePreferedIdxs(b->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); + tnl->append(b); + } + else if(node->is_for()){ + tree_for* tn_for = static_cast<tree_for *>(node); + if(idxSym){ + //Replace the current tn_for's index variable with idxSym + //printf("replacing sym %s -> %s\n", tn_for->index()->name(), idxSym->name()); + replacements r; + r.oldsyms.append(tn_for->index()); + r.newsyms.append(idxSym); + tree_for* new_loop = (tree_for*)tn_for->clone_helper(&r, true); + idxSym = 0; //Reset for more loops in this tnl + new_loop->set_body(recursiveFindReplacePreferedIdxs(new_loop->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); + tnl->append(new_loop); + + if(sync){ + in_cal *the_call = + new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaSync))), 0); + tnl->append(new tree_instr(the_call)); + //tnl->print(); + sync = true; + } + }else{ + tn_for->set_body(recursiveFindReplacePreferedIdxs(tn_for->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); + tnl->append(tn_for); + } + }else if (node->kind() == TREE_IF) { + tree_if *tni = static_cast<tree_if *>(node); + tni->set_then_part(recursiveFindReplacePreferedIdxs(tni->then_part(), proc_syms, cudaSync, unkown_func, loop_idxs)); + tnl->append(tni); + } + } + //Do this after the loop to not screw up the pointer interator + /* + for(int i=0; i<r1.size(); i++){ + swap_node_for_node_list(r1[i],r2[i]); + }*/ + return tnl; +} + +// loop_vars -> array references +// loop_idxs -> <idx_name,idx_sym> map for when we encounter a loop with a different preferredIndex +// dim_vars -> out param, fills with <old,new> var_sym pair for 2D array dimentions (messy stuff) +tree_node_list* swapVarReferences(tree_node_list* code, replacements* r, CG_suifBuilder *ocg, + std::map<std::string, var_sym*>& loop_vars, + proc_symtab *proc_syms, + std::vector< std::pair<var_sym*,var_sym*> >& dim_vars) +{ + //Iterate over every expression, looking up each variable and type + //reference used and possibly replacing it or adding it to our symbol + //table + // + //We use the built-in cloning helper methods to seriously help us with this! + + //Need to do a recursive mark + recursiveFindRefs(code, proc_syms, r); + + + //We can't rely on type_node->clone() to do the heavy lifting when the + //old type is a two dimentional array with variable upper bounds as + //that requires creating and saveing variable references to the upper + //bounds. So we do one pass over the oldtypes doing this type of + //conversion, putting results in the fixed_types map for a second pass + //to pick up. + std::map<type_node*,type_node*> fixed_types; //array_types needing their upper bound installed + type_node_list_iter tlip(&r->oldtypes); + while(!tlip.is_empty()) + { + type_node* old_tn = tlip.step(); + type_node* new_tn = 0; + type_node* base_type = old_tn; + std::vector< std::pair<var_sym*, type_node*> > variable_upper_bouneds; + if(old_tn->is_ptr()){ + while (base_type->is_array() || base_type->is_ptr()) { + if (base_type->is_array()){ + array_bound ub = ((array_type*)base_type)->upper_bound(); + if(ub.is_variable()){ + var_sym* old_ub = (var_sym*)ub.variable(); + var_sym *new_ub = proc_syms->new_unique_var(type_s32); + dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub)); + variable_upper_bouneds.push_back( std::pair<var_sym*, type_node*>(new_ub, base_type) ); + } + base_type = static_cast<array_type *>(base_type)->elem_type(); + } + else if (base_type->is_ptr()) + base_type = static_cast<ptr_type *>(base_type)->ref_type(); + } + } + for (int i = variable_upper_bouneds.size()-1; i >= 0; i--) { + var_sym *var_ub = variable_upper_bouneds[i].first; + type_node* old_tn = variable_upper_bouneds[i].second; + if(new_tn == 0) + new_tn = new array_type(base_type, array_bound(1), array_bound(var_ub)); + else + new_tn = new array_type(new_tn, array_bound(1), array_bound(var_ub)); + proc_syms->add_type(new_tn); + fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn)); + } + if(new_tn){ + if(old_tn->is_ptr()){ + new_tn = new ptr_type(new_tn); + proc_syms->add_type(new_tn); + } + fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn)); + } + } + + //Quickly look for modifiers on our our array types (__shared__ float [][]) + type_node_list_iter tliq(&r->oldtypes); + while(!tliq.is_empty()) + { + type_node* old_tn = tliq.step(); + if(old_tn->is_modifier()){ + type_node* base_type = static_cast<modifier_type *>(old_tn)->base(); + if(fixed_types.find(base_type) != fixed_types.end()){ + type_node* fixed_base = (*fixed_types.find(base_type)).second; + //printf("Fix modifier with fixed base\n"); + //This should work to copy over the annotations, but apparently doesn't work so well + type_node* new_tn = new modifier_type(static_cast<modifier_type*>(old_tn)->op(), fixed_base); + old_tn->copy_annotes(new_tn); + fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn)); + } + } + } + + //Run through the types and create entries in r->newtypes but don't install + type_node_list_iter tli(&r->oldtypes); + while(!tli.is_empty()) + { + type_node* old_tn = tli.step(); + type_node* new_tn = 0; + + //If we recorded this as fixed by our special case, use that type + //instead of cloning. + if(fixed_types.find(old_tn) != fixed_types.end()){ + new_tn = (*fixed_types.find(old_tn)).second; + //printf("Reusing fixed typ %u: ", new_tn->type_id()); + }else{ + new_tn = old_tn->clone(); + //printf("Cloning type %u: ", old_tn->type_id()); + } + new_tn = proc_syms->install_type(new_tn); + + //Ok, there is a weird case where an array type that has var_sym as + //their upper bounds can't be covered fully in this loop or the + //var_sym loop, so we need special code. + /* + if(old_tn->op() == TYPE_PTR && ((ptr_type*)old_tn)->ref_type()->op() == TYPE_ARRAY){ + array_type* outer_array = (array_type*)((ptr_type*)old_tn)->ref_type(); + array_bound ub = outer_array->upper_bound(); + if(ub.is_variable()){ + var_sym* old_ub = (var_sym*)ub.variable(); + var_sym* new_ub = (var_sym*)((array_type*)((ptr_type*)new_tn)->ref_type())->upper_bound().variable(); + //r->oldsyms.append(old_ub); + fix_ub.insert(std::pair<var_sym*,array_type*>(old_ub, (array_type*)((ptr_type*)new_tn)->ref_type())); + dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub)); + printf("array var_sym: %p\n", new_ub); + } + if(outer_array->elem_type()->op() == TYPE_ARRAY) + { + array_type* inner_array = (array_type*)outer_array->elem_type(); + array_bound ub = inner_array->upper_bound(); + if(ub.is_variable()){ + var_sym* old_ub = (var_sym*)ub.variable(); + var_sym* new_ub = (var_sym*)((array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type())->upper_bound().variable(); + dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub)); + printf("array var_sym: %p\n", new_ub); + //r->oldsyms.append(old_ub); + fix_ub.insert(std::pair<var_sym*,array_type*>(old_ub, (array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type())); + } + } + } + */ + r->newtypes.append(new_tn); + } + + //printf("proc_syms symbol run through\n"); + //proc_syms->print(); + + //Run through the syms creating new copies + sym_node_list_iter snli(&r->oldsyms); + while(!snli.is_empty()) + { + sym_node *old_sn = snli.step(); + + if(loop_vars.count(std::string(old_sn->name())) > 0) + { + r->newsyms.append(loop_vars[std::string(old_sn->name())]); + //printf("def exists: %s\n", old_sn->name()); + }else{ + sym_node *new_sn = old_sn->copy(); + if(new_sn->is_var()){ + var_sym* var = (var_sym*)new_sn; + type_node* new_type = var->type()->clone_helper(r); + + //TODO: Have a tagged list of variables to make shared + //Make local 2D arrays __shared__ + if(new_type->op() == TYPE_ARRAY && ((array_type*)new_type)->elem_type()->op() == TYPE_ARRAY){ + //protonu--changes suggested by Malik + //printf("Adding __shared__ annotation to : %s\n", new_sn->name()); + //new_type = ocg->ModifyType(new_type, "__shared__"); + //proc_syms->add_type(new_type); + } + var->set_type(new_type); + } + proc_syms->add_sym(new_sn); + r->newsyms.append(new_sn); + //printf("def new: %s\n", new_sn->name()); + } + } + + //printf("proc_syms var runthrough\n"); + //proc_syms->print(); + return code->clone_helper(r); +} + +bool LoopCuda::validIndexes(int stmt, const std::vector<std::string>& idxs){ + for(int i=0; i<idxs.size(); i++){ + bool found = false; + for(int j=0; j<idxNames[stmt].size(); j++){ + if(strcmp(idxNames[stmt][j].c_str(), idxs[i].c_str()) == 0){ + found=true; + } + } + if(!found){ + return false; + } + } + return true; +} + + +bool LoopCuda::cudaize_v2(std::string kernel_name, std::map<std::string, int> array_dims, + std::vector<std::string> blockIdxs, std::vector<std::string> threadIdxs) +{ + int stmt_num = 0; + if(cudaDebug){ + printf("cudaize_v2(%s, {", kernel_name.c_str()); + //for( + printf("}, blocks={"); printVs(blockIdxs); printf("}, thread={"); printVs(threadIdxs); printf("})\n"); + } + + this->array_dims = array_dims; + if(!validIndexes(stmt_num, blockIdxs)){ + throw std::runtime_error("One of the indexes in the block list was not " + "found in the current set of indexes."); + } + if(!validIndexes(stmt_num, threadIdxs)){ + throw std::runtime_error("One of the indexes in the thread list was not " + "found in the current set of indexes."); + } + if(blockIdxs.size() ==0) + throw std::runtime_error("Cudaize: Need at least one block dimention"); + int block_level=0; + //Now, we will determine the actual size (if possible, otherwise + //complain) for the block dimentions and thread dimentions based on our + //indexes and the relations for our stmt; + for(int i=0; i<blockIdxs.size(); i++){ + int level = findCurLevel(stmt_num, blockIdxs[i]); + int ub,lb; + extractCudaUB(stmt_num,level,ub,lb); + if(lb!= 0){ + //attempt to "normalize" the loop with an in-place tile and then re-check our bounds + if(cudaDebug) printf("Cudaize: doing tile at level %d to try and normalize lower bounds\n", level); + tile(stmt_num,level,1,level,CountedTile); + idxNames[stmt_num].insert(idxNames[stmt_num].begin()+(level),"");//TODO: possibly handle this for all sibling stmts + extractCudaUB(stmt_num,level,ub,lb); + } + if(lb != 0){ + char buf[1024]; + sprintf(buf, "Cudaize: Loop at level %d does not have 0 as it's lower bound", level); + throw std::runtime_error(buf); + } + if(ub < 0){ + char buf[1024]; + sprintf(buf, "Cudaize: Loop at level %d does not have a hard upper bound", level); + throw std::runtime_error(buf); + } + if(cudaDebug) printf("block idx %s level %d lb: %d ub %d\n", blockIdxs[i].c_str(), level, lb, ub); + if(i == 0){ + block_level = level; + cu_bx = ub+1; + idxNames[stmt_num][level-1] = "bx"; + } + else if(i == 1){ + cu_by = ub+1; + idxNames[stmt_num][level-1] = "by"; + } + } + if(!cu_by) + block_level=0; + int thread_level1 = 0; + int thread_level2 = 0; + for(int i=0; i<threadIdxs.size(); i++){ + int level = findCurLevel(stmt_num, threadIdxs[i]); + int ub,lb; + extractCudaUB(stmt_num,level,ub,lb); + if(lb!= 0){ + //attempt to "normalize" the loop with an in-place tile and then re-check our bounds + if(cudaDebug) printf("Cudaize: doing tile at level %d to try and normalize lower bounds\n", level); + tile(stmt_num,level,1,level,CountedTile); + idxNames[stmt_num].insert(idxNames[stmt_num].begin()+(level),""); + extractCudaUB(stmt_num,level,ub,lb); + } + if(lb != 0){ + char buf[1024]; + sprintf(buf, "Cudaize: Loop at level %d does not have 0 as it's lower bound", level); + throw std::runtime_error(buf); + } + if(ub < 0){ + char buf[1024]; + sprintf(buf, "Cudaize: Loop at level %d does not have a hard upper bound", level); + throw std::runtime_error(buf); + } + + if(cudaDebug) printf("thread idx %s level %d lb: %d ub %d\n", threadIdxs[i].c_str(), level, lb, ub); + if(i == 0){ + thread_level1 = level; + cu_tx = ub+1; + idxNames[stmt_num][level-1] = "tx"; + } + else if(i == 1){ + thread_level2 = level; + cu_ty = ub+1; + idxNames[stmt_num][level-1] = "ty"; + } + else if(i == 2){ + cu_tz = ub+1; + idxNames[stmt_num][level-1] = "tz"; + } + } + if(!cu_ty) + thread_level1 = 0; + if(!cu_tz) + thread_level2 = 0; + + //Make changes to nonsplitlevels + const int m = stmt.size(); + for (int i = 0; i < m; i++) { + if(block_level){ + //stmt[i].nonSplitLevels.append((block_level)*2); + stmt_nonSplitLevels[i].append((block_level)*2); + } + if(thread_level1){ + //stmt[i].nonSplitLevels.append((thread_level1)*2); + stmt_nonSplitLevels[i].append((thread_level1)*2); + } + if(thread_level2){ + //stmt[i].nonSplitLevels.append((thread_level1)*2); + stmt_nonSplitLevels[i].append((thread_level1)*2); + } + } + + if(cudaDebug) { + printf("Codegen: current names: "); + printVS(idxNames[stmt_num]); + } + //Set codegen flag + code_gen_flags |= GenCudaizeV2; + + //Save array dimention sizes + this->array_dims = array_dims; + cu_kernel_name = kernel_name.c_str(); + +} + +tree_node_list* LoopCuda::cudaize_codegen_v2() +{ + //printf("cudaize codegen V2\n"); + CG_suifBuilder *ocg = dynamic_cast<CG_suifBuilder*>(ir->builder()); + if(!ocg) return false; + + //protonu--adding an annote to track texture memory type + ANNOTE(k_cuda_texture_memory, "cuda texture memory", TRUE); + ANNOTE(k_cuda_constant_memory, "cuda constant memory", TRUE); + int tex_mem_on = 0; + int cons_mem_on = 0; + + + + CG_outputRepr* repr; + std::vector<VarDefs> arrayVars; + std::vector<VarDefs> localScopedVars; + + std::vector<IR_ArrayRef *> ro_refs; + std::vector<IR_ArrayRef *> wo_refs; + std::set<std::string> uniqueRefs; + std::set<std::string> uniqueWoRefs; + //protonu--let's try a much simpler approach of a map instead + //we also keep a map for constant memories + std::map<std::string , var_sym *>tex_ref_map; + std::map<std::string , var_sym *>cons_ref_map; + + for(int j=0; j<stmt.size(); j++) + { + std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[j].code); + for (int i = 0; i < refs.size(); i++) + { + //printf("ref %s wo %d\n", static_cast<const char*>(refs[i]->name()), refs[i]->is_write()); + var_sym* var = symtab->lookup_var((char*)refs[i]->name().c_str(),false); + //If the array is not a parameter, then it's a local array and we + //want to recreate it as a stack variable in the kernel as opposed to + //passing it in. + if(!var->is_param()) + continue; + if (uniqueRefs.find(refs[i]->name()) == uniqueRefs.end()) + { + uniqueRefs.insert(refs[i]->name()); + if(refs[i]->is_write()){ + uniqueWoRefs.insert(refs[i]->name()); + wo_refs.push_back(refs[i]); + } + else + ro_refs.push_back(refs[i]); + } + if (refs[i]->is_write() && uniqueWoRefs.find(refs[i]->name()) == uniqueWoRefs.end()){ + uniqueWoRefs.insert(refs[i]->name()); + wo_refs.push_back(refs[i]); + //printf("adding %s to wo\n", static_cast<const char*>(refs[i]->name())); + } + } + } + + // printf("reading from array "); + // for(int i=0; i<ro_refs.size(); i++) + // printf("'%s' ", ro_refs[i]->name().c_str()); + // printf("and writting to array "); + // for(int i=0; i<wo_refs.size(); i++) + // printf("'%s' ", wo_refs[i]->name().c_str()); + // printf("\n"); + + const char* gridName = "dimGrid"; + const char* blockName = "dimBlock"; + + //TODO: Could allow for array_dims_vars to be a mapping from array + //references to to variable names that define their length. + var_sym* dim1 = 0; + var_sym* dim2 = 0; + + for(int i=0; i<wo_refs.size(); i++) + { + //TODO: Currently assume all arrays are floats of one or two dimentions + var_sym* outArray = 0; + std::string name = wo_refs[i]->name(); + outArray = symtab->lookup_var((char*)name.c_str(),false); + + VarDefs v; + v.size_2d = -1; + char buf[32]; + snprintf(buf, 32, "devO%dPtr", i+1); + v.name = buf; + if(outArray->type()->is_ptr()) + if(((ptr_type *)(outArray->type()))->ref_type()->is_array()) + v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type(); + else + v.type = ((ptr_type *)(outArray->type()))->ref_type(); + else + v.type = type_f32; + v.tex_mapped = false; + v.cons_mapped = false; + v.original_name = wo_refs[i]->name(); + //Size of the array = dim1 * dim2 * num bytes of our array type + + //If our input array is 2D (non-linearized), we want the actual + //dimentions of the array + CG_outputRepr* size; + //Lookup in array_dims + std::map<std::string, int>::iterator it = array_dims.find(name.c_str()); + if(outArray->type()->is_ptr() && outArray->type()->ref_type(0)->is_array()) + { + array_type* t = (array_type*)outArray->type()->ref_type(0); + v.size_2d = t->upper_bound().constant()+1; + printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)wo_refs[i]->name().c_str()); + size = ocg->CreateInt(v.size_2d * v.size_2d); + }else if(it != array_dims.end()){ + int ref_size = it->second; + v.var_ref_size = ref_size; + size = ocg->CreateInt(ref_size); + } + else{ + if(dim1){ + size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)), + new CG_suifRepr(operand(dim2))); + }else{ + char buf[1024]; + sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a " + "detectable size or specififed dimentions", name.c_str()); + throw std::runtime_error(buf); + } + } + v.size_expr = operand(static_cast<CG_suifRepr*>(ocg->CreateTimes( + size, + ocg->CreateInt(v.type->size()/8)))->GetExpression()); + v.in_data = 0; + v.out_data = outArray; + //Check for in ro_refs and remove it at this point + std::vector<IR_ArrayRef *>::iterator it_; + for(it_ = ro_refs.begin(); it_ != ro_refs.end(); it_++) + { + if((*it_)->name() == wo_refs[i]->name()){ + break; + } + } + if(it_ != ro_refs.end()) + { + v.in_data = outArray; + ro_refs.erase(it_); + } + + arrayVars.push_back(v); + + } + + //protonu-- assuming that all texture mapped memories were originally read only mems + //there should be safety checks for that, will implement those later + + int cs_ref_size = 0; + + for(int i=0; i<ro_refs.size(); i++) + { + var_sym* inArray = 0; + std::string name = ro_refs[i]->name(); + inArray = symtab->lookup_var((char*)name.c_str(),false); + VarDefs v; + v.size_2d = -1; + char buf[32]; + snprintf(buf, 32, "devI%dPtr", i+1); + v.name = buf; + if(inArray->type()->is_ptr()) + if(((ptr_type *)(inArray->type()))->ref_type()->is_array()) + v.type = ((array_type *)(((ptr_type *)(inArray->type()))->ref_type()))->elem_type(); + else + v.type = ((ptr_type *)(inArray->type()))->ref_type(); + else + v.type = type_f32; + v.tex_mapped = false; + v.cons_mapped = false; + v.original_name = ro_refs[i]->name(); + if ( texture != NULL) + v.tex_mapped = (texture->is_array_tex_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars + if (v.tex_mapped){ + printf("this variable %s is mapped to texture memory", name.c_str()); + } + if ( constant_mem != NULL) + v.cons_mapped = (constant_mem->is_array_cons_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars + if (v.cons_mapped){ + printf("this variable %s is mapped to constant memory", name.c_str()); + } + + //Size of the array = dim1 * dim2 * num bytes of our array type + + //If our input array is 2D (non-linearized), we want the actual + //dimentions of the array (as it might be less than cu_n + CG_outputRepr* size; + //Lookup in array_dims + std::map<std::string, int>::iterator it = array_dims.find(name.c_str()); + int ref_size = 0; + if(inArray->type()->is_ptr() && inArray->type()->ref_type(0)->is_array()) + { + array_type* t = (array_type*)inArray->type()->ref_type(0); + v.size_2d = t->upper_bound().constant()+1; + printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)ro_refs[i]->name().c_str()); + size = ocg->CreateInt(v.size_2d * v.size_2d); + }else if(it != array_dims.end()){ + ref_size = it->second; + v.var_ref_size = ref_size; + size = ocg->CreateInt(ref_size); + }else{ + if(dim1){ + size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)), + new CG_suifRepr(operand(dim2))); + }else{ + char buf[1024]; + sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a " + "detectable size or specififed dimentions", name.c_str()); + throw std::runtime_error(buf); + } + } + + + + v.size_expr = operand(static_cast<CG_suifRepr*>(ocg->CreateTimes( + size, + ocg->CreateInt(v.type->size()/8)))->GetExpression()); + + v.in_data = inArray; + v.out_data = 0; + arrayVars.push_back(v); + } + + + if(arrayVars.size() < 2) + { + fprintf(stderr, "cudaize error: Did not find two arrays being accessed\n"); + return false; + } + + //protonu--debugging tool--the printf statement + //tex_mem_on signals use of tex mem + for(int i=0; i<arrayVars.size(); i++) + { + //printf("var name %s, tex_mem used %s\n", arrayVars[i].name.c_str(), (arrayVars[i].tex_mapped)?"true":"false"); + if (arrayVars[i].tex_mapped ) tex_mem_on ++; + if (arrayVars[i].cons_mapped ) cons_mem_on ++; + } + + //Add CUDA function extern prototypes and function types + func_type* unkown_func = new func_type(type_s32); //function on unkown args that returns a i32 + unkown_func = (func_type*)symtab->install_type(unkown_func); + func_type* void_func = new func_type(type_void); //function on unkown args that returns a void + void_func = (func_type*)globals->install_type(void_func); + func_type* float_func = new func_type(type_f32); //function on unkown args that returns a float + float_func = (func_type*)globals->install_type(float_func); + + type_node* result = ocg->ModifyType(type_void, "__global__"); + result = globals->install_type(result); + func_type* kernel_type = new func_type(result); //function returns a '__global__ void' + + int numArgs = arrayVars.size() + (dim1 ? 2 : 0) + localScopedVars.size(); + //protonu--need to account for texture memory here, reduce the #args + if( tex_mem_on ) numArgs -= tex_mem_on; + if( cons_mem_on ) numArgs -= cons_mem_on; + kernel_type->set_num_args(numArgs); + int argCount = 0; + for(int i=0; i<arrayVars.size(); i++) + { + type_node* fptr; + if(arrayVars[i].in_data) + fptr = arrayVars[i].in_data->type()->clone(); + else + fptr = arrayVars[i].out_data->type()->clone(); + //protonu--skip this for texture mems + if( arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true ) + kernel_type->set_arg_type(argCount++, fptr); + } + if(dim1){ + kernel_type->set_arg_type(argCount++, type_s32); //width x height dimentions + kernel_type->set_arg_type(argCount++, type_s32); + } + kernel_type = (func_type*)globals->install_type(kernel_type); + + proc_sym* cudaMalloc = globals->new_proc(unkown_func, src_c, "cudaMalloc"); + proc_sym* cudaMemcpy = globals->new_proc(unkown_func, src_c, "cudaMemcpy"); + proc_sym* cudaFree = globals->new_proc(unkown_func, src_c, "cudaFree"); + proc_sym* cudaSync = globals->new_proc(void_func, src_c, "__syncthreads"); + proc_sym* cudaBind = globals->new_proc(unkown_func, src_c, "cudaBindTexture"); + proc_sym* cudaMemcpySym = globals->new_proc(unkown_func, src_c, "cudaMemcpyToSymbol"); + + + //protonu-removing Gabe's function, introducing mine, this is pretty cosmetic + //proc_sym* cudaFetch = globals->new_proc(float_func, src_c, "tex1Dfetch"); + proc_sym* tex1D = globals->new_proc(float_func, src_c, "tex1Dfetch"); + + var_sym *cudaMemcpyHostToDevice = new var_sym(type_s32, "cudaMemcpyHostToDevice"); + var_sym *cudaMemcpyDeviceToHost = new var_sym(type_s32, "cudaMemcpyDeviceToHost"); + cudaMemcpyDeviceToHost->set_param(); + cudaMemcpyHostToDevice->set_param(); + globals->add_sym(cudaMemcpyHostToDevice); + globals->add_sym(cudaMemcpyDeviceToHost); + + //protonu--adding the bool tex_mem to the structure struct_type + //to bypass the re-naming of struct texture, this is a hack fix + struct_type* texType = new struct_type(TYPE_GROUP, 0, "texture<float, 1, cudaReadModeElementType>", 0, true); + immed_list *iml_tex = new immed_list; + iml_tex->append(immed("texture memory")); + texType->append_annote(k_cuda_texture_memory, iml_tex); + //protonu--end my changes + texType = (struct_type*)globals->install_type(texType); + //protonu--should register the locals later on + //when we do the bind operation + //var_sym* texRef = new var_sym(texType, "texRef"); + //globals->add_sym(texRef); + + //Add our mallocs (and input array memcpys) + for(int i=0; i<arrayVars.size(); i++) + { + //protonu--check if the variable is not a tex-mapped variable. If it is tex mapped + // allow a malloc and memcpy operation, and a bind, but only if it is tex mapped, but dont call + // the kernel with it as an argument. + + //Make a pointer of type a[i].type + //type_node* fptr = new ptr_type(arrayVars[i].type->clone()); + //protonu--temporary change + type_node* fptr = new ptr_type(arrayVars[i].type); + fptr = symtab->install_type(fptr); + var_sym *dvs = new var_sym(fptr, const_cast<char*>( + arrayVars[i].name.c_str())); + dvs->set_addr_taken(); + symtab->add_sym(dvs); + + //cudaMalloc args + //protonu--no cudaMalloc required for constant memory + tree_node_list* tnl = new tree_node_list; + if(arrayVars[i].cons_mapped != true ) + { + in_cal *the_call = + new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMalloc))), 2); + the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to()->ptr_to(), operand(), immed(dvs)))); + the_call->set_argument(1, arrayVars[i].size_expr); + + tnl->append(new tree_instr(the_call)); + setup_code = ocg->StmtListAppend(setup_code, + new CG_suifRepr(tnl)); + } + if(arrayVars[i].in_data) + { + //cudaMemcpy args + //protonu-- no cudaMemcpy required for constant memory + if ( arrayVars[i].cons_mapped != true ) + { + in_cal *the_call = + new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpy))), 4); + the_call->set_argument(0, operand(dvs)); + the_call->set_argument(1, operand(arrayVars[i].in_data)); + the_call->set_argument(2, arrayVars[i].size_expr.clone()); + the_call->set_argument(3, operand(cudaMemcpyHostToDevice)); + + tnl = new tree_node_list; + tnl->append(new tree_instr(the_call)); + setup_code = ocg->StmtListAppend(setup_code, + new CG_suifRepr(tnl)); + } + + //protonu--check if the arrayvar is tex mapped + if(arrayVars[i].tex_mapped == true) + { + //Need a texture reference variable + char buf[32]; + snprintf(buf, 32, "tex%dRef", i+1); + arrayVars[i].secondName = buf; + + var_sym* texRef = new var_sym(texType, buf); + //printf("\n putting in %s\n", arrayVars[i].original_name.c_str()); + tex_ref_map[arrayVars[i].original_name] = texRef; + globals->add_sym(texRef); + //protonu--added the above two lines + + in_cal *the_call = + new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaBind))), 4); + in_ldc *ins = new in_ldc(type_s32, operand(), immed(0)); + the_call->set_argument(0, operand(ins)); + the_call->set_argument(1, operand(texRef));//protonu--change to add the new sym + the_call->set_argument(2, operand(dvs)); + the_call->set_argument(3, arrayVars[i].size_expr.clone()); + + tnl = new tree_node_list; + tnl->append(new tree_instr(the_call)); + setup_code = ocg->StmtListAppend(setup_code, + new CG_suifRepr(tnl)); + } + + //protonu--if arrayvar is mapped to constant memory + if(arrayVars[i].cons_mapped == true) + { + char buf[32]; + snprintf(buf, 32, "cs%dRef", i+1); + //arrayVars[i].secondName = buf; + array_bound low (0); + array_bound high (arrayVars[i].var_ref_size -1); + array_type *arr = new array_type(arrayVars[i].type,low, high); + type_node* cons_arr = ocg->ModifyType(arr, "__device__ __constant__"); + cons_arr = globals->install_type(cons_arr); + var_sym* consRef = new var_sym(cons_arr, buf); + cons_ref_map[arrayVars[i].original_name] = consRef; + globals->add_sym(consRef); + + + + in_cal *the_call = + new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpySym))), 3); + the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to(), operand(), immed(consRef)))); + the_call->set_argument(1, operand(arrayVars[i].in_data)); + the_call->set_argument(2, arrayVars[i].size_expr.clone()); + + tnl = new tree_node_list; + tnl->append(new tree_instr(the_call)); + setup_code = ocg->StmtListAppend(setup_code, + new CG_suifRepr(tnl)); + + } + } + } + + //Build dimGrid dim3 variables based on loop dimentions and ti/tj + char blockD1[120]; + char blockD2[120]; + if(dim1){ + snprintf(blockD1, 120, "%s/%d", dim1->name(), cu_tx); + snprintf(blockD2, 120, "%s/%d", dim2->name(), cu_ty); + }else{ + snprintf(blockD1, 120, "%d", cu_bx); + snprintf(blockD2, 120, "%d", cu_by); + //snprintf(blockD1, 120, "%d/%d", cu_nx, cu_tx); + //snprintf(blockD2, 120, "%d/%d", cu_ny, cu_ty); + } + repr = ocg->CreateDim3(immed((char*)gridName), + immed(blockD1), + immed(blockD2)); + setup_code = ocg->StmtListAppend(setup_code, repr); + + repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx),immed(cu_ty)); + + if(cu_tz > 1) + repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty), immed(cu_tz)); + else + repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty)); + setup_code = ocg->StmtListAppend(setup_code, repr); + + //call kernel function with name loop_name + //like: transpose_k<<<dimGrid,dimBlock>>>(devOPtr, devIPtr , width, height); + char dims[120]; + snprintf(dims,120,"<<<%s,%s>>>",gridName, blockName); + immed_list *iml = new immed_list; + iml->append(immed((char*)cu_kernel_name.c_str())); + iml->append(immed(dims)); + //printf("%s %s\n", static_cast<const char*>(cu_kernel_name), dims); + for(int i=0; i<arrayVars.size(); i++) + //Throw in a type cast if our kernel takes 2D array notation + //like (float(*) [1024]) + { + //protonu--throwing in another hack to stop the caller from passing tex mapped + //vars to the kernel. + if(arrayVars[i].tex_mapped == true || arrayVars[i].cons_mapped == true ) + continue; + if(arrayVars[i].size_2d >= 0) + { + snprintf(dims,120,"(float(*) [%d])%s", arrayVars[i].size_2d, + const_cast<char*>(arrayVars[i].name.c_str())); + //printf("%d %s\n", i, dims); + iml->append(immed(dims)); + }else{ + //printf("%d %s\n", i, static_cast<const char*>(arrayVars[i].name)); + iml->append(immed(const_cast<char*>( + arrayVars[i].name.c_str()))); + } + } + if(dim1){ + iml->append(immed(dim1)); + iml->append(immed(dim2)); + } + repr = ocg->CreateKernel(iml);//kernel call + setup_code = ocg->StmtListAppend(setup_code, repr); + + //cuda free variables + for(int i=0; i<arrayVars.size(); i++) + { + if(arrayVars[i].out_data) + { + //cudaMemcpy args + in_cal *the_call = + new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpy))), 4); + the_call->set_argument(0, operand(arrayVars[i].out_data)); + the_call->set_argument(1, operand(symtab->lookup_var(const_cast<char*>( + arrayVars[i].name.c_str())))); + the_call->set_argument(2, arrayVars[i].size_expr.clone()); + the_call->set_argument(3, operand(cudaMemcpyDeviceToHost)); + + tree_node_list* tnl = new tree_node_list; + tnl->append(new tree_instr(the_call)); + teardown_code = ocg->StmtListAppend(teardown_code, + new CG_suifRepr(tnl)); + } + + in_cal *the_call = + new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaFree))), 1); + the_call->set_argument(0, operand(symtab->lookup_var(const_cast<char*>( + arrayVars[i].name.c_str())))); + + tree_node_list* tnl = new tree_node_list; + tnl->append(new tree_instr(the_call)); + teardown_code = ocg->StmtListAppend(teardown_code, + new CG_suifRepr(tnl)); + } + + // --------------- + // BUILD THE KERNEL + // --------------- + + //Extract out kernel body + tree_node_list* code = getCode(); + //Get rid of wrapper if that original() added + if(code->head()->contents->kind() == TREE_IF) + { + tree_if* ifn = (tree_if*)code->head()->contents; + code = ifn->then_part(); + } + + //Create kernel function body + proc_sym *new_psym = globals->new_proc(kernel_type, src_c, (char*)cu_kernel_name.c_str()); + proc_symtab *new_proc_syms = new proc_symtab(new_psym->name()); + globals->add_child(new_proc_syms); + + //Add Params + std::map<std::string, var_sym*> loop_vars; + //In-Out arrays + type_node* fptr; + for(int i=0; i<arrayVars.size(); i++) + { + if(arrayVars[i].in_data) + //fptr = arrayVars[i].in_data->type()->clone(); + fptr = arrayVars[i].in_data->type(); + else + //fptr = arrayVars[i].out_data->type()->clone(); + fptr = arrayVars[i].out_data->type(); + fptr = new_proc_syms->install_type(fptr); + std::string name = arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name(); + var_sym* sym = new var_sym(fptr, arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name()); + //protonu--adding a check to ensure that texture memories are not passed in as arguments + if(arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true ) + { + sym->set_param(); + new_proc_syms->params()->append(sym); + new_proc_syms->add_sym(sym);//protonu--added to suppress the addition of the redundant var in the kernel + } + if (arrayVars[i].cons_mapped == true) + { + sym->set_param(); + new_proc_syms->add_sym(sym); + } + //printf("inserting name: %s\n", static_cast<const char*>(name)); + loop_vars.insert(std::pair<std::string, var_sym*>(std::string(name), sym)); + } + + if(dim1) + { + //Array dimentions + var_sym* kdim1 = new var_sym(dim1->type(), dim1->name()); + kdim1->set_param(); + new_proc_syms->add_sym(kdim1); + loop_vars.insert(std::pair<std::string, var_sym*>(std::string(dim1->name()), kdim1)); + var_sym* kdim2 = new var_sym(dim2->type(), dim2->name()); + kdim2->set_param(); + new_proc_syms->add_sym(kdim2); + loop_vars.insert(std::pair<std::string, var_sym*>(std::string(dim2->name()), kdim2)); + new_proc_syms->params()->append(kdim1); + new_proc_syms->params()->append(kdim2); + } + //Put block and thread implicit variables into scope + std::vector<var_sym *> index_syms; + /* Currently we don't use the block dimentions + var_sym* blockDim_x = new var_sym(type_s32, "blockDim.x"); + blockDim_x->set_param(); + new_proc_syms->add_sym(blockDim_x); + var_sym* blockDim_y = new var_sym(type_s32, "blockDim.y"); + blockDim_y->set_param(); + new_proc_syms->add_sym(blockDim_y); + */ + if(cu_bx > 1){ + var_sym* blockIdx_x = new var_sym(type_s32, "blockIdx.x"); + blockIdx_x->set_param(); + new_proc_syms->add_sym(blockIdx_x); + index_syms.push_back(blockIdx_x); + } + if(cu_by > 1){ + var_sym* blockIdx_y = new var_sym(type_s32, "blockIdx.y"); + blockIdx_y->set_param(); + new_proc_syms->add_sym(blockIdx_y); + index_syms.push_back(blockIdx_y); + } + if(cu_tx > 1){ + var_sym* threadIdx_x = new var_sym(type_s32, "threadIdx.x"); + threadIdx_x->set_param(); + new_proc_syms->add_sym(threadIdx_x); + index_syms.push_back(threadIdx_x); + } + if(cu_ty > 1){ + var_sym* threadIdx_y = new var_sym(type_s32, "threadIdx.y"); + threadIdx_y->set_param(); + new_proc_syms->add_sym(threadIdx_y); + index_syms.push_back(threadIdx_y); + } + + if(cu_tz > 1){ + var_sym* threadIdx_z = new var_sym(type_s32, "threadIdx.z"); + threadIdx_z->set_param(); + new_proc_syms->add_sym(threadIdx_z); + index_syms.push_back(threadIdx_z); + } + + //Figure out which loop variables will be our thread and block dimention variables + std::vector<var_sym *> loop_syms; + //Get our indexes + std::vector<const char*> indexes;// = get_loop_indexes(code,cu_num_reduce); + int threadsPos=0; + if(cu_bx > 1) + indexes.push_back("bx"); + if(cu_by > 1) + indexes.push_back("by"); + if(cu_tx > 1){ + threadsPos = indexes.size(); + indexes.push_back("tx"); + } + if(cu_ty > 1) + indexes.push_back("ty"); + if(cu_tz > 1) + indexes.push_back("tz"); + for(int i=0; i<indexes.size(); i++) + { + //printf("indexes[%d] = %s\n", i, (char*)indexes[i]); + loop_syms.push_back(new var_sym(type_s32, (char*)indexes[i])); + new_proc_syms->add_sym(loop_syms[i]); + //loop_vars.insert(std::pair<std::string, var_sym*>(std::string(indexes[i]), loop_syms[i])); + } + + //Generate this code + //int bx = blockIdx.x + //int by = blockIdx.y + //int tx = threadIdx.x + //int ty = threadIdx.y + CG_outputRepr *body=NULL; + for(int i=0; i<indexes.size(); i++){ + CG_outputRepr *lhs = new CG_suifRepr(operand(loop_syms[i])); + //body = ocg->StmtListAppend(body, ocg->CreateStmtList( + // ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i]))))); + body = ocg->StmtListAppend(body, ocg->StmtListAppend( + ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i]))), NULL)); + } + + //Get our inital code prepped for loop reduction. First we need to swap + //out internal SUIF variable references to point to the new local + //function symbol table. + std::map<std::string, var_sym*> loop_idxs; //map from idx names to their new syms + std::vector< std::pair<var_sym*, var_sym*> > dim_vars; //pair is of <old,new> var_sym (for 2D array size initializations) + replacements r; + tree_node_list* swapped = swapVarReferences(code, &r, ocg, loop_vars, new_proc_syms, dim_vars); + //printf("\n code before recursiveFindReplacePreferedIdxs :\n"); + //swapped->print(); + swapped = recursiveFindReplacePreferedIdxs(swapped, new_proc_syms, cudaSync, void_func, loop_idxs);//in-place swapping + //printf("\n code after recursiveFindReplacePreferedIdxs :\n"); + //swapped->print(); + + for(int i=0; i<indexes.size(); i++){ + std::vector<tree_for*> tfs = findCommentedFors(indexes[i], swapped); + for(int k=0; k<tfs.size(); k++){ + //printf("replacing %p tfs for index %s\n", tfs[k], indexes[i]); + tree_node_list* newBlock = forReduce(tfs[k], loop_idxs[indexes[i]], new_proc_syms); + //newBlock->print(); + swap_node_for_node_list(tfs[k], newBlock); + //printf("AFTER SWAP\n"); newBlock->print(); + } + } + //printf("AFTER REDUCE\n"); swapped->print(); + + if(static_cast<const IR_cudasuifCode *>(ir)->init_code()){ + tree_node_list* orig_init_code = static_cast<CG_suifRepr *>(static_cast<const IR_cudasuifCode *>(ir)->init_code())->GetCode(); + for(int i=0; i<dim_vars.size(); i++){ + //We have a map of var_sym from the original function body and we know + //that these var_syms have initialization statements which define the + //array size. We need to mimic these initialization statements. + + //First find the assignment and pull out the constant initialization + //value + int value = -1; + tree_node_list_iter tnli(orig_init_code); + while (!tnli.is_empty()) { + tree_node *node = tnli.step(); + if(node->kind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr) + { + in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr(); + //expect the structure: cpy( _ = min(grab_me, _)) + if(inst->opcode() == io_cpy && inst->dst_op().is_symbol()){ + //printf("looking at instruction: "); + //inst->print(); + var_sym* dest = inst->dst_op().symbol(); + if(dest == dim_vars[i].first) + { + if(inst->src1_op().is_instr() && inst->src1_op().instr()->format() == inf_ldc){ + value = ((in_ldc*)inst->src1_op().instr())->value().integer(); + } + } + } + } + } + if(value < 0){ + fprintf(stderr, "ERROR: Could not find initializing statement for variable used in upper_bound of array type"); + } + CG_outputRepr *lhs = new CG_suifRepr(operand(dim_vars[i].second)); + //body = ocg->StmtListAppend(body, ocg->CreateStmtList(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value)))); + body = ocg->StmtListAppend(body, ocg->StmtListAppend(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value)), NULL)); + } + } + + + body = ocg->StmtListAppend(body, new CG_suifRepr(swapped)); + + //protonu--lets try creating our function definiton here + var_sym *tsym = NULL; + + + std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(body); + for(int i=0; i<refs.size(); i++) + { + //check if the array is tex mapped + if(texture != NULL && texture->is_array_tex_mapped(refs[i]->name().c_str())) + { + //protonu--our new tex lookup function + in_cal *tex_lookup = + new in_cal(type_f32, operand(), operand(new in_ldc(float_func->ptr_to(), operand(), immed(tex1D))), 2); + + //printf("name of the array to be mapped is %s\n", refs[i]->name().c_str()); + tsym = tex_ref_map[refs[i]->name()]; + tex_lookup->set_argument(0, operand(tsym)); + + + int array_dims = ((IR_suifArrayRef *)refs[i])->ia_->dims(); + + if (array_dims == 1){ + tex_lookup->set_argument(1, ((IR_suifArrayRef *)refs[i])->ia_->index(0).clone()); + }else if (array_dims > 2) { + printf(" \n we don't handle more than 2D arrays mapped to textures yet\n"); + }else if (array_dims == 2) { + + IR_ArraySymbol *sym = refs[i]->symbol(); + CG_outputRepr *sz = sym->size(1); + delete sym; // free the wrapper object only + // find the builder ocg + CG_outputRepr *expr = ocg->CreateTimes(sz->clone(),refs[i]->index(0)); + delete sz; // free the wrapper object only + expr = ocg->CreatePlus(expr, refs[i]->index(1)); + // expr holds the 1D access expression and take it out + tex_lookup->set_argument(1, ((CG_suifRepr *)expr)->GetExpression()); + } + + //using chun's function to replace the array look up with the function call + ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(tex_lookup))); + } + + } + + + tsym = NULL; + //protonu--now let's try what we did above for constant memory + for(int i=0; i<refs.size(); i++) + { + //check if the array is tex mapped + if(constant_mem != NULL && constant_mem->is_array_cons_mapped(refs[i]->name().c_str())) + { + + //printf("name of the array to be cons mapped is %s\n", refs[i]->name().c_str()); + tsym = cons_ref_map[refs[i]->name()]; + //we should create a IR_SuifArray here + IR_ArraySymbol *ar_sym = new IR_suifArraySymbol(ir,tsym); + std::vector<CG_outputRepr *> ar_index; + ar_index.push_back(((IR_suifArrayRef *)refs[i])->index(0)); + IR_ArrayRef *ar_ref = ((IR_suifCode *)ir)->CreateArrayRef(ar_sym, ar_index); + //using chun's function to replace the array look up with the function call + ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(((IR_suifArrayRef *)ar_ref)->ia_))); + + } + } + + + tree_proc *new_body = new tree_proc(static_cast<CG_suifRepr*>(body)->GetCode(), new_proc_syms); + //globals->add_child(new_proc_syms); + new_psym->set_block(new_body); + new_procs.push_back(new_psym); + + return swapped; +} + +//Order taking out dummy variables +std::vector<std::string> cleanOrder(std::vector<std::string> idxNames){ + std::vector<std::string> results; + for(int j=0; j<idxNames.size(); j++){ + if(idxNames[j].length() != 0) + results.push_back(idxNames[j]); + } + return results; +} + +//First non-dummy level in ascending order +int LoopCuda::nonDummyLevel(int stmt, int level){ + //level comes in 1-basd and should leave 1-based + for(int j=level-1; j<idxNames[stmt].size(); j++){ + if(idxNames[stmt][j].length() != 0){ + //printf("found non dummy level of %d with idx: %s when searching for %d\n", j+1, (const char*) idxNames[stmt][j], level); + return j+1; + } + } + char buf[128]; sprintf(buf, "%d", level); + throw std::runtime_error(std::string("Unable to find a non-dummy level starting from ") + std::string(buf)); +} + +int LoopCuda::findCurLevel(int stmt, std::string idx){ + for(int j=0; j<idxNames[stmt].size(); j++){ + if(strcmp(idxNames[stmt][j].c_str(),idx.c_str()) == 0) + return j+1; + } + throw std::runtime_error(std::string("Unable to find index ") + idx + std::string(" in current list of indexes")); +} + +void LoopCuda::permute_cuda(int stmt, const std::vector<std::string>& curOrder) +{ + //printf("curOrder: "); + //printVs(curOrder); + //printf("idxNames: "); + //printVS(idxNames[stmt]); + std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt]); + bool same=true; + std::vector<int> pi; + for(int i=0; i<curOrder.size(); i++){ + bool found = false; + for(int j=0; j<cIdxNames.size(); j++){ + if(strcmp(cIdxNames[j].c_str(), curOrder[i].c_str()) == 0){ + pi.push_back(j+1); + found=true; + if(j!=i) + same=false; + } + } + if(!found){ + throw std::runtime_error("One of the indexes in the permute order where not " + "found in the current set of indexes."); + } + } + for(int i=curOrder.size(); i<cIdxNames.size(); i++){ + pi.push_back(i); + } + if(same) + return; + permute(stmt, pi); + //Set old indexe names as new + for(int i=0; i<curOrder.size(); i++){ + idxNames[stmt][i] = curOrder[i].c_str(); //what about sibling stmts? + } +} + + +bool LoopCuda::permute(int stmt_num, const std::vector<int> &pi) +{ +// check for sanity of parameters + if (stmt_num >= stmt.size() || stmt_num < 0) + throw std::invalid_argument("invalid statement " + to_string(stmt_num)); + const int n = stmt[stmt_num].xform.n_out(); + if (pi.size() > (n-1)/2) + throw std::invalid_argument("iteration space dimensionality does not match permute dimensionality"); + int first_level = 0; + int last_level = 0; + for (int i = 0; i < pi.size(); i++) { + if (pi[i] > (n-1)/2 || pi[i] <= 0) + throw std::invalid_argument("invalid loop level " + to_string(pi[i]) + " in permuation"); + + if (pi[i] != i+1) { + if (first_level == 0) + first_level = i+1; + last_level = i+1; + } + } + if (first_level == 0) + return true; + + std::vector<int> lex = getLexicalOrder(stmt_num); + std::set<int> active = getStatements(lex, 2*first_level-2); + Loop::permute(active, pi); +} + + +void LoopCuda::tile_cuda(int stmt, int level, int outer_level) +{ + tile_cuda(stmt,level,1,outer_level,"","",CountedTile); +} +void LoopCuda::tile_cuda(int level, int tile_size, int outer_level, std::string idxName, + std::string ctrlName, TilingMethodType method){ + tile_cuda(0, level, tile_size, outer_level, idxName, ctrlName, method); +} + +void LoopCuda::tile_cuda(int stmt, int level, int tile_size, int outer_level, std::string idxName, + std::string ctrlName, TilingMethodType method){ + //Do regular tile but then update the index and control loop variable + //names as well as the idxName to reflect the current state of things. + //printf("tile(%d,%d,%d,%d)\n", stmt, level, tile_size, outer_level); + //printf("idxNames before: "); + //printVS(idxNames[stmt]); + + tile(stmt, level, tile_size, outer_level, method); + + if(idxName.size()) + idxNames[stmt][level-1] = idxName.c_str(); + if(tile_size == 1){ + //potentially rearrange loops + if(outer_level < level){ + std::string tmp = idxNames[stmt][level-1]; + for(int i=level-1; i>outer_level-1; i--){ + if(i-1 >= 0) + idxNames[stmt][i] = idxNames[stmt][i-1]; + } + idxNames[stmt][outer_level-1] = tmp; + } + //TODO: even with a tile size of one, you need a insert (of a dummy loop) + idxNames[stmt].insert(idxNames[stmt].begin()+(level),""); + }else{ + if(!ctrlName.size()) + throw std::runtime_error("No ctrl loop name for tile"); + //insert + idxNames[stmt].insert(idxNames[stmt].begin()+(outer_level-1),ctrlName.c_str()); + } + + //printf("idxNames after: "); + //printVS(idxNames[stmt]); +} + + +bool LoopCuda::datacopy_privatized_cuda(int stmt_num, int level, const std::string &array_name, const std::vector<int> &privatized_levels, bool allow_extra_read , int fastest_changing_dimension , int padding_stride , int padding_alignment , bool cuda_shared) +{ + int old_stmts =stmt.size(); + //datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, cuda_shared); + if(cuda_shared) + datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 1); + else + datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 0); + + + //Adjust idxNames to reflect updated state + std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]); + int new_stmts = stmt.size(); + for(int i=old_stmts; i<new_stmts; i++){ + //printf("fixing up statement %d\n", i); + std::vector<std::string> idxs; + + + //protonu-making sure the vector of nonSplitLevels grows along with + //the statement structure + stmt_nonSplitLevels.push_back(omega::Tuple<int>()); + + //Indexes up to level will be the same + for(int j=0; j<level-1; j++) + idxs.push_back(cIdxNames[j]); + + //Expect privatized_levels to match + for(int j=0; j<privatized_levels.size(); j++) + idxs.push_back(cIdxNames[privatized_levels[j]-1]);//level is one-based + + //all further levels should match order they are in originally + if(privatized_levels.size()){ + int last_privatized = privatized_levels.back(); + int top_level = last_privatized + (stmt[i].IS.n_set()-idxs.size()); + //printf("last privatized_levels: %d top_level: %d\n", last_privatized, top_level); + for(int j=last_privatized; j<top_level; j++){ + idxs.push_back(cIdxNames[j]); + //printf("pushing back: %s\n", (const char*)cIdxNames[j]); + } + } + idxNames.push_back(idxs); + } +} + +bool LoopCuda::datacopy_cuda(int stmt_num, int level, const std::string &array_name, std::vector<std::string> new_idxs, bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, bool cuda_shared) +{ + + int old_stmts =stmt.size(); + //datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment,cuda_shared); + if(cuda_shared) + datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 1); + else + datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 0); + //Adjust idxNames to reflect updated state + std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]); + int new_stmts = stmt.size(); + for(int i=old_stmts; i<new_stmts; i++){ + //printf("fixing up statement %d\n", i); + std::vector<std::string> idxs; + + //protonu-making sure the vector of nonSplitLevels grows along with + //the statement structure + stmt_nonSplitLevels.push_back(omega::Tuple<int>()); + + //protonu--lets dump out the code from each statement here + //printf("\n dumping statement :%d", i); + //stmt[i].code->Dump(); + + //Indexes up to level will be the same + for(int j=0; j<level-1; j++) + idxs.push_back(cIdxNames[j]); + + //all further levels should get names from new_idxs + int top_level = stmt[i].IS.n_set(); + //printf("top_level: %d level: %d\n", top_level, level); + if(new_idxs.size() < top_level-level+1) + throw std::runtime_error("Need more new index names for new datacopy loop levels"); + + for(int j=level-1; j<top_level; j++){ + idxs.push_back(new_idxs[j-level+1].c_str()); + //printf("pushing back: %s\n", new_idxs[j-level+1].c_str()); + } + idxNames.push_back(idxs); + } +} + +bool LoopCuda::unroll_cuda(int stmt_num, int level, int unroll_amount) +{ + int old_stmts =stmt.size(); + //bool b= unroll(stmt_num, , unroll_amount); + + + int dim = 2*level-1; + std::vector<int> lex = getLexicalOrder(stmt_num); + std::set<int> same_loop = getStatements(lex, dim-1); + + level = nonDummyLevel(stmt_num,level); + //printf("unrolling %d at level %d\n", stmt_num,level); + + //protonu--using the new version of unroll, which returns + //a set of ints instead of a bool. To keep Gabe's logic + //I'll check the size of the set, if it's 0 return true + //bool b= unroll(stmt_num, level, unroll_amount); + std::set<int> b_set= unroll(stmt_num, level, unroll_amount); + bool b = false; + if (b_set.size() == 0) b = true; + //end--protonu + + //Adjust idxNames to reflect updated state + std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]); + std::vector<std::string> origSource = idxNames[stmt_num];; + //Drop index names at level + if(unroll_amount == 0){ + //For all statements that were in this unroll together, drop index name for unrolled level + idxNames[stmt_num][level-1] = ""; + for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) { + //printf("in same loop as %d is %d\n", stmt_num, (*i)); + //idxNames[(*i)][level-1] = ""; + idxNames[(*i)] = idxNames[stmt_num]; + } + } + + lex = getLexicalOrder(stmt_num); + same_loop = getStatements(lex, dim-1); + + bool same_as_source = false; + int new_stmts = stmt.size(); + for(int i=old_stmts; i<new_stmts; i++){ + //Check whether we had a sync for the statement we are unrolling, if + //so, propogate that to newly created statements so that if they are + //in a different loop structure, they will also get a syncthreads + int size = syncs.size(); + for(int j=0; j<size; j++){ + if(syncs[j].first == stmt_num) + syncs.push_back(make_pair(i,syncs[j].second)); + } + + //protonu-making sure the vector of nonSplitLevels grows along with + //the statement structure + stmt_nonSplitLevels.push_back(omega::Tuple<int>()); + + + //We expect that new statements have a constant for the variable in + //stmt[i].IS at level (as seen with print_with_subs), otherwise there + //will be a for loop at level and idxNames should match stmt's + //idxNames pre-unrolled + Relation IS = stmt[i].IS; + //Ok, if you know how the hell to get anything out of a Relation, you + //should probably be able to do this more elegantly. But for now, I'm + //hacking it. + std::string s = IS.print_with_subs_to_string(); + //s looks looks like + //{[_t49,8,_t51,_t52,128]: 0 <= _t52 <= 3 && 0 <= _t51 <= 15 && 0 <= _t49 && 64_t49+16_t52+_t51 <= 128} + //where level == 5, you see a integer in the input set + + //If that's not an integer and this is the first new statement, then + //we think codegen will have a loop at that level. It's not perfect, + //not sure if it can be determined without round-tripping to codegen. + int sIdx = 0; + int eIdx = 0; + for(int j=0; j<level-1; j++){ + sIdx = s.find(",",sIdx+1); + if(sIdx < 0) break; + } + if(sIdx > 0){ + eIdx = s.find("]"); + int tmp = s.find(",",sIdx+1); + if(tmp > 0 && tmp < eIdx) + eIdx = tmp; //", before ]" + if(eIdx > 0){ + sIdx++; + std::string var = s.substr(sIdx,eIdx-sIdx); + //printf("%s\n", s.c_str()); + //printf("set var for stmt %d at level %d is %s\n", i, level, var.c_str()); + if(atoi(var.c_str()) == 0 && i ==old_stmts){ + //TODO:Maybe do see if this new statement would be in the same + //group as the original and if it would, don't say + //same_as_source + if(same_loop.find(i) == same_loop.end()){ + printf("stmt %d level %d, newly created unroll statement should have same level indexes as source\n", i, level); + same_as_source = true; + } + } + } + } + + + //printf("fixing up statement %d n_set %d with %d levels\n", i, stmt[i].IS.n_set(), level-1); + if(same_as_source) + idxNames.push_back(origSource); + else + idxNames.push_back(idxNames[stmt_num]); + } + + return b; +} + +void LoopCuda::copy_to_texture(const char *array_name) +{ + //protonu--placeholder for now + //set the bool for using cuda memory as true + //in a vector of strings, put the names of arrays to tex mapped + if ( !texture ) + texture = new texture_memory_mapping(true, array_name); + else + texture->add(array_name); + + +} + + +void LoopCuda::copy_to_constant(const char *array_name) +{ + //protonu--placeholder for now + //set the bool for using cuda memory as true + //in a vector of strings, put the names of arrays to tex mapped + if ( !constant_mem ) + constant_mem = new constant_memory_mapping(true, array_name); + else + constant_mem->add(array_name); +} + +//protonu--moving this from Loop +tree_node_list* LoopCuda::codegen() +{ + if(code_gen_flags & GenCudaizeV2) + return cudaize_codegen_v2(); + //Do other flagged codegen methods, return plain vanilla generated code + return getCode(); +} + +//These three are in Omega code_gen.cc and are used as a massive hack to +//get out some info from MMGenerateCode. Yea for nasty side-effects. +namespace omega{ + extern int checkLoopLevel; + extern int stmtForLoopCheck; + extern int upperBoundForLevel; + extern int lowerBoundForLevel; +} + + +void LoopCuda::extractCudaUB(int stmt_num, int level, int &outUpperBound, int &outLowerBound){ + // check for sanity of parameters + const int m = stmt.size(); + if (stmt_num >= m || stmt_num < 0) + throw std::invalid_argument("invalid statement " + to_string(stmt_num)); + const int n = stmt[stmt_num].xform.n_out(); + if (level > (n-1)/2 || level <= 0) + throw std::invalid_argument("invalid loop level " + to_string(level)); + + int dim = 2*level-1; + + std::vector<int> lex = getLexicalOrder(stmt_num); + std::set<int> same_loop = getStatements(lex, dim-1); + + // extract the intersection of the iteration space to be considered + Relation hull; + { + hull = Relation::True(n); + for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) { + hull = Intersection(hull, project_onto_levels(getNewIS(*i), dim+1, true)); + hull.simplify(2, 4); + } + + for (int i = 2; i <= dim+1; i+=2) { + //std::string name = std::string("_t") + to_string(t_counter++); + std::string name = std::string("_t") + to_string(tmp_loop_var_name_counter++); + hull.name_set_var(i, name); + } + hull.setup_names(); + } + + // extract the exact loop bound of the dimension to be unrolled + if (is_single_iteration(hull, dim)){ + throw std::runtime_error("No loop availabe at level to extract upper bound."); + } + Relation bound = get_loop_bound(hull, dim); + if (!bound.has_single_conjunct() || !bound.is_satisfiable() || bound.is_tautology()) + throw loop_error("loop error: unable to extract loop bound for cudaize"); + + // extract the loop stride + EQ_Handle stride_eq; + int stride = 1; + { + bool simple_stride = true; + int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(dim+1), stride_eq, simple_stride); + if (strides > 1) + throw loop_error("loop error: too many strides"); + else if (strides == 1) { + int sign = stride_eq.get_coef(bound.set_var(dim+1)); +// assert(sign == 1 || sign == -1); + Constr_Vars_Iter it(stride_eq, true); + stride = abs((*it).coef/sign); + } + } + if(stride != 1){ + char buf[1024]; + sprintf(buf, "Cudaize: Loop at level %d has non-one stride of %d", level, stride); + throw std::runtime_error(buf); + } + + //Use code generation system to build tell us our bound information. We + //need a hard upper bound a 0 lower bound. + + checkLoopLevel = level*2; + stmtForLoopCheck = stmt_num; + upperBoundForLevel = -1; + lowerBoundForLevel = -1; + printCode(1,false); + checkLoopLevel = 0; + + outUpperBound = upperBoundForLevel; + outLowerBound = lowerBoundForLevel; + return; +} + + +void LoopCuda::printCode(int effort, bool actuallyPrint) const { + const int m = stmt.size(); + if (m == 0) + return; + const int n = stmt[0].xform.n_out(); + + + + Tuple<Relation> IS(m); + Tuple<Relation> xform(m); + Tuple<IntTuple > nonSplitLevels(m); + for (int i = 0; i < m; i++) { + IS[i+1] = stmt[i].IS; + xform[i+1] = stmt[i].xform; + nonSplitLevels[i+1] = stmt_nonSplitLevels[i]; + //nonSplitLevels[i+1] = stmt[i].nonSplitLevels; + } + + Tuple< Tuple<std::string> > idxTupleNames; + if(useIdxNames){ + for(int i=0; i<idxNames.size(); i++){ + Tuple<std::string> idxs; + for(int j=0; j<idxNames[i].size(); j++) + idxs.append(idxNames[i][j]); + idxTupleNames.append( idxs ); + } + } + + Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); + CG_stringBuilder *ocg = new CG_stringBuilder(); + Tuple<CG_outputRepr *> nameInfo; + for (int i = 1; i <= m; i++) + nameInfo.append(new CG_stringRepr("s" + to_string(i))); + CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort); + if(actuallyPrint) + std::cout << GetString(repr); +/* + for (int i = 1; i <= m; i++) + delete nameInfo[i]; +*/ + + delete ocg; +} + + + +void LoopCuda::printRuntimeInfo() const { + for(int i=0; i<stmt.size(); i++){ + Relation IS = stmt[i].IS; + Relation xform = stmt[i].xform; + printf("stmt[%d]\n", i); + printf("IS\n"); + IS.print_with_subs(); + + printf("xform[%d]\n", i); + xform.print_with_subs(); + + //printf("code\n"); + //static_cast<CG_suifRepr *>(stmt[i].code)->GetCode()->print_expr(); + } +} + +void LoopCuda::printIndexes() const { + for(int i=0; i<stmt.size(); i++){ + printf("stmt %d nset %d ", i, stmt[i].IS.n_set()); + + for(int j=0; j<idxNames[i].size(); j++){ + if(j>0) + printf(","); + printf("%s", idxNames[i][j].c_str()); + } + printf("\n"); + } +} + +tree_node_list* LoopCuda::getCode(int effort) const { + const int m = stmt.size(); + if (m == 0) + return new tree_node_list; + const int n = stmt[0].xform.n_out(); + + + + Tuple<CG_outputRepr *> ni(m); + Tuple<Relation> IS(m); + Tuple<Relation> xform(m); + Tuple< IntTuple > nonSplitLevels(m); + for (int i = 0; i < m; i++) { + ni[i+1] = stmt[i].code; + IS[i+1] = stmt[i].IS; + xform[i+1] = stmt[i].xform; + nonSplitLevels[i+1] = stmt_nonSplitLevels[i]; + //nonSplitLevels[i+1] = stmt[i].nonSplitLevels; + } + + + Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); +#ifdef DEBUG +// std::cout << GetString(MMGenerateCode(new CG_stringBuilder(), xform, IS, known, effort)); +#endif + Tuple< Tuple<std::string> > idxTupleNames; + if(useIdxNames){ + for(int i=0; i<idxNames.size(); i++){ + Tuple<std::string> idxs; + for(int j=0; j<idxNames[i].size(); j++) + idxs.append(idxNames[i][j]); + idxTupleNames.append( idxs ); + } + } + + CG_outputBuilder *ocg = ir->builder(); + CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, nonSplitLevels, syncs, idxTupleNames, effort); + + //CG_outputRepr *overflow_initialization = ocg->CreateStmtList(); + //protonu--using the new function CG_suifBuilder::StmtListAppend + CG_outputRepr *overflow_initialization = ocg->StmtListAppend(NULL, NULL); + for (std::map<int, std::vector<Free_Var_Decl *> >::const_iterator i = overflow.begin(); i != overflow.end(); i++) + for (std::vector<Free_Var_Decl *>::const_iterator j = i->second.begin(); j != i->second.end(); j++) + //overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->CreateStmtList(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)))); + overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->StmtListAppend(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)), NULL)); + + repr = ocg->StmtListAppend(overflow_initialization, repr); + tree_node_list *tnl = static_cast<CG_suifRepr *>(repr)->GetCode(); + + delete repr; + /* + for (int i = 1; i <= m; i++) + delete ni[i]; + */ + + return tnl; +} + + +//protonu--adding constructors for the new derived class +LoopCuda::LoopCuda():Loop(), code_gen_flags(GenInit){} + +LoopCuda::LoopCuda(IR_Control *irc, int loop_num) + :Loop(irc) +{ + setup_code = NULL; + teardown_code = NULL; + code_gen_flags = 0; + cu_bx = cu_by = cu_tx = cu_ty = cu_tz = 1; + cu_num_reduce = 0; + cu_mode = GlobalMem; + texture = NULL; + constant_mem = NULL; + + int m=stmt.size(); + //printf("\n the size of stmt(initially) is: %d\n", stmt.size()); + for(int i=0; i<m; i++) + stmt_nonSplitLevels.push_back(omega::Tuple<int>()); + + + //protonu--setting up + //proc_symtab *symtab + //global_symtab *globals + + globals = ((IR_cudasuifCode *)ir)->gsym_ ; + std::vector<tree_for *> tf = ((IR_cudasuifCode *)ir)->get_loops(); + + symtab = tf[loop_num]->proc()->block()->proc_syms(); + + std::vector<tree_for *> deepest = find_deepest_loops(tf[loop_num]); + + for (int i = 0; i < deepest.size(); i++){ + index.push_back(deepest[i]->index()->name()); //reflects original code index names + } + + for(int i=0; i< stmt.size(); i++) + idxNames.push_back(index); //refects prefered index names (used as handles in cudaize v2) + useIdxNames=false; + +} + |