From c285135eb903c31cd221f90f03e288a6b67770cd Mon Sep 17 00:00:00 2001 From: Derick Huth Date: Thu, 24 Sep 2015 11:26:53 -0600 Subject: pre-v0.2.1 --- loop_cuda.cc | 2123 ---------------------------------------------------------- 1 file changed, 2123 deletions(-) delete mode 100644 loop_cuda.cc (limited to 'loop_cuda.cc') diff --git a/loop_cuda.cc b/loop_cuda.cc deleted file mode 100644 index a23990d..0000000 --- a/loop_cuda.cc +++ /dev/null @@ -1,2123 +0,0 @@ -/***************************************************************************** - Copyright (C) 2009 University of Utah - All Rights Reserved. - - Purpose: - Cudaize methods - - Notes: - - History: - 1/7/10 Created by Gabe Rudy by migrating code from loop.cc - 31/1/11 Modified by Protonu Basu -*****************************************************************************/ - -#include -#include -#include -#include -#include "loop_cuda.hh" -#include "loop.hh" -#include -#include -#include "omegatools.hh" -#include "ir_cudasuif.hh" -#include "ir_suif.hh" -#include "ir_suif_utils.hh" -#include "chill_error.hh" -#include - -using namespace omega; -char *k_cuda_texture_memory; //protonu--added to track texture memory type -char *k_cuda_constant_memory; //protonu--added to track constant memory type -//extern char *omega::k_cuda_texture_memory; //protonu--added to track texture memory type -extern char *omega::k_ocg_comment; - - -static int cudaDebug; -class CudaStaticInit{ public: CudaStaticInit(){ cudaDebug=0; //Change this to 1 for debug -}}; -static CudaStaticInit junkInitInstance__; - - - -std::string& upcase(std::string& s) -{ - for(int i=0; i& curOrder){ - if(!cudaDebug) return; - for(int i=0; i0) - printf(","); - printf("%s", curOrder[i].c_str()); - } - printf("\n"); -} - -void printVS(const std::vector& curOrder){ - //if(!cudaDebug) return; - for(int i=0; i0) - printf(","); - printf("%s", curOrder[i].c_str()); - } - printf("\n"); -} - -LoopCuda::~LoopCuda() { - const int m = stmt.size(); - for (int i = 0; i < m; i++) - stmt[i].code->clear(); -} - -bool LoopCuda::symbolExists(std::string s){ - if(symtab->lookup_sym(s.c_str(), SYM_VAR, false)) - return true; - if(globals->lookup_sym(s.c_str(), SYM_VAR, false)) - return true; - for(int i=0; iub_list(); - tree_node_list_iter upli(ub); - while(!upli.is_empty()){ - tree_node *node = upli.step(); - if(node->kind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr) - { - in_rrr* ins = (in_rrr*)((tree_instr*)node)->instr(); - //expect the structure: cpy( _ = min(grab_me, _)) - if(ins->opcode() == io_cpy && ins->src1_op().is_instr()){ - ins = (in_rrr*)ins->src1_op().instr(); - if(ins->opcode() == io_min){ - tree_node_list* tnl = new tree_node_list; - tnl->append(if_node(symtab, fold_sle(operand(bound_sym), ins->src1_op().instr()->clone()), then_part)); - return tnl; - } - } - } - } - return then_part; //Failed to go to proper loop level -} - -/** - * This would be better if it was done by a CHiLL xformation instead of at codegen - * - * state: - * for(...) - * for(...) - * cur_body - * stmt1 - * - * stm1 is in-between two loops that are going to be reduced. The - * solution is to put stmt1 at the end of cur_body but conditionally run - * in on the last step of the for loop. - * - * A CHiLL command that would work better: - * - * for(...) - * stmt0 - * for(for i=0; i - * for(...) - * for(for i=0; i findCommentedFors(const char* index, tree_node_list* tnl){ - std::vector result; - - tree_node_list_iter iter(tnl); - bool next_loop_ok = false; - while (!iter.is_empty()) { - tree_node *tn = iter.step(); - if (tn->kind() == TREE_INSTR && ((tree_instr*)tn)->instr()->opcode() == io_mrk) - { - instruction* inst = ((tree_instr*)tn)->instr(); - std::string comment; - if ((inst->peek_annote(k_ocg_comment) != NULL)) - { - immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment)); - immed_list_iter data_iter(data); - if(!data_iter.is_empty()){ - immed first_immed = data_iter.step(); - if(first_immed.kind() == im_string) - comment = first_immed.string(); - } - } - if(comment.find("~cuda~") != std::string::npos - && comment.find("preferredIdx: ") != std::string::npos){ - std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos); - if(idx.find(" ") != std::string::npos) - idx = idx.substr(0,idx.find(" ")); - if(strcmp(idx.c_str(),index) == 0) - next_loop_ok = true; - } - } - if (tn->kind() == TREE_FOR){ - if(next_loop_ok){ - //printf("found loop %s\n", static_cast(tn)->index()->name()); - result.push_back(static_cast(tn)); - } - else{ - //printf("looking down for loop %s\n", static_cast(tn)->index()->name()); - std::vector t = findCommentedFors(index, static_cast(tn)->body()); - std::copy(t.begin(), t.end(), back_inserter(result)); - } - next_loop_ok = false; - } - if (tn->kind() == TREE_IF) { - //printf("looking down if\n"); - tree_if *tni = static_cast(tn); - std::vector t = findCommentedFors(index, tni->then_part()); - std::copy(t.begin(), t.end(), back_inserter(result)); - } - } - - return result; -} - -tree_node_list* forReduce(tree_for* loop, var_sym* reduceIndex, proc_symtab* proc_syms) -{ - //We did the replacements all at once with recursiveFindPreferedIdxs - //replacements r; - //r.oldsyms.append(loop->index()); - //r.newsyms.append(reduceIndex); - //tree_for* new_loop = (tree_for*)loop->clone_helper(&r, true); - tree_for* new_loop = loop; - - //return body one loops in - tree_node_list* tnl = loop_body_at_level(new_loop, 1); - //wrap in conditional if necessary - tnl = wrapInIfFromMinBound(tnl, new_loop, proc_syms, reduceIndex); - return tnl; -} - -void recursiveFindRefs(tree_node_list* code, proc_symtab* proc_syms, replacements* r) -{ - if(code->parent() && code->scope()->is_block()) - ((block_symtab*)code->scope())->find_exposed_refs(proc_syms, r); - tree_node_list_iter tnli(code); - while (!tnli.is_empty()) { - tree_node *node = tnli.step(); - //printf("node kind: %d\n", node->kind()); - if(node->is_instr()) - { - tree_instr* t_instr = (tree_instr*)node; - t_instr->find_exposed_refs(proc_syms, r); - } - if(node->is_block()){ - recursiveFindRefs(static_cast(node)->body(), proc_syms, r); - } - else if(node->is_for()){ - tree_for* tn_for = static_cast(node); - //Find refs in statemetns and body - tn_for->find_exposed_refs(proc_syms, r); - //recursiveFindRefs(tn_for->body(), proc_syms, r); - } - } -} - -tree_node_list* recursiveFindReplacePreferedIdxs(tree_node_list* code, proc_symtab* proc_syms, - proc_sym* cudaSync, func_type* unkown_func, - std::map& loop_idxs) -{ - tree_node_list* tnl = new tree_node_list; - tree_node_list_iter tnli(code); - var_sym* idxSym=0; - bool sync = false; - std::vector r1; - std::vector r2; - while (!tnli.is_empty()) { - tree_node *node = tnli.step(); - //printf("node kind: %d\n", node->kind()); - if(node->is_instr()) - { - if(((tree_instr*)node)->instr()->format() == inf_rrr){ - in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr(); - if(inst->opcode() == io_mrk){ - std::string comment; - if ((inst->peek_annote(k_ocg_comment) != NULL)) - { - immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment)); - immed_list_iter data_iter(data); - if(!data_iter.is_empty()){ - immed first_immed = data_iter.step(); - if(first_immed.kind() == im_string) - comment = first_immed.string(); - } - } - if(comment.find("~cuda~") != std::string::npos - && comment.find("preferredIdx: ") != std::string::npos){ - std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos); - if(idx.find(" ") != std::string::npos) - idx = idx.substr(0,idx.find(" ")); - //printf("sym_tab preferred index: %s\n", idx.c_str()); - if(loop_idxs.find(idx) != loop_idxs.end()) - idxSym = loop_idxs.find(idx)->second; - //Get the proc variable sybol for this preferred index - if(idxSym == 0){ - idxSym = (var_sym*)proc_syms->lookup_sym(idx.c_str(), SYM_VAR, false); - //printf("idx not found: lookup %p\n", idxSym); - if(!idxSym){ - idxSym = new var_sym(type_s32, (char*)idx.c_str()); - proc_syms->add_sym(idxSym); - //printf("idx created and inserted\n"); - } - //Now insert into our map for future - loop_idxs.insert(make_pair(idx, idxSym)); - } - //See if we have a sync as well - if(comment.find("sync") != std::string::npos){ - //printf("Inserting sync after current block\n"); - sync = true; - } - } - } - } - tnl->append(node); - } - else if(node->is_block()){ - tree_block* b = static_cast(node); - b->set_body(recursiveFindReplacePreferedIdxs(b->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); - tnl->append(b); - } - else if(node->is_for()){ - tree_for* tn_for = static_cast(node); - if(idxSym){ - //Replace the current tn_for's index variable with idxSym - //printf("replacing sym %s -> %s\n", tn_for->index()->name(), idxSym->name()); - replacements r; - r.oldsyms.append(tn_for->index()); - r.newsyms.append(idxSym); - tree_for* new_loop = (tree_for*)tn_for->clone_helper(&r, true); - idxSym = 0; //Reset for more loops in this tnl - new_loop->set_body(recursiveFindReplacePreferedIdxs(new_loop->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); - tnl->append(new_loop); - - if(sync){ - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaSync))), 0); - tnl->append(new tree_instr(the_call)); - //tnl->print(); - sync = true; - } - }else{ - tn_for->set_body(recursiveFindReplacePreferedIdxs(tn_for->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); - tnl->append(tn_for); - } - }else if (node->kind() == TREE_IF) { - tree_if *tni = static_cast(node); - tni->set_then_part(recursiveFindReplacePreferedIdxs(tni->then_part(), proc_syms, cudaSync, unkown_func, loop_idxs)); - tnl->append(tni); - } - } - //Do this after the loop to not screw up the pointer interator - /* - for(int i=0; i array references -// loop_idxs -> map for when we encounter a loop with a different preferredIndex -// dim_vars -> out param, fills with var_sym pair for 2D array dimentions (messy stuff) -tree_node_list* swapVarReferences(tree_node_list* code, replacements* r, CG_suifBuilder *ocg, - std::map& loop_vars, - proc_symtab *proc_syms, - std::vector< std::pair >& dim_vars) -{ - //Iterate over every expression, looking up each variable and type - //reference used and possibly replacing it or adding it to our symbol - //table - // - //We use the built-in cloning helper methods to seriously help us with this! - - //Need to do a recursive mark - recursiveFindRefs(code, proc_syms, r); - - - //We can't rely on type_node->clone() to do the heavy lifting when the - //old type is a two dimentional array with variable upper bounds as - //that requires creating and saveing variable references to the upper - //bounds. So we do one pass over the oldtypes doing this type of - //conversion, putting results in the fixed_types map for a second pass - //to pick up. - std::map fixed_types; //array_types needing their upper bound installed - type_node_list_iter tlip(&r->oldtypes); - while(!tlip.is_empty()) - { - type_node* old_tn = tlip.step(); - type_node* new_tn = 0; - type_node* base_type = old_tn; - std::vector< std::pair > variable_upper_bouneds; - if(old_tn->is_ptr()){ - while (base_type->is_array() || base_type->is_ptr()) { - if (base_type->is_array()){ - array_bound ub = ((array_type*)base_type)->upper_bound(); - if(ub.is_variable()){ - var_sym* old_ub = (var_sym*)ub.variable(); - var_sym *new_ub = proc_syms->new_unique_var(type_s32); - dim_vars.push_back(std::pair(old_ub, new_ub)); - variable_upper_bouneds.push_back( std::pair(new_ub, base_type) ); - } - base_type = static_cast(base_type)->elem_type(); - } - else if (base_type->is_ptr()) - base_type = static_cast(base_type)->ref_type(); - } - } - for (int i = variable_upper_bouneds.size()-1; i >= 0; i--) { - var_sym *var_ub = variable_upper_bouneds[i].first; - type_node* old_tn = variable_upper_bouneds[i].second; - if(new_tn == 0) - new_tn = new array_type(base_type, array_bound(1), array_bound(var_ub)); - else - new_tn = new array_type(new_tn, array_bound(1), array_bound(var_ub)); - proc_syms->add_type(new_tn); - fixed_types.insert(std::pair(old_tn, new_tn)); - } - if(new_tn){ - if(old_tn->is_ptr()){ - new_tn = new ptr_type(new_tn); - proc_syms->add_type(new_tn); - } - fixed_types.insert(std::pair(old_tn, new_tn)); - } - } - - //Quickly look for modifiers on our our array types (__shared__ float [][]) - type_node_list_iter tliq(&r->oldtypes); - while(!tliq.is_empty()) - { - type_node* old_tn = tliq.step(); - if(old_tn->is_modifier()){ - type_node* base_type = static_cast(old_tn)->base(); - if(fixed_types.find(base_type) != fixed_types.end()){ - type_node* fixed_base = (*fixed_types.find(base_type)).second; - //printf("Fix modifier with fixed base\n"); - //This should work to copy over the annotations, but apparently doesn't work so well - type_node* new_tn = new modifier_type(static_cast(old_tn)->op(), fixed_base); - old_tn->copy_annotes(new_tn); - fixed_types.insert(std::pair(old_tn, new_tn)); - } - } - } - - //Run through the types and create entries in r->newtypes but don't install - type_node_list_iter tli(&r->oldtypes); - while(!tli.is_empty()) - { - type_node* old_tn = tli.step(); - type_node* new_tn = 0; - - //If we recorded this as fixed by our special case, use that type - //instead of cloning. - if(fixed_types.find(old_tn) != fixed_types.end()){ - new_tn = (*fixed_types.find(old_tn)).second; - //printf("Reusing fixed typ %u: ", new_tn->type_id()); - }else{ - new_tn = old_tn->clone(); - //printf("Cloning type %u: ", old_tn->type_id()); - } - new_tn = proc_syms->install_type(new_tn); - - //Ok, there is a weird case where an array type that has var_sym as - //their upper bounds can't be covered fully in this loop or the - //var_sym loop, so we need special code. - /* - if(old_tn->op() == TYPE_PTR && ((ptr_type*)old_tn)->ref_type()->op() == TYPE_ARRAY){ - array_type* outer_array = (array_type*)((ptr_type*)old_tn)->ref_type(); - array_bound ub = outer_array->upper_bound(); - if(ub.is_variable()){ - var_sym* old_ub = (var_sym*)ub.variable(); - var_sym* new_ub = (var_sym*)((array_type*)((ptr_type*)new_tn)->ref_type())->upper_bound().variable(); - //r->oldsyms.append(old_ub); - fix_ub.insert(std::pair(old_ub, (array_type*)((ptr_type*)new_tn)->ref_type())); - dim_vars.push_back(std::pair(old_ub, new_ub)); - printf("array var_sym: %p\n", new_ub); - } - if(outer_array->elem_type()->op() == TYPE_ARRAY) - { - array_type* inner_array = (array_type*)outer_array->elem_type(); - array_bound ub = inner_array->upper_bound(); - if(ub.is_variable()){ - var_sym* old_ub = (var_sym*)ub.variable(); - var_sym* new_ub = (var_sym*)((array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type())->upper_bound().variable(); - dim_vars.push_back(std::pair(old_ub, new_ub)); - printf("array var_sym: %p\n", new_ub); - //r->oldsyms.append(old_ub); - fix_ub.insert(std::pair(old_ub, (array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type())); - } - } - } - */ - r->newtypes.append(new_tn); - } - - //printf("proc_syms symbol run through\n"); - //proc_syms->print(); - - //Run through the syms creating new copies - sym_node_list_iter snli(&r->oldsyms); - while(!snli.is_empty()) - { - sym_node *old_sn = snli.step(); - - if(loop_vars.count(std::string(old_sn->name())) > 0) - { - r->newsyms.append(loop_vars[std::string(old_sn->name())]); - //printf("def exists: %s\n", old_sn->name()); - }else{ - sym_node *new_sn = old_sn->copy(); - if(new_sn->is_var()){ - var_sym* var = (var_sym*)new_sn; - type_node* new_type = var->type()->clone_helper(r); - - //TODO: Have a tagged list of variables to make shared - //Make local 2D arrays __shared__ - if(new_type->op() == TYPE_ARRAY && ((array_type*)new_type)->elem_type()->op() == TYPE_ARRAY){ - //protonu--changes suggested by Malik - //printf("Adding __shared__ annotation to : %s\n", new_sn->name()); - //new_type = ocg->ModifyType(new_type, "__shared__"); - //proc_syms->add_type(new_type); - } - var->set_type(new_type); - } - proc_syms->add_sym(new_sn); - r->newsyms.append(new_sn); - //printf("def new: %s\n", new_sn->name()); - } - } - - //printf("proc_syms var runthrough\n"); - //proc_syms->print(); - return code->clone_helper(r); -} - -bool LoopCuda::validIndexes(int stmt, const std::vector& idxs){ - for(int i=0; i array_dims, - std::vector blockIdxs, std::vector threadIdxs) -{ - int stmt_num = 0; - if(cudaDebug){ - printf("cudaize_v2(%s, {", kernel_name.c_str()); - //for( - printf("}, blocks={"); printVs(blockIdxs); printf("}, thread={"); printVs(threadIdxs); printf("})\n"); - } - - this->array_dims = array_dims; - if(!validIndexes(stmt_num, blockIdxs)){ - throw std::runtime_error("One of the indexes in the block list was not " - "found in the current set of indexes."); - } - if(!validIndexes(stmt_num, threadIdxs)){ - throw std::runtime_error("One of the indexes in the thread list was not " - "found in the current set of indexes."); - } - if(blockIdxs.size() ==0) - throw std::runtime_error("Cudaize: Need at least one block dimention"); - int block_level=0; - //Now, we will determine the actual size (if possible, otherwise - //complain) for the block dimentions and thread dimentions based on our - //indexes and the relations for our stmt; - for(int i=0; iarray_dims = array_dims; - cu_kernel_name = kernel_name.c_str(); - -} - -tree_node_list* LoopCuda::cudaize_codegen_v2() -{ - //printf("cudaize codegen V2\n"); - CG_suifBuilder *ocg = dynamic_cast(ir->builder()); - if(!ocg) return false; - - //protonu--adding an annote to track texture memory type - ANNOTE(k_cuda_texture_memory, "cuda texture memory", TRUE); - ANNOTE(k_cuda_constant_memory, "cuda constant memory", TRUE); - int tex_mem_on = 0; - int cons_mem_on = 0; - - - - CG_outputRepr* repr; - std::vector arrayVars; - std::vector localScopedVars; - - std::vector ro_refs; - std::vector wo_refs; - std::set uniqueRefs; - std::set uniqueWoRefs; - //protonu--let's try a much simpler approach of a map instead - //we also keep a map for constant memories - std::maptex_ref_map; - std::mapcons_ref_map; - - for(int j=0; j refs = ir->FindArrayRef(stmt[j].code); - for (int i = 0; i < refs.size(); i++) - { - //printf("ref %s wo %d\n", static_cast(refs[i]->name()), refs[i]->is_write()); - var_sym* var = symtab->lookup_var((char*)refs[i]->name().c_str(),false); - //If the array is not a parameter, then it's a local array and we - //want to recreate it as a stack variable in the kernel as opposed to - //passing it in. - if(!var->is_param()) - continue; - if (uniqueRefs.find(refs[i]->name()) == uniqueRefs.end()) - { - uniqueRefs.insert(refs[i]->name()); - if(refs[i]->is_write()){ - uniqueWoRefs.insert(refs[i]->name()); - wo_refs.push_back(refs[i]); - } - else - ro_refs.push_back(refs[i]); - } - if (refs[i]->is_write() && uniqueWoRefs.find(refs[i]->name()) == uniqueWoRefs.end()){ - uniqueWoRefs.insert(refs[i]->name()); - wo_refs.push_back(refs[i]); - //printf("adding %s to wo\n", static_cast(refs[i]->name())); - } - } - } - - // printf("reading from array "); - // for(int i=0; iname().c_str()); - // printf("and writting to array "); - // for(int i=0; iname().c_str()); - // printf("\n"); - - const char* gridName = "dimGrid"; - const char* blockName = "dimBlock"; - - //TODO: Could allow for array_dims_vars to be a mapping from array - //references to to variable names that define their length. - var_sym* dim1 = 0; - var_sym* dim2 = 0; - - for(int i=0; iname(); - outArray = symtab->lookup_var((char*)name.c_str(),false); - - VarDefs v; - v.size_2d = -1; - char buf[32]; - snprintf(buf, 32, "devO%dPtr", i+1); - v.name = buf; - if(outArray->type()->is_ptr()) - if(((ptr_type *)(outArray->type()))->ref_type()->is_array()) - v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type(); - else - v.type = ((ptr_type *)(outArray->type()))->ref_type(); - else - v.type = type_f32; - v.tex_mapped = false; - v.cons_mapped = false; - v.original_name = wo_refs[i]->name(); - //Size of the array = dim1 * dim2 * num bytes of our array type - - //If our input array is 2D (non-linearized), we want the actual - //dimentions of the array - CG_outputRepr* size; - //Lookup in array_dims - std::map::iterator it = array_dims.find(name.c_str()); - if(outArray->type()->is_ptr() && outArray->type()->ref_type(0)->is_array()) - { - array_type* t = (array_type*)outArray->type()->ref_type(0); - v.size_2d = t->upper_bound().constant()+1; - printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)wo_refs[i]->name().c_str()); - size = ocg->CreateInt(v.size_2d * v.size_2d); - }else if(it != array_dims.end()){ - int ref_size = it->second; - v.var_ref_size = ref_size; - size = ocg->CreateInt(ref_size); - } - else{ - if(dim1){ - size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)), - new CG_suifRepr(operand(dim2))); - }else{ - char buf[1024]; - sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a " - "detectable size or specififed dimentions", name.c_str()); - throw std::runtime_error(buf); - } - } - v.size_expr = operand(static_cast(ocg->CreateTimes( - size, - ocg->CreateInt(v.type->size()/8)))->GetExpression()); - v.in_data = 0; - v.out_data = outArray; - //Check for in ro_refs and remove it at this point - std::vector::iterator it_; - for(it_ = ro_refs.begin(); it_ != ro_refs.end(); it_++) - { - if((*it_)->name() == wo_refs[i]->name()){ - break; - } - } - if(it_ != ro_refs.end()) - { - v.in_data = outArray; - ro_refs.erase(it_); - } - - arrayVars.push_back(v); - - } - - //protonu-- assuming that all texture mapped memories were originally read only mems - //there should be safety checks for that, will implement those later - - int cs_ref_size = 0; - - for(int i=0; iname(); - inArray = symtab->lookup_var((char*)name.c_str(),false); - VarDefs v; - v.size_2d = -1; - char buf[32]; - snprintf(buf, 32, "devI%dPtr", i+1); - v.name = buf; - if(inArray->type()->is_ptr()) - if(((ptr_type *)(inArray->type()))->ref_type()->is_array()) - v.type = ((array_type *)(((ptr_type *)(inArray->type()))->ref_type()))->elem_type(); - else - v.type = ((ptr_type *)(inArray->type()))->ref_type(); - else - v.type = type_f32; - v.tex_mapped = false; - v.cons_mapped = false; - v.original_name = ro_refs[i]->name(); - if ( texture != NULL) - v.tex_mapped = (texture->is_array_tex_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars - if (v.tex_mapped){ - printf("this variable %s is mapped to texture memory", name.c_str()); - } - if ( constant_mem != NULL) - v.cons_mapped = (constant_mem->is_array_cons_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars - if (v.cons_mapped){ - printf("this variable %s is mapped to constant memory", name.c_str()); - } - - //Size of the array = dim1 * dim2 * num bytes of our array type - - //If our input array is 2D (non-linearized), we want the actual - //dimentions of the array (as it might be less than cu_n - CG_outputRepr* size; - //Lookup in array_dims - std::map::iterator it = array_dims.find(name.c_str()); - int ref_size = 0; - if(inArray->type()->is_ptr() && inArray->type()->ref_type(0)->is_array()) - { - array_type* t = (array_type*)inArray->type()->ref_type(0); - v.size_2d = t->upper_bound().constant()+1; - printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)ro_refs[i]->name().c_str()); - size = ocg->CreateInt(v.size_2d * v.size_2d); - }else if(it != array_dims.end()){ - ref_size = it->second; - v.var_ref_size = ref_size; - size = ocg->CreateInt(ref_size); - }else{ - if(dim1){ - size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)), - new CG_suifRepr(operand(dim2))); - }else{ - char buf[1024]; - sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a " - "detectable size or specififed dimentions", name.c_str()); - throw std::runtime_error(buf); - } - } - - - - v.size_expr = operand(static_cast(ocg->CreateTimes( - size, - ocg->CreateInt(v.type->size()/8)))->GetExpression()); - - v.in_data = inArray; - v.out_data = 0; - arrayVars.push_back(v); - } - - - if(arrayVars.size() < 2) - { - fprintf(stderr, "cudaize error: Did not find two arrays being accessed\n"); - return false; - } - - //protonu--debugging tool--the printf statement - //tex_mem_on signals use of tex mem - for(int i=0; iinstall_type(unkown_func); - func_type* void_func = new func_type(type_void); //function on unkown args that returns a void - void_func = (func_type*)globals->install_type(void_func); - func_type* float_func = new func_type(type_f32); //function on unkown args that returns a float - float_func = (func_type*)globals->install_type(float_func); - - type_node* result = ocg->ModifyType(type_void, "__global__"); - result = globals->install_type(result); - func_type* kernel_type = new func_type(result); //function returns a '__global__ void' - - int numArgs = arrayVars.size() + (dim1 ? 2 : 0) + localScopedVars.size(); - //protonu--need to account for texture memory here, reduce the #args - if( tex_mem_on ) numArgs -= tex_mem_on; - if( cons_mem_on ) numArgs -= cons_mem_on; - kernel_type->set_num_args(numArgs); - int argCount = 0; - for(int i=0; itype()->clone(); - else - fptr = arrayVars[i].out_data->type()->clone(); - //protonu--skip this for texture mems - if( arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true ) - kernel_type->set_arg_type(argCount++, fptr); - } - if(dim1){ - kernel_type->set_arg_type(argCount++, type_s32); //width x height dimentions - kernel_type->set_arg_type(argCount++, type_s32); - } - kernel_type = (func_type*)globals->install_type(kernel_type); - - proc_sym* cudaMalloc = globals->new_proc(unkown_func, src_c, "cudaMalloc"); - proc_sym* cudaMemcpy = globals->new_proc(unkown_func, src_c, "cudaMemcpy"); - proc_sym* cudaFree = globals->new_proc(unkown_func, src_c, "cudaFree"); - proc_sym* cudaSync = globals->new_proc(void_func, src_c, "__syncthreads"); - proc_sym* cudaBind = globals->new_proc(unkown_func, src_c, "cudaBindTexture"); - proc_sym* cudaMemcpySym = globals->new_proc(unkown_func, src_c, "cudaMemcpyToSymbol"); - - - //protonu-removing Gabe's function, introducing mine, this is pretty cosmetic - //proc_sym* cudaFetch = globals->new_proc(float_func, src_c, "tex1Dfetch"); - proc_sym* tex1D = globals->new_proc(float_func, src_c, "tex1Dfetch"); - - var_sym *cudaMemcpyHostToDevice = new var_sym(type_s32, "cudaMemcpyHostToDevice"); - var_sym *cudaMemcpyDeviceToHost = new var_sym(type_s32, "cudaMemcpyDeviceToHost"); - cudaMemcpyDeviceToHost->set_param(); - cudaMemcpyHostToDevice->set_param(); - globals->add_sym(cudaMemcpyHostToDevice); - globals->add_sym(cudaMemcpyDeviceToHost); - - //protonu--adding the bool tex_mem to the structure struct_type - //to bypass the re-naming of struct texture, this is a hack fix - struct_type* texType = new struct_type(TYPE_GROUP, 0, "texture", 0, true); - immed_list *iml_tex = new immed_list; - iml_tex->append(immed("texture memory")); - texType->append_annote(k_cuda_texture_memory, iml_tex); - //protonu--end my changes - texType = (struct_type*)globals->install_type(texType); - //protonu--should register the locals later on - //when we do the bind operation - //var_sym* texRef = new var_sym(texType, "texRef"); - //globals->add_sym(texRef); - - //Add our mallocs (and input array memcpys) - for(int i=0; iclone()); - //protonu--temporary change - type_node* fptr = new ptr_type(arrayVars[i].type); - fptr = symtab->install_type(fptr); - var_sym *dvs = new var_sym(fptr, const_cast( - arrayVars[i].name.c_str())); - dvs->set_addr_taken(); - symtab->add_sym(dvs); - - //cudaMalloc args - //protonu--no cudaMalloc required for constant memory - tree_node_list* tnl = new tree_node_list; - if(arrayVars[i].cons_mapped != true ) - { - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMalloc))), 2); - the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to()->ptr_to(), operand(), immed(dvs)))); - the_call->set_argument(1, arrayVars[i].size_expr); - - tnl->append(new tree_instr(the_call)); - setup_code = ocg->StmtListAppend(setup_code, - new CG_suifRepr(tnl)); - } - if(arrayVars[i].in_data) - { - //cudaMemcpy args - //protonu-- no cudaMemcpy required for constant memory - if ( arrayVars[i].cons_mapped != true ) - { - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpy))), 4); - the_call->set_argument(0, operand(dvs)); - the_call->set_argument(1, operand(arrayVars[i].in_data)); - the_call->set_argument(2, arrayVars[i].size_expr.clone()); - the_call->set_argument(3, operand(cudaMemcpyHostToDevice)); - - tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - setup_code = ocg->StmtListAppend(setup_code, - new CG_suifRepr(tnl)); - } - - //protonu--check if the arrayvar is tex mapped - if(arrayVars[i].tex_mapped == true) - { - //Need a texture reference variable - char buf[32]; - snprintf(buf, 32, "tex%dRef", i+1); - arrayVars[i].secondName = buf; - - var_sym* texRef = new var_sym(texType, buf); - //printf("\n putting in %s\n", arrayVars[i].original_name.c_str()); - tex_ref_map[arrayVars[i].original_name] = texRef; - globals->add_sym(texRef); - //protonu--added the above two lines - - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaBind))), 4); - in_ldc *ins = new in_ldc(type_s32, operand(), immed(0)); - the_call->set_argument(0, operand(ins)); - the_call->set_argument(1, operand(texRef));//protonu--change to add the new sym - the_call->set_argument(2, operand(dvs)); - the_call->set_argument(3, arrayVars[i].size_expr.clone()); - - tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - setup_code = ocg->StmtListAppend(setup_code, - new CG_suifRepr(tnl)); - } - - //protonu--if arrayvar is mapped to constant memory - if(arrayVars[i].cons_mapped == true) - { - char buf[32]; - snprintf(buf, 32, "cs%dRef", i+1); - //arrayVars[i].secondName = buf; - array_bound low (0); - array_bound high (arrayVars[i].var_ref_size -1); - array_type *arr = new array_type(arrayVars[i].type,low, high); - type_node* cons_arr = ocg->ModifyType(arr, "__device__ __constant__"); - cons_arr = globals->install_type(cons_arr); - var_sym* consRef = new var_sym(cons_arr, buf); - cons_ref_map[arrayVars[i].original_name] = consRef; - globals->add_sym(consRef); - - - - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpySym))), 3); - the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to(), operand(), immed(consRef)))); - the_call->set_argument(1, operand(arrayVars[i].in_data)); - the_call->set_argument(2, arrayVars[i].size_expr.clone()); - - tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - setup_code = ocg->StmtListAppend(setup_code, - new CG_suifRepr(tnl)); - - } - } - } - - //Build dimGrid dim3 variables based on loop dimentions and ti/tj - char blockD1[120]; - char blockD2[120]; - if(dim1){ - snprintf(blockD1, 120, "%s/%d", dim1->name(), cu_tx); - snprintf(blockD2, 120, "%s/%d", dim2->name(), cu_ty); - }else{ - snprintf(blockD1, 120, "%d", cu_bx); - snprintf(blockD2, 120, "%d", cu_by); - //snprintf(blockD1, 120, "%d/%d", cu_nx, cu_tx); - //snprintf(blockD2, 120, "%d/%d", cu_ny, cu_ty); - } - repr = ocg->CreateDim3(immed((char*)gridName), - immed(blockD1), - immed(blockD2)); - setup_code = ocg->StmtListAppend(setup_code, repr); - - repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx),immed(cu_ty)); - - if(cu_tz > 1) - repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty), immed(cu_tz)); - else - repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty)); - setup_code = ocg->StmtListAppend(setup_code, repr); - - //call kernel function with name loop_name - //like: transpose_k<<>>(devOPtr, devIPtr , width, height); - char dims[120]; - snprintf(dims,120,"<<<%s,%s>>>",gridName, blockName); - immed_list *iml = new immed_list; - iml->append(immed((char*)cu_kernel_name.c_str())); - iml->append(immed(dims)); - //printf("%s %s\n", static_cast(cu_kernel_name), dims); - for(int i=0; i= 0) - { - snprintf(dims,120,"(float(*) [%d])%s", arrayVars[i].size_2d, - const_cast(arrayVars[i].name.c_str())); - //printf("%d %s\n", i, dims); - iml->append(immed(dims)); - }else{ - //printf("%d %s\n", i, static_cast(arrayVars[i].name)); - iml->append(immed(const_cast( - arrayVars[i].name.c_str()))); - } - } - if(dim1){ - iml->append(immed(dim1)); - iml->append(immed(dim2)); - } - repr = ocg->CreateKernel(iml);//kernel call - setup_code = ocg->StmtListAppend(setup_code, repr); - - //cuda free variables - for(int i=0; iptr_to(), operand(), immed(cudaMemcpy))), 4); - the_call->set_argument(0, operand(arrayVars[i].out_data)); - the_call->set_argument(1, operand(symtab->lookup_var(const_cast( - arrayVars[i].name.c_str())))); - the_call->set_argument(2, arrayVars[i].size_expr.clone()); - the_call->set_argument(3, operand(cudaMemcpyDeviceToHost)); - - tree_node_list* tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - teardown_code = ocg->StmtListAppend(teardown_code, - new CG_suifRepr(tnl)); - } - - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaFree))), 1); - the_call->set_argument(0, operand(symtab->lookup_var(const_cast( - arrayVars[i].name.c_str())))); - - tree_node_list* tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - teardown_code = ocg->StmtListAppend(teardown_code, - new CG_suifRepr(tnl)); - } - - // --------------- - // BUILD THE KERNEL - // --------------- - - //Extract out kernel body - tree_node_list* code = getCode(); - //Get rid of wrapper if that original() added - if(code->head()->contents->kind() == TREE_IF) - { - tree_if* ifn = (tree_if*)code->head()->contents; - code = ifn->then_part(); - } - - //Create kernel function body - proc_sym *new_psym = globals->new_proc(kernel_type, src_c, (char*)cu_kernel_name.c_str()); - proc_symtab *new_proc_syms = new proc_symtab(new_psym->name()); - globals->add_child(new_proc_syms); - - //Add Params - std::map loop_vars; - //In-Out arrays - type_node* fptr; - for(int i=0; itype()->clone(); - fptr = arrayVars[i].in_data->type(); - else - //fptr = arrayVars[i].out_data->type()->clone(); - fptr = arrayVars[i].out_data->type(); - fptr = new_proc_syms->install_type(fptr); - std::string name = arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name(); - var_sym* sym = new var_sym(fptr, arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name()); - //protonu--adding a check to ensure that texture memories are not passed in as arguments - if(arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true ) - { - sym->set_param(); - new_proc_syms->params()->append(sym); - new_proc_syms->add_sym(sym);//protonu--added to suppress the addition of the redundant var in the kernel - } - if (arrayVars[i].cons_mapped == true) - { - sym->set_param(); - new_proc_syms->add_sym(sym); - } - //printf("inserting name: %s\n", static_cast(name)); - loop_vars.insert(std::pair(std::string(name), sym)); - } - - if(dim1) - { - //Array dimentions - var_sym* kdim1 = new var_sym(dim1->type(), dim1->name()); - kdim1->set_param(); - new_proc_syms->add_sym(kdim1); - loop_vars.insert(std::pair(std::string(dim1->name()), kdim1)); - var_sym* kdim2 = new var_sym(dim2->type(), dim2->name()); - kdim2->set_param(); - new_proc_syms->add_sym(kdim2); - loop_vars.insert(std::pair(std::string(dim2->name()), kdim2)); - new_proc_syms->params()->append(kdim1); - new_proc_syms->params()->append(kdim2); - } - //Put block and thread implicit variables into scope - std::vector index_syms; - /* Currently we don't use the block dimentions - var_sym* blockDim_x = new var_sym(type_s32, "blockDim.x"); - blockDim_x->set_param(); - new_proc_syms->add_sym(blockDim_x); - var_sym* blockDim_y = new var_sym(type_s32, "blockDim.y"); - blockDim_y->set_param(); - new_proc_syms->add_sym(blockDim_y); - */ - if(cu_bx > 1){ - var_sym* blockIdx_x = new var_sym(type_s32, "blockIdx.x"); - blockIdx_x->set_param(); - new_proc_syms->add_sym(blockIdx_x); - index_syms.push_back(blockIdx_x); - } - if(cu_by > 1){ - var_sym* blockIdx_y = new var_sym(type_s32, "blockIdx.y"); - blockIdx_y->set_param(); - new_proc_syms->add_sym(blockIdx_y); - index_syms.push_back(blockIdx_y); - } - if(cu_tx > 1){ - var_sym* threadIdx_x = new var_sym(type_s32, "threadIdx.x"); - threadIdx_x->set_param(); - new_proc_syms->add_sym(threadIdx_x); - index_syms.push_back(threadIdx_x); - } - if(cu_ty > 1){ - var_sym* threadIdx_y = new var_sym(type_s32, "threadIdx.y"); - threadIdx_y->set_param(); - new_proc_syms->add_sym(threadIdx_y); - index_syms.push_back(threadIdx_y); - } - - if(cu_tz > 1){ - var_sym* threadIdx_z = new var_sym(type_s32, "threadIdx.z"); - threadIdx_z->set_param(); - new_proc_syms->add_sym(threadIdx_z); - index_syms.push_back(threadIdx_z); - } - - //Figure out which loop variables will be our thread and block dimention variables - std::vector loop_syms; - //Get our indexes - std::vector indexes;// = get_loop_indexes(code,cu_num_reduce); - int threadsPos=0; - if(cu_bx > 1) - indexes.push_back("bx"); - if(cu_by > 1) - indexes.push_back("by"); - if(cu_tx > 1){ - threadsPos = indexes.size(); - indexes.push_back("tx"); - } - if(cu_ty > 1) - indexes.push_back("ty"); - if(cu_tz > 1) - indexes.push_back("tz"); - for(int i=0; iadd_sym(loop_syms[i]); - //loop_vars.insert(std::pair(std::string(indexes[i]), loop_syms[i])); - } - - //Generate this code - //int bx = blockIdx.x - //int by = blockIdx.y - //int tx = threadIdx.x - //int ty = threadIdx.y - CG_outputRepr *body=NULL; - for(int i=0; iStmtListAppend(body, ocg->CreateStmtList( - // ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i]))))); - body = ocg->StmtListAppend(body, ocg->StmtListAppend( - ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i]))), NULL)); - } - - //Get our inital code prepped for loop reduction. First we need to swap - //out internal SUIF variable references to point to the new local - //function symbol table. - std::map loop_idxs; //map from idx names to their new syms - std::vector< std::pair > dim_vars; //pair is of var_sym (for 2D array size initializations) - replacements r; - tree_node_list* swapped = swapVarReferences(code, &r, ocg, loop_vars, new_proc_syms, dim_vars); - //printf("\n code before recursiveFindReplacePreferedIdxs :\n"); - //swapped->print(); - swapped = recursiveFindReplacePreferedIdxs(swapped, new_proc_syms, cudaSync, void_func, loop_idxs);//in-place swapping - //printf("\n code after recursiveFindReplacePreferedIdxs :\n"); - //swapped->print(); - - for(int i=0; i tfs = findCommentedFors(indexes[i], swapped); - for(int k=0; kprint(); - swap_node_for_node_list(tfs[k], newBlock); - //printf("AFTER SWAP\n"); newBlock->print(); - } - } - //printf("AFTER REDUCE\n"); swapped->print(); - - if(static_cast(ir)->init_code()){ - tree_node_list* orig_init_code = static_cast(static_cast(ir)->init_code())->GetCode(); - for(int i=0; ikind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr) - { - in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr(); - //expect the structure: cpy( _ = min(grab_me, _)) - if(inst->opcode() == io_cpy && inst->dst_op().is_symbol()){ - //printf("looking at instruction: "); - //inst->print(); - var_sym* dest = inst->dst_op().symbol(); - if(dest == dim_vars[i].first) - { - if(inst->src1_op().is_instr() && inst->src1_op().instr()->format() == inf_ldc){ - value = ((in_ldc*)inst->src1_op().instr())->value().integer(); - } - } - } - } - } - if(value < 0){ - fprintf(stderr, "ERROR: Could not find initializing statement for variable used in upper_bound of array type"); - } - CG_outputRepr *lhs = new CG_suifRepr(operand(dim_vars[i].second)); - //body = ocg->StmtListAppend(body, ocg->CreateStmtList(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value)))); - body = ocg->StmtListAppend(body, ocg->StmtListAppend(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value)), NULL)); - } - } - - - body = ocg->StmtListAppend(body, new CG_suifRepr(swapped)); - - //protonu--lets try creating our function definiton here - var_sym *tsym = NULL; - - - std::vector refs = ir->FindArrayRef(body); - for(int i=0; iis_array_tex_mapped(refs[i]->name().c_str())) - { - //protonu--our new tex lookup function - in_cal *tex_lookup = - new in_cal(type_f32, operand(), operand(new in_ldc(float_func->ptr_to(), operand(), immed(tex1D))), 2); - - //printf("name of the array to be mapped is %s\n", refs[i]->name().c_str()); - tsym = tex_ref_map[refs[i]->name()]; - tex_lookup->set_argument(0, operand(tsym)); - - - int array_dims = ((IR_suifArrayRef *)refs[i])->ia_->dims(); - - if (array_dims == 1){ - tex_lookup->set_argument(1, ((IR_suifArrayRef *)refs[i])->ia_->index(0).clone()); - }else if (array_dims > 2) { - printf(" \n we don't handle more than 2D arrays mapped to textures yet\n"); - }else if (array_dims == 2) { - - IR_ArraySymbol *sym = refs[i]->symbol(); - CG_outputRepr *sz = sym->size(1); - delete sym; // free the wrapper object only - // find the builder ocg - CG_outputRepr *expr = ocg->CreateTimes(sz->clone(),refs[i]->index(0)); - delete sz; // free the wrapper object only - expr = ocg->CreatePlus(expr, refs[i]->index(1)); - // expr holds the 1D access expression and take it out - tex_lookup->set_argument(1, ((CG_suifRepr *)expr)->GetExpression()); - } - - //using chun's function to replace the array look up with the function call - ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(tex_lookup))); - } - - } - - - tsym = NULL; - //protonu--now let's try what we did above for constant memory - for(int i=0; iis_array_cons_mapped(refs[i]->name().c_str())) - { - - //printf("name of the array to be cons mapped is %s\n", refs[i]->name().c_str()); - tsym = cons_ref_map[refs[i]->name()]; - //we should create a IR_SuifArray here - IR_ArraySymbol *ar_sym = new IR_suifArraySymbol(ir,tsym); - std::vector ar_index; - ar_index.push_back(((IR_suifArrayRef *)refs[i])->index(0)); - IR_ArrayRef *ar_ref = ((IR_suifCode *)ir)->CreateArrayRef(ar_sym, ar_index); - //using chun's function to replace the array look up with the function call - ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(((IR_suifArrayRef *)ar_ref)->ia_))); - - } - } - - - tree_proc *new_body = new tree_proc(static_cast(body)->GetCode(), new_proc_syms); - //globals->add_child(new_proc_syms); - new_psym->set_block(new_body); - new_procs.push_back(new_psym); - - return swapped; -} - -//Order taking out dummy variables -std::vector cleanOrder(std::vector idxNames){ - std::vector results; - for(int j=0; j& curOrder) -{ - //printf("curOrder: "); - //printVs(curOrder); - //printf("idxNames: "); - //printVS(idxNames[stmt]); - std::vector cIdxNames = cleanOrder(idxNames[stmt]); - bool same=true; - std::vector pi; - for(int i=0; i &pi) -{ -// check for sanity of parameters - if (stmt_num >= stmt.size() || stmt_num < 0) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - const int n = stmt[stmt_num].xform.n_out(); - if (pi.size() > (n-1)/2) - throw std::invalid_argument("iteration space dimensionality does not match permute dimensionality"); - int first_level = 0; - int last_level = 0; - for (int i = 0; i < pi.size(); i++) { - if (pi[i] > (n-1)/2 || pi[i] <= 0) - throw std::invalid_argument("invalid loop level " + to_string(pi[i]) + " in permuation"); - - if (pi[i] != i+1) { - if (first_level == 0) - first_level = i+1; - last_level = i+1; - } - } - if (first_level == 0) - return true; - - std::vector lex = getLexicalOrder(stmt_num); - std::set active = getStatements(lex, 2*first_level-2); - Loop::permute(active, pi); -} - - -void LoopCuda::tile_cuda(int stmt, int level, int outer_level) -{ - tile_cuda(stmt,level,1,outer_level,"","",CountedTile); -} -void LoopCuda::tile_cuda(int level, int tile_size, int outer_level, std::string idxName, - std::string ctrlName, TilingMethodType method){ - tile_cuda(0, level, tile_size, outer_level, idxName, ctrlName, method); -} - -void LoopCuda::tile_cuda(int stmt, int level, int tile_size, int outer_level, std::string idxName, - std::string ctrlName, TilingMethodType method){ - //Do regular tile but then update the index and control loop variable - //names as well as the idxName to reflect the current state of things. - //printf("tile(%d,%d,%d,%d)\n", stmt, level, tile_size, outer_level); - //printf("idxNames before: "); - //printVS(idxNames[stmt]); - - tile(stmt, level, tile_size, outer_level, method); - - if(idxName.size()) - idxNames[stmt][level-1] = idxName.c_str(); - if(tile_size == 1){ - //potentially rearrange loops - if(outer_level < level){ - std::string tmp = idxNames[stmt][level-1]; - for(int i=level-1; i>outer_level-1; i--){ - if(i-1 >= 0) - idxNames[stmt][i] = idxNames[stmt][i-1]; - } - idxNames[stmt][outer_level-1] = tmp; - } - //TODO: even with a tile size of one, you need a insert (of a dummy loop) - idxNames[stmt].insert(idxNames[stmt].begin()+(level),""); - }else{ - if(!ctrlName.size()) - throw std::runtime_error("No ctrl loop name for tile"); - //insert - idxNames[stmt].insert(idxNames[stmt].begin()+(outer_level-1),ctrlName.c_str()); - } - - //printf("idxNames after: "); - //printVS(idxNames[stmt]); -} - - -bool LoopCuda::datacopy_privatized_cuda(int stmt_num, int level, const std::string &array_name, const std::vector &privatized_levels, bool allow_extra_read , int fastest_changing_dimension , int padding_stride , int padding_alignment , bool cuda_shared) -{ - int old_stmts =stmt.size(); - //datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, cuda_shared); - if(cuda_shared) - datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 1); - else - datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 0); - - - //Adjust idxNames to reflect updated state - std::vector cIdxNames = cleanOrder(idxNames[stmt_num]); - int new_stmts = stmt.size(); - for(int i=old_stmts; i idxs; - - - //protonu-making sure the vector of nonSplitLevels grows along with - //the statement structure - stmt_nonSplitLevels.push_back(omega::Tuple()); - - //Indexes up to level will be the same - for(int j=0; j new_idxs, bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, bool cuda_shared) -{ - - int old_stmts =stmt.size(); - //datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment,cuda_shared); - if(cuda_shared) - datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 1); - else - datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 0); - //Adjust idxNames to reflect updated state - std::vector cIdxNames = cleanOrder(idxNames[stmt_num]); - int new_stmts = stmt.size(); - for(int i=old_stmts; i idxs; - - //protonu-making sure the vector of nonSplitLevels grows along with - //the statement structure - stmt_nonSplitLevels.push_back(omega::Tuple()); - - //protonu--lets dump out the code from each statement here - //printf("\n dumping statement :%d", i); - //stmt[i].code->Dump(); - - //Indexes up to level will be the same - for(int j=0; j lex = getLexicalOrder(stmt_num); - std::set same_loop = getStatements(lex, dim-1); - - level = nonDummyLevel(stmt_num,level); - //printf("unrolling %d at level %d\n", stmt_num,level); - - //protonu--using the new version of unroll, which returns - //a set of ints instead of a bool. To keep Gabe's logic - //I'll check the size of the set, if it's 0 return true - //bool b= unroll(stmt_num, level, unroll_amount); - std::set b_set= unroll(stmt_num, level, unroll_amount); - bool b = false; - if (b_set.size() == 0) b = true; - //end--protonu - - //Adjust idxNames to reflect updated state - std::vector cIdxNames = cleanOrder(idxNames[stmt_num]); - std::vector origSource = idxNames[stmt_num];; - //Drop index names at level - if(unroll_amount == 0){ - //For all statements that were in this unroll together, drop index name for unrolled level - idxNames[stmt_num][level-1] = ""; - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - //printf("in same loop as %d is %d\n", stmt_num, (*i)); - //idxNames[(*i)][level-1] = ""; - idxNames[(*i)] = idxNames[stmt_num]; - } - } - - lex = getLexicalOrder(stmt_num); - same_loop = getStatements(lex, dim-1); - - bool same_as_source = false; - int new_stmts = stmt.size(); - for(int i=old_stmts; i()); - - - //We expect that new statements have a constant for the variable in - //stmt[i].IS at level (as seen with print_with_subs), otherwise there - //will be a for loop at level and idxNames should match stmt's - //idxNames pre-unrolled - Relation IS = stmt[i].IS; - //Ok, if you know how the hell to get anything out of a Relation, you - //should probably be able to do this more elegantly. But for now, I'm - //hacking it. - std::string s = IS.print_with_subs_to_string(); - //s looks looks like - //{[_t49,8,_t51,_t52,128]: 0 <= _t52 <= 3 && 0 <= _t51 <= 15 && 0 <= _t49 && 64_t49+16_t52+_t51 <= 128} - //where level == 5, you see a integer in the input set - - //If that's not an integer and this is the first new statement, then - //we think codegen will have a loop at that level. It's not perfect, - //not sure if it can be determined without round-tripping to codegen. - int sIdx = 0; - int eIdx = 0; - for(int j=0; j 0){ - eIdx = s.find("]"); - int tmp = s.find(",",sIdx+1); - if(tmp > 0 && tmp < eIdx) - eIdx = tmp; //", before ]" - if(eIdx > 0){ - sIdx++; - std::string var = s.substr(sIdx,eIdx-sIdx); - //printf("%s\n", s.c_str()); - //printf("set var for stmt %d at level %d is %s\n", i, level, var.c_str()); - if(atoi(var.c_str()) == 0 && i ==old_stmts){ - //TODO:Maybe do see if this new statement would be in the same - //group as the original and if it would, don't say - //same_as_source - if(same_loop.find(i) == same_loop.end()){ - printf("stmt %d level %d, newly created unroll statement should have same level indexes as source\n", i, level); - same_as_source = true; - } - } - } - } - - - //printf("fixing up statement %d n_set %d with %d levels\n", i, stmt[i].IS.n_set(), level-1); - if(same_as_source) - idxNames.push_back(origSource); - else - idxNames.push_back(idxNames[stmt_num]); - } - - return b; -} - -void LoopCuda::copy_to_texture(const char *array_name) -{ - //protonu--placeholder for now - //set the bool for using cuda memory as true - //in a vector of strings, put the names of arrays to tex mapped - if ( !texture ) - texture = new texture_memory_mapping(true, array_name); - else - texture->add(array_name); - - -} - - -void LoopCuda::copy_to_constant(const char *array_name) -{ - //protonu--placeholder for now - //set the bool for using cuda memory as true - //in a vector of strings, put the names of arrays to tex mapped - if ( !constant_mem ) - constant_mem = new constant_memory_mapping(true, array_name); - else - constant_mem->add(array_name); -} - -//protonu--moving this from Loop -tree_node_list* LoopCuda::codegen() -{ - if(code_gen_flags & GenCudaizeV2) - return cudaize_codegen_v2(); - //Do other flagged codegen methods, return plain vanilla generated code - return getCode(); -} - -//These three are in Omega code_gen.cc and are used as a massive hack to -//get out some info from MMGenerateCode. Yea for nasty side-effects. -namespace omega{ - extern int checkLoopLevel; - extern int stmtForLoopCheck; - extern int upperBoundForLevel; - extern int lowerBoundForLevel; -} - - -void LoopCuda::extractCudaUB(int stmt_num, int level, int &outUpperBound, int &outLowerBound){ - // check for sanity of parameters - const int m = stmt.size(); - if (stmt_num >= m || stmt_num < 0) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - const int n = stmt[stmt_num].xform.n_out(); - if (level > (n-1)/2 || level <= 0) - throw std::invalid_argument("invalid loop level " + to_string(level)); - - int dim = 2*level-1; - - std::vector lex = getLexicalOrder(stmt_num); - std::set same_loop = getStatements(lex, dim-1); - - // extract the intersection of the iteration space to be considered - Relation hull; - { - hull = Relation::True(n); - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - hull = Intersection(hull, project_onto_levels(getNewIS(*i), dim+1, true)); - hull.simplify(2, 4); - } - - for (int i = 2; i <= dim+1; i+=2) { - //std::string name = std::string("_t") + to_string(t_counter++); - std::string name = std::string("_t") + to_string(tmp_loop_var_name_counter++); - hull.name_set_var(i, name); - } - hull.setup_names(); - } - - // extract the exact loop bound of the dimension to be unrolled - if (is_single_iteration(hull, dim)){ - throw std::runtime_error("No loop availabe at level to extract upper bound."); - } - Relation bound = get_loop_bound(hull, dim); - if (!bound.has_single_conjunct() || !bound.is_satisfiable() || bound.is_tautology()) - throw loop_error("loop error: unable to extract loop bound for cudaize"); - - // extract the loop stride - EQ_Handle stride_eq; - int stride = 1; - { - bool simple_stride = true; - int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(dim+1), stride_eq, simple_stride); - if (strides > 1) - throw loop_error("loop error: too many strides"); - else if (strides == 1) { - int sign = stride_eq.get_coef(bound.set_var(dim+1)); -// assert(sign == 1 || sign == -1); - Constr_Vars_Iter it(stride_eq, true); - stride = abs((*it).coef/sign); - } - } - if(stride != 1){ - char buf[1024]; - sprintf(buf, "Cudaize: Loop at level %d has non-one stride of %d", level, stride); - throw std::runtime_error(buf); - } - - //Use code generation system to build tell us our bound information. We - //need a hard upper bound a 0 lower bound. - - checkLoopLevel = level*2; - stmtForLoopCheck = stmt_num; - upperBoundForLevel = -1; - lowerBoundForLevel = -1; - printCode(1,false); - checkLoopLevel = 0; - - outUpperBound = upperBoundForLevel; - outLowerBound = lowerBoundForLevel; - return; -} - - -void LoopCuda::printCode(int effort, bool actuallyPrint) const { - const int m = stmt.size(); - if (m == 0) - return; - const int n = stmt[0].xform.n_out(); - - - - Tuple IS(m); - Tuple xform(m); - Tuple nonSplitLevels(m); - for (int i = 0; i < m; i++) { - IS[i+1] = stmt[i].IS; - xform[i+1] = stmt[i].xform; - nonSplitLevels[i+1] = stmt_nonSplitLevels[i]; - //nonSplitLevels[i+1] = stmt[i].nonSplitLevels; - } - - Tuple< Tuple > idxTupleNames; - if(useIdxNames){ - for(int i=0; i idxs; - for(int j=0; jknown), n - this->known.n_set()); - CG_stringBuilder *ocg = new CG_stringBuilder(); - Tuple nameInfo; - for (int i = 1; i <= m; i++) - nameInfo.append(new CG_stringRepr("s" + to_string(i))); - CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort); - if(actuallyPrint) - std::cout << GetString(repr); -/* - for (int i = 1; i <= m; i++) - delete nameInfo[i]; -*/ - - delete ocg; -} - - - -void LoopCuda::printRuntimeInfo() const { - for(int i=0; i(stmt[i].code)->GetCode()->print_expr(); - } -} - -void LoopCuda::printIndexes() const { - for(int i=0; i0) - printf(","); - printf("%s", idxNames[i][j].c_str()); - } - printf("\n"); - } -} - -tree_node_list* LoopCuda::getCode(int effort) const { - const int m = stmt.size(); - if (m == 0) - return new tree_node_list; - const int n = stmt[0].xform.n_out(); - - - - Tuple ni(m); - Tuple IS(m); - Tuple xform(m); - Tuple< IntTuple > nonSplitLevels(m); - for (int i = 0; i < m; i++) { - ni[i+1] = stmt[i].code; - IS[i+1] = stmt[i].IS; - xform[i+1] = stmt[i].xform; - nonSplitLevels[i+1] = stmt_nonSplitLevels[i]; - //nonSplitLevels[i+1] = stmt[i].nonSplitLevels; - } - - - Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); -#ifdef DEBUG -// std::cout << GetString(MMGenerateCode(new CG_stringBuilder(), xform, IS, known, effort)); -#endif - Tuple< Tuple > idxTupleNames; - if(useIdxNames){ - for(int i=0; i idxs; - for(int j=0; jbuilder(); - CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, nonSplitLevels, syncs, idxTupleNames, effort); - - //CG_outputRepr *overflow_initialization = ocg->CreateStmtList(); - //protonu--using the new function CG_suifBuilder::StmtListAppend - CG_outputRepr *overflow_initialization = ocg->StmtListAppend(NULL, NULL); - for (std::map >::const_iterator i = overflow.begin(); i != overflow.end(); i++) - for (std::vector::const_iterator j = i->second.begin(); j != i->second.end(); j++) - //overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->CreateStmtList(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)))); - overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->StmtListAppend(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)), NULL)); - - repr = ocg->StmtListAppend(overflow_initialization, repr); - tree_node_list *tnl = static_cast(repr)->GetCode(); - - delete repr; - /* - for (int i = 1; i <= m; i++) - delete ni[i]; - */ - - return tnl; -} - - -//protonu--adding constructors for the new derived class -LoopCuda::LoopCuda():Loop(), code_gen_flags(GenInit){} - -LoopCuda::LoopCuda(IR_Control *irc, int loop_num) - :Loop(irc) -{ - setup_code = NULL; - teardown_code = NULL; - code_gen_flags = 0; - cu_bx = cu_by = cu_tx = cu_ty = cu_tz = 1; - cu_num_reduce = 0; - cu_mode = GlobalMem; - texture = NULL; - constant_mem = NULL; - - int m=stmt.size(); - //printf("\n the size of stmt(initially) is: %d\n", stmt.size()); - for(int i=0; i()); - - - //protonu--setting up - //proc_symtab *symtab - //global_symtab *globals - - globals = ((IR_cudasuifCode *)ir)->gsym_ ; - std::vector tf = ((IR_cudasuifCode *)ir)->get_loops(); - - symtab = tf[loop_num]->proc()->block()->proc_syms(); - - std::vector deepest = find_deepest_loops(tf[loop_num]); - - for (int i = 0; i < deepest.size(); i++){ - index.push_back(deepest[i]->index()->name()); //reflects original code index names - } - - for(int i=0; i< stmt.size(); i++) - idxNames.push_back(index); //refects prefered index names (used as handles in cudaize v2) - useIdxNames=false; - -} - -- cgit v1.2.3-70-g09d2