/***************************************************************************** Copyright (C) 2009 University of Utah All Rights Reserved. Purpose: Cudaize methods Notes: History: 1/7/10 Created by Gabe Rudy by migrating code from loop.cc 31/1/11 Modified by Protonu Basu *****************************************************************************/ #include #include #include #include #include "loop_cuda.hh" #include "loop.hh" #include #include #include "omegatools.hh" #include "ir_cudasuif.hh" #include "ir_suif.hh" #include "ir_suif_utils.hh" #include "chill_error.hh" #include using namespace omega; char *k_cuda_texture_memory; //protonu--added to track texture memory type char *k_cuda_constant_memory; //protonu--added to track constant memory type //extern char *omega::k_cuda_texture_memory; //protonu--added to track texture memory type extern char *omega::k_ocg_comment; static int cudaDebug; class CudaStaticInit{ public: CudaStaticInit(){ cudaDebug=0; //Change this to 1 for debug }}; static CudaStaticInit junkInitInstance__; std::string& upcase(std::string& s) { for(int i=0; i& curOrder){ if(!cudaDebug) return; for(int i=0; i0) printf(","); printf("%s", curOrder[i].c_str()); } printf("\n"); } void printVS(const std::vector& curOrder){ //if(!cudaDebug) return; for(int i=0; i0) printf(","); printf("%s", curOrder[i].c_str()); } printf("\n"); } LoopCuda::~LoopCuda() { const int m = stmt.size(); for (int i = 0; i < m; i++) stmt[i].code->clear(); } bool LoopCuda::symbolExists(std::string s){ if(symtab->lookup_sym(s.c_str(), SYM_VAR, false)) return true; if(globals->lookup_sym(s.c_str(), SYM_VAR, false)) return true; for(int i=0; iub_list(); tree_node_list_iter upli(ub); while(!upli.is_empty()){ tree_node *node = upli.step(); if(node->kind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr) { in_rrr* ins = (in_rrr*)((tree_instr*)node)->instr(); //expect the structure: cpy( _ = min(grab_me, _)) if(ins->opcode() == io_cpy && ins->src1_op().is_instr()){ ins = (in_rrr*)ins->src1_op().instr(); if(ins->opcode() == io_min){ tree_node_list* tnl = new tree_node_list; tnl->append(if_node(symtab, fold_sle(operand(bound_sym), ins->src1_op().instr()->clone()), then_part)); return tnl; } } } } return then_part; //Failed to go to proper loop level } /** * This would be better if it was done by a CHiLL xformation instead of at codegen * * state: * for(...) * for(...) * cur_body * stmt1 * * stm1 is in-between two loops that are going to be reduced. The * solution is to put stmt1 at the end of cur_body but conditionally run * in on the last step of the for loop. * * A CHiLL command that would work better: * * for(...) * stmt0 * for(for i=0; i * for(...) * for(for i=0; i findCommentedFors(const char* index, tree_node_list* tnl){ std::vector result; tree_node_list_iter iter(tnl); bool next_loop_ok = false; while (!iter.is_empty()) { tree_node *tn = iter.step(); if (tn->kind() == TREE_INSTR && ((tree_instr*)tn)->instr()->opcode() == io_mrk) { instruction* inst = ((tree_instr*)tn)->instr(); std::string comment; if ((inst->peek_annote(k_ocg_comment) != NULL)) { immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment)); immed_list_iter data_iter(data); if(!data_iter.is_empty()){ immed first_immed = data_iter.step(); if(first_immed.kind() == im_string) comment = first_immed.string(); } } if(comment.find("~cuda~") != std::string::npos && comment.find("preferredIdx: ") != std::string::npos){ std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos); if(idx.find(" ") != std::string::npos) idx = idx.substr(0,idx.find(" ")); if(strcmp(idx.c_str(),index) == 0) next_loop_ok = true; } } if (tn->kind() == TREE_FOR){ if(next_loop_ok){ //printf("found loop %s\n", static_cast(tn)->index()->name()); result.push_back(static_cast(tn)); } else{ //printf("looking down for loop %s\n", static_cast(tn)->index()->name()); std::vector t = findCommentedFors(index, static_cast(tn)->body()); std::copy(t.begin(), t.end(), back_inserter(result)); } next_loop_ok = false; } if (tn->kind() == TREE_IF) { //printf("looking down if\n"); tree_if *tni = static_cast(tn); std::vector t = findCommentedFors(index, tni->then_part()); std::copy(t.begin(), t.end(), back_inserter(result)); } } return result; } tree_node_list* forReduce(tree_for* loop, var_sym* reduceIndex, proc_symtab* proc_syms) { //We did the replacements all at once with recursiveFindPreferedIdxs //replacements r; //r.oldsyms.append(loop->index()); //r.newsyms.append(reduceIndex); //tree_for* new_loop = (tree_for*)loop->clone_helper(&r, true); tree_for* new_loop = loop; //return body one loops in tree_node_list* tnl = loop_body_at_level(new_loop, 1); //wrap in conditional if necessary tnl = wrapInIfFromMinBound(tnl, new_loop, proc_syms, reduceIndex); return tnl; } void recursiveFindRefs(tree_node_list* code, proc_symtab* proc_syms, replacements* r) { if(code->parent() && code->scope()->is_block()) ((block_symtab*)code->scope())->find_exposed_refs(proc_syms, r); tree_node_list_iter tnli(code); while (!tnli.is_empty()) { tree_node *node = tnli.step(); //printf("node kind: %d\n", node->kind()); if(node->is_instr()) { tree_instr* t_instr = (tree_instr*)node; t_instr->find_exposed_refs(proc_syms, r); } if(node->is_block()){ recursiveFindRefs(static_cast(node)->body(), proc_syms, r); } else if(node->is_for()){ tree_for* tn_for = static_cast(node); //Find refs in statemetns and body tn_for->find_exposed_refs(proc_syms, r); //recursiveFindRefs(tn_for->body(), proc_syms, r); } } } tree_node_list* recursiveFindReplacePreferedIdxs(tree_node_list* code, proc_symtab* proc_syms, proc_sym* cudaSync, func_type* unkown_func, std::map& loop_idxs) { tree_node_list* tnl = new tree_node_list; tree_node_list_iter tnli(code); var_sym* idxSym=0; bool sync = false; std::vector r1; std::vector r2; while (!tnli.is_empty()) { tree_node *node = tnli.step(); //printf("node kind: %d\n", node->kind()); if(node->is_instr()) { if(((tree_instr*)node)->instr()->format() == inf_rrr){ in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr(); if(inst->opcode() == io_mrk){ std::string comment; if ((inst->peek_annote(k_ocg_comment) != NULL)) { immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment)); immed_list_iter data_iter(data); if(!data_iter.is_empty()){ immed first_immed = data_iter.step(); if(first_immed.kind() == im_string) comment = first_immed.string(); } } if(comment.find("~cuda~") != std::string::npos && comment.find("preferredIdx: ") != std::string::npos){ std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos); if(idx.find(" ") != std::string::npos) idx = idx.substr(0,idx.find(" ")); //printf("sym_tab preferred index: %s\n", idx.c_str()); if(loop_idxs.find(idx) != loop_idxs.end()) idxSym = loop_idxs.find(idx)->second; //Get the proc variable sybol for this preferred index if(idxSym == 0){ idxSym = (var_sym*)proc_syms->lookup_sym(idx.c_str(), SYM_VAR, false); //printf("idx not found: lookup %p\n", idxSym); if(!idxSym){ idxSym = new var_sym(type_s32, (char*)idx.c_str()); proc_syms->add_sym(idxSym); //printf("idx created and inserted\n"); } //Now insert into our map for future loop_idxs.insert(make_pair(idx, idxSym)); } //See if we have a sync as well if(comment.find("sync") != std::string::npos){ //printf("Inserting sync after current block\n"); sync = true; } } } } tnl->append(node); } else if(node->is_block()){ tree_block* b = static_cast(node); b->set_body(recursiveFindReplacePreferedIdxs(b->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); tnl->append(b); } else if(node->is_for()){ tree_for* tn_for = static_cast(node); if(idxSym){ //Replace the current tn_for's index variable with idxSym //printf("replacing sym %s -> %s\n", tn_for->index()->name(), idxSym->name()); replacements r; r.oldsyms.append(tn_for->index()); r.newsyms.append(idxSym); tree_for* new_loop = (tree_for*)tn_for->clone_helper(&r, true); idxSym = 0; //Reset for more loops in this tnl new_loop->set_body(recursiveFindReplacePreferedIdxs(new_loop->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); tnl->append(new_loop); if(sync){ in_cal *the_call = new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaSync))), 0); tnl->append(new tree_instr(the_call)); //tnl->print(); sync = true; } }else{ tn_for->set_body(recursiveFindReplacePreferedIdxs(tn_for->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); tnl->append(tn_for); } }else if (node->kind() == TREE_IF) { tree_if *tni = static_cast(node); tni->set_then_part(recursiveFindReplacePreferedIdxs(tni->then_part(), proc_syms, cudaSync, unkown_func, loop_idxs)); tnl->append(tni); } } //Do this after the loop to not screw up the pointer interator /* for(int i=0; i array references // loop_idxs -> map for when we encounter a loop with a different preferredIndex // dim_vars -> out param, fills with var_sym pair for 2D array dimentions (messy stuff) tree_node_list* swapVarReferences(tree_node_list* code, replacements* r, CG_suifBuilder *ocg, std::map& loop_vars, proc_symtab *proc_syms, std::vector< std::pair >& dim_vars) { //Iterate over every expression, looking up each variable and type //reference used and possibly replacing it or adding it to our symbol //table // //We use the built-in cloning helper methods to seriously help us with this! //Need to do a recursive mark recursiveFindRefs(code, proc_syms, r); //We can't rely on type_node->clone() to do the heavy lifting when the //old type is a two dimentional array with variable upper bounds as //that requires creating and saveing variable references to the upper //bounds. So we do one pass over the oldtypes doing this type of //conversion, putting results in the fixed_types map for a second pass //to pick up. std::map fixed_types; //array_types needing their upper bound installed type_node_list_iter tlip(&r->oldtypes); while(!tlip.is_empty()) { type_node* old_tn = tlip.step(); type_node* new_tn = 0; type_node* base_type = old_tn; std::vector< std::pair > variable_upper_bouneds; if(old_tn->is_ptr()){ while (base_type->is_array() || base_type->is_ptr()) { if (base_type->is_array()){ array_bound ub = ((array_type*)base_type)->upper_bound(); if(ub.is_variable()){ var_sym* old_ub = (var_sym*)ub.variable(); var_sym *new_ub = proc_syms->new_unique_var(type_s32); dim_vars.push_back(std::pair(old_ub, new_ub)); variable_upper_bouneds.push_back( std::pair(new_ub, base_type) ); } base_type = static_cast(base_type)->elem_type(); } else if (base_type->is_ptr()) base_type = static_cast(base_type)->ref_type(); } } for (int i = variable_upper_bouneds.size()-1; i >= 0; i--) { var_sym *var_ub = variable_upper_bouneds[i].first; type_node* old_tn = variable_upper_bouneds[i].second; if(new_tn == 0) new_tn = new array_type(base_type, array_bound(1), array_bound(var_ub)); else new_tn = new array_type(new_tn, array_bound(1), array_bound(var_ub)); proc_syms->add_type(new_tn); fixed_types.insert(std::pair(old_tn, new_tn)); } if(new_tn){ if(old_tn->is_ptr()){ new_tn = new ptr_type(new_tn); proc_syms->add_type(new_tn); } fixed_types.insert(std::pair(old_tn, new_tn)); } } //Quickly look for modifiers on our our array types (__shared__ float [][]) type_node_list_iter tliq(&r->oldtypes); while(!tliq.is_empty()) { type_node* old_tn = tliq.step(); if(old_tn->is_modifier()){ type_node* base_type = static_cast(old_tn)->base(); if(fixed_types.find(base_type) != fixed_types.end()){ type_node* fixed_base = (*fixed_types.find(base_type)).second; //printf("Fix modifier with fixed base\n"); //This should work to copy over the annotations, but apparently doesn't work so well type_node* new_tn = new modifier_type(static_cast(old_tn)->op(), fixed_base); old_tn->copy_annotes(new_tn); fixed_types.insert(std::pair(old_tn, new_tn)); } } } //Run through the types and create entries in r->newtypes but don't install type_node_list_iter tli(&r->oldtypes); while(!tli.is_empty()) { type_node* old_tn = tli.step(); type_node* new_tn = 0; //If we recorded this as fixed by our special case, use that type //instead of cloning. if(fixed_types.find(old_tn) != fixed_types.end()){ new_tn = (*fixed_types.find(old_tn)).second; //printf("Reusing fixed typ %u: ", new_tn->type_id()); }else{ new_tn = old_tn->clone(); //printf("Cloning type %u: ", old_tn->type_id()); } new_tn = proc_syms->install_type(new_tn); //Ok, there is a weird case where an array type that has var_sym as //their upper bounds can't be covered fully in this loop or the //var_sym loop, so we need special code. /* if(old_tn->op() == TYPE_PTR && ((ptr_type*)old_tn)->ref_type()->op() == TYPE_ARRAY){ array_type* outer_array = (array_type*)((ptr_type*)old_tn)->ref_type(); array_bound ub = outer_array->upper_bound(); if(ub.is_variable()){ var_sym* old_ub = (var_sym*)ub.variable(); var_sym* new_ub = (var_sym*)((array_type*)((ptr_type*)new_tn)->ref_type())->upper_bound().variable(); //r->oldsyms.append(old_ub); fix_ub.insert(std::pair(old_ub, (array_type*)((ptr_type*)new_tn)->ref_type())); dim_vars.push_back(std::pair(old_ub, new_ub)); printf("array var_sym: %p\n", new_ub); } if(outer_array->elem_type()->op() == TYPE_ARRAY) { array_type* inner_array = (array_type*)outer_array->elem_type(); array_bound ub = inner_array->upper_bound(); if(ub.is_variable()){ var_sym* old_ub = (var_sym*)ub.variable(); var_sym* new_ub = (var_sym*)((array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type())->upper_bound().variable(); dim_vars.push_back(std::pair(old_ub, new_ub)); printf("array var_sym: %p\n", new_ub); //r->oldsyms.append(old_ub); fix_ub.insert(std::pair(old_ub, (array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type())); } } } */ r->newtypes.append(new_tn); } //printf("proc_syms symbol run through\n"); //proc_syms->print(); //Run through the syms creating new copies sym_node_list_iter snli(&r->oldsyms); while(!snli.is_empty()) { sym_node *old_sn = snli.step(); if(loop_vars.count(std::string(old_sn->name())) > 0) { r->newsyms.append(loop_vars[std::string(old_sn->name())]); //printf("def exists: %s\n", old_sn->name()); }else{ sym_node *new_sn = old_sn->copy(); if(new_sn->is_var()){ var_sym* var = (var_sym*)new_sn; type_node* new_type = var->type()->clone_helper(r); //TODO: Have a tagged list of variables to make shared //Make local 2D arrays __shared__ if(new_type->op() == TYPE_ARRAY && ((array_type*)new_type)->elem_type()->op() == TYPE_ARRAY){ //protonu--changes suggested by Malik //printf("Adding __shared__ annotation to : %s\n", new_sn->name()); //new_type = ocg->ModifyType(new_type, "__shared__"); //proc_syms->add_type(new_type); } var->set_type(new_type); } proc_syms->add_sym(new_sn); r->newsyms.append(new_sn); //printf("def new: %s\n", new_sn->name()); } } //printf("proc_syms var runthrough\n"); //proc_syms->print(); return code->clone_helper(r); } bool LoopCuda::validIndexes(int stmt, const std::vector& idxs){ for(int i=0; i array_dims, std::vector blockIdxs, std::vector threadIdxs) { int stmt_num = 0; if(cudaDebug){ printf("cudaize_v2(%s, {", kernel_name.c_str()); //for( printf("}, blocks={"); printVs(blockIdxs); printf("}, thread={"); printVs(threadIdxs); printf("})\n"); } this->array_dims = array_dims; if(!validIndexes(stmt_num, blockIdxs)){ throw std::runtime_error("One of the indexes in the block list was not " "found in the current set of indexes."); } if(!validIndexes(stmt_num, threadIdxs)){ throw std::runtime_error("One of the indexes in the thread list was not " "found in the current set of indexes."); } if(blockIdxs.size() ==0) throw std::runtime_error("Cudaize: Need at least one block dimention"); int block_level=0; //Now, we will determine the actual size (if possible, otherwise //complain) for the block dimentions and thread dimentions based on our //indexes and the relations for our stmt; for(int i=0; iarray_dims = array_dims; cu_kernel_name = kernel_name.c_str(); } tree_node_list* LoopCuda::cudaize_codegen_v2() { //printf("cudaize codegen V2\n"); CG_suifBuilder *ocg = dynamic_cast(ir->builder()); if(!ocg) return false; //protonu--adding an annote to track texture memory type ANNOTE(k_cuda_texture_memory, "cuda texture memory", TRUE); ANNOTE(k_cuda_constant_memory, "cuda constant memory", TRUE); int tex_mem_on = 0; int cons_mem_on = 0; CG_outputRepr* repr; std::vector arrayVars; std::vector localScopedVars; std::vector ro_refs; std::vector wo_refs; std::set uniqueRefs; std::set uniqueWoRefs; //protonu--let's try a much simpler approach of a map instead //we also keep a map for constant memories std::maptex_ref_map; std::mapcons_ref_map; for(int j=0; j refs = ir->FindArrayRef(stmt[j].code); for (int i = 0; i < refs.size(); i++) { //printf("ref %s wo %d\n", static_cast(refs[i]->name()), refs[i]->is_write()); var_sym* var = symtab->lookup_var((char*)refs[i]->name().c_str(),false); //If the array is not a parameter, then it's a local array and we //want to recreate it as a stack variable in the kernel as opposed to //passing it in. if(!var->is_param()) continue; if (uniqueRefs.find(refs[i]->name()) == uniqueRefs.end()) { uniqueRefs.insert(refs[i]->name()); if(refs[i]->is_write()){ uniqueWoRefs.insert(refs[i]->name()); wo_refs.push_back(refs[i]); } else ro_refs.push_back(refs[i]); } if (refs[i]->is_write() && uniqueWoRefs.find(refs[i]->name()) == uniqueWoRefs.end()){ uniqueWoRefs.insert(refs[i]->name()); wo_refs.push_back(refs[i]); //printf("adding %s to wo\n", static_cast(refs[i]->name())); } } } // printf("reading from array "); // for(int i=0; iname().c_str()); // printf("and writting to array "); // for(int i=0; iname().c_str()); // printf("\n"); const char* gridName = "dimGrid"; const char* blockName = "dimBlock"; //TODO: Could allow for array_dims_vars to be a mapping from array //references to to variable names that define their length. var_sym* dim1 = 0; var_sym* dim2 = 0; for(int i=0; iname(); outArray = symtab->lookup_var((char*)name.c_str(),false); VarDefs v; v.size_2d = -1; char buf[32]; snprintf(buf, 32, "devO%dPtr", i+1); v.name = buf; if(outArray->type()->is_ptr()) if(((ptr_type *)(outArray->type()))->ref_type()->is_array()) v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type(); else v.type = ((ptr_type *)(outArray->type()))->ref_type(); else v.type = type_f32; v.tex_mapped = false; v.cons_mapped = false; v.original_name = wo_refs[i]->name(); //Size of the array = dim1 * dim2 * num bytes of our array type //If our input array is 2D (non-linearized), we want the actual //dimentions of the array CG_outputRepr* size; //Lookup in array_dims std::map::iterator it = array_dims.find(name.c_str()); if(outArray->type()->is_ptr() && outArray->type()->ref_type(0)->is_array()) { array_type* t = (array_type*)outArray->type()->ref_type(0); v.size_2d = t->upper_bound().constant()+1; printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)wo_refs[i]->name().c_str()); size = ocg->CreateInt(v.size_2d * v.size_2d); }else if(it != array_dims.end()){ int ref_size = it->second; v.var_ref_size = ref_size; size = ocg->CreateInt(ref_size); } else{ if(dim1){ size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)), new CG_suifRepr(operand(dim2))); }else{ char buf[1024]; sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a " "detectable size or specififed dimentions", name.c_str()); throw std::runtime_error(buf); } } v.size_expr = operand(static_cast(ocg->CreateTimes( size, ocg->CreateInt(v.type->size()/8)))->GetExpression()); v.in_data = 0; v.out_data = outArray; //Check for in ro_refs and remove it at this point std::vector::iterator it_; for(it_ = ro_refs.begin(); it_ != ro_refs.end(); it_++) { if((*it_)->name() == wo_refs[i]->name()){ break; } } if(it_ != ro_refs.end()) { v.in_data = outArray; ro_refs.erase(it_); } arrayVars.push_back(v); } //protonu-- assuming that all texture mapped memories were originally read only mems //there should be safety checks for that, will implement those later int cs_ref_size = 0; for(int i=0; iname(); inArray = symtab->lookup_var((char*)name.c_str(),false); VarDefs v; v.size_2d = -1; char buf[32]; snprintf(buf, 32, "devI%dPtr", i+1); v.name = buf; if(inArray->type()->is_ptr()) if(((ptr_type *)(inArray->type()))->ref_type()->is_array()) v.type = ((array_type *)(((ptr_type *)(inArray->type()))->ref_type()))->elem_type(); else v.type = ((ptr_type *)(inArray->type()))->ref_type(); else v.type = type_f32; v.tex_mapped = false; v.cons_mapped = false; v.original_name = ro_refs[i]->name(); if ( texture != NULL) v.tex_mapped = (texture->is_array_tex_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars if (v.tex_mapped){ printf("this variable %s is mapped to texture memory", name.c_str()); } if ( constant_mem != NULL) v.cons_mapped = (constant_mem->is_array_cons_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars if (v.cons_mapped){ printf("this variable %s is mapped to constant memory", name.c_str()); } //Size of the array = dim1 * dim2 * num bytes of our array type //If our input array is 2D (non-linearized), we want the actual //dimentions of the array (as it might be less than cu_n CG_outputRepr* size; //Lookup in array_dims std::map::iterator it = array_dims.find(name.c_str()); int ref_size = 0; if(inArray->type()->is_ptr() && inArray->type()->ref_type(0)->is_array()) { array_type* t = (array_type*)inArray->type()->ref_type(0); v.size_2d = t->upper_bound().constant()+1; printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)ro_refs[i]->name().c_str()); size = ocg->CreateInt(v.size_2d * v.size_2d); }else if(it != array_dims.end()){ ref_size = it->second; v.var_ref_size = ref_size; size = ocg->CreateInt(ref_size); }else{ if(dim1){ size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)), new CG_suifRepr(operand(dim2))); }else{ char buf[1024]; sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a " "detectable size or specififed dimentions", name.c_str()); throw std::runtime_error(buf); } } v.size_expr = operand(static_cast(ocg->CreateTimes( size, ocg->CreateInt(v.type->size()/8)))->GetExpression()); v.in_data = inArray; v.out_data = 0; arrayVars.push_back(v); } if(arrayVars.size() < 2) { fprintf(stderr, "cudaize error: Did not find two arrays being accessed\n"); return false; } //protonu--debugging tool--the printf statement //tex_mem_on signals use of tex mem for(int i=0; iinstall_type(unkown_func); func_type* void_func = new func_type(type_void); //function on unkown args that returns a void void_func = (func_type*)globals->install_type(void_func); func_type* float_func = new func_type(type_f32); //function on unkown args that returns a float float_func = (func_type*)globals->install_type(float_func); type_node* result = ocg->ModifyType(type_void, "__global__"); result = globals->install_type(result); func_type* kernel_type = new func_type(result); //function returns a '__global__ void' int numArgs = arrayVars.size() + (dim1 ? 2 : 0) + localScopedVars.size(); //protonu--need to account for texture memory here, reduce the #args if( tex_mem_on ) numArgs -= tex_mem_on; if( cons_mem_on ) numArgs -= cons_mem_on; kernel_type->set_num_args(numArgs); int argCount = 0; for(int i=0; itype()->clone(); else fptr = arrayVars[i].out_data->type()->clone(); //protonu--skip this for texture mems if( arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true ) kernel_type->set_arg_type(argCount++, fptr); } if(dim1){ kernel_type->set_arg_type(argCount++, type_s32); //width x height dimentions kernel_type->set_arg_type(argCount++, type_s32); } kernel_type = (func_type*)globals->install_type(kernel_type); proc_sym* cudaMalloc = globals->new_proc(unkown_func, src_c, "cudaMalloc"); proc_sym* cudaMemcpy = globals->new_proc(unkown_func, src_c, "cudaMemcpy"); proc_sym* cudaFree = globals->new_proc(unkown_func, src_c, "cudaFree"); proc_sym* cudaSync = globals->new_proc(void_func, src_c, "__syncthreads"); proc_sym* cudaBind = globals->new_proc(unkown_func, src_c, "cudaBindTexture"); proc_sym* cudaMemcpySym = globals->new_proc(unkown_func, src_c, "cudaMemcpyToSymbol"); //protonu-removing Gabe's function, introducing mine, this is pretty cosmetic //proc_sym* cudaFetch = globals->new_proc(float_func, src_c, "tex1Dfetch"); proc_sym* tex1D = globals->new_proc(float_func, src_c, "tex1Dfetch"); var_sym *cudaMemcpyHostToDevice = new var_sym(type_s32, "cudaMemcpyHostToDevice"); var_sym *cudaMemcpyDeviceToHost = new var_sym(type_s32, "cudaMemcpyDeviceToHost"); cudaMemcpyDeviceToHost->set_param(); cudaMemcpyHostToDevice->set_param(); globals->add_sym(cudaMemcpyHostToDevice); globals->add_sym(cudaMemcpyDeviceToHost); //protonu--adding the bool tex_mem to the structure struct_type //to bypass the re-naming of struct texture, this is a hack fix struct_type* texType = new struct_type(TYPE_GROUP, 0, "texture", 0, true); immed_list *iml_tex = new immed_list; iml_tex->append(immed("texture memory")); texType->append_annote(k_cuda_texture_memory, iml_tex); //protonu--end my changes texType = (struct_type*)globals->install_type(texType); //protonu--should register the locals later on //when we do the bind operation //var_sym* texRef = new var_sym(texType, "texRef"); //globals->add_sym(texRef); //Add our mallocs (and input array memcpys) for(int i=0; iclone()); //protonu--temporary change type_node* fptr = new ptr_type(arrayVars[i].type); fptr = symtab->install_type(fptr); var_sym *dvs = new var_sym(fptr, const_cast( arrayVars[i].name.c_str())); dvs->set_addr_taken(); symtab->add_sym(dvs); //cudaMalloc args //protonu--no cudaMalloc required for constant memory tree_node_list* tnl = new tree_node_list; if(arrayVars[i].cons_mapped != true ) { in_cal *the_call = new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMalloc))), 2); the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to()->ptr_to(), operand(), immed(dvs)))); the_call->set_argument(1, arrayVars[i].size_expr); tnl->append(new tree_instr(the_call)); setup_code = ocg->StmtListAppend(setup_code, new CG_suifRepr(tnl)); } if(arrayVars[i].in_data) { //cudaMemcpy args //protonu-- no cudaMemcpy required for constant memory if ( arrayVars[i].cons_mapped != true ) { in_cal *the_call = new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpy))), 4); the_call->set_argument(0, operand(dvs)); the_call->set_argument(1, operand(arrayVars[i].in_data)); the_call->set_argument(2, arrayVars[i].size_expr.clone()); the_call->set_argument(3, operand(cudaMemcpyHostToDevice)); tnl = new tree_node_list; tnl->append(new tree_instr(the_call)); setup_code = ocg->StmtListAppend(setup_code, new CG_suifRepr(tnl)); } //protonu--check if the arrayvar is tex mapped if(arrayVars[i].tex_mapped == true) { //Need a texture reference variable char buf[32]; snprintf(buf, 32, "tex%dRef", i+1); arrayVars[i].secondName = buf; var_sym* texRef = new var_sym(texType, buf); //printf("\n putting in %s\n", arrayVars[i].original_name.c_str()); tex_ref_map[arrayVars[i].original_name] = texRef; globals->add_sym(texRef); //protonu--added the above two lines in_cal *the_call = new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaBind))), 4); in_ldc *ins = new in_ldc(type_s32, operand(), immed(0)); the_call->set_argument(0, operand(ins)); the_call->set_argument(1, operand(texRef));//protonu--change to add the new sym the_call->set_argument(2, operand(dvs)); the_call->set_argument(3, arrayVars[i].size_expr.clone()); tnl = new tree_node_list; tnl->append(new tree_instr(the_call)); setup_code = ocg->StmtListAppend(setup_code, new CG_suifRepr(tnl)); } //protonu--if arrayvar is mapped to constant memory if(arrayVars[i].cons_mapped == true) { char buf[32]; snprintf(buf, 32, "cs%dRef", i+1); //arrayVars[i].secondName = buf; array_bound low (0); array_bound high (arrayVars[i].var_ref_size -1); array_type *arr = new array_type(arrayVars[i].type,low, high); type_node* cons_arr = ocg->ModifyType(arr, "__device__ __constant__"); cons_arr = globals->install_type(cons_arr); var_sym* consRef = new var_sym(cons_arr, buf); cons_ref_map[arrayVars[i].original_name] = consRef; globals->add_sym(consRef); in_cal *the_call = new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpySym))), 3); the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to(), operand(), immed(consRef)))); the_call->set_argument(1, operand(arrayVars[i].in_data)); the_call->set_argument(2, arrayVars[i].size_expr.clone()); tnl = new tree_node_list; tnl->append(new tree_instr(the_call)); setup_code = ocg->StmtListAppend(setup_code, new CG_suifRepr(tnl)); } } } //Build dimGrid dim3 variables based on loop dimentions and ti/tj char blockD1[120]; char blockD2[120]; if(dim1){ snprintf(blockD1, 120, "%s/%d", dim1->name(), cu_tx); snprintf(blockD2, 120, "%s/%d", dim2->name(), cu_ty); }else{ snprintf(blockD1, 120, "%d", cu_bx); snprintf(blockD2, 120, "%d", cu_by); //snprintf(blockD1, 120, "%d/%d", cu_nx, cu_tx); //snprintf(blockD2, 120, "%d/%d", cu_ny, cu_ty); } repr = ocg->CreateDim3(immed((char*)gridName), immed(blockD1), immed(blockD2)); setup_code = ocg->StmtListAppend(setup_code, repr); repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx),immed(cu_ty)); if(cu_tz > 1) repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty), immed(cu_tz)); else repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty)); setup_code = ocg->StmtListAppend(setup_code, repr); //call kernel function with name loop_name //like: transpose_k<<>>(devOPtr, devIPtr , width, height); char dims[120]; snprintf(dims,120,"<<<%s,%s>>>",gridName, blockName); immed_list *iml = new immed_list; iml->append(immed((char*)cu_kernel_name.c_str())); iml->append(immed(dims)); //printf("%s %s\n", static_cast(cu_kernel_name), dims); for(int i=0; i= 0) { snprintf(dims,120,"(float(*) [%d])%s", arrayVars[i].size_2d, const_cast(arrayVars[i].name.c_str())); //printf("%d %s\n", i, dims); iml->append(immed(dims)); }else{ //printf("%d %s\n", i, static_cast(arrayVars[i].name)); iml->append(immed(const_cast( arrayVars[i].name.c_str()))); } } if(dim1){ iml->append(immed(dim1)); iml->append(immed(dim2)); } repr = ocg->CreateKernel(iml);//kernel call setup_code = ocg->StmtListAppend(setup_code, repr); //cuda free variables for(int i=0; iptr_to(), operand(), immed(cudaMemcpy))), 4); the_call->set_argument(0, operand(arrayVars[i].out_data)); the_call->set_argument(1, operand(symtab->lookup_var(const_cast( arrayVars[i].name.c_str())))); the_call->set_argument(2, arrayVars[i].size_expr.clone()); the_call->set_argument(3, operand(cudaMemcpyDeviceToHost)); tree_node_list* tnl = new tree_node_list; tnl->append(new tree_instr(the_call)); teardown_code = ocg->StmtListAppend(teardown_code, new CG_suifRepr(tnl)); } in_cal *the_call = new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaFree))), 1); the_call->set_argument(0, operand(symtab->lookup_var(const_cast( arrayVars[i].name.c_str())))); tree_node_list* tnl = new tree_node_list; tnl->append(new tree_instr(the_call)); teardown_code = ocg->StmtListAppend(teardown_code, new CG_suifRepr(tnl)); } // --------------- // BUILD THE KERNEL // --------------- //Extract out kernel body tree_node_list* code = getCode(); //Get rid of wrapper if that original() added if(code->head()->contents->kind() == TREE_IF) { tree_if* ifn = (tree_if*)code->head()->contents; code = ifn->then_part(); } //Create kernel function body proc_sym *new_psym = globals->new_proc(kernel_type, src_c, (char*)cu_kernel_name.c_str()); proc_symtab *new_proc_syms = new proc_symtab(new_psym->name()); globals->add_child(new_proc_syms); //Add Params std::map loop_vars; //In-Out arrays type_node* fptr; for(int i=0; itype()->clone(); fptr = arrayVars[i].in_data->type(); else //fptr = arrayVars[i].out_data->type()->clone(); fptr = arrayVars[i].out_data->type(); fptr = new_proc_syms->install_type(fptr); std::string name = arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name(); var_sym* sym = new var_sym(fptr, arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name()); //protonu--adding a check to ensure that texture memories are not passed in as arguments if(arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true ) { sym->set_param(); new_proc_syms->params()->append(sym); new_proc_syms->add_sym(sym);//protonu--added to suppress the addition of the redundant var in the kernel } if (arrayVars[i].cons_mapped == true) { sym->set_param(); new_proc_syms->add_sym(sym); } //printf("inserting name: %s\n", static_cast(name)); loop_vars.insert(std::pair(std::string(name), sym)); } if(dim1) { //Array dimentions var_sym* kdim1 = new var_sym(dim1->type(), dim1->name()); kdim1->set_param(); new_proc_syms->add_sym(kdim1); loop_vars.insert(std::pair(std::string(dim1->name()), kdim1)); var_sym* kdim2 = new var_sym(dim2->type(), dim2->name()); kdim2->set_param(); new_proc_syms->add_sym(kdim2); loop_vars.insert(std::pair(std::string(dim2->name()), kdim2)); new_proc_syms->params()->append(kdim1); new_proc_syms->params()->append(kdim2); } //Put block and thread implicit variables into scope std::vector index_syms; /* Currently we don't use the block dimentions var_sym* blockDim_x = new var_sym(type_s32, "blockDim.x"); blockDim_x->set_param(); new_proc_syms->add_sym(blockDim_x); var_sym* blockDim_y = new var_sym(type_s32, "blockDim.y"); blockDim_y->set_param(); new_proc_syms->add_sym(blockDim_y); */ if(cu_bx > 1){ var_sym* blockIdx_x = new var_sym(type_s32, "blockIdx.x"); blockIdx_x->set_param(); new_proc_syms->add_sym(blockIdx_x); index_syms.push_back(blockIdx_x); } if(cu_by > 1){ var_sym* blockIdx_y = new var_sym(type_s32, "blockIdx.y"); blockIdx_y->set_param(); new_proc_syms->add_sym(blockIdx_y); index_syms.push_back(blockIdx_y); } if(cu_tx > 1){ var_sym* threadIdx_x = new var_sym(type_s32, "threadIdx.x"); threadIdx_x->set_param(); new_proc_syms->add_sym(threadIdx_x); index_syms.push_back(threadIdx_x); } if(cu_ty > 1){ var_sym* threadIdx_y = new var_sym(type_s32, "threadIdx.y"); threadIdx_y->set_param(); new_proc_syms->add_sym(threadIdx_y); index_syms.push_back(threadIdx_y); } if(cu_tz > 1){ var_sym* threadIdx_z = new var_sym(type_s32, "threadIdx.z"); threadIdx_z->set_param(); new_proc_syms->add_sym(threadIdx_z); index_syms.push_back(threadIdx_z); } //Figure out which loop variables will be our thread and block dimention variables std::vector loop_syms; //Get our indexes std::vector indexes;// = get_loop_indexes(code,cu_num_reduce); int threadsPos=0; if(cu_bx > 1) indexes.push_back("bx"); if(cu_by > 1) indexes.push_back("by"); if(cu_tx > 1){ threadsPos = indexes.size(); indexes.push_back("tx"); } if(cu_ty > 1) indexes.push_back("ty"); if(cu_tz > 1) indexes.push_back("tz"); for(int i=0; iadd_sym(loop_syms[i]); //loop_vars.insert(std::pair(std::string(indexes[i]), loop_syms[i])); } //Generate this code //int bx = blockIdx.x //int by = blockIdx.y //int tx = threadIdx.x //int ty = threadIdx.y CG_outputRepr *body=NULL; for(int i=0; iStmtListAppend(body, ocg->CreateStmtList( // ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i]))))); body = ocg->StmtListAppend(body, ocg->StmtListAppend( ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i]))), NULL)); } //Get our inital code prepped for loop reduction. First we need to swap //out internal SUIF variable references to point to the new local //function symbol table. std::map loop_idxs; //map from idx names to their new syms std::vector< std::pair > dim_vars; //pair is of var_sym (for 2D array size initializations) replacements r; tree_node_list* swapped = swapVarReferences(code, &r, ocg, loop_vars, new_proc_syms, dim_vars); //printf("\n code before recursiveFindReplacePreferedIdxs :\n"); //swapped->print(); swapped = recursiveFindReplacePreferedIdxs(swapped, new_proc_syms, cudaSync, void_func, loop_idxs);//in-place swapping //printf("\n code after recursiveFindReplacePreferedIdxs :\n"); //swapped->print(); for(int i=0; i tfs = findCommentedFors(indexes[i], swapped); for(int k=0; kprint(); swap_node_for_node_list(tfs[k], newBlock); //printf("AFTER SWAP\n"); newBlock->print(); } } //printf("AFTER REDUCE\n"); swapped->print(); if(static_cast(ir)->init_code()){ tree_node_list* orig_init_code = static_cast(static_cast(ir)->init_code())->GetCode(); for(int i=0; ikind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr) { in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr(); //expect the structure: cpy( _ = min(grab_me, _)) if(inst->opcode() == io_cpy && inst->dst_op().is_symbol()){ //printf("looking at instruction: "); //inst->print(); var_sym* dest = inst->dst_op().symbol(); if(dest == dim_vars[i].first) { if(inst->src1_op().is_instr() && inst->src1_op().instr()->format() == inf_ldc){ value = ((in_ldc*)inst->src1_op().instr())->value().integer(); } } } } } if(value < 0){ fprintf(stderr, "ERROR: Could not find initializing statement for variable used in upper_bound of array type"); } CG_outputRepr *lhs = new CG_suifRepr(operand(dim_vars[i].second)); //body = ocg->StmtListAppend(body, ocg->CreateStmtList(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value)))); body = ocg->StmtListAppend(body, ocg->StmtListAppend(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value)), NULL)); } } body = ocg->StmtListAppend(body, new CG_suifRepr(swapped)); //protonu--lets try creating our function definiton here var_sym *tsym = NULL; std::vector refs = ir->FindArrayRef(body); for(int i=0; iis_array_tex_mapped(refs[i]->name().c_str())) { //protonu--our new tex lookup function in_cal *tex_lookup = new in_cal(type_f32, operand(), operand(new in_ldc(float_func->ptr_to(), operand(), immed(tex1D))), 2); //printf("name of the array to be mapped is %s\n", refs[i]->name().c_str()); tsym = tex_ref_map[refs[i]->name()]; tex_lookup->set_argument(0, operand(tsym)); int array_dims = ((IR_suifArrayRef *)refs[i])->ia_->dims(); if (array_dims == 1){ tex_lookup->set_argument(1, ((IR_suifArrayRef *)refs[i])->ia_->index(0).clone()); }else if (array_dims > 2) { printf(" \n we don't handle more than 2D arrays mapped to textures yet\n"); }else if (array_dims == 2) { IR_ArraySymbol *sym = refs[i]->symbol(); CG_outputRepr *sz = sym->size(1); delete sym; // free the wrapper object only // find the builder ocg CG_outputRepr *expr = ocg->CreateTimes(sz->clone(),refs[i]->index(0)); delete sz; // free the wrapper object only expr = ocg->CreatePlus(expr, refs[i]->index(1)); // expr holds the 1D access expression and take it out tex_lookup->set_argument(1, ((CG_suifRepr *)expr)->GetExpression()); } //using chun's function to replace the array look up with the function call ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(tex_lookup))); } } tsym = NULL; //protonu--now let's try what we did above for constant memory for(int i=0; iis_array_cons_mapped(refs[i]->name().c_str())) { //printf("name of the array to be cons mapped is %s\n", refs[i]->name().c_str()); tsym = cons_ref_map[refs[i]->name()]; //we should create a IR_SuifArray here IR_ArraySymbol *ar_sym = new IR_suifArraySymbol(ir,tsym); std::vector ar_index; ar_index.push_back(((IR_suifArrayRef *)refs[i])->index(0)); IR_ArrayRef *ar_ref = ((IR_suifCode *)ir)->CreateArrayRef(ar_sym, ar_index); //using chun's function to replace the array look up with the function call ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(((IR_suifArrayRef *)ar_ref)->ia_))); } } tree_proc *new_body = new tree_proc(static_cast(body)->GetCode(), new_proc_syms); //globals->add_child(new_proc_syms); new_psym->set_block(new_body); new_procs.push_back(new_psym); return swapped; } //Order taking out dummy variables std::vector cleanOrder(std::vector idxNames){ std::vector results; for(int j=0; j& curOrder) { //printf("curOrder: "); //printVs(curOrder); //printf("idxNames: "); //printVS(idxNames[stmt]); std::vector cIdxNames = cleanOrder(idxNames[stmt]); bool same=true; std::vector pi; for(int i=0; i &pi) { // check for sanity of parameters if (stmt_num >= stmt.size() || stmt_num < 0) throw std::invalid_argument("invalid statement " + to_string(stmt_num)); const int n = stmt[stmt_num].xform.n_out(); if (pi.size() > (n-1)/2) throw std::invalid_argument("iteration space dimensionality does not match permute dimensionality"); int first_level = 0; int last_level = 0; for (int i = 0; i < pi.size(); i++) { if (pi[i] > (n-1)/2 || pi[i] <= 0) throw std::invalid_argument("invalid loop level " + to_string(pi[i]) + " in permuation"); if (pi[i] != i+1) { if (first_level == 0) first_level = i+1; last_level = i+1; } } if (first_level == 0) return true; std::vector lex = getLexicalOrder(stmt_num); std::set active = getStatements(lex, 2*first_level-2); Loop::permute(active, pi); } void LoopCuda::tile_cuda(int stmt, int level, int outer_level) { tile_cuda(stmt,level,1,outer_level,"","",CountedTile); } void LoopCuda::tile_cuda(int level, int tile_size, int outer_level, std::string idxName, std::string ctrlName, TilingMethodType method){ tile_cuda(0, level, tile_size, outer_level, idxName, ctrlName, method); } void LoopCuda::tile_cuda(int stmt, int level, int tile_size, int outer_level, std::string idxName, std::string ctrlName, TilingMethodType method){ //Do regular tile but then update the index and control loop variable //names as well as the idxName to reflect the current state of things. //printf("tile(%d,%d,%d,%d)\n", stmt, level, tile_size, outer_level); //printf("idxNames before: "); //printVS(idxNames[stmt]); tile(stmt, level, tile_size, outer_level, method); if(idxName.size()) idxNames[stmt][level-1] = idxName.c_str(); if(tile_size == 1){ //potentially rearrange loops if(outer_level < level){ std::string tmp = idxNames[stmt][level-1]; for(int i=level-1; i>outer_level-1; i--){ if(i-1 >= 0) idxNames[stmt][i] = idxNames[stmt][i-1]; } idxNames[stmt][outer_level-1] = tmp; } //TODO: even with a tile size of one, you need a insert (of a dummy loop) idxNames[stmt].insert(idxNames[stmt].begin()+(level),""); }else{ if(!ctrlName.size()) throw std::runtime_error("No ctrl loop name for tile"); //insert idxNames[stmt].insert(idxNames[stmt].begin()+(outer_level-1),ctrlName.c_str()); } //printf("idxNames after: "); //printVS(idxNames[stmt]); } bool LoopCuda::datacopy_privatized_cuda(int stmt_num, int level, const std::string &array_name, const std::vector &privatized_levels, bool allow_extra_read , int fastest_changing_dimension , int padding_stride , int padding_alignment , bool cuda_shared) { int old_stmts =stmt.size(); //datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, cuda_shared); if(cuda_shared) datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 1); else datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 0); //Adjust idxNames to reflect updated state std::vector cIdxNames = cleanOrder(idxNames[stmt_num]); int new_stmts = stmt.size(); for(int i=old_stmts; i idxs; //protonu-making sure the vector of nonSplitLevels grows along with //the statement structure stmt_nonSplitLevels.push_back(omega::Tuple()); //Indexes up to level will be the same for(int j=0; j new_idxs, bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, bool cuda_shared) { int old_stmts =stmt.size(); //datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment,cuda_shared); if(cuda_shared) datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 1); else datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 0); //Adjust idxNames to reflect updated state std::vector cIdxNames = cleanOrder(idxNames[stmt_num]); int new_stmts = stmt.size(); for(int i=old_stmts; i idxs; //protonu-making sure the vector of nonSplitLevels grows along with //the statement structure stmt_nonSplitLevels.push_back(omega::Tuple()); //protonu--lets dump out the code from each statement here //printf("\n dumping statement :%d", i); //stmt[i].code->Dump(); //Indexes up to level will be the same for(int j=0; j lex = getLexicalOrder(stmt_num); std::set same_loop = getStatements(lex, dim-1); level = nonDummyLevel(stmt_num,level); //printf("unrolling %d at level %d\n", stmt_num,level); //protonu--using the new version of unroll, which returns //a set of ints instead of a bool. To keep Gabe's logic //I'll check the size of the set, if it's 0 return true //bool b= unroll(stmt_num, level, unroll_amount); std::set b_set= unroll(stmt_num, level, unroll_amount); bool b = false; if (b_set.size() == 0) b = true; //end--protonu //Adjust idxNames to reflect updated state std::vector cIdxNames = cleanOrder(idxNames[stmt_num]); std::vector origSource = idxNames[stmt_num];; //Drop index names at level if(unroll_amount == 0){ //For all statements that were in this unroll together, drop index name for unrolled level idxNames[stmt_num][level-1] = ""; for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { //printf("in same loop as %d is %d\n", stmt_num, (*i)); //idxNames[(*i)][level-1] = ""; idxNames[(*i)] = idxNames[stmt_num]; } } lex = getLexicalOrder(stmt_num); same_loop = getStatements(lex, dim-1); bool same_as_source = false; int new_stmts = stmt.size(); for(int i=old_stmts; i()); //We expect that new statements have a constant for the variable in //stmt[i].IS at level (as seen with print_with_subs), otherwise there //will be a for loop at level and idxNames should match stmt's //idxNames pre-unrolled Relation IS = stmt[i].IS; //Ok, if you know how the hell to get anything out of a Relation, you //should probably be able to do this more elegantly. But for now, I'm //hacking it. std::string s = IS.print_with_subs_to_string(); //s looks looks like //{[_t49,8,_t51,_t52,128]: 0 <= _t52 <= 3 && 0 <= _t51 <= 15 && 0 <= _t49 && 64_t49+16_t52+_t51 <= 128} //where level == 5, you see a integer in the input set //If that's not an integer and this is the first new statement, then //we think codegen will have a loop at that level. It's not perfect, //not sure if it can be determined without round-tripping to codegen. int sIdx = 0; int eIdx = 0; for(int j=0; j 0){ eIdx = s.find("]"); int tmp = s.find(",",sIdx+1); if(tmp > 0 && tmp < eIdx) eIdx = tmp; //", before ]" if(eIdx > 0){ sIdx++; std::string var = s.substr(sIdx,eIdx-sIdx); //printf("%s\n", s.c_str()); //printf("set var for stmt %d at level %d is %s\n", i, level, var.c_str()); if(atoi(var.c_str()) == 0 && i ==old_stmts){ //TODO:Maybe do see if this new statement would be in the same //group as the original and if it would, don't say //same_as_source if(same_loop.find(i) == same_loop.end()){ printf("stmt %d level %d, newly created unroll statement should have same level indexes as source\n", i, level); same_as_source = true; } } } } //printf("fixing up statement %d n_set %d with %d levels\n", i, stmt[i].IS.n_set(), level-1); if(same_as_source) idxNames.push_back(origSource); else idxNames.push_back(idxNames[stmt_num]); } return b; } void LoopCuda::copy_to_texture(const char *array_name) { //protonu--placeholder for now //set the bool for using cuda memory as true //in a vector of strings, put the names of arrays to tex mapped if ( !texture ) texture = new texture_memory_mapping(true, array_name); else texture->add(array_name); } void LoopCuda::copy_to_constant(const char *array_name) { //protonu--placeholder for now //set the bool for using cuda memory as true //in a vector of strings, put the names of arrays to tex mapped if ( !constant_mem ) constant_mem = new constant_memory_mapping(true, array_name); else constant_mem->add(array_name); } //protonu--moving this from Loop tree_node_list* LoopCuda::codegen() { if(code_gen_flags & GenCudaizeV2) return cudaize_codegen_v2(); //Do other flagged codegen methods, return plain vanilla generated code return getCode(); } //These three are in Omega code_gen.cc and are used as a massive hack to //get out some info from MMGenerateCode. Yea for nasty side-effects. namespace omega{ extern int checkLoopLevel; extern int stmtForLoopCheck; extern int upperBoundForLevel; extern int lowerBoundForLevel; } void LoopCuda::extractCudaUB(int stmt_num, int level, int &outUpperBound, int &outLowerBound){ // check for sanity of parameters const int m = stmt.size(); if (stmt_num >= m || stmt_num < 0) throw std::invalid_argument("invalid statement " + to_string(stmt_num)); const int n = stmt[stmt_num].xform.n_out(); if (level > (n-1)/2 || level <= 0) throw std::invalid_argument("invalid loop level " + to_string(level)); int dim = 2*level-1; std::vector lex = getLexicalOrder(stmt_num); std::set same_loop = getStatements(lex, dim-1); // extract the intersection of the iteration space to be considered Relation hull; { hull = Relation::True(n); for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { hull = Intersection(hull, project_onto_levels(getNewIS(*i), dim+1, true)); hull.simplify(2, 4); } for (int i = 2; i <= dim+1; i+=2) { //std::string name = std::string("_t") + to_string(t_counter++); std::string name = std::string("_t") + to_string(tmp_loop_var_name_counter++); hull.name_set_var(i, name); } hull.setup_names(); } // extract the exact loop bound of the dimension to be unrolled if (is_single_iteration(hull, dim)){ throw std::runtime_error("No loop availabe at level to extract upper bound."); } Relation bound = get_loop_bound(hull, dim); if (!bound.has_single_conjunct() || !bound.is_satisfiable() || bound.is_tautology()) throw loop_error("loop error: unable to extract loop bound for cudaize"); // extract the loop stride EQ_Handle stride_eq; int stride = 1; { bool simple_stride = true; int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(dim+1), stride_eq, simple_stride); if (strides > 1) throw loop_error("loop error: too many strides"); else if (strides == 1) { int sign = stride_eq.get_coef(bound.set_var(dim+1)); // assert(sign == 1 || sign == -1); Constr_Vars_Iter it(stride_eq, true); stride = abs((*it).coef/sign); } } if(stride != 1){ char buf[1024]; sprintf(buf, "Cudaize: Loop at level %d has non-one stride of %d", level, stride); throw std::runtime_error(buf); } //Use code generation system to build tell us our bound information. We //need a hard upper bound a 0 lower bound. checkLoopLevel = level*2; stmtForLoopCheck = stmt_num; upperBoundForLevel = -1; lowerBoundForLevel = -1; printCode(1,false); checkLoopLevel = 0; outUpperBound = upperBoundForLevel; outLowerBound = lowerBoundForLevel; return; } void LoopCuda::printCode(int effort, bool actuallyPrint) const { const int m = stmt.size(); if (m == 0) return; const int n = stmt[0].xform.n_out(); Tuple IS(m); Tuple xform(m); Tuple nonSplitLevels(m); for (int i = 0; i < m; i++) { IS[i+1] = stmt[i].IS; xform[i+1] = stmt[i].xform; nonSplitLevels[i+1] = stmt_nonSplitLevels[i]; //nonSplitLevels[i+1] = stmt[i].nonSplitLevels; } Tuple< Tuple > idxTupleNames; if(useIdxNames){ for(int i=0; i idxs; for(int j=0; jknown), n - this->known.n_set()); CG_stringBuilder *ocg = new CG_stringBuilder(); Tuple nameInfo; for (int i = 1; i <= m; i++) nameInfo.append(new CG_stringRepr("s" + to_string(i))); CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort); if(actuallyPrint) std::cout << GetString(repr); /* for (int i = 1; i <= m; i++) delete nameInfo[i]; */ delete ocg; } void LoopCuda::printRuntimeInfo() const { for(int i=0; i(stmt[i].code)->GetCode()->print_expr(); } } void LoopCuda::printIndexes() const { for(int i=0; i0) printf(","); printf("%s", idxNames[i][j].c_str()); } printf("\n"); } } tree_node_list* LoopCuda::getCode(int effort) const { const int m = stmt.size(); if (m == 0) return new tree_node_list; const int n = stmt[0].xform.n_out(); Tuple ni(m); Tuple IS(m); Tuple xform(m); Tuple< IntTuple > nonSplitLevels(m); for (int i = 0; i < m; i++) { ni[i+1] = stmt[i].code; IS[i+1] = stmt[i].IS; xform[i+1] = stmt[i].xform; nonSplitLevels[i+1] = stmt_nonSplitLevels[i]; //nonSplitLevels[i+1] = stmt[i].nonSplitLevels; } Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); #ifdef DEBUG // std::cout << GetString(MMGenerateCode(new CG_stringBuilder(), xform, IS, known, effort)); #endif Tuple< Tuple > idxTupleNames; if(useIdxNames){ for(int i=0; i idxs; for(int j=0; jbuilder(); CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, nonSplitLevels, syncs, idxTupleNames, effort); //CG_outputRepr *overflow_initialization = ocg->CreateStmtList(); //protonu--using the new function CG_suifBuilder::StmtListAppend CG_outputRepr *overflow_initialization = ocg->StmtListAppend(NULL, NULL); for (std::map >::const_iterator i = overflow.begin(); i != overflow.end(); i++) for (std::vector::const_iterator j = i->second.begin(); j != i->second.end(); j++) //overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->CreateStmtList(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)))); overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->StmtListAppend(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)), NULL)); repr = ocg->StmtListAppend(overflow_initialization, repr); tree_node_list *tnl = static_cast(repr)->GetCode(); delete repr; /* for (int i = 1; i <= m; i++) delete ni[i]; */ return tnl; } //protonu--adding constructors for the new derived class LoopCuda::LoopCuda():Loop(), code_gen_flags(GenInit){} LoopCuda::LoopCuda(IR_Control *irc, int loop_num) :Loop(irc) { setup_code = NULL; teardown_code = NULL; code_gen_flags = 0; cu_bx = cu_by = cu_tx = cu_ty = cu_tz = 1; cu_num_reduce = 0; cu_mode = GlobalMem; texture = NULL; constant_mem = NULL; int m=stmt.size(); //printf("\n the size of stmt(initially) is: %d\n", stmt.size()); for(int i=0; i()); //protonu--setting up //proc_symtab *symtab //global_symtab *globals globals = ((IR_cudasuifCode *)ir)->gsym_ ; std::vector tf = ((IR_cudasuifCode *)ir)->get_loops(); symtab = tf[loop_num]->proc()->block()->proc_syms(); std::vector deepest = find_deepest_loops(tf[loop_num]); for (int i = 0; i < deepest.size(); i++){ index.push_back(deepest[i]->index()->name()); //reflects original code index names } for(int i=0; i< stmt.size(); i++) idxNames.push_back(index); //refects prefered index names (used as handles in cudaize v2) useIdxNames=false; }