Initial commit

author: dhuth <derickhuth@gmail.com> 2014-08-27 09:52:06 -0600
committer: dhuth <derickhuth@gmail.com> 2014-08-27 09:52:06 -0600
commit: bff810cc371a38f493d688c54f71013f5a7d53bf (patch)
tree: fbe86954bb3c01deb21da9e41ebff5baa2889a45 /loop_cuda.cc
download: chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.gz
chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.bz2
chill-bff810cc371a38f493d688c54f71013f5a7d53bf.zip
1 files changed, 2123 insertions, 0 deletions
diff --git a/loop_cuda.cc b/loop_cuda.cc
new file mode 100644
index 0000000..a23990d
--- /dev/null
+++ b/loop_cuda.cc
@@ -0,0 +1,2123 @@
+/*****************************************************************************
+ Copyright (C) 2009 University of Utah
+ All Rights Reserved.
+
+ Purpose:
+   Cudaize methods
+
+ Notes:
+
+ History:
+   1/7/10 Created by Gabe Rudy by migrating code from loop.cc
+     31/1/11 Modified by Protonu Basu
+*****************************************************************************/
+
+#include <code_gen/code_gen.h>
+#include <code_gen/CG_stringBuilder.h>
+#include <code_gen/output_repr.h>
+#include <code_gen/CG_outputRepr.h>
+#include "loop_cuda.hh"
+#include "loop.hh"
+#include <math.h>
+#include <useful.h>
+#include "omegatools.hh"
+#include "ir_cudasuif.hh"
+#include "ir_suif.hh"
+#include "ir_suif_utils.hh"
+#include "chill_error.hh"
+#include <vector>
+
+using namespace omega;
+char *k_cuda_texture_memory; //protonu--added to track texture memory type
+char *k_cuda_constant_memory; //protonu--added to track constant memory type
+//extern char *omega::k_cuda_texture_memory; //protonu--added to track texture memory type
+extern char *omega::k_ocg_comment;
+
+
+static int cudaDebug;
+class CudaStaticInit{ public: CudaStaticInit(){ cudaDebug=0; //Change this to 1 for debug
+}};
+static CudaStaticInit junkInitInstance__;
+
+
+
+std::string& upcase(std::string& s)
+{
+  for(int i=0; i<s.size(); i++)
+    s[i] = toupper(s[i]);
+  return s;
+}
+
+void printVs(const std::vector<std::string>& curOrder){
+  if(!cudaDebug) return;
+  for(int i=0; i<curOrder.size(); i++){
+    if(i>0)
+      printf(",");
+    printf("%s", curOrder[i].c_str());
+  }
+  printf("\n");
+}
+
+void printVS(const std::vector<std::string>& curOrder){
+  //if(!cudaDebug) return;
+  for(int i=0; i<curOrder.size(); i++){
+    if(i>0)
+      printf(",");
+    printf("%s", curOrder[i].c_str());
+  }
+  printf("\n");
+}
+
+LoopCuda::~LoopCuda() {
+  const int m = stmt.size();
+  for (int i = 0; i < m; i++)
+    stmt[i].code->clear();
+}
+
+bool LoopCuda::symbolExists(std::string s){
+  if(symtab->lookup_sym(s.c_str(), SYM_VAR, false))
+    return true;
+  if(globals->lookup_sym(s.c_str(), SYM_VAR, false))
+    return true;
+  for(int i=0; i<idxNames.size(); i++)
+    for(int j=0; j<idxNames[i].size(); j++)
+      if(strcmp(idxNames[i][j].c_str(), s.c_str()) == 0)
+        return true;
+  return false;
+}
+
+void LoopCuda::addSync(int stmt_num, std::string idxName)
+{
+  //we store these and code-gen inserts sync to omega comments where stmt
+  //in loop that has idxName being generated
+  syncs.push_back(make_pair(stmt_num,idxName));
+}
+
+void LoopCuda::renameIndex(int stmt_num, std::string idx, std::string newName)
+{
+  int level = findCurLevel(stmt_num, idx);
+  if(idxNames.size() <= stmt_num || idxNames[stmt_num].size() < level)
+    throw std::runtime_error("Invalid statment number of index");
+  idxNames[stmt_num][level-1] = newName.c_str();
+}
+
+
+
+enum Type{ Int };
+
+struct VarDefs{
+  std::string name;
+  std::string secondName;  
+  operand size_expr; //array size as an expression (can be a product of other variables etc)
+  type_node * type;
+  var_sym* in_data; //Variable of array to copy data in from (before kernel call)
+  var_sym* out_data; //Variable of array to copy data out to (after kernel call)
+  int size_2d; //-1 if linearized, the constant size N, of a NxN 2D array otherwise
+  bool tex_mapped; //protonu-- true if this variable will be texture mapped, so no need to pass it as a argument
+  bool cons_mapped; //protonu-- true if this variable will be constant mem mapped, so no need to pass it as a argument
+  std::string original_name; //this is such a hack, to store the original name, to store a table to textures used
+  int var_ref_size ;
+};
+
+tree_node_list* wrapInIfFromMinBound(tree_node_list* then_part, tree_for* loop, base_symtab* symtab, var_sym* bound_sym)
+{
+  tree_node_list* ub = loop->ub_list();
+  tree_node_list_iter upli(ub);
+  while(!upli.is_empty()){
+    tree_node *node = upli.step();
+    if(node->kind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr)
+    {
+      in_rrr* ins = (in_rrr*)((tree_instr*)node)->instr();
+      //expect the structure: cpy( _ = min(grab_me, _))
+      if(ins->opcode() == io_cpy && ins->src1_op().is_instr()){
+        ins = (in_rrr*)ins->src1_op().instr();
+        if(ins->opcode() == io_min){
+          tree_node_list* tnl = new tree_node_list;
+          tnl->append(if_node(symtab, fold_sle(operand(bound_sym), ins->src1_op().instr()->clone()), then_part));
+          return tnl;
+        }
+      }
+    }
+  }
+  return then_part; //Failed to go to proper loop level
+}
+
+/**
+ * This would be better if it was done by a CHiLL xformation instead of at codegen
+ *
+ * state:
+ * for(...)
+ *   for(...)
+ *     cur_body
+ *   stmt1
+ *
+ * stm1 is in-between two loops that are going to be reduced. The
+ * solution is to put stmt1 at the end of cur_body but conditionally run
+ * in on the last step of the for loop.
+ *
+ * A CHiLL command that would work better:
+ *
+ * for(...)
+ *   stmt0
+ *   for(for i=0; i<n; i++)
+ *     cur_body
+ *   stmt1
+ * =>
+ * for(...)
+ *   for(for i=0; i<n; i++)
+ *     if(i==0) stmt0
+ *     cur_body
+ *     if(i==n-1) stmt1
+ */
+
+std::vector<tree_for*> findCommentedFors(const char* index, tree_node_list* tnl){
+  std::vector<tree_for *> result;
+  
+  tree_node_list_iter iter(tnl);
+  bool next_loop_ok = false;
+  while (!iter.is_empty()) {
+    tree_node *tn = iter.step();
+    if (tn->kind() == TREE_INSTR && ((tree_instr*)tn)->instr()->opcode() == io_mrk)
+    {
+      instruction* inst = ((tree_instr*)tn)->instr();
+      std::string comment;
+      if ((inst->peek_annote(k_ocg_comment) != NULL))
+      {
+        immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment));
+        immed_list_iter data_iter(data);
+        if(!data_iter.is_empty()){
+          immed first_immed = data_iter.step();
+          if(first_immed.kind() == im_string)
+            comment = first_immed.string();
+        }
+      }
+      if(comment.find("~cuda~") != std::string::npos
+         && comment.find("preferredIdx: ") != std::string::npos){
+        std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos);
+        if(idx.find(" ") != std::string::npos)
+          idx = idx.substr(0,idx.find(" "));
+        if(strcmp(idx.c_str(),index) == 0)
+          next_loop_ok = true;
+      }
+    }
+    if (tn->kind() == TREE_FOR){
+      if(next_loop_ok){
+        //printf("found loop %s\n", static_cast<tree_for *>(tn)->index()->name());
+        result.push_back(static_cast<tree_for *>(tn));
+      }
+      else{
+        //printf("looking down for loop %s\n", static_cast<tree_for *>(tn)->index()->name());
+        std::vector<tree_for*> t = findCommentedFors(index, static_cast<tree_for *>(tn)->body());
+        std::copy(t.begin(), t.end(), back_inserter(result));
+      }
+      next_loop_ok = false;
+    }
+    if (tn->kind() == TREE_IF) {
+      //printf("looking down if\n");
+      tree_if *tni = static_cast<tree_if *>(tn);
+      std::vector<tree_for*> t = findCommentedFors(index, tni->then_part());
+      std::copy(t.begin(), t.end(), back_inserter(result));
+    }
+  }
+  
+  return result;
+}
+
+tree_node_list* forReduce(tree_for* loop, var_sym* reduceIndex, proc_symtab* proc_syms)
+{
+  //We did the replacements all at once with recursiveFindPreferedIdxs
+  //replacements r;
+  //r.oldsyms.append(loop->index());
+  //r.newsyms.append(reduceIndex);
+  //tree_for* new_loop = (tree_for*)loop->clone_helper(&r, true);
+  tree_for* new_loop = loop;
+  
+  //return body one loops in
+  tree_node_list* tnl = loop_body_at_level(new_loop, 1);
+  //wrap in conditional if necessary
+  tnl = wrapInIfFromMinBound(tnl, new_loop, proc_syms, reduceIndex);
+  return tnl;
+}
+
+void recursiveFindRefs(tree_node_list* code, proc_symtab* proc_syms, replacements* r)
+{
+  if(code->parent() && code->scope()->is_block())
+    ((block_symtab*)code->scope())->find_exposed_refs(proc_syms, r);
+  tree_node_list_iter tnli(code);
+  while (!tnli.is_empty()) {
+    tree_node *node = tnli.step();
+    //printf("node kind: %d\n", node->kind());
+    if(node->is_instr())
+    {
+      tree_instr* t_instr = (tree_instr*)node;
+      t_instr->find_exposed_refs(proc_syms, r);
+    }
+    if(node->is_block()){
+      recursiveFindRefs(static_cast<tree_block *>(node)->body(), proc_syms, r);
+    }
+    else if(node->is_for()){
+      tree_for* tn_for = static_cast<tree_for *>(node);
+      //Find refs in statemetns and body
+      tn_for->find_exposed_refs(proc_syms, r);
+      //recursiveFindRefs(tn_for->body(), proc_syms, r);
+    }
+  }
+}
+
+tree_node_list* recursiveFindReplacePreferedIdxs(tree_node_list* code, proc_symtab* proc_syms,
+                                                 proc_sym* cudaSync, func_type* unkown_func, 
+                                                 std::map<std::string, var_sym*>& loop_idxs)
+{
+  tree_node_list* tnl = new tree_node_list;
+  tree_node_list_iter tnli(code);
+  var_sym* idxSym=0;
+  bool sync = false;
+  std::vector<tree_node*>      r1;
+  std::vector<tree_node_list*> r2;
+  while (!tnli.is_empty()) {
+    tree_node *node = tnli.step();
+    //printf("node kind: %d\n", node->kind());
+    if(node->is_instr())
+    {
+      if(((tree_instr*)node)->instr()->format() == inf_rrr){
+        in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr();
+        if(inst->opcode() == io_mrk){
+          std::string comment;
+          if ((inst->peek_annote(k_ocg_comment) != NULL))
+          {
+            immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment));
+            immed_list_iter data_iter(data);
+            if(!data_iter.is_empty()){
+              immed first_immed = data_iter.step();
+              if(first_immed.kind() == im_string)
+                comment = first_immed.string();
+            }
+          }
+          if(comment.find("~cuda~") != std::string::npos
+             && comment.find("preferredIdx: ") != std::string::npos){
+            std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos);
+            if(idx.find(" ") != std::string::npos)
+              idx = idx.substr(0,idx.find(" "));
+            //printf("sym_tab preferred index: %s\n", idx.c_str());
+            if(loop_idxs.find(idx) != loop_idxs.end())
+              idxSym = loop_idxs.find(idx)->second;
+            //Get the proc variable sybol for this preferred index
+            if(idxSym == 0){
+              idxSym = (var_sym*)proc_syms->lookup_sym(idx.c_str(), SYM_VAR, false);
+              //printf("idx not found: lookup %p\n", idxSym);
+              if(!idxSym){
+                idxSym = new var_sym(type_s32, (char*)idx.c_str());
+                proc_syms->add_sym(idxSym);
+                //printf("idx created and inserted\n");
+              }
+              //Now insert into our map for future
+              loop_idxs.insert(make_pair(idx, idxSym));
+            }
+            //See if we have a sync as well
+            if(comment.find("sync") != std::string::npos){
+              //printf("Inserting sync after current block\n");
+              sync = true;
+            }
+          }
+        }
+      }
+      tnl->append(node);
+    }
+    else if(node->is_block()){
+      tree_block* b = static_cast<tree_block *>(node);
+      b->set_body(recursiveFindReplacePreferedIdxs(b->body(), proc_syms, cudaSync, unkown_func, loop_idxs));
+      tnl->append(b);
+    }
+    else if(node->is_for()){
+      tree_for* tn_for = static_cast<tree_for *>(node);
+      if(idxSym){
+        //Replace the current tn_for's index variable with idxSym
+        //printf("replacing sym %s -> %s\n", tn_for->index()->name(), idxSym->name());
+        replacements r;
+        r.oldsyms.append(tn_for->index());
+        r.newsyms.append(idxSym);
+        tree_for* new_loop = (tree_for*)tn_for->clone_helper(&r, true);
+        idxSym = 0; //Reset for more loops in this tnl
+        new_loop->set_body(recursiveFindReplacePreferedIdxs(new_loop->body(), proc_syms, cudaSync, unkown_func, loop_idxs));
+        tnl->append(new_loop);
+        
+        if(sync){
+          in_cal *the_call =
+            new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaSync))), 0);
+          tnl->append(new tree_instr(the_call));
+          //tnl->print();
+          sync = true;
+        }
+      }else{
+        tn_for->set_body(recursiveFindReplacePreferedIdxs(tn_for->body(), proc_syms, cudaSync, unkown_func, loop_idxs));
+        tnl->append(tn_for);
+      }
+    }else if (node->kind() == TREE_IF) {
+      tree_if *tni = static_cast<tree_if *>(node);
+      tni->set_then_part(recursiveFindReplacePreferedIdxs(tni->then_part(), proc_syms, cudaSync, unkown_func, loop_idxs));
+      tnl->append(tni);
+    }
+  }
+  //Do this after the loop to not screw up the pointer interator
+  /*
+    for(int i=0; i<r1.size(); i++){
+    swap_node_for_node_list(r1[i],r2[i]);
+    }*/
+  return tnl;
+}
+
+// loop_vars -> array references
+// loop_idxs -> <idx_name,idx_sym> map for when we encounter a loop with a different preferredIndex
+// dim_vars -> out param, fills with <old,new> var_sym pair for 2D array dimentions (messy stuff)
+tree_node_list* swapVarReferences(tree_node_list* code, replacements* r, CG_suifBuilder *ocg,
+                                  std::map<std::string, var_sym*>& loop_vars,
+                                  proc_symtab *proc_syms,
+                                  std::vector< std::pair<var_sym*,var_sym*> >& dim_vars)
+{
+  //Iterate over every expression, looking up each variable and type
+  //reference used and possibly replacing it or adding it to our symbol
+  //table
+  //
+  //We use the built-in cloning helper methods to seriously help us with this!
+  
+  //Need to do a recursive mark
+  recursiveFindRefs(code, proc_syms, r);
+  
+  
+  //We can't rely on type_node->clone() to do the heavy lifting when the
+  //old type is a two dimentional array with variable upper bounds as
+  //that requires creating and saveing variable references to the upper
+  //bounds. So we do one pass over the oldtypes doing this type of
+  //conversion, putting results in the fixed_types map for a second pass
+  //to pick up.
+  std::map<type_node*,type_node*> fixed_types; //array_types needing their upper bound installed
+  type_node_list_iter tlip(&r->oldtypes);
+  while(!tlip.is_empty())
+  {
+    type_node* old_tn = tlip.step();
+    type_node* new_tn = 0;
+    type_node* base_type = old_tn;
+    std::vector< std::pair<var_sym*, type_node*> > variable_upper_bouneds;
+    if(old_tn->is_ptr()){
+      while (base_type->is_array() || base_type->is_ptr()) {
+        if (base_type->is_array()){
+          array_bound ub = ((array_type*)base_type)->upper_bound();
+          if(ub.is_variable()){
+            var_sym* old_ub = (var_sym*)ub.variable();
+            var_sym *new_ub = proc_syms->new_unique_var(type_s32);
+            dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub));
+            variable_upper_bouneds.push_back( std::pair<var_sym*, type_node*>(new_ub, base_type) );
+          }
+          base_type = static_cast<array_type *>(base_type)->elem_type();
+        }
+        else if (base_type->is_ptr())
+          base_type = static_cast<ptr_type *>(base_type)->ref_type();
+      }
+    }
+    for (int i = variable_upper_bouneds.size()-1; i >= 0; i--) {
+      var_sym *var_ub = variable_upper_bouneds[i].first;
+      type_node* old_tn = variable_upper_bouneds[i].second;
+      if(new_tn == 0)
+        new_tn = new array_type(base_type, array_bound(1), array_bound(var_ub));
+      else
+        new_tn = new array_type(new_tn, array_bound(1), array_bound(var_ub));
+      proc_syms->add_type(new_tn);
+      fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn));
+    }
+    if(new_tn){
+      if(old_tn->is_ptr()){
+        new_tn = new ptr_type(new_tn);
+        proc_syms->add_type(new_tn);
+      }
+      fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn));
+    }
+  }
+  
+  //Quickly look for modifiers on our our array types (__shared__ float [][])
+  type_node_list_iter tliq(&r->oldtypes);
+  while(!tliq.is_empty())
+  {
+    type_node* old_tn = tliq.step();
+    if(old_tn->is_modifier()){
+      type_node* base_type = static_cast<modifier_type *>(old_tn)->base();
+      if(fixed_types.find(base_type) != fixed_types.end()){
+        type_node* fixed_base = (*fixed_types.find(base_type)).second;
+        //printf("Fix modifier with fixed base\n");
+        //This should work to copy over the annotations, but apparently doesn't work so well
+        type_node* new_tn = new modifier_type(static_cast<modifier_type*>(old_tn)->op(), fixed_base);
+        old_tn->copy_annotes(new_tn);
+        fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn));
+      }
+    }
+  }
+  
+  //Run through the types and create entries in r->newtypes but don't install
+  type_node_list_iter tli(&r->oldtypes);
+  while(!tli.is_empty())
+  {
+    type_node* old_tn = tli.step();
+    type_node* new_tn = 0;
+    
+    //If we recorded this as fixed by our special case, use that type
+    //instead of cloning.
+    if(fixed_types.find(old_tn) != fixed_types.end()){
+      new_tn = (*fixed_types.find(old_tn)).second;
+      //printf("Reusing fixed typ %u: ", new_tn->type_id());
+    }else{
+      new_tn = old_tn->clone();
+      //printf("Cloning type %u: ", old_tn->type_id());
+    }
+    new_tn = proc_syms->install_type(new_tn);
+    
+    //Ok, there is a weird case where an array type that has var_sym as
+    //their upper bounds can't be covered fully in this loop or the
+    //var_sym loop, so we need special code.
+    /*
+      if(old_tn->op() == TYPE_PTR && ((ptr_type*)old_tn)->ref_type()->op() == TYPE_ARRAY){
+      array_type* outer_array = (array_type*)((ptr_type*)old_tn)->ref_type();
+      array_bound ub = outer_array->upper_bound();
+      if(ub.is_variable()){
+      var_sym* old_ub = (var_sym*)ub.variable();
+      var_sym* new_ub = (var_sym*)((array_type*)((ptr_type*)new_tn)->ref_type())->upper_bound().variable();
+      //r->oldsyms.append(old_ub);
+      fix_ub.insert(std::pair<var_sym*,array_type*>(old_ub, (array_type*)((ptr_type*)new_tn)->ref_type()));
+      dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub));
+      printf("array var_sym: %p\n", new_ub);
+      }
+      if(outer_array->elem_type()->op() == TYPE_ARRAY)
+      {
+      array_type* inner_array = (array_type*)outer_array->elem_type();
+      array_bound ub = inner_array->upper_bound();
+      if(ub.is_variable()){
+      var_sym* old_ub = (var_sym*)ub.variable();
+      var_sym* new_ub = (var_sym*)((array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type())->upper_bound().variable();
+      dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub));
+      printf("array var_sym: %p\n", new_ub);
+      //r->oldsyms.append(old_ub);
+      fix_ub.insert(std::pair<var_sym*,array_type*>(old_ub, (array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type()));
+      }
+      }
+      }
+    */
+    r->newtypes.append(new_tn);
+  }
+  
+  //printf("proc_syms symbol run through\n");
+  //proc_syms->print();
+  
+  //Run through the syms creating new copies
+  sym_node_list_iter snli(&r->oldsyms);
+  while(!snli.is_empty())
+  {
+    sym_node *old_sn = snli.step();
+    
+    if(loop_vars.count(std::string(old_sn->name())) > 0)
+    {
+      r->newsyms.append(loop_vars[std::string(old_sn->name())]);
+      //printf("def exists: %s\n", old_sn->name());
+    }else{
+      sym_node *new_sn = old_sn->copy();
+      if(new_sn->is_var()){
+        var_sym* var = (var_sym*)new_sn;
+        type_node* new_type = var->type()->clone_helper(r);
+        
+        //TODO: Have a tagged list of variables to make shared
+        //Make local 2D arrays __shared__
+        if(new_type->op() == TYPE_ARRAY && ((array_type*)new_type)->elem_type()->op() == TYPE_ARRAY){
+          //protonu--changes suggested by Malik
+          //printf("Adding __shared__ annotation to : %s\n", new_sn->name());
+          //new_type = ocg->ModifyType(new_type, "__shared__");
+          //proc_syms->add_type(new_type);
+        }
+        var->set_type(new_type);
+      }
+      proc_syms->add_sym(new_sn);
+      r->newsyms.append(new_sn);
+      //printf("def new: %s\n", new_sn->name());
+    }
+  }
+  
+  //printf("proc_syms var runthrough\n");
+  //proc_syms->print();
+  return code->clone_helper(r);
+}
+
+bool LoopCuda::validIndexes(int stmt, const std::vector<std::string>& idxs){
+  for(int i=0; i<idxs.size(); i++){
+    bool found = false;
+    for(int j=0; j<idxNames[stmt].size(); j++){
+      if(strcmp(idxNames[stmt][j].c_str(), idxs[i].c_str()) == 0){
+        found=true;
+      }
+    }
+    if(!found){
+      return false;
+    }
+  }
+  return true;
+}
+
+
+bool LoopCuda::cudaize_v2(std::string kernel_name, std::map<std::string, int> array_dims,
+                          std::vector<std::string> blockIdxs, std::vector<std::string> threadIdxs)
+{
+  int stmt_num = 0;
+  if(cudaDebug){
+    printf("cudaize_v2(%s, {", kernel_name.c_str());
+    //for(
+    printf("}, blocks={"); printVs(blockIdxs); printf("}, thread={"); printVs(threadIdxs); printf("})\n");
+  }
+  
+  this->array_dims = array_dims;
+  if(!validIndexes(stmt_num, blockIdxs)){
+    throw std::runtime_error("One of the indexes in the block list was not "
+                             "found in the current set of indexes.");
+  }
+  if(!validIndexes(stmt_num, threadIdxs)){
+    throw std::runtime_error("One of the indexes in the thread list was not "
+                             "found in the current set of indexes.");
+  }
+  if(blockIdxs.size() ==0)
+    throw std::runtime_error("Cudaize: Need at least one block dimention");
+  int block_level=0;
+  //Now, we will determine the actual size (if possible, otherwise
+  //complain) for the block dimentions and thread dimentions based on our
+  //indexes and the relations for our stmt;
+  for(int i=0; i<blockIdxs.size(); i++){
+    int level = findCurLevel(stmt_num, blockIdxs[i]);
+    int ub,lb;
+    extractCudaUB(stmt_num,level,ub,lb);
+    if(lb!= 0){
+      //attempt to "normalize" the loop with an in-place tile and then re-check our bounds
+      if(cudaDebug) printf("Cudaize: doing tile at level %d to try and normalize lower bounds\n", level);
+      tile(stmt_num,level,1,level,CountedTile);
+      idxNames[stmt_num].insert(idxNames[stmt_num].begin()+(level),"");//TODO: possibly handle this for all sibling stmts
+      extractCudaUB(stmt_num,level,ub,lb);
+    }
+    if(lb != 0){
+      char buf[1024];
+      sprintf(buf, "Cudaize: Loop at level %d does not have 0 as it's lower bound", level);
+      throw std::runtime_error(buf);
+    }
+    if(ub < 0){
+      char buf[1024];
+      sprintf(buf, "Cudaize: Loop at level %d does not have a hard upper bound", level);
+      throw std::runtime_error(buf);
+    }
+    if(cudaDebug) printf("block idx %s level %d lb: %d ub %d\n", blockIdxs[i].c_str(), level, lb, ub);
+    if(i == 0){
+      block_level = level;
+      cu_bx = ub+1;
+      idxNames[stmt_num][level-1] = "bx";
+    }
+    else if(i == 1){
+      cu_by = ub+1;
+      idxNames[stmt_num][level-1] = "by";
+    }
+  }
+  if(!cu_by)
+    block_level=0;
+  int thread_level1 = 0;
+  int thread_level2 = 0;
+  for(int i=0; i<threadIdxs.size(); i++){
+    int level = findCurLevel(stmt_num, threadIdxs[i]);
+    int ub,lb;
+    extractCudaUB(stmt_num,level,ub,lb);
+    if(lb!= 0){
+      //attempt to "normalize" the loop with an in-place tile and then re-check our bounds
+      if(cudaDebug) printf("Cudaize: doing tile at level %d to try and normalize lower bounds\n", level);
+      tile(stmt_num,level,1,level,CountedTile);
+      idxNames[stmt_num].insert(idxNames[stmt_num].begin()+(level),"");
+      extractCudaUB(stmt_num,level,ub,lb);
+    }
+    if(lb != 0){
+      char buf[1024];
+      sprintf(buf, "Cudaize: Loop at level %d does not have 0 as it's lower bound", level);
+      throw std::runtime_error(buf);
+    }
+    if(ub < 0){
+      char buf[1024];
+      sprintf(buf, "Cudaize: Loop at level %d does not have a hard upper bound", level);
+      throw std::runtime_error(buf);
+    }
+    
+    if(cudaDebug) printf("thread idx %s level %d lb: %d ub %d\n", threadIdxs[i].c_str(), level, lb, ub);
+    if(i == 0){
+      thread_level1 = level;
+      cu_tx = ub+1;
+      idxNames[stmt_num][level-1] = "tx";
+    }
+    else if(i == 1){
+      thread_level2 = level;
+      cu_ty = ub+1;
+      idxNames[stmt_num][level-1] = "ty";
+    }
+    else if(i == 2){
+      cu_tz = ub+1;
+      idxNames[stmt_num][level-1] = "tz";
+    }
+  }
+  if(!cu_ty)
+    thread_level1 = 0; 
+  if(!cu_tz)
+    thread_level2 = 0; 
+  
+  //Make changes to nonsplitlevels
+  const int m = stmt.size();
+  for (int i = 0; i < m; i++) {
+    if(block_level){
+      //stmt[i].nonSplitLevels.append((block_level)*2);
+      stmt_nonSplitLevels[i].append((block_level)*2);
+    }
+    if(thread_level1){
+      //stmt[i].nonSplitLevels.append((thread_level1)*2);
+      stmt_nonSplitLevels[i].append((thread_level1)*2);
+    }
+    if(thread_level2){
+      //stmt[i].nonSplitLevels.append((thread_level1)*2);
+      stmt_nonSplitLevels[i].append((thread_level1)*2);
+    }
+  }
+  
+  if(cudaDebug) {
+    printf("Codegen: current names: ");
+    printVS(idxNames[stmt_num]);
+  }
+  //Set codegen flag
+  code_gen_flags |= GenCudaizeV2;
+  
+  //Save array dimention sizes
+  this->array_dims = array_dims;
+  cu_kernel_name = kernel_name.c_str();
+  
+}
+
+tree_node_list* LoopCuda::cudaize_codegen_v2()
+{
+    //printf("cudaize codegen V2\n");
+  CG_suifBuilder *ocg = dynamic_cast<CG_suifBuilder*>(ir->builder());
+  if(!ocg) return false;
+  
+  //protonu--adding an annote to track texture memory type
+  ANNOTE(k_cuda_texture_memory, "cuda texture memory", TRUE);
+  ANNOTE(k_cuda_constant_memory, "cuda constant memory", TRUE);
+  int tex_mem_on = 0;
+  int cons_mem_on = 0;
+  
+  
+  
+  CG_outputRepr* repr;
+  std::vector<VarDefs> arrayVars;
+  std::vector<VarDefs> localScopedVars;
+  
+  std::vector<IR_ArrayRef *> ro_refs;
+  std::vector<IR_ArrayRef *> wo_refs;
+  std::set<std::string> uniqueRefs;
+  std::set<std::string> uniqueWoRefs;
+  //protonu--let's try a much simpler approach of a map instead
+  //we also keep a map for constant memories
+  std::map<std::string , var_sym *>tex_ref_map;
+  std::map<std::string , var_sym *>cons_ref_map;
+  
+  for(int j=0; j<stmt.size(); j++)
+  {
+    std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[j].code);
+    for (int i = 0; i < refs.size(); i++)
+    {
+      //printf("ref %s wo %d\n", static_cast<const char*>(refs[i]->name()), refs[i]->is_write());
+      var_sym* var = symtab->lookup_var((char*)refs[i]->name().c_str(),false);
+      //If the array is not a parameter, then it's a local array and we
+      //want to recreate it as a stack variable in the kernel as opposed to
+      //passing it in.
+      if(!var->is_param())
+        continue;
+      if (uniqueRefs.find(refs[i]->name()) == uniqueRefs.end())
+      {
+        uniqueRefs.insert(refs[i]->name());
+        if(refs[i]->is_write()){
+          uniqueWoRefs.insert(refs[i]->name());
+          wo_refs.push_back(refs[i]);
+        }
+        else
+          ro_refs.push_back(refs[i]);
+      }
+      if (refs[i]->is_write() && uniqueWoRefs.find(refs[i]->name()) == uniqueWoRefs.end()){
+        uniqueWoRefs.insert(refs[i]->name());
+        wo_refs.push_back(refs[i]);
+        //printf("adding %s to wo\n", static_cast<const char*>(refs[i]->name()));
+      }
+    }
+  }
+  
+  // printf("reading from array ");
+  // for(int i=0; i<ro_refs.size(); i++)
+  //   printf("'%s' ", ro_refs[i]->name().c_str());
+  // printf("and writting to array ");
+  // for(int i=0; i<wo_refs.size(); i++)
+  //   printf("'%s' ", wo_refs[i]->name().c_str());
+  // printf("\n");
+  
+  const char* gridName = "dimGrid";
+  const char* blockName = "dimBlock";
+  
+  //TODO: Could allow for array_dims_vars to be a mapping from array
+  //references to to variable names that define their length.
+  var_sym* dim1 = 0;
+  var_sym* dim2 = 0;
+  
+  for(int i=0; i<wo_refs.size(); i++)
+  {
+    //TODO: Currently assume all arrays are floats of one or two dimentions
+    var_sym* outArray = 0;
+    std::string name = wo_refs[i]->name();
+    outArray = symtab->lookup_var((char*)name.c_str(),false);
+    
+    VarDefs v;
+    v.size_2d = -1;
+    char buf[32];
+    snprintf(buf, 32, "devO%dPtr", i+1);
+    v.name = buf;
+    if(outArray->type()->is_ptr())
+      if(((ptr_type *)(outArray->type()))->ref_type()->is_array())
+        v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type();
+      else
+        v.type = ((ptr_type *)(outArray->type()))->ref_type();
+    else
+      v.type = type_f32;
+    v.tex_mapped = false;
+    v.cons_mapped = false;
+    v.original_name = wo_refs[i]->name();
+    //Size of the array = dim1 * dim2 * num bytes of our array type
+    
+    //If our input array is 2D (non-linearized), we want the actual
+    //dimentions of the array
+    CG_outputRepr* size;
+    //Lookup in array_dims
+    std::map<std::string, int>::iterator it = array_dims.find(name.c_str());
+    if(outArray->type()->is_ptr() && outArray->type()->ref_type(0)->is_array())
+    {
+      array_type* t = (array_type*)outArray->type()->ref_type(0);
+      v.size_2d = t->upper_bound().constant()+1;
+      printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)wo_refs[i]->name().c_str());
+      size = ocg->CreateInt(v.size_2d * v.size_2d);
+    }else if(it != array_dims.end()){
+      int ref_size = it->second;
+      v.var_ref_size = ref_size;
+      size = ocg->CreateInt(ref_size);
+    }
+    else{
+      if(dim1){
+        size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)),
+                                new CG_suifRepr(operand(dim2)));
+      }else{
+        char buf[1024];
+        sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a "
+                "detectable size or specififed dimentions", name.c_str());
+        throw std::runtime_error(buf);
+      }
+    }
+    v.size_expr = operand(static_cast<CG_suifRepr*>(ocg->CreateTimes(
+                                                      size,
+                                                      ocg->CreateInt(v.type->size()/8)))->GetExpression());
+    v.in_data = 0;
+    v.out_data = outArray;
+    //Check for in ro_refs and remove it at this point
+    std::vector<IR_ArrayRef *>::iterator it_;
+    for(it_ = ro_refs.begin(); it_ != ro_refs.end(); it_++)
+    {
+      if((*it_)->name() == wo_refs[i]->name()){
+        break;
+      }
+    }
+    if(it_ != ro_refs.end())
+    {
+      v.in_data = outArray;
+      ro_refs.erase(it_);
+    }
+    
+    arrayVars.push_back(v);
+    
+  }
+  
+  //protonu-- assuming that all texture mapped memories were originally read only mems
+  //there should be safety checks for that, will implement those later
+  
+  int cs_ref_size = 0;
+  
+  for(int i=0; i<ro_refs.size(); i++)
+  {
+    var_sym* inArray = 0;
+    std::string name = ro_refs[i]->name();
+    inArray = symtab->lookup_var((char*)name.c_str(),false);
+    VarDefs v;
+    v.size_2d = -1;
+    char buf[32];
+    snprintf(buf, 32, "devI%dPtr", i+1);
+    v.name = buf;
+    if(inArray->type()->is_ptr())
+      if(((ptr_type *)(inArray->type()))->ref_type()->is_array())
+        v.type = ((array_type *)(((ptr_type *)(inArray->type()))->ref_type()))->elem_type();
+      else
+        v.type = ((ptr_type *)(inArray->type()))->ref_type(); 
+    else
+      v.type = type_f32;
+    v.tex_mapped = false;
+    v.cons_mapped = false;
+    v.original_name = ro_refs[i]->name();
+    if ( texture != NULL)
+      v.tex_mapped = (texture->is_array_tex_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars
+    if (v.tex_mapped){
+      printf("this variable  %s is mapped to texture memory", name.c_str());
+    }
+    if ( constant_mem != NULL)
+      v.cons_mapped = (constant_mem->is_array_cons_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars
+    if (v.cons_mapped){
+      printf("this variable  %s is mapped to constant memory", name.c_str());
+    }
+    
+    //Size of the array = dim1 * dim2 * num bytes of our array type
+    
+    //If our input array is 2D (non-linearized), we want the actual
+    //dimentions of the array (as it might be less than cu_n
+    CG_outputRepr* size;
+    //Lookup in array_dims
+    std::map<std::string, int>::iterator it = array_dims.find(name.c_str());
+    int ref_size = 0;
+    if(inArray->type()->is_ptr() && inArray->type()->ref_type(0)->is_array())
+    {
+      array_type* t = (array_type*)inArray->type()->ref_type(0);
+      v.size_2d = t->upper_bound().constant()+1;
+      printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)ro_refs[i]->name().c_str());
+      size = ocg->CreateInt(v.size_2d * v.size_2d);
+    }else if(it != array_dims.end()){
+      ref_size = it->second;
+      v.var_ref_size = ref_size;
+      size = ocg->CreateInt(ref_size);
+    }else{
+      if(dim1){
+        size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)),
+                                new CG_suifRepr(operand(dim2)));
+      }else{
+        char buf[1024];
+        sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a "
+                "detectable size or specififed dimentions", name.c_str());
+        throw std::runtime_error(buf);
+      }
+    }
+    
+    
+    
+    v.size_expr = operand(static_cast<CG_suifRepr*>(ocg->CreateTimes(
+                                                      size,
+                                                      ocg->CreateInt(v.type->size()/8)))->GetExpression());
+    
+    v.in_data = inArray;
+    v.out_data = 0;
+    arrayVars.push_back(v);
+  }
+  
+  
+  if(arrayVars.size() < 2)
+  {
+    fprintf(stderr, "cudaize error: Did not find two arrays being accessed\n");
+    return false;
+  }
+  
+  //protonu--debugging tool--the printf statement
+  //tex_mem_on signals use of tex mem
+  for(int i=0; i<arrayVars.size(); i++)
+  {
+    //printf("var name %s, tex_mem used %s\n", arrayVars[i].name.c_str(), (arrayVars[i].tex_mapped)?"true":"false");
+    if (arrayVars[i].tex_mapped  ) tex_mem_on ++;
+    if (arrayVars[i].cons_mapped  ) cons_mem_on ++;
+  }
+  
+  //Add CUDA function extern prototypes and function types
+  func_type* unkown_func = new func_type(type_s32); //function on unkown args that returns a i32
+  unkown_func = (func_type*)symtab->install_type(unkown_func);
+  func_type* void_func = new func_type(type_void); //function on unkown args that returns a void
+  void_func = (func_type*)globals->install_type(void_func);
+  func_type* float_func = new func_type(type_f32); //function on unkown args that returns a float
+  float_func = (func_type*)globals->install_type(float_func);
+  
+  type_node* result = ocg->ModifyType(type_void, "__global__");
+  result = globals->install_type(result);
+  func_type* kernel_type = new func_type(result); //function returns a '__global__ void'
+  
+  int numArgs =  arrayVars.size() + (dim1 ? 2 : 0) + localScopedVars.size();
+  //protonu--need to account for texture memory here, reduce the #args
+  if( tex_mem_on ) numArgs -= tex_mem_on;
+  if( cons_mem_on ) numArgs -= cons_mem_on;
+  kernel_type->set_num_args(numArgs);
+  int argCount = 0;
+  for(int i=0; i<arrayVars.size(); i++)
+  {
+    type_node* fptr;
+    if(arrayVars[i].in_data)
+      fptr = arrayVars[i].in_data->type()->clone();
+    else
+      fptr = arrayVars[i].out_data->type()->clone();
+    //protonu--skip this for texture mems
+    if( arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true )
+      kernel_type->set_arg_type(argCount++, fptr);
+  }
+  if(dim1){
+    kernel_type->set_arg_type(argCount++, type_s32); //width x height dimentions
+    kernel_type->set_arg_type(argCount++, type_s32);
+  }
+  kernel_type = (func_type*)globals->install_type(kernel_type);
+  
+  proc_sym* cudaMalloc = globals->new_proc(unkown_func, src_c, "cudaMalloc");
+  proc_sym* cudaMemcpy = globals->new_proc(unkown_func, src_c, "cudaMemcpy");
+  proc_sym* cudaFree = globals->new_proc(unkown_func, src_c, "cudaFree");
+  proc_sym* cudaSync = globals->new_proc(void_func, src_c, "__syncthreads");
+  proc_sym* cudaBind = globals->new_proc(unkown_func, src_c, "cudaBindTexture");
+  proc_sym* cudaMemcpySym = globals->new_proc(unkown_func, src_c, "cudaMemcpyToSymbol");
+  
+  
+  //protonu-removing Gabe's function, introducing mine, this is pretty cosmetic
+  //proc_sym* cudaFetch = globals->new_proc(float_func, src_c, "tex1Dfetch");
+  proc_sym* tex1D = globals->new_proc(float_func, src_c, "tex1Dfetch");
+  
+  var_sym *cudaMemcpyHostToDevice = new var_sym(type_s32, "cudaMemcpyHostToDevice");
+  var_sym *cudaMemcpyDeviceToHost = new var_sym(type_s32, "cudaMemcpyDeviceToHost");
+  cudaMemcpyDeviceToHost->set_param();
+  cudaMemcpyHostToDevice->set_param();
+  globals->add_sym(cudaMemcpyHostToDevice);
+  globals->add_sym(cudaMemcpyDeviceToHost);
+  
+  //protonu--adding the bool tex_mem to the structure struct_type
+  //to bypass the re-naming of struct texture, this is a hack fix
+  struct_type* texType = new struct_type(TYPE_GROUP, 0, "texture<float, 1, cudaReadModeElementType>", 0, true);
+  immed_list *iml_tex = new immed_list;
+  iml_tex->append(immed("texture memory"));
+  texType->append_annote(k_cuda_texture_memory, iml_tex);
+  //protonu--end my changes
+  texType = (struct_type*)globals->install_type(texType);
+  //protonu--should register the locals later on
+  //when we do the bind operation
+  //var_sym* texRef = new var_sym(texType, "texRef");
+  //globals->add_sym(texRef);
+  
+  //Add our mallocs (and input array memcpys)
+  for(int i=0; i<arrayVars.size(); i++)
+  {
+    //protonu--check if the variable is not a tex-mapped variable. If it is tex mapped
+    // allow a malloc and memcpy operation, and a bind, but only if it is tex mapped, but dont call
+    // the kernel with it as an argument.
+    
+    //Make a pointer of type a[i].type
+    //type_node* fptr = new ptr_type(arrayVars[i].type->clone());
+    //protonu--temporary change 
+    type_node* fptr = new ptr_type(arrayVars[i].type);
+    fptr = symtab->install_type(fptr);
+    var_sym *dvs = new var_sym(fptr, const_cast<char*>(
+                                 arrayVars[i].name.c_str()));
+    dvs->set_addr_taken();
+    symtab->add_sym(dvs);
+    
+    //cudaMalloc args
+    //protonu--no cudaMalloc required for constant memory
+    tree_node_list* tnl = new tree_node_list;
+    if(arrayVars[i].cons_mapped != true )
+    {
+      in_cal *the_call =
+        new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMalloc))), 2);
+      the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to()->ptr_to(), operand(), immed(dvs))));
+      the_call->set_argument(1, arrayVars[i].size_expr);
+      
+      tnl->append(new tree_instr(the_call));
+      setup_code = ocg->StmtListAppend(setup_code,
+                                       new CG_suifRepr(tnl));
+    }
+    if(arrayVars[i].in_data)
+    {
+      //cudaMemcpy args
+      //protonu-- no cudaMemcpy required for constant memory
+      if ( arrayVars[i].cons_mapped != true )
+      {
+        in_cal *the_call =
+          new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpy))), 4);
+        the_call->set_argument(0, operand(dvs));
+        the_call->set_argument(1, operand(arrayVars[i].in_data));
+        the_call->set_argument(2, arrayVars[i].size_expr.clone());
+        the_call->set_argument(3, operand(cudaMemcpyHostToDevice));
+        
+        tnl = new tree_node_list;
+        tnl->append(new tree_instr(the_call));
+        setup_code = ocg->StmtListAppend(setup_code,
+                                         new CG_suifRepr(tnl));
+      }
+      
+      //protonu--check if the arrayvar is tex mapped
+      if(arrayVars[i].tex_mapped == true)
+      {
+        //Need a texture reference variable
+        char buf[32];
+        snprintf(buf, 32, "tex%dRef", i+1);
+        arrayVars[i].secondName = buf;
+        
+        var_sym* texRef = new var_sym(texType, buf);
+        //printf("\n putting in %s\n", arrayVars[i].original_name.c_str());
+        tex_ref_map[arrayVars[i].original_name] = texRef;
+        globals->add_sym(texRef);
+        //protonu--added the above two lines
+        
+        in_cal *the_call =
+          new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaBind))), 4);
+        in_ldc *ins = new in_ldc(type_s32, operand(), immed(0));
+        the_call->set_argument(0, operand(ins));
+        the_call->set_argument(1, operand(texRef));//protonu--change to add the new sym
+        the_call->set_argument(2, operand(dvs));
+        the_call->set_argument(3, arrayVars[i].size_expr.clone());
+        
+        tnl = new tree_node_list;
+        tnl->append(new tree_instr(the_call));
+        setup_code = ocg->StmtListAppend(setup_code,
+                                         new CG_suifRepr(tnl));
+      }
+      
+      //protonu--if arrayvar is mapped to constant memory
+      if(arrayVars[i].cons_mapped == true)
+      {
+        char buf[32];
+        snprintf(buf, 32, "cs%dRef", i+1);
+        //arrayVars[i].secondName = buf;
+        array_bound low (0);
+        array_bound high (arrayVars[i].var_ref_size -1);
+        array_type *arr = new array_type(arrayVars[i].type,low, high);
+        type_node* cons_arr = ocg->ModifyType(arr, "__device__ __constant__");
+        cons_arr = globals->install_type(cons_arr);
+        var_sym* consRef = new var_sym(cons_arr, buf);
+        cons_ref_map[arrayVars[i].original_name] = consRef;
+        globals->add_sym(consRef);
+        
+        
+        
+        in_cal *the_call =
+          new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpySym))), 3);
+        the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to(), operand(), immed(consRef))));
+        the_call->set_argument(1, operand(arrayVars[i].in_data));
+        the_call->set_argument(2, arrayVars[i].size_expr.clone());
+        
+        tnl = new tree_node_list;
+        tnl->append(new tree_instr(the_call));
+        setup_code = ocg->StmtListAppend(setup_code,
+                                         new CG_suifRepr(tnl));
+        
+      }
+    }
+  }
+  
+  //Build dimGrid dim3 variables based on loop dimentions and ti/tj
+  char blockD1[120];
+  char blockD2[120];
+  if(dim1){
+    snprintf(blockD1, 120, "%s/%d", dim1->name(), cu_tx);
+    snprintf(blockD2, 120, "%s/%d", dim2->name(), cu_ty);
+  }else{
+    snprintf(blockD1, 120, "%d", cu_bx);
+    snprintf(blockD2, 120, "%d", cu_by);
+    //snprintf(blockD1, 120, "%d/%d", cu_nx, cu_tx);
+    //snprintf(blockD2, 120, "%d/%d", cu_ny, cu_ty);
+  }
+  repr = ocg->CreateDim3(immed((char*)gridName),
+                         immed(blockD1),
+                         immed(blockD2));
+  setup_code = ocg->StmtListAppend(setup_code, repr);
+  
+  repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx),immed(cu_ty));
+  
+  if(cu_tz > 1)
+    repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty), immed(cu_tz));
+  else
+    repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty));
+  setup_code = ocg->StmtListAppend(setup_code, repr);
+  
+  //call kernel function with name loop_name
+  //like: transpose_k<<<dimGrid,dimBlock>>>(devOPtr, devIPtr , width, height);
+  char dims[120];
+  snprintf(dims,120,"<<<%s,%s>>>",gridName, blockName);
+  immed_list *iml = new immed_list;
+  iml->append(immed((char*)cu_kernel_name.c_str()));
+  iml->append(immed(dims));
+  //printf("%s %s\n", static_cast<const char*>(cu_kernel_name), dims);
+  for(int i=0; i<arrayVars.size(); i++)
+    //Throw in a type cast if our kernel takes 2D array notation
+    //like (float(*) [1024])
+  {
+    //protonu--throwing in another hack to stop the caller from passing tex mapped 
+    //vars to the kernel.
+    if(arrayVars[i].tex_mapped == true || arrayVars[i].cons_mapped == true )
+      continue;     
+    if(arrayVars[i].size_2d >= 0)
+    {
+      snprintf(dims,120,"(float(*) [%d])%s", arrayVars[i].size_2d,
+               const_cast<char*>(arrayVars[i].name.c_str()));
+      //printf("%d %s\n", i, dims);
+      iml->append(immed(dims));
+    }else{
+      //printf("%d %s\n", i, static_cast<const char*>(arrayVars[i].name));
+      iml->append(immed(const_cast<char*>(
+                          arrayVars[i].name.c_str())));
+    }
+  }
+  if(dim1){
+    iml->append(immed(dim1));
+    iml->append(immed(dim2));
+  }
+  repr = ocg->CreateKernel(iml);//kernel call
+  setup_code = ocg->StmtListAppend(setup_code, repr);
+  
+  //cuda free variables
+  for(int i=0; i<arrayVars.size(); i++)
+  {
+    if(arrayVars[i].out_data)
+    {
+      //cudaMemcpy args
+      in_cal *the_call =
+        new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpy))), 4);
+      the_call->set_argument(0, operand(arrayVars[i].out_data));
+      the_call->set_argument(1, operand(symtab->lookup_var(const_cast<char*>(
+                                                             arrayVars[i].name.c_str()))));
+      the_call->set_argument(2, arrayVars[i].size_expr.clone());
+      the_call->set_argument(3, operand(cudaMemcpyDeviceToHost));
+      
+      tree_node_list* tnl = new tree_node_list;
+      tnl->append(new tree_instr(the_call));
+      teardown_code = ocg->StmtListAppend(teardown_code,
+                                          new CG_suifRepr(tnl));
+    }
+    
+    in_cal *the_call =
+      new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaFree))), 1);
+    the_call->set_argument(0, operand(symtab->lookup_var(const_cast<char*>(
+                                                           arrayVars[i].name.c_str()))));
+    
+    tree_node_list* tnl = new tree_node_list;
+    tnl->append(new tree_instr(the_call));
+    teardown_code = ocg->StmtListAppend(teardown_code,
+                                        new CG_suifRepr(tnl));
+  }
+  
+  // ---------------
+  // BUILD THE KERNEL
+  // ---------------
+  
+  //Extract out kernel body
+  tree_node_list* code = getCode();
+  //Get rid of wrapper if that original() added
+  if(code->head()->contents->kind() == TREE_IF)
+  {
+    tree_if* ifn = (tree_if*)code->head()->contents;
+    code = ifn->then_part();
+  }
+  
+  //Create kernel function body
+  proc_sym *new_psym = globals->new_proc(kernel_type, src_c, (char*)cu_kernel_name.c_str());
+  proc_symtab *new_proc_syms = new proc_symtab(new_psym->name());
+  globals->add_child(new_proc_syms);
+  
+  //Add Params
+  std::map<std::string, var_sym*> loop_vars;
+  //In-Out arrays 
+  type_node* fptr;
+  for(int i=0; i<arrayVars.size(); i++)
+  {
+    if(arrayVars[i].in_data)
+      //fptr = arrayVars[i].in_data->type()->clone();
+      fptr = arrayVars[i].in_data->type();
+    else
+      //fptr = arrayVars[i].out_data->type()->clone();
+      fptr = arrayVars[i].out_data->type();
+    fptr = new_proc_syms->install_type(fptr);
+    std::string name = arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name();
+    var_sym* sym = new var_sym(fptr, arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name());
+    //protonu--adding a check to ensure that texture memories are not passed in as arguments
+    if(arrayVars[i].tex_mapped != true     && arrayVars[i].cons_mapped !=true  ) 
+    {
+      sym->set_param();
+      new_proc_syms->params()->append(sym);
+      new_proc_syms->add_sym(sym);//protonu--added to suppress the addition of the redundant var in the kernel
+    }
+    if (arrayVars[i].cons_mapped == true)
+    {       
+      sym->set_param();
+      new_proc_syms->add_sym(sym);
+    }
+    //printf("inserting name: %s\n", static_cast<const char*>(name));
+    loop_vars.insert(std::pair<std::string, var_sym*>(std::string(name), sym));
+  }
+  
+  if(dim1)
+  {
+    //Array dimentions
+    var_sym* kdim1 = new var_sym(dim1->type(), dim1->name());
+    kdim1->set_param();
+    new_proc_syms->add_sym(kdim1);
+    loop_vars.insert(std::pair<std::string, var_sym*>(std::string(dim1->name()), kdim1));
+    var_sym* kdim2 = new var_sym(dim2->type(), dim2->name());
+    kdim2->set_param();
+    new_proc_syms->add_sym(kdim2);
+    loop_vars.insert(std::pair<std::string, var_sym*>(std::string(dim2->name()), kdim2));
+    new_proc_syms->params()->append(kdim1);
+    new_proc_syms->params()->append(kdim2);
+  }
+  //Put block and thread implicit variables into scope
+  std::vector<var_sym *> index_syms;
+  /* Currently we don't use the block dimentions
+     var_sym* blockDim_x = new var_sym(type_s32, "blockDim.x");
+     blockDim_x->set_param();
+     new_proc_syms->add_sym(blockDim_x);
+     var_sym* blockDim_y = new var_sym(type_s32, "blockDim.y");
+     blockDim_y->set_param();
+     new_proc_syms->add_sym(blockDim_y);
+  */
+  if(cu_bx > 1){
+    var_sym* blockIdx_x = new var_sym(type_s32, "blockIdx.x");
+    blockIdx_x->set_param();
+    new_proc_syms->add_sym(blockIdx_x);
+    index_syms.push_back(blockIdx_x);
+  }
+  if(cu_by > 1){
+    var_sym* blockIdx_y = new var_sym(type_s32, "blockIdx.y");
+    blockIdx_y->set_param();
+    new_proc_syms->add_sym(blockIdx_y);
+    index_syms.push_back(blockIdx_y);
+  }
+  if(cu_tx > 1){
+    var_sym* threadIdx_x = new var_sym(type_s32, "threadIdx.x");
+    threadIdx_x->set_param();
+    new_proc_syms->add_sym(threadIdx_x);
+    index_syms.push_back(threadIdx_x);
+  }
+  if(cu_ty > 1){
+    var_sym* threadIdx_y = new var_sym(type_s32, "threadIdx.y");
+    threadIdx_y->set_param();
+    new_proc_syms->add_sym(threadIdx_y);
+    index_syms.push_back(threadIdx_y);
+  }
+  
+  if(cu_tz > 1){
+    var_sym* threadIdx_z = new var_sym(type_s32, "threadIdx.z");
+    threadIdx_z->set_param();
+    new_proc_syms->add_sym(threadIdx_z);
+    index_syms.push_back(threadIdx_z);
+  }
+  
+  //Figure out which loop variables will be our thread and block dimention variables
+  std::vector<var_sym *> loop_syms;
+  //Get our indexes
+  std::vector<const char*> indexes;// = get_loop_indexes(code,cu_num_reduce);
+  int threadsPos=0;
+  if(cu_bx > 1)
+    indexes.push_back("bx");
+  if(cu_by > 1)
+    indexes.push_back("by");
+  if(cu_tx > 1){
+    threadsPos = indexes.size();
+    indexes.push_back("tx");
+  }
+  if(cu_ty > 1)
+    indexes.push_back("ty");
+  if(cu_tz > 1)
+    indexes.push_back("tz");
+  for(int i=0; i<indexes.size(); i++)
+  {
+    //printf("indexes[%d] = %s\n", i, (char*)indexes[i]);
+    loop_syms.push_back(new var_sym(type_s32, (char*)indexes[i]));
+    new_proc_syms->add_sym(loop_syms[i]);
+    //loop_vars.insert(std::pair<std::string, var_sym*>(std::string(indexes[i]), loop_syms[i]));
+  }
+  
+  //Generate this code
+  //int bx = blockIdx.x
+  //int by = blockIdx.y
+  //int tx = threadIdx.x
+  //int ty = threadIdx.y
+  CG_outputRepr *body=NULL;
+  for(int i=0; i<indexes.size(); i++){
+    CG_outputRepr *lhs = new CG_suifRepr(operand(loop_syms[i]));
+    //body = ocg->StmtListAppend(body, ocg->CreateStmtList(
+    //                             ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i])))));
+    body = ocg->StmtListAppend(body, ocg->StmtListAppend(
+                                 ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i]))), NULL));
+  }
+  
+  //Get our inital code prepped for loop reduction. First we need to swap
+  //out internal SUIF variable references to point to the new local
+  //function symbol table.
+  std::map<std::string, var_sym*> loop_idxs; //map from idx names to their new syms
+  std::vector< std::pair<var_sym*, var_sym*> > dim_vars; //pair is of <old,new> var_sym (for 2D array size initializations)
+  replacements r;
+  tree_node_list* swapped = swapVarReferences(code, &r, ocg, loop_vars, new_proc_syms, dim_vars);
+  //printf("\n code before recursiveFindReplacePreferedIdxs :\n");
+  //swapped->print();
+  swapped = recursiveFindReplacePreferedIdxs(swapped, new_proc_syms, cudaSync, void_func, loop_idxs);//in-place swapping
+  //printf("\n code after recursiveFindReplacePreferedIdxs :\n");
+  //swapped->print();
+  
+  for(int i=0; i<indexes.size(); i++){
+    std::vector<tree_for*> tfs = findCommentedFors(indexes[i], swapped);
+    for(int k=0; k<tfs.size(); k++){
+      //printf("replacing %p tfs for index %s\n", tfs[k], indexes[i]);
+      tree_node_list* newBlock = forReduce(tfs[k], loop_idxs[indexes[i]], new_proc_syms);
+      //newBlock->print();
+      swap_node_for_node_list(tfs[k], newBlock);
+      //printf("AFTER SWAP\n");        newBlock->print();
+    }
+  }
+  //printf("AFTER REDUCE\n"); swapped->print();
+  
+  if(static_cast<const IR_cudasuifCode *>(ir)->init_code()){
+    tree_node_list* orig_init_code = static_cast<CG_suifRepr *>(static_cast<const IR_cudasuifCode *>(ir)->init_code())->GetCode();
+    for(int i=0; i<dim_vars.size(); i++){
+      //We have a map of var_sym from the original function body and we know
+      //that these var_syms have initialization statements which define the
+      //array size. We need to mimic these initialization statements.
+      
+      //First find the assignment and pull out the constant initialization
+      //value
+      int value = -1;
+      tree_node_list_iter tnli(orig_init_code);
+      while (!tnli.is_empty()) {
+        tree_node *node = tnli.step();
+        if(node->kind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr)
+        {
+          in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr();
+          //expect the structure: cpy( _ = min(grab_me, _))
+          if(inst->opcode() == io_cpy && inst->dst_op().is_symbol()){
+            //printf("looking at instruction: ");
+            //inst->print();
+            var_sym* dest = inst->dst_op().symbol();
+            if(dest == dim_vars[i].first)
+            {
+              if(inst->src1_op().is_instr() && inst->src1_op().instr()->format() == inf_ldc){
+                value = ((in_ldc*)inst->src1_op().instr())->value().integer();
+              }
+            }
+          }
+        }
+      }
+      if(value < 0){
+        fprintf(stderr, "ERROR: Could not find initializing statement for variable used in upper_bound of array type");
+      }
+      CG_outputRepr *lhs = new CG_suifRepr(operand(dim_vars[i].second));
+      //body = ocg->StmtListAppend(body, ocg->CreateStmtList(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value))));
+      body = ocg->StmtListAppend(body, ocg->StmtListAppend(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value)), NULL));
+    }
+  }
+  
+  
+  body = ocg->StmtListAppend(body, new CG_suifRepr(swapped));
+  
+  //protonu--lets try creating our function definiton here
+  var_sym *tsym = NULL;
+  
+  
+  std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(body);
+  for(int i=0; i<refs.size(); i++)
+  {
+    //check if the array is tex mapped
+    if(texture != NULL && texture->is_array_tex_mapped(refs[i]->name().c_str()))
+    {
+      //protonu--our new tex lookup function
+      in_cal *tex_lookup =
+        new in_cal(type_f32, operand(), operand(new in_ldc(float_func->ptr_to(), operand(), immed(tex1D))), 2);
+      
+      //printf("name of the array to be mapped is %s\n", refs[i]->name().c_str());
+      tsym = tex_ref_map[refs[i]->name()];
+      tex_lookup->set_argument(0, operand(tsym));
+      
+      
+      int array_dims = ((IR_suifArrayRef *)refs[i])->ia_->dims();
+      
+      if (array_dims == 1){ 
+        tex_lookup->set_argument(1, ((IR_suifArrayRef *)refs[i])->ia_->index(0).clone());
+      }else if (array_dims > 2) {
+        printf(" \n we don't handle more than 2D arrays mapped to textures yet\n");
+      }else if (array_dims == 2) {
+        
+        IR_ArraySymbol *sym = refs[i]->symbol();
+        CG_outputRepr *sz = sym->size(1);
+        delete sym;  // free the wrapper object only
+        // find the builder ocg
+        CG_outputRepr *expr = ocg->CreateTimes(sz->clone(),refs[i]->index(0));
+        delete sz; // free the wrapper object only
+        expr = ocg->CreatePlus(expr, refs[i]->index(1));
+        // expr holds the 1D access expression and take it out
+        tex_lookup->set_argument(1, ((CG_suifRepr *)expr)->GetExpression());
+      }
+      
+      //using chun's function to replace the array look up with the function call
+      ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(tex_lookup)));
+    }
+    
+  }
+  
+  
+  tsym = NULL;
+  //protonu--now let's try what we did above for constant memory
+  for(int i=0; i<refs.size(); i++)
+  {
+    //check if the array is tex mapped
+    if(constant_mem != NULL && constant_mem->is_array_cons_mapped(refs[i]->name().c_str()))
+    {
+      
+      //printf("name of the array to be cons mapped is %s\n", refs[i]->name().c_str());
+      tsym = cons_ref_map[refs[i]->name()];
+      //we should create a IR_SuifArray here
+      IR_ArraySymbol *ar_sym = new IR_suifArraySymbol(ir,tsym);
+      std::vector<CG_outputRepr *> ar_index;
+      ar_index.push_back(((IR_suifArrayRef *)refs[i])->index(0));
+      IR_ArrayRef *ar_ref = ((IR_suifCode *)ir)->CreateArrayRef(ar_sym, ar_index);
+      //using chun's function to replace the array look up with the function call
+      ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(((IR_suifArrayRef *)ar_ref)->ia_)));
+      
+    }
+  }
+  
+  
+  tree_proc *new_body = new tree_proc(static_cast<CG_suifRepr*>(body)->GetCode(), new_proc_syms);
+  //globals->add_child(new_proc_syms);
+  new_psym->set_block(new_body);
+  new_procs.push_back(new_psym);
+  
+  return swapped;
+}
+
+//Order taking out dummy variables
+std::vector<std::string> cleanOrder(std::vector<std::string> idxNames){
+  std::vector<std::string> results;
+  for(int j=0; j<idxNames.size(); j++){
+    if(idxNames[j].length() != 0)
+      results.push_back(idxNames[j]);
+  }
+  return results;
+}
+
+//First non-dummy level in ascending order
+int LoopCuda::nonDummyLevel(int stmt, int level){
+  //level comes in 1-basd and should leave 1-based
+  for(int j=level-1; j<idxNames[stmt].size(); j++){
+    if(idxNames[stmt][j].length() != 0){
+      //printf("found non dummy level of %d with idx: %s when searching for %d\n", j+1, (const char*) idxNames[stmt][j], level);
+      return j+1;
+    }
+  }
+  char buf[128]; sprintf(buf, "%d", level);
+  throw std::runtime_error(std::string("Unable to find a non-dummy level starting from ") + std::string(buf));
+}
+
+int LoopCuda::findCurLevel(int stmt, std::string idx){
+  for(int j=0; j<idxNames[stmt].size(); j++){
+    if(strcmp(idxNames[stmt][j].c_str(),idx.c_str()) == 0)
+      return j+1;
+  }
+  throw std::runtime_error(std::string("Unable to find index ") + idx + std::string(" in current list of indexes"));
+}
+
+void LoopCuda::permute_cuda(int stmt, const std::vector<std::string>& curOrder)
+{
+  //printf("curOrder: ");
+  //printVs(curOrder);
+  //printf("idxNames: ");
+  //printVS(idxNames[stmt]);
+  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt]);
+  bool same=true;
+  std::vector<int> pi;
+  for(int i=0; i<curOrder.size(); i++){
+    bool found = false;
+    for(int j=0; j<cIdxNames.size(); j++){
+      if(strcmp(cIdxNames[j].c_str(), curOrder[i].c_str()) == 0){
+        pi.push_back(j+1);
+        found=true;
+        if(j!=i)
+          same=false;
+      }
+    }
+    if(!found){
+      throw std::runtime_error("One of the indexes in the permute order where not "
+                               "found in the current set of indexes.");
+    }
+  }
+  for(int i=curOrder.size(); i<cIdxNames.size(); i++){
+    pi.push_back(i);
+  }
+  if(same)
+    return;
+  permute(stmt, pi);
+  //Set old indexe names as new
+  for(int i=0; i<curOrder.size(); i++){
+    idxNames[stmt][i] = curOrder[i].c_str(); //what about sibling stmts?
+  }
+}
+
+
+bool LoopCuda::permute(int stmt_num, const std::vector<int> &pi)
+{
+// check for sanity of parameters
+  if (stmt_num >= stmt.size() || stmt_num < 0)
+    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
+  const int n = stmt[stmt_num].xform.n_out();
+  if (pi.size() > (n-1)/2)
+    throw std::invalid_argument("iteration space dimensionality does not match permute dimensionality");
+  int first_level = 0;
+  int last_level = 0;
+  for (int i = 0; i < pi.size(); i++) {
+    if (pi[i] > (n-1)/2 || pi[i] <= 0)
+      throw std::invalid_argument("invalid loop level " + to_string(pi[i]) + " in permuation");
+    
+    if (pi[i] != i+1) {
+      if (first_level == 0)
+        first_level = i+1;      
+      last_level = i+1;
+    }
+  }
+  if (first_level == 0)
+    return true;
+  
+  std::vector<int> lex = getLexicalOrder(stmt_num);
+  std::set<int> active = getStatements(lex, 2*first_level-2);
+  Loop::permute(active, pi);
+}
+
+
+void LoopCuda::tile_cuda(int stmt, int level, int outer_level)
+{
+  tile_cuda(stmt,level,1,outer_level,"","",CountedTile);
+}
+void LoopCuda::tile_cuda(int level, int tile_size, int outer_level, std::string idxName,
+                         std::string ctrlName, TilingMethodType method){
+  tile_cuda(0, level, tile_size, outer_level, idxName, ctrlName, method);
+}
+
+void LoopCuda::tile_cuda(int stmt, int level, int tile_size, int outer_level, std::string idxName,
+                         std::string ctrlName, TilingMethodType method){
+  //Do regular tile but then update the index and control loop variable
+  //names as well as the idxName to reflect the current state of things.
+  //printf("tile(%d,%d,%d,%d)\n", stmt, level, tile_size, outer_level);
+  //printf("idxNames before: ");
+  //printVS(idxNames[stmt]);
+  
+  tile(stmt, level, tile_size, outer_level, method);
+  
+  if(idxName.size())
+    idxNames[stmt][level-1] = idxName.c_str();
+  if(tile_size == 1){
+    //potentially rearrange loops
+    if(outer_level < level){
+      std::string tmp = idxNames[stmt][level-1];
+      for(int i=level-1; i>outer_level-1; i--){
+        if(i-1 >= 0)
+          idxNames[stmt][i] = idxNames[stmt][i-1];
+      }
+      idxNames[stmt][outer_level-1] = tmp;
+    }
+    //TODO: even with a tile size of one, you need a insert (of a dummy loop)
+    idxNames[stmt].insert(idxNames[stmt].begin()+(level),"");
+  }else{
+    if(!ctrlName.size())
+      throw std::runtime_error("No ctrl loop name for tile");
+    //insert
+    idxNames[stmt].insert(idxNames[stmt].begin()+(outer_level-1),ctrlName.c_str());
+  }
+  
+  //printf("idxNames after: ");
+  //printVS(idxNames[stmt]);
+}
+
+
+bool LoopCuda::datacopy_privatized_cuda(int stmt_num, int level, const std::string &array_name, const std::vector<int> &privatized_levels, bool allow_extra_read , int fastest_changing_dimension , int padding_stride , int padding_alignment , bool cuda_shared)
+{
+  int old_stmts =stmt.size();
+  //datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, cuda_shared);
+  if(cuda_shared)
+    datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 1);
+  else
+    datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 0);
+  
+  
+  //Adjust idxNames to reflect updated state
+  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
+  int new_stmts = stmt.size();
+  for(int i=old_stmts; i<new_stmts; i++){
+    //printf("fixing up statement %d\n", i);
+    std::vector<std::string> idxs;
+    
+    
+    //protonu-making sure the vector of nonSplitLevels grows along with
+    //the statement structure
+    stmt_nonSplitLevels.push_back(omega::Tuple<int>());
+    
+    //Indexes up to level will be the same
+    for(int j=0; j<level-1; j++)
+      idxs.push_back(cIdxNames[j]);
+    
+    //Expect privatized_levels to match
+    for(int j=0; j<privatized_levels.size(); j++)
+      idxs.push_back(cIdxNames[privatized_levels[j]-1]);//level is one-based
+    
+    //all further levels should match order they are in originally
+    if(privatized_levels.size()){
+      int last_privatized = privatized_levels.back();
+      int top_level = last_privatized + (stmt[i].IS.n_set()-idxs.size());
+      //printf("last privatized_levels: %d top_level: %d\n", last_privatized, top_level);
+      for(int j=last_privatized; j<top_level; j++){
+        idxs.push_back(cIdxNames[j]);
+        //printf("pushing back: %s\n", (const char*)cIdxNames[j]);
+      }
+    }
+    idxNames.push_back(idxs);
+  }
+}
+
+bool LoopCuda::datacopy_cuda(int stmt_num, int level, const std::string &array_name, std::vector<std::string> new_idxs, bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, bool cuda_shared)
+{
+  
+  int old_stmts =stmt.size();
+  //datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment,cuda_shared);
+  if(cuda_shared)
+    datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 1);
+  else
+    datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 0);
+  //Adjust idxNames to reflect updated state
+  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
+  int new_stmts = stmt.size();
+  for(int i=old_stmts; i<new_stmts; i++){
+    //printf("fixing up statement %d\n", i);
+    std::vector<std::string> idxs;
+    
+    //protonu-making sure the vector of nonSplitLevels grows along with
+    //the statement structure
+    stmt_nonSplitLevels.push_back(omega::Tuple<int>());
+    
+    //protonu--lets dump out the code from each statement here
+    //printf("\n dumping statement :%d", i);
+    //stmt[i].code->Dump();
+    
+    //Indexes up to level will be the same
+    for(int j=0; j<level-1; j++)
+      idxs.push_back(cIdxNames[j]);
+    
+    //all further levels should get names from new_idxs
+    int top_level = stmt[i].IS.n_set();
+    //printf("top_level: %d level: %d\n", top_level, level);
+    if(new_idxs.size() < top_level-level+1)
+      throw std::runtime_error("Need more new index names for new datacopy loop levels");
+    
+    for(int j=level-1; j<top_level; j++){
+      idxs.push_back(new_idxs[j-level+1].c_str());
+      //printf("pushing back: %s\n", new_idxs[j-level+1].c_str());
+    }
+    idxNames.push_back(idxs);
+  }
+}
+
+bool LoopCuda::unroll_cuda(int stmt_num, int level, int unroll_amount)
+{
+  int old_stmts =stmt.size();
+  //bool b= unroll(stmt_num, , unroll_amount);
+  
+  
+  int dim = 2*level-1;
+  std::vector<int> lex = getLexicalOrder(stmt_num);
+  std::set<int> same_loop = getStatements(lex, dim-1);
+  
+  level = nonDummyLevel(stmt_num,level);
+  //printf("unrolling %d at level %d\n", stmt_num,level);
+  
+  //protonu--using the new version of unroll, which returns
+  //a set of ints instead of a bool. To keep Gabe's logic
+  //I'll check the size of the set, if it's 0 return true
+  //bool b= unroll(stmt_num, level, unroll_amount);
+  std::set<int> b_set= unroll(stmt_num, level, unroll_amount);
+  bool b = false;
+  if (b_set.size() == 0) b = true;
+  //end--protonu
+  
+  //Adjust idxNames to reflect updated state
+  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
+  std::vector<std::string> origSource = idxNames[stmt_num];;
+  //Drop index names at level
+  if(unroll_amount == 0){
+    //For all statements that were in this unroll together, drop index name for unrolled level
+    idxNames[stmt_num][level-1] = "";
+    for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
+      //printf("in same loop as %d is %d\n", stmt_num, (*i));
+      //idxNames[(*i)][level-1] = "";
+      idxNames[(*i)] = idxNames[stmt_num];
+    }
+  }
+  
+  lex = getLexicalOrder(stmt_num);
+  same_loop = getStatements(lex, dim-1);
+  
+  bool same_as_source = false;
+  int new_stmts = stmt.size();
+  for(int i=old_stmts; i<new_stmts; i++){
+    //Check whether we had a sync for the statement we are unrolling, if
+    //so, propogate that to newly created statements so that if they are
+    //in a different loop structure, they will also get a syncthreads
+    int size = syncs.size();
+    for(int j=0; j<size; j++){
+      if(syncs[j].first == stmt_num)
+        syncs.push_back(make_pair(i,syncs[j].second));
+    }
+    
+    //protonu-making sure the vector of nonSplitLevels grows along with
+    //the statement structure
+    stmt_nonSplitLevels.push_back(omega::Tuple<int>());
+    
+    
+    //We expect that new statements have a constant for the variable in
+    //stmt[i].IS at level (as seen with print_with_subs), otherwise there
+    //will be a for loop at level and idxNames should match stmt's
+    //idxNames pre-unrolled
+    Relation IS = stmt[i].IS;
+    //Ok, if you know how the hell to get anything out of a Relation, you
+    //should probably be able to do this more elegantly. But for now, I'm
+    //hacking it.
+    std::string s = IS.print_with_subs_to_string();
+    //s looks looks like
+    //{[_t49,8,_t51,_t52,128]: 0 <= _t52 <= 3 && 0 <= _t51 <= 15 && 0 <= _t49 && 64_t49+16_t52+_t51 <= 128}
+    //where level == 5, you see a integer in the input set
+    
+    //If that's not an integer and this is the first new statement, then
+    //we think codegen will have a loop at that level. It's not perfect,
+    //not sure if it can be determined without round-tripping to codegen.
+    int sIdx = 0;
+    int eIdx = 0;
+    for(int j=0; j<level-1; j++){
+      sIdx = s.find(",",sIdx+1);
+      if(sIdx < 0) break;
+    }
+    if(sIdx > 0){
+      eIdx = s.find("]");
+      int tmp = s.find(",",sIdx+1);
+      if(tmp > 0 && tmp < eIdx)
+        eIdx = tmp; //", before ]"
+      if(eIdx > 0){
+        sIdx++;
+        std::string var = s.substr(sIdx,eIdx-sIdx);
+        //printf("%s\n", s.c_str());
+        //printf("set var for stmt %d at level %d is %s\n", i, level, var.c_str());
+        if(atoi(var.c_str()) == 0 && i ==old_stmts){
+          //TODO:Maybe do see if this new statement would be in the same
+          //group as the original and if it would, don't say
+          //same_as_source
+          if(same_loop.find(i) == same_loop.end()){
+            printf("stmt %d level %d, newly created unroll statement should have same level indexes as source\n", i, level);
+            same_as_source = true;
+          }
+        }
+      }
+    }
+    
+    
+    //printf("fixing up statement %d n_set %d with %d levels\n", i, stmt[i].IS.n_set(), level-1);
+    if(same_as_source)
+      idxNames.push_back(origSource);
+    else
+      idxNames.push_back(idxNames[stmt_num]);
+  }
+  
+  return b;
+}
+
+void LoopCuda::copy_to_texture(const char *array_name)
+{
+  //protonu--placeholder for now
+  //set the bool for using cuda memory as true
+  //in a vector of strings, put the names of arrays to tex mapped
+  if ( !texture )
+    texture = new texture_memory_mapping(true, array_name);
+  else
+    texture->add(array_name);
+  
+  
+}
+
+
+void LoopCuda::copy_to_constant(const char *array_name)
+{
+  //protonu--placeholder for now
+  //set the bool for using cuda memory as true
+  //in a vector of strings, put the names of arrays to tex mapped
+  if ( !constant_mem )
+    constant_mem = new constant_memory_mapping(true, array_name);
+  else
+    constant_mem->add(array_name);
+}
+
+//protonu--moving this from Loop
+tree_node_list* LoopCuda::codegen()
+{
+  if(code_gen_flags & GenCudaizeV2)
+    return cudaize_codegen_v2();
+  //Do other flagged codegen methods, return plain vanilla generated code
+  return getCode();
+}
+
+//These three are in Omega code_gen.cc and are used as a massive hack to
+//get out some info from MMGenerateCode. Yea for nasty side-effects.
+namespace omega{
+  extern int checkLoopLevel;
+  extern int stmtForLoopCheck;
+  extern int upperBoundForLevel;
+  extern int lowerBoundForLevel;
+}
+
+
+void LoopCuda::extractCudaUB(int stmt_num, int level, int &outUpperBound, int &outLowerBound){
+  // check for sanity of parameters
+  const int m = stmt.size();
+  if (stmt_num >= m || stmt_num < 0)
+    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
+  const int n = stmt[stmt_num].xform.n_out();
+  if (level > (n-1)/2 || level <= 0)
+    throw std::invalid_argument("invalid loop level " + to_string(level));
+  
+  int dim = 2*level-1;
+  
+  std::vector<int> lex = getLexicalOrder(stmt_num);
+  std::set<int> same_loop = getStatements(lex, dim-1);
+  
+  // extract the intersection of the iteration space to be considered
+  Relation hull;
+  {
+    hull = Relation::True(n);
+    for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
+      hull = Intersection(hull, project_onto_levels(getNewIS(*i), dim+1, true));
+      hull.simplify(2, 4);
+    }
+    
+    for (int i = 2; i <= dim+1; i+=2) {
+      //std::string name = std::string("_t") + to_string(t_counter++);
+      std::string name = std::string("_t") + to_string(tmp_loop_var_name_counter++);
+      hull.name_set_var(i, name);
+    }
+    hull.setup_names();
+  }
+  
+  // extract the exact loop bound of the dimension to be unrolled
+  if (is_single_iteration(hull, dim)){
+    throw std::runtime_error("No loop availabe at level to extract upper bound.");
+  }
+  Relation bound = get_loop_bound(hull, dim);
+  if (!bound.has_single_conjunct() || !bound.is_satisfiable() || bound.is_tautology())
+    throw loop_error("loop error: unable to extract loop bound for cudaize");
+  
+  // extract the loop stride
+  EQ_Handle stride_eq;
+  int stride = 1;
+  {
+    bool simple_stride = true;
+    int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(dim+1), stride_eq, simple_stride);
+    if (strides > 1)
+      throw loop_error("loop error: too many strides");
+    else if (strides == 1) {
+      int sign = stride_eq.get_coef(bound.set_var(dim+1));
+//      assert(sign == 1 || sign == -1);
+      Constr_Vars_Iter it(stride_eq, true);
+      stride = abs((*it).coef/sign);
+    }
+  }
+  if(stride != 1){
+    char buf[1024];
+    sprintf(buf, "Cudaize: Loop at level %d has non-one stride of %d", level, stride);
+    throw std::runtime_error(buf);
+  }
+  
+  //Use code generation system to build tell us our bound information. We
+  //need a hard upper bound a 0 lower bound.
+  
+  checkLoopLevel = level*2;
+  stmtForLoopCheck = stmt_num;
+  upperBoundForLevel = -1;
+  lowerBoundForLevel = -1;
+  printCode(1,false);
+  checkLoopLevel = 0;
+  
+  outUpperBound = upperBoundForLevel;
+  outLowerBound = lowerBoundForLevel;
+  return;
+}
+
+
+void LoopCuda::printCode(int effort, bool actuallyPrint) const {
+  const int m = stmt.size();
+  if (m == 0)
+    return;
+  const int n = stmt[0].xform.n_out();
+  
+  
+  
+  Tuple<Relation> IS(m);
+  Tuple<Relation> xform(m);
+  Tuple<IntTuple > nonSplitLevels(m);
+  for (int i = 0; i < m; i++) {
+    IS[i+1] = stmt[i].IS;
+    xform[i+1] = stmt[i].xform;
+    nonSplitLevels[i+1] = stmt_nonSplitLevels[i];
+    //nonSplitLevels[i+1] = stmt[i].nonSplitLevels;
+  }
+  
+  Tuple< Tuple<std::string> > idxTupleNames;
+  if(useIdxNames){
+    for(int i=0; i<idxNames.size(); i++){
+      Tuple<std::string> idxs;
+      for(int j=0; j<idxNames[i].size(); j++)
+        idxs.append(idxNames[i][j]);
+      idxTupleNames.append( idxs );
+    }
+  }
+  
+  Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
+  CG_stringBuilder *ocg = new CG_stringBuilder();
+  Tuple<CG_outputRepr *> nameInfo;
+  for (int i = 1; i <= m; i++)
+    nameInfo.append(new CG_stringRepr("s" + to_string(i)));
+  CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort);
+  if(actuallyPrint)
+    std::cout << GetString(repr);
+/*
+  for (int i = 1; i <= m; i++)
+  delete nameInfo[i];
+*/
+  
+  delete ocg;
+}
+
+
+
+void LoopCuda::printRuntimeInfo() const {
+  for(int i=0; i<stmt.size(); i++){
+    Relation IS = stmt[i].IS;
+    Relation xform = stmt[i].xform;
+    printf("stmt[%d]\n", i);
+    printf("IS\n");
+    IS.print_with_subs();
+    
+    printf("xform[%d]\n", i);
+    xform.print_with_subs();
+    
+    //printf("code\n");
+    //static_cast<CG_suifRepr *>(stmt[i].code)->GetCode()->print_expr();
+  }
+}
+
+void LoopCuda::printIndexes() const {
+  for(int i=0; i<stmt.size(); i++){
+    printf("stmt %d nset %d ", i, stmt[i].IS.n_set());
+    
+    for(int j=0; j<idxNames[i].size(); j++){
+      if(j>0)
+        printf(",");
+      printf("%s", idxNames[i][j].c_str());
+    }
+    printf("\n");
+  }
+}
+
+tree_node_list* LoopCuda::getCode(int effort) const {
+  const int m = stmt.size();
+  if (m == 0)
+    return new tree_node_list;
+  const int n = stmt[0].xform.n_out();
+  
+  
+  
+  Tuple<CG_outputRepr *> ni(m);
+  Tuple<Relation> IS(m);
+  Tuple<Relation> xform(m);
+  Tuple< IntTuple > nonSplitLevels(m);
+  for (int i = 0; i < m; i++) {
+    ni[i+1] = stmt[i].code;
+    IS[i+1] = stmt[i].IS;
+    xform[i+1] = stmt[i].xform;
+    nonSplitLevels[i+1] = stmt_nonSplitLevels[i];
+    //nonSplitLevels[i+1] = stmt[i].nonSplitLevels;
+  }
+  
+  
+  Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
+#ifdef DEBUG
+//  std::cout << GetString(MMGenerateCode(new CG_stringBuilder(), xform, IS, known, effort));
+#endif
+  Tuple< Tuple<std::string> > idxTupleNames;
+  if(useIdxNames){
+    for(int i=0; i<idxNames.size(); i++){
+      Tuple<std::string> idxs;
+      for(int j=0; j<idxNames[i].size(); j++)
+        idxs.append(idxNames[i][j]);
+      idxTupleNames.append( idxs );
+    }
+  }
+  
+  CG_outputBuilder *ocg = ir->builder();
+  CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, nonSplitLevels, syncs, idxTupleNames, effort);
+  
+  //CG_outputRepr *overflow_initialization = ocg->CreateStmtList();
+  //protonu--using the new function CG_suifBuilder::StmtListAppend
+  CG_outputRepr *overflow_initialization = ocg->StmtListAppend(NULL, NULL);
+  for (std::map<int, std::vector<Free_Var_Decl *> >::const_iterator i = overflow.begin(); i != overflow.end(); i++)
+    for (std::vector<Free_Var_Decl *>::const_iterator j = i->second.begin(); j != i->second.end(); j++)
+      //overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->CreateStmtList(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0))));
+      overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->StmtListAppend(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)), NULL));
+  
+  repr = ocg->StmtListAppend(overflow_initialization, repr);
+  tree_node_list *tnl = static_cast<CG_suifRepr *>(repr)->GetCode();
+  
+  delete repr;
+  /*
+    for (int i = 1; i <= m; i++)
+    delete ni[i];
+  */
+  
+  return tnl;
+}
+
+
+//protonu--adding constructors for the new derived class
+LoopCuda::LoopCuda():Loop(), code_gen_flags(GenInit){}
+
+LoopCuda::LoopCuda(IR_Control *irc, int loop_num)
+  :Loop(irc)
+{
+    setup_code = NULL;
+  teardown_code = NULL;
+  code_gen_flags = 0;
+  cu_bx = cu_by = cu_tx = cu_ty = cu_tz = 1;
+  cu_num_reduce = 0;
+  cu_mode = GlobalMem;
+  texture = NULL;
+  constant_mem = NULL;
+  
+  int m=stmt.size();
+  //printf("\n the size of stmt(initially) is: %d\n", stmt.size());
+  for(int i=0; i<m; i++)
+    stmt_nonSplitLevels.push_back(omega::Tuple<int>());
+  
+  
+  //protonu--setting up
+  //proc_symtab *symtab
+  //global_symtab *globals
+  
+  globals =  ((IR_cudasuifCode *)ir)->gsym_ ;
+  std::vector<tree_for *> tf = ((IR_cudasuifCode *)ir)->get_loops();
+  
+  symtab = tf[loop_num]->proc()->block()->proc_syms();
+  
+  std::vector<tree_for *> deepest = find_deepest_loops(tf[loop_num]);
+  
+  for (int i = 0; i < deepest.size(); i++){
+    index.push_back(deepest[i]->index()->name()); //reflects original code index names
+  }
+  
+  for(int i=0; i< stmt.size(); i++)
+    idxNames.push_back(index); //refects prefered index names (used as handles in cudaize v2)
+  useIdxNames=false;
+  
+}
+
author	dhuth <derickhuth@gmail.com>	2014-08-27 09:52:06 -0600
committer	dhuth <derickhuth@gmail.com>	2014-08-27 09:52:06 -0600
commit	bff810cc371a38f493d688c54f71013f5a7d53bf (patch)
tree	fbe86954bb3c01deb21da9e41ebff5baa2889a45 /loop_cuda.cc
download	chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.gz chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.bz2 chill-bff810cc371a38f493d688c54f71013f5a7d53bf.zip