1 files changed, 3734 insertions, 0 deletions
diff --git a/loop_cuda_rose.cc b/loop_cuda_rose.cc
new file mode 100644
index 0000000..c5633ee
--- /dev/null
+++ b/loop_cuda_rose.cc
@@ -0,0 +1,3734 @@
+/*****************************************************************************
+ Copyright (C) 2009 University of Utah
+ All Rights Reserved.
+
+ Purpose:
+ Cudaize methods
+
+ Notes:
+
+ History:
+ 1/7/10 Created by Gabe Rudy by migrating code from loop.cc
+ 31/1/11 Modified by Protonu Basu
+*****************************************************************************/
+#define TRANSFORMATION_FILE_INFO Sg_File_Info::generateDefaultFileInfoForTransformationNode()
+#include <code_gen/CG_stringBuilder.h>
+#include <codegen.h>
+#include <code_gen/CG_utils.h>
+#include <code_gen/CG_outputRepr.h>
+#include "loop_cuda_rose.hh"
+#include "loop.hh"
+#include <math.h>
+//#include <useful.h>
+#include "omegatools.hh"
+#include "ir_cudarose.hh"
+#include "ir_rose.hh"
+#include "ir_rose_utils.hh"
+#include "chill_error.hh"
+#include <vector>
+#include "Outliner.hh"
+//#define DEBUG
+using namespace omega;
+using namespace SageBuilder;
+using namespace SageInterface;
+//using namespace Outliner;
+//using namespace ASTtools;
+char *k_cuda_texture_memory; //protonu--added to track texture memory type
+//extern char *omega::k_cuda_texture_memory; //protonu--added to track texture memory type
+extern char *omega::k_ocg_comment;
+
+static int cudaDebug;
+class CudaStaticInit {
+public:
+  CudaStaticInit() {
+    cudaDebug = 0; //Change this to 1 for debug
+  }
+};
+static CudaStaticInit junkInitInstance__;
+
+std::string& upcase(std::string& s) {
+  for (int i = 0; i < s.size(); i++)
+    s[i] = toupper(s[i]);
+  return s;
+}
+
+void printVs(const std::vector<std::string>& curOrder) {
+  if (!cudaDebug) return;
+  for (int i = 0; i < curOrder.size(); i++) {
+    if (i > 0)
+      printf(",");
+    printf("%s", curOrder[i].c_str());
+  }
+  printf("\n");
+}
+
+void printVS(const std::vector<std::string>& curOrder) {
+  if(!cudaDebug) return;
+  for (int i = 0; i < curOrder.size(); i++) {
+    if (i > 0)
+      printf(",");
+    printf("%s", curOrder[i].c_str());
+  }
+  printf("\n");
+}
+
+LoopCuda::~LoopCuda() {
+  const int m = stmt.size();
+  for (int i = 0; i < m; i++)
+    stmt[i].code->clear();
+}
+
+bool LoopCuda::symbolExists(std::string s) {
+  
+  if (body_symtab->find_variable(SgName(s.c_str()))
+      || parameter_symtab->find_variable(SgName(s.c_str())))
+    return true;
+  if (globals->lookup_variable_symbol(SgName(s.c_str())))
+    return true;
+  for (int i = 0; i < idxNames.size(); i++)
+    for (int j = 0; j < idxNames[i].size(); j++)
+      if (strcmp(idxNames[i][j].c_str(), s.c_str()) == 0)
+        return true;
+  return false;
+}
+
+void LoopCuda::addSync(int stmt_num, std::string idxName) {
+  //we store these and code-gen inserts sync to omega comments where stmt
+  //in loop that has idxName being generated
+  syncs.push_back(make_pair(stmt_num, idxName));
+}
+
+void LoopCuda::renameIndex(int stmt_num, std::string idx, std::string newName) {
+  int level = findCurLevel(stmt_num, idx);
+  if (idxNames.size() <= stmt_num || idxNames[stmt_num].size() < level)
+    throw std::runtime_error("Invalid statment number of index");
+  idxNames[stmt_num][level - 1] = newName.c_str();
+}
+
+enum Type {
+  Int
+};
+
+SgNode* wrapInIfFromMinBound(SgNode* then_part, SgForStatement* loop,
+                             SgScopeStatement* symtab, SgVariableSymbol* bound_sym) {
+  // CG_roseBuilder *ocg = new CG_roseBuilder(
+  
+  SgBinaryOp* test_expr = isSgBinaryOp(loop->get_test_expr());
+  SgExpression* upperBound;
+  SgExpression* conditional;
+  upperBound = test_expr->get_rhs_operand();
+  CG_outputRepr *ifstmt;
+  
+  SgCallExpression *call;
+  if (call = isSgCallExpression(upperBound))
+    if (isSgVarRefExp(call->get_function())->get_symbol()->get_name().getString()
+        == "__rose_lt") {
+      SgExprListExp* arg_list = call->get_args();
+      SgExpression *if_bound = *(arg_list->get_expressions().begin());
+      /*This relies on the minimum expression being the rhs operand of
+       * the min instruction.
+       */
+      SgIfStmt *ifstmt = buildIfStmt(
+        buildLessOrEqualOp(buildVarRefExp(bound_sym), if_bound),
+        isSgStatement(then_part), NULL);
+      return isSgNode(ifstmt);
+      
+    }
+  
+/*  if (isSgConditionalExp(upperBound)) {
+    conditional = isSgConditionalExp(upperBound)->get_conditional_exp();
+    
+    if (isSgBinaryOp(conditional)) {
+    SgBinaryOp* binop = isSgBinaryOp(conditional);
+    
+    if (isSgLessThanOp(binop) || isSgLessOrEqualOp(binop)) {
+    SgIfStmt *ifstmt = buildIfStmt(
+    buildLessOrEqualOp(buildVarRefExp(bound_sym),
+    test_expr), isSgStatement(then_part), NULL);
+    return isSgNode(ifstmt);
+    }
+    
+    }
+    
+    }
+*/
+  return then_part;
+}
+
+/**
+ * This would be better if it was done by a CHiLL xformation instead of at codegen
+ *
+ * state:
+ * for(...)
+ *   for(...)
+ *     cur_body
+ *   stmt1
+ *
+ * stm1 is in-between two loops that are going to be reduced. The
+ * solution is to put stmt1 at the end of cur_body but conditionally run
+ * in on the last step of the for loop.
+ *
+ * A CHiLL command that would work better:
+ *
+ * for(...)
+ *   stmt0
+ *   for(for i=0; i<n; i++)
+ *     cur_body
+ *   stmt1
+ * =>
+ * for(...)
+ *   for(for i=0; i<n; i++)
+ *     if(i==0) stmt0
+ *     cur_body
+ *     if(i==n-1) stmt1
+ */
+
+std::vector<SgForStatement*> findCommentedFors(const char* index, SgNode* tnl) {
+  std::vector<SgForStatement *> result;
+  bool next_loop_ok = false;
+  
+  if (isSgBasicBlock(tnl)) {
+    
+    SgStatementPtrList& list = isSgBasicBlock(tnl)->get_statements();
+    
+    for (SgStatementPtrList::iterator it = list.begin(); it != list.end();
+         it++) {
+      std::vector<SgForStatement*> t = findCommentedFors(index,
+                                                         isSgNode(*it));
+      std::copy(t.begin(), t.end(), back_inserter(result));
+    }
+  } else if (isSgForStatement(tnl)) {
+    
+    AstTextAttribute* att =
+      (AstTextAttribute*) (isSgNode(tnl)->getAttribute(
+                             "omega_comment"));
+    std::string comment = att->toString();
+    
+    if (comment.find("~cuda~") != std::string::npos
+        && comment.find("preferredIdx: ") != std::string::npos) {
+      std::string idx = comment.substr(
+        comment.find("preferredIdx: ") + 14, std::string::npos);
+      if (idx.find(" ") != std::string::npos)
+        idx = idx.substr(0, idx.find(" "));
+      if (strcmp(idx.c_str(), index) == 0)
+        next_loop_ok = true;
+    }
+    
+    if (next_loop_ok) {
+      //printf("found loop %s\n", static_cast<tree_for *>(tn)->index()->name());
+      result.push_back(isSgForStatement(tnl));
+    } else {
+      //printf("looking down for loop %s\n", static_cast<tree_for *>(tn)->index()->name());
+      std::vector<SgForStatement*> t = findCommentedFors(index,
+                                                         isSgForStatement(tnl)->get_loop_body());
+      std::copy(t.begin(), t.end(), back_inserter(result));
+    }
+    next_loop_ok = false;
+  } else if (isSgIfStmt(tnl)) {
+    //printf("looking down if\n");
+    SgIfStmt *tni = isSgIfStmt(tnl);
+    std::vector<SgForStatement*> t = findCommentedFors(index,
+                                                       tni->get_true_body());
+    std::copy(t.begin(), t.end(), back_inserter(result));
+  }
+  
+  return result;
+}
+
+SgNode* forReduce(SgForStatement* loop, SgVariableSymbol* reduceIndex,
+                  SgScopeStatement* body_syms) {
+  //We did the replacements all at once with recursiveFindPreferedIdxs
+  //replacements r;
+  //r.oldsyms.append(loop->index());
+  //r.newsyms.append(reduceIndex);
+  //tree_for* new_loop = (tree_for*)loop->clone_helper(&r, true);
+  SgForStatement* new_loop = loop;
+  
+  //return body one loops in
+  SgNode* tnl = loop_body_at_level(new_loop, 1);
+  //wrap in conditional if necessary
+  tnl = wrapInIfFromMinBound(tnl, new_loop, body_syms, reduceIndex);
+  return tnl;
+}
+
+void recursiveFindRefs(SgNode* code, std::set<const SgVariableSymbol *>& syms,
+                       SgFunctionDefinition* def) {
+  
+  SgStatement* s = isSgStatement(code);
+  // L = {symbols defined within 's'}, local variables declared within 's'
+  ASTtools::VarSymSet_t L;
+  ASTtools::collectDefdVarSyms(s, L);
+  //dump (L, "L = ");
+  
+  // U = {symbols used within 's'}
+  ASTtools::VarSymSet_t U;
+  ASTtools::collectRefdVarSyms(s, U);
+  //dump (U, "U = ");
+  
+  // U - L = {symbols used within 's' but not defined in 's'}
+  // variable references to non-local-declared variables
+  ASTtools::VarSymSet_t diff_U_L;
+  set_difference(U.begin(), U.end(), L.begin(), L.end(),
+                 inserter(diff_U_L, diff_U_L.begin()));
+  //dump (diff_U_L, "U - L = ");
+  
+  // Q = {symbols defined within the function surrounding 's' that are
+  // visible at 's'}, including function parameters
+  ASTtools::VarSymSet_t Q;
+  ASTtools::collectLocalVisibleVarSyms(def->get_declaration(), s, Q);
+//    dump (Q, "Q = ");
+  
+  // (U - L) \cap Q = {variables that need to be passed as parameters
+  // to the outlined function}
+  // a sub set of variables that are not globally visible (no need to pass at all)
+  // It excludes the variables with a scope between global and the enclosing function
+  set_intersection(diff_U_L.begin(), diff_U_L.end(), Q.begin(), Q.end(),
+                   inserter(syms, syms.begin()));
+  
+  /* std::vector<SgVariableSymbol *> scalars;
+  //SgNode  *tnl = static_cast<const omega::CG_roseRepr *>(repr)->GetCode();
+  SgStatement* stmt;
+  SgExpression* exp;
+  if (tnl != NULL) {
+  if(stmt = isSgStatement(tnl)){
+  if(isSgBasicBlock(stmt)){
+  SgStatementPtrList& stmts = isSgBasicBlock(stmt)->get_statements();
+  for(int i =0; i < stmts.size(); i++){
+  //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgNode(stmts[i]));
+  std::vector<SgVariableSymbol *> a = recursiveFindRefs(isSgNode(stmts[i]));
+  //delete r;
+  std::copy(a.begin(), a.end(), back_inserter(scalars));
+  }
+  
+  }
+  else if(isSgForStatement(stmt)){
+  
+  SgForStatement *tnf =  isSgForStatement(stmt);
+  //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgStatement(tnf->get_loop_body()));
+  std::vector<SgVariableSymbol *> a = recursiveFindRefs(isSgNode(tnf->get_loop_body()));
+  //delete r;
+  std::copy(a.begin(), a.end(), back_inserter(scalars));
+  }
+  else if(isSgFortranDo(stmt)){
+  SgFortranDo *tfortran =  isSgFortranDo(stmt);
+  omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgStatement(tfortran->get_body()));
+  std::vector<SgVariableSymbol *> a = recursiveFindRefs(r);
+  delete r;
+  std::copy(a.begin(), a.end(), back_inserter(scalars));
+  }
+  
+  else if(isSgIfStmt(stmt) ){
+  SgIfStmt* tni = isSgIfStmt(stmt);
+  //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgNode(tni->get_conditional()));
+  std::vector<SgVariableSymbol *> a = recursiveFindRefs(isSgNode(tni->get_conditional()));
+  //delete r;
+  std::copy(a.begin(), a.end(), back_inserter(scalars));
+  //r = new omega::CG_roseRepr(isSgNode(tni->get_true_body()));
+  a = recursiveFindRefs(isSgNode(tni->get_true_body()));
+  //delete r;
+  std::copy(a.begin(), a.end(), back_inserter(scalars));
+  //r = new omega::CG_roseRepr(isSgNode(tni->get_false_body()));
+  a = recursiveFindRefs(isSgNode(tni->get_false_body()));
+  //delete r;
+  std::copy(a.begin(), a.end(), back_inserter(scalars));
+  }
+  else if(isSgExprStatement(stmt)) {
+  //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgExpression(isSgExprStatement(stmt)->get_expression()));
+  std::vector<SgVariableSymbol *> a = recursiveFindRefs(isSgNode(isSgExprStatement(stmt)->get_expression()));
+  //delete r;
+  std::copy(a.begin(), a.end(), back_inserter(scalars));
+  
+  }
+  }
+  }
+  else{
+  SgExpression* op = isSgExpression(tnl);
+  if(isSgVarRefExp(op)){
+  
+  scalars.push_back(isSgVarRefExp(op)->get_symbol());
+  
+  }
+  else if( isSgAssignOp(op)){
+  //omega::CG_roseRepr *r1 = new omega::CG_roseRepr(isSgAssignOp(op)->get_lhs_operand());
+  std::vector<SgVariableSymbol *> a1 = recursiveFindRefs(isSgNode(isSgAssignOp(op)->get_lhs_operand()));
+  //delete r1;
+  std::copy(a1.begin(), a1.end(), back_inserter(scalars));
+  //omega::CG_roseRepr *r2 = new omega::CG_roseRepr(isSgAssignOp(op)->get_rhs_operand());
+  std::vector<SgVariableSymbol *> a2 = recursiveFindRefs(isSgNode(isSgAssignOp(op)->get_rhs_operand()));
+  //delete r2;
+  std::copy(a2.begin(), a2.end(), back_inserter(scalars));
+  
+  }
+  else if(isSgBinaryOp(op)){
+  // omega::CG_roseRepr *r1 = new omega::CG_roseRepr(isSgBinaryOp(op)->get_lhs_operand());
+  std::vector<SgVariableSymbol *> a1 = recursiveFindRefs(isSgNode(isSgBinaryOp(op)->get_lhs_operand()));
+  //delete r1;
+  std::copy(a1.begin(), a1.end(), back_inserter(scalars));
+  //omega::CG_roseRepr *r2 = new omega::CG_roseRepr(isSgBinaryOp(op)->get_rhs_operand());
+  std::vector<SgVariableSymbol *> a2 = recursiveFindRefs((isSgBinaryOp(op)->get_rhs_operand()));
+  //delete r2;
+  std::copy(a2.begin(), a2.end(), back_inserter(scalars));
+  }
+  else if(isSgUnaryOp(op)){
+  //omega::CG_roseRepr *r1 = new omega::CG_roseRepr(isSgUnaryOp(op)->get_operand());
+  std::vector<SgVariableSymbol *> a1 = recursiveFindRefs(isSgNode(isSgUnaryOp(op)->get_operand()));
+  //delete r1;
+  std::copy(a1.begin(), a1.end(), back_inserter(scalars));
+  }
+  
+  }
+  return scalars;
+  
+  
+  */
+  
+}
+
+SgNode* recursiveFindReplacePreferedIdxs(SgNode* code, SgSymbolTable* body_syms,
+                                         SgSymbolTable* param_syms, SgScopeStatement* body,
+                                         std::map<std::string, SgVariableSymbol*>& loop_idxs,
+                                         SgGlobal* globalscope, bool sync = false) {
+  //tree_node_list* tnl = new tree_node_list;
+  //tree_node_list_iter tnli(code);
+  SgVariableSymbol* idxSym = 0;
+  std::vector<SgStatement*> r1;
+  std::vector<SgNode*> r2;
+  SgNode* tnli;
+  SgNode* tnli1;
+  SgNode* tnli2;
+  SgBasicBlock * clone;
+  
+  if (isSgForStatement(code)) {
+    AstTextAttribute* att =
+      (AstTextAttribute*) (isSgNode(code)->getAttribute(
+                             "omega_comment"));
+    
+    std::string comment;
+    if (att != NULL)
+      comment = att->toString();
+    
+    if (comment.find("~cuda~") != std::string::npos
+        && comment.find("preferredIdx: ") != std::string::npos) {
+      std::string idx = comment.substr(
+        comment.find("preferredIdx: ") + 14, std::string::npos);
+      if (idx.find(" ") != std::string::npos)
+        idx = idx.substr(0, idx.find(" "));
+      if (loop_idxs.find(idx) != loop_idxs.end())
+        idxSym = loop_idxs.find(idx)->second;
+      //Get the proc variable sybol for this preferred index
+      if (idxSym == 0) {
+        idxSym = body_syms->find_variable(idx.c_str());
+        if (!idxSym)
+          idxSym = param_syms->find_variable(idx.c_str());
+        //printf("idx not found: lookup %p\n", idxSym);
+        if (!idxSym) {
+          SgVariableDeclaration* defn = buildVariableDeclaration(
+            SgName((char*) idx.c_str()), buildIntType());
+          //idxSym = new var_sym(type_s32, (char*)idx.c_str());
+          SgInitializedNamePtrList& variables = defn->get_variables();
+          SgInitializedNamePtrList::const_iterator i =
+            variables.begin();
+          SgInitializedName* initializedName = *i;
+          SgVariableSymbol* vs = new SgVariableSymbol(
+            initializedName);
+          prependStatement(defn, body);
+          vs->set_parent(body_syms);
+          body_syms->insert(SgName((char*) idx.c_str()), vs);
+          idxSym = vs;
+          //printf("idx created and inserted\n");
+        }
+        //Now insert into our map for future
+	if (cudaDebug)
+	    std::cout << idx << "\n\n";
+        loop_idxs.insert(make_pair(idx, idxSym));
+      }
+      //See if we have a sync as well
+      if (comment.find("sync") != std::string::npos) {
+        //printf("Inserting sync after current block\n");
+        sync = true;
+      }
+      
+    }
+    if (idxSym) {
+      SgForInitStatement* list =
+        isSgForStatement(code)->get_for_init_stmt();
+      SgStatementPtrList& initStatements = list->get_init_stmt();
+      SgStatementPtrList::const_iterator j = initStatements.begin();
+      const SgVariableSymbol* index;
+      
+      if (SgExprStatement *expr = isSgExprStatement(*j))
+        if (SgAssignOp* op = isSgAssignOp(expr->get_expression()))
+          if (SgVarRefExp* var_ref = isSgVarRefExp(
+                op->get_lhs_operand()))
+            index = var_ref->get_symbol();
+      
+      std::vector<SgVarRefExp *> array = substitute(code, index, NULL,
+                                                    isSgNode(body_syms));
+      
+      for (int j = 0; j < array.size(); j++)
+        array[j]->set_symbol(idxSym);
+    }
+    
+    SgStatement* body_ = isSgStatement(
+      recursiveFindReplacePreferedIdxs(
+        isSgNode((isSgForStatement(code)->get_loop_body())),
+        body_syms, param_syms, body, loop_idxs, globalscope));
+    
+    omega::CG_roseRepr * tnl = new omega::CG_roseRepr(code);
+    omega::CG_outputRepr* block = tnl->clone();
+    tnli = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
+    
+    isSgForStatement(tnli)->set_loop_body(body_);
+    body_->set_parent(tnli);
+    
+    if (idxSym) {
+      SgForInitStatement* list =
+        isSgForStatement(tnli)->get_for_init_stmt();
+      SgStatementPtrList& initStatements = list->get_init_stmt();
+      SgStatementPtrList::const_iterator j = initStatements.begin();
+      const SgVariableSymbol* index;
+      
+      if (SgExprStatement *expr = isSgExprStatement(*j))
+        if (SgAssignOp* op = isSgAssignOp(expr->get_expression()))
+          if (SgVarRefExp* var_ref = isSgVarRefExp(
+                op->get_lhs_operand()))
+            index = var_ref->get_symbol();
+      
+      std::vector<SgVarRefExp *> array = substitute(tnli, index, NULL,
+                                                    isSgNode(body_syms));
+      
+      for (int j = 0; j < array.size(); j++)
+        array[j]->set_symbol(idxSym);
+    }
+    //  std::cout << isSgNode(body_)->unparseToString() << "\n\n";
+    if (att != NULL)
+      tnli->setAttribute("omega_comment", att);
+    
+    if (sync) {
+      SgName name_syncthreads("__syncthreads");
+      SgFunctionSymbol * syncthreads_symbol =
+        globalscope->lookup_function_symbol(name_syncthreads);
+      
+      // Create a call to __syncthreads():
+      SgFunctionCallExp * syncthreads_call = buildFunctionCallExp(
+        syncthreads_symbol, buildExprListExp());
+      
+      SgExprStatement* stmt = buildExprStatement(syncthreads_call);
+      
+      /*    if (SgBasicBlock* bb = isSgBasicBlock(
+            isSgForStatement(code)->get_loop_body()))
+            appendStatement(isSgStatement(stmt), bb);
+            
+            else if (SgStatement* ss = isSgStatement(
+            isSgForStatement(code)->get_loop_body())) {
+            SgBasicBlock* bb2 = buildBasicBlock();
+            
+            isSgNode(ss)->set_parent(bb2);
+            appendStatement(ss, bb2);
+            
+            appendStatement(isSgStatement(stmt), bb2);
+            isSgNode(stmt)->set_parent(bb2);
+            isSgForStatement(code)->set_loop_body(bb2);
+            isSgNode(bb2)->set_parent(code);
+            }
+      */
+      
+      SgBasicBlock* bb2 = buildBasicBlock();
+      
+      bb2->append_statement(isSgStatement(tnli));
+      bb2->append_statement(stmt);
+      /* SgNode* parent = code->get_parent();
+         if(!isSgStatement(parent))
+         throw loop_error("Parent not a statement");
+         
+         if(isSgForStatement(parent)){
+         if(SgStatement *ss = isSgForStatement(isSgForStatement(parent)->get_loop_body())){
+         omega::CG_roseRepr * tnl = new omega::CG_roseRepr(ss);
+         omega::CG_outputRepr* block= tnl->clone();
+         
+         SgNode *new_ss = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
+         SgBasicBlock* bb2 = buildBasicBlock();
+         
+         isSgNode(new_ss)->set_parent(bb2);
+         appendStatement(isSgStatement(new_ss), bb2);
+         appendStatement(isSgStatement(stmt), bb2);
+         isSgNode(stmt)->set_parent(bb2);
+         
+         isSgStatement(parent)->replace_statement_from_basicBlock(ss, isSgStatement(bb2));
+         
+         }else if(isSgBasicBlock(isSgForStatement(parent)->get_loop_body()))
+         isSgStatement(isSgForStatement(parent)->get_loop_body())->insert_statement(isSgStatement(code), stmt, false);
+         else
+         throw loop_error("parent statement type undefined!!");
+         
+         }
+         else if(isSgBasicBlock(parent))
+         isSgStatement(parent)->insert_statement(isSgStatement(code), stmt, false);
+         else
+         throw loop_error("parent statement type undefined!!");
+         
+         //tnl->print();
+         *
+         *
+         */
+      sync = true;
+      return isSgNode(bb2);
+      
+    } else
+      return tnli;
+  } else if (isSgIfStmt(code)) {
+    SgStatement* body_ = isSgStatement(
+      recursiveFindReplacePreferedIdxs(
+        isSgNode((isSgIfStmt(code)->get_true_body())),
+        body_syms, param_syms, body, loop_idxs, globalscope));
+    
+    omega::CG_roseRepr * tnl = new omega::CG_roseRepr(code);
+    omega::CG_outputRepr* block = tnl->clone();
+    tnli = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
+    
+    isSgIfStmt(tnli)->set_true_body(body_);
+    
+    if ((isSgIfStmt(code)->get_false_body()))
+      isSgIfStmt(tnli)->set_false_body(
+        isSgStatement(
+          recursiveFindReplacePreferedIdxs(
+            isSgNode(
+              (isSgIfStmt(code)->get_false_body())),
+            body_syms, param_syms, body, loop_idxs,
+            globalscope)));
+    
+    return tnli;
+  } else if (isSgStatement(code) && !isSgBasicBlock(code)) {
+    omega::CG_roseRepr * tnl = new omega::CG_roseRepr(code);
+    omega::CG_outputRepr* block = tnl->clone();
+    tnli = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
+    
+    return tnli;
+    
+  } else if (isSgBasicBlock(code)) {
+    SgStatementPtrList& tnl = isSgBasicBlock(code)->get_statements();
+    
+    SgStatementPtrList::iterator temp;
+    clone = buildBasicBlock();
+    bool sync_found = false;
+    for (SgStatementPtrList::const_iterator it = tnl.begin();
+         it != tnl.end(); it++) {
+      
+      if (isSgForStatement(*it)) {
+        AstTextAttribute* att =
+          (AstTextAttribute*) (isSgNode(*it)->getAttribute(
+                                 "omega_comment"));
+        
+        std::string comment;
+        if (att != NULL)
+          comment = att->toString();
+        
+        if (comment.find("~cuda~") != std::string::npos
+            && comment.find("preferredIdx: ")
+            != std::string::npos) {
+          std::string idx = comment.substr(
+            comment.find("preferredIdx: ") + 14,
+            std::string::npos);
+          if (idx.find(" ") != std::string::npos)
+            idx = idx.substr(0, idx.find(" "));
+          //printf("sym_tab preferred index: %s\n", idx.c_str());
+          if (loop_idxs.find(idx) != loop_idxs.end())
+            idxSym = loop_idxs.find(idx)->second;
+          //Get the proc variable sybol for this preferred index
+          if (idxSym == 0) {
+            idxSym = body_syms->find_variable(idx.c_str());
+            if (!idxSym)
+              idxSym = param_syms->find_variable(idx.c_str());
+            //printf("idx not found: lookup %p\n", idxSym);
+            if (!idxSym) {
+              SgVariableDeclaration* defn =
+                buildVariableDeclaration(
+                  SgName((char*) idx.c_str()),
+                  buildIntType());
+              //idxSym = new var_sym(type_s32, (char*)idx.c_str());
+              SgInitializedNamePtrList& variables =
+                defn->get_variables();
+              SgInitializedNamePtrList::const_iterator i =
+                variables.begin();
+              SgInitializedName* initializedName = *i;
+              SgVariableSymbol* vs = new SgVariableSymbol(
+                initializedName);
+              prependStatement(defn, body);
+              vs->set_parent(body_syms);
+              body_syms->insert(SgName((char*) idx.c_str()), vs);
+              //printf("idx created and inserted\n");
+              idxSym = vs;
+            }
+            //Now insert into our map for future
+	    if (cudaDebug)
+		std::cout << idx << "\n\n";
+            loop_idxs.insert(make_pair(idx, idxSym));
+            
+          }
+          //See if we have a sync as well
+          if (comment.find("sync") != std::string::npos) {
+            //printf("Inserting sync after current block\n");
+            sync = true;
+          }
+          
+        }
+        if (idxSym) {
+          SgForInitStatement* list =
+            isSgForStatement(*it)->get_for_init_stmt();
+          SgStatementPtrList& initStatements = list->get_init_stmt();
+          SgStatementPtrList::const_iterator j =
+            initStatements.begin();
+          const SgVariableSymbol* index;
+          
+          if (SgExprStatement *expr = isSgExprStatement(*j))
+            if (SgAssignOp* op = isSgAssignOp(
+                  expr->get_expression()))
+              if (SgVarRefExp* var_ref = isSgVarRefExp(
+                    op->get_lhs_operand()))
+                index = var_ref->get_symbol();
+          
+          std::vector<SgVarRefExp *> array = substitute(*it, index,
+                                                        NULL, isSgNode(body_syms));
+          
+          for (int j = 0; j < array.size(); j++)
+            array[j]->set_symbol(idxSym);
+          
+        }
+        
+        SgStatement* body_ =
+          isSgStatement(
+            recursiveFindReplacePreferedIdxs(
+              isSgNode(
+                (isSgForStatement(*it)->get_loop_body())),
+              body_syms, param_syms, body, loop_idxs,
+              globalscope));
+        
+        omega::CG_roseRepr * tnl = new omega::CG_roseRepr(*it);
+        omega::CG_outputRepr* block = tnl->clone();
+        tnli =
+          static_cast<const omega::CG_roseRepr *>(block)->GetCode();
+        
+        isSgForStatement(tnli)->set_loop_body(body_);
+        body_->set_parent(tnli);
+        if (idxSym) {
+          SgForInitStatement* list =
+            isSgForStatement(tnli)->get_for_init_stmt();
+          SgStatementPtrList& initStatements = list->get_init_stmt();
+          SgStatementPtrList::const_iterator j =
+            initStatements.begin();
+          const SgVariableSymbol* index;
+          
+          if (SgExprStatement *expr = isSgExprStatement(*j))
+            if (SgAssignOp* op = isSgAssignOp(
+                  expr->get_expression()))
+              if (SgVarRefExp* var_ref = isSgVarRefExp(
+                    op->get_lhs_operand()))
+                index = var_ref->get_symbol();
+          
+          std::vector<SgVarRefExp *> array = substitute(tnli, index,
+                                                        NULL, isSgNode(body_syms));
+          
+          for (int j = 0; j < array.size(); j++)
+            array[j]->set_symbol(idxSym);
+        }
+        idxSym = 0;
+        //  std::cout << isSgNode(body_)->unparseToString() << "\n\n";
+        if (att != NULL)
+          tnli->setAttribute("omega_comment", att);
+        clone->append_statement(isSgStatement(tnli));
+        if (sync) {
+          SgName name_syncthreads("__syncthreads");
+          SgFunctionSymbol * syncthreads_symbol =
+            globalscope->lookup_function_symbol(
+              name_syncthreads);
+          
+          // Create a call to __syncthreads():
+          SgFunctionCallExp * syncthreads_call = buildFunctionCallExp(
+            syncthreads_symbol, buildExprListExp());
+          
+          SgExprStatement* stmt = buildExprStatement(
+            syncthreads_call);
+          
+          /*    if (SgBasicBlock* bb = isSgBasicBlock(
+                isSgForStatement(code)->get_loop_body()))
+                appendStatement(isSgStatement(stmt), bb);
+                
+                else if (SgStatement* ss = isSgStatement(
+                isSgForStatement(code)->get_loop_body())) {
+                SgBasicBlock* bb2 = buildBasicBlock();
+                
+                isSgNode(ss)->set_parent(bb2);
+                appendStatement(ss, bb2);
+                
+                appendStatement(isSgStatement(stmt), bb2);
+                isSgNode(stmt)->set_parent(bb2);
+                isSgForStatement(code)->set_loop_body(bb2);
+                isSgNode(bb2)->set_parent(code);
+                }
+          */
+          
+          //SgBasicBlock* bb2 = buildBasicBlock();
+          clone->append_statement(stmt);
+          /* SgNode* parent = code->get_parent();
+             if(!isSgStatement(parent))
+             throw loop_error("Parent not a statement");
+             
+             if(isSgForStatement(parent)){
+             if(SgStatement *ss = isSgForStatement(isSgForStatement(parent)->get_loop_body())){
+             omega::CG_roseRepr * tnl = new omega::CG_roseRepr(ss);
+             omega::CG_outputRepr* block= tnl->clone();
+             
+             SgNode *new_ss = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
+             SgBasicBlock* bb2 = buildBasicBlock();
+             
+             isSgNode(new_ss)->set_parent(bb2);
+             appendStatement(isSgStatement(new_ss), bb2);
+             appendStatement(isSgStatement(stmt), bb2);
+             isSgNode(stmt)->set_parent(bb2);
+             
+             isSgStatement(parent)->replace_statement_from_basicBlock(ss, isSgStatement(bb2));
+             
+             }else if(isSgBasicBlock(isSgForStatement(parent)->get_loop_body()))
+             isSgStatement(isSgForStatement(parent)->get_loop_body())->insert_statement(isSgStatement(code), stmt, false);
+             else
+             throw loop_error("parent statement type undefined!!");
+             
+             }
+             else if(isSgBasicBlock(parent))
+             isSgStatement(parent)->insert_statement(isSgStatement(code), stmt, false);
+             else
+             throw loop_error("parent statement type undefined!!");
+             
+             //tnl->print();
+             *
+             *
+             */
+          sync = true;
+          //    return isSgNode(bb2);
+          
+        }
+        
+        //  return tnli;
+      } else if (isSgIfStmt(*it)) {
+        SgStatement* body_ = isSgStatement(
+          recursiveFindReplacePreferedIdxs(
+            isSgNode((isSgIfStmt(*it)->get_true_body())),
+            body_syms, param_syms, body, loop_idxs,
+            globalscope));
+        
+        omega::CG_roseRepr * tnl = new omega::CG_roseRepr(*it);
+        omega::CG_outputRepr* block = tnl->clone();
+        tnli1 =
+          static_cast<const omega::CG_roseRepr *>(block)->GetCode();
+        
+        isSgIfStmt(tnli1)->set_true_body(body_);
+        
+        if ((isSgIfStmt(*it)->get_false_body()))
+          isSgIfStmt(tnli1)->set_false_body(
+            isSgStatement(
+              recursiveFindReplacePreferedIdxs(
+                isSgNode(
+                  (isSgIfStmt(*it)->get_false_body())),
+                body_syms, param_syms, body,
+                loop_idxs, globalscope)));
+        
+        clone->append_statement(isSgStatement(tnli1));
+        //return tnli;
+      } else if (isSgStatement(*it)) {
+        omega::CG_roseRepr * tnl = new omega::CG_roseRepr(*it);
+        omega::CG_outputRepr* block = tnl->clone();
+        tnli2 =
+          static_cast<const omega::CG_roseRepr *>(block)->GetCode();
+        
+        clone->append_statement(isSgStatement(tnli2));
+        //return tnli;
+        
+      }
+    }
+    
+    return isSgNode(clone);
+    
+  }
+  
+  /*    if (!isSgBasicBlock(
+        recursiveFindReplacePreferedIdxs(isSgNode(*it), body_syms,
+        param_syms, body, loop_idxs, globalscope))) {
+        SgStatement *to_push = isSgStatement(
+        recursiveFindReplacePreferedIdxs(isSgNode(*it),
+        body_syms, param_syms, body, loop_idxs,
+        globalscope, sync));
+        clone->append_statement(to_push);
+        
+        if ((sync_found) && isSgForStatement(to_push)) {
+        SgName name_syncthreads("__syncthreads");
+        SgFunctionSymbol * syncthreads_symbol =
+        globalscope->lookup_function_symbol(
+        name_syncthreads);
+        
+        // Create a call to __syncthreads():
+        SgFunctionCallExp * syncthreads_call = buildFunctionCallExp(
+        syncthreads_symbol, buildExprListExp());
+        
+        SgExprStatement* stmt = buildExprStatement(
+        syncthreads_call);
+        
+        clone->append_statement(isSgStatement(stmt));
+        }
+        //  std::cout<<isSgNode(*it)->unparseToString()<<"\n\n";
+        } else {
+        
+        SgStatementPtrList& tnl2 = isSgBasicBlock(
+        recursiveFindReplacePreferedIdxs(isSgNode(*it),
+        body_syms, param_syms, body, loop_idxs,
+        globalscope))->get_statements();
+        for (SgStatementPtrList::const_iterator it2 = tnl2.begin();
+        it2 != tnl2.end(); it2++) {
+        clone->append_statement(*it2);
+        
+        sync_found = true;
+        //  std::cout<<isSgNode(*it2)->unparseToString()<<"\n\n";
+        }
+        }
+        
+        }
+        return isSgNode(clone);
+        }
+  */
+//  return tnl;
+}
+
+// loop_vars -> array references
+// loop_idxs -> <idx_name,idx_sym> map for when we encounter a loop with a different preferredIndex
+// dim_vars -> out param, fills with <old,new> var_sym pair for 2D array dimentions (messy stuff)
+SgNode* swapVarReferences(SgNode* code,
+                          std::set<const SgVariableSymbol *>& syms, SgSymbolTable* param,
+                          SgSymbolTable* body, SgScopeStatement* body_stmt) {
+  //Iterate over every expression, looking up each variable and type
+  //reference used and possibly replacing it or adding it to our symbol
+  //table
+  //
+  //We use the built-in cloning helper methods to seriously help us with this!
+  
+  //Need to do a recursive mark
+  
+  std::set<const SgVariableSymbol *>::iterator myIterator;
+  for (myIterator = syms.begin(); myIterator != syms.end(); myIterator++) {
+    SgName var_name = (*myIterator)->get_name();
+    std::string x = var_name.getString();
+    
+    if ((param->find_variable(var_name) == NULL)
+        && (body->find_variable(var_name) == NULL)) {
+      SgInitializedName* decl = (*myIterator)->get_declaration();
+      
+      SgVariableSymbol* dvs = new SgVariableSymbol(decl);
+      SgVariableDeclaration* var_decl = buildVariableDeclaration(
+        dvs->get_name(), dvs->get_type());
+      
+      AstTextAttribute* att = (AstTextAttribute*) (isSgNode(
+                                                     decl->get_declaration())->getAttribute("__shared__"));
+      if (isSgNode(decl->get_declaration())->attributeExists(
+            "__shared__"))
+        var_decl->get_declarationModifier().get_storageModifier().setCudaShared();
+      
+      appendStatement(var_decl, body_stmt);
+      
+      dvs->set_parent(body);
+      body->insert(var_name, dvs);
+    }
+    
+    std::vector<SgVarRefExp *> array = substitute(code, *myIterator, NULL,
+                                                  isSgNode(body));
+    
+    SgVariableSymbol* var = (SgVariableSymbol*) (*myIterator);
+    for (int j = 0; j < array.size(); j++)
+      array[j]->set_symbol(var);
+  }
+  
+  return code;
+}
+
+bool LoopCuda::validIndexes(int stmt, const std::vector<std::string>& idxs) {
+  for (int i = 0; i < idxs.size(); i++) {
+    bool found = false;
+    for (int j = 0; j < idxNames[stmt].size(); j++) {
+      if (strcmp(idxNames[stmt][j].c_str(), idxs[i].c_str()) == 0) {
+        found = true;
+      }
+    }
+    if (!found) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool LoopCuda::cudaize_v2(std::string kernel_name,
+                          std::map<std::string, int> array_dims,
+                          std::vector<std::string> blockIdxs,
+                          std::vector<std::string> threadIdxs) {
+  CG_outputBuilder *ocg = ir->builder();
+  int stmt_num = 0;
+  if (cudaDebug) {
+    printf("cudaize_v2(%s, {", kernel_name.c_str());
+    //for(
+    printf("}, blocks={");
+    printVs(blockIdxs);
+    printf("}, thread={");
+    printVs(threadIdxs);
+    printf("})\n");
+  }
+  
+  this->array_dims = array_dims;
+  if (!validIndexes(stmt_num, blockIdxs)) {
+    throw std::runtime_error("One of the indexes in the block list was not "
+                             "found in the current set of indexes.");
+  }
+  if (!validIndexes(stmt_num, threadIdxs)) {
+    throw std::runtime_error(
+      "One of the indexes in the thread list was not "
+      "found in the current set of indexes.");
+  }
+  if (blockIdxs.size() == 0)
+    throw std::runtime_error("Cudaize: Need at least one block dimention");
+  int block_level = 0;
+  //Now, we will determine the actual size (if possible, otherwise
+  //complain) for the block dimentions and thread dimentions based on our
+  //indexes and the relations for our stmt;
+  for (int i = 0; i < blockIdxs.size(); i++) {
+    int level = findCurLevel(stmt_num, blockIdxs[i]);
+    int ub, lb;
+    CG_outputRepr* ubrepr = extractCudaUB(stmt_num, level, ub, lb);
+    if (lb != 0) {
+      //attempt to "normalize" the loop with an in-place tile and then re-check our bounds
+      if (cudaDebug)
+        printf(
+          "Cudaize: doing tile at level %d to try and normalize lower bounds\n",
+          level);
+      tile(stmt_num, level, 1, level, CountedTile);
+      idxNames[stmt_num].insert(idxNames[stmt_num].begin() + (level), ""); //TODO: possibly handle this for all sibling stmts
+      ubrepr = extractCudaUB(stmt_num, level, ub, lb);
+    }
+    if (lb != 0) {
+      char buf[1024];
+      sprintf(buf,
+              "Cudaize: Loop at level %d does not have 0 as it's lower bound",
+              level);
+      throw std::runtime_error(buf);
+    }
+    if (ub < 0) {
+      char buf[1024];
+      sprintf(buf,
+              "Cudaize: Loop at level %d does not have a hard upper bound",
+              level);
+      //Anand: Commenting out error indication for lack of constant upper bound
+      //throw std::runtime_error(buf);
+    }
+    if (cudaDebug)
+      printf("block idx %s level %d lb: %d ub %d\n", blockIdxs[i].c_str(),
+             level, lb, ub);
+    if (i == 0) {
+      block_level = level;
+      if (ubrepr == NULL) {
+        cu_bx = ub + 1;
+        cu_bx_repr = NULL;
+      } else {
+        cu_bx = 0;
+        cu_bx_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
+      }
+      idxNames[stmt_num][level - 1] = "bx";
+    } else if (i == 1) {
+      if (ubrepr == NULL) {
+        cu_by = ub + 1;
+        cu_by_repr = NULL;
+      } else {
+        cu_by = 0;
+        cu_by_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
+      }
+      idxNames[stmt_num][level - 1] = "by";
+    }
+  }
+  if (!cu_by && !cu_by_repr)
+    block_level = 0;
+  int thread_level1 = 0;
+  int thread_level2 = 0;
+  for (int i = 0; i < threadIdxs.size(); i++) {
+    int level = findCurLevel(stmt_num, threadIdxs[i]);
+    int ub, lb;
+    CG_outputRepr* ubrepr = extractCudaUB(stmt_num, level, ub, lb);
+    if (lb != 0) {
+      //attempt to "normalize" the loop with an in-place tile and then re-check our bounds
+      if (cudaDebug)
+        printf(
+          "Cudaize: doing tile at level %d to try and normalize lower bounds\n",
+          level);
+      tile(stmt_num, level, 1, level, CountedTile);
+      idxNames[stmt_num].insert(idxNames[stmt_num].begin() + (level), "");
+      ubrepr = extractCudaUB(stmt_num, level, ub, lb);
+    }
+    if (lb != 0) {
+      char buf[1024];
+      sprintf(buf,
+              "Cudaize: Loop at level %d does not have 0 as it's lower bound",
+              level);
+      throw std::runtime_error(buf);
+    }
+    if (ub < 0) {
+      char buf[1024];
+      sprintf(buf,
+              "Cudaize: Loop at level %d does not have a hard upper bound",
+              level);
+      //Anand: Commenting out error indication for lack of constant upper bound
+      //throw std::runtime_error(buf);
+    }
+    
+    if (cudaDebug)
+      printf("thread idx %s level %d lb: %d ub %d\n",
+             threadIdxs[i].c_str(), level, lb, ub);
+    if (i == 0) {
+      thread_level1 = level;
+      if (ubrepr == NULL) {
+        cu_tx = ub + 1;
+        cu_tx_repr = NULL;
+      } else {
+        cu_tx = 0;
+        cu_tx_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
+      }
+      idxNames[stmt_num][level - 1] = "tx";
+    } else if (i == 1) {
+      thread_level2 = level;
+      if (ubrepr == NULL) {
+        cu_ty = ub + 1;
+        cu_ty_repr = NULL;
+      } else {
+        cu_ty = 0;
+        cu_ty_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
+      }
+      idxNames[stmt_num][level - 1] = "ty";
+    } else if (i == 2) {
+      if (ubrepr == NULL) {
+        cu_tz = ub + 1;
+        cu_tz_repr = NULL;
+      } else {
+        cu_tz = 0;
+        cu_tz_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
+      }
+      idxNames[stmt_num][level - 1] = "tz";
+    }
+  }
+  if (!cu_ty && !cu_ty_repr)
+    thread_level1 = 0;
+  if (!cu_tz && !cu_tz_repr)
+    thread_level2 = 0;
+  
+  //Make changes to nonsplitlevels
+  const int m = stmt.size();
+  for (int i = 0; i < m; i++) {
+    if (block_level) {
+      //stmt[i].nonSplitLevels.append((block_level)*2);
+      stmt_nonSplitLevels[i].push_back((block_level) * 2);
+    }
+    if (thread_level1) {
+      //stmt[i].nonSplitLevels.append((thread_level1)*2);
+      stmt_nonSplitLevels[i].push_back((thread_level1) * 2);
+    }
+    if (thread_level2) {
+      //stmt[i].nonSplitLevels.append((thread_level1)*2);
+      stmt_nonSplitLevels[i].push_back((thread_level1) * 2);
+    }
+  }
+  
+  if (cudaDebug) {
+    printf("Codegen: current names: ");
+    printVS(idxNames[stmt_num]);
+  }
+  //Set codegen flag
+  code_gen_flags |= GenCudaizeV2;
+  
+  //Save array dimention sizes
+  this->array_dims = array_dims;
+  cu_kernel_name = kernel_name.c_str();
+  
+}
+
+/*
+ * setupConstantVar
+ * handles constant variable declaration
+ * and adds a global constant variable
+ * parameters:
+ *   constant - the constant_memory_mapping object for this loop
+ *   arr_def  - the VarDefs object for the mapped variable
+ *   globals  - Rose Global variables
+ *   i        - an index to keep new variable names unique
+ *   symtab   - global symbol table
+ */
+static void setupConstantVar(constant_memory_mapping* constant, VarDefs* arr_def, SgGlobal* globals, int i, SgSymbolTable* symtab) {
+  char* buf1 = new char[32];
+  snprintf(buf1, 32, "cs%dRef", i+1);
+  arr_def->secondName = buf1;
+  
+  char buf2[64];
+  snprintf(buf2, 64, "__device__ __constant__ float");
+  
+  SgVariableDeclaration* consvar_decl = buildVariableDeclaration(
+                     SgName(std::string(buf1)), buildArrayType(
+                                 buildOpaqueType(SgName(buf2),globals),
+                                 arr_def->size_expr));
+  SgInitializedNamePtrList& variables = consvar_decl->get_variables();
+  SgInitializedNamePtrList::const_iterator j = variables.begin();
+  SgInitializedName* initializedName = *j;
+  SgVariableSymbol* consvar_sym = new SgVariableSymbol(initializedName);
+  prependStatement(consvar_decl, globals);
+
+  consvar_sym->set_parent(symtab);
+  symtab->insert(SgName(std::string(buf1)), consvar_sym);
+  
+  constant->set_mapped_symbol(arr_def->original_name.c_str(), consvar_sym);
+  constant->set_vardef(arr_def->original_name.c_str(), arr_def);
+}
+
+/*
+ * cudaBindConstantVar
+ * allocs a variable to constant memory
+ *   constant  - the constant mapping object
+ *   arr_def   - the VarDefs abject
+ *   globals   - global symbol table
+ *   stmt_list - the GPU functions' statement list
+ */
+static void cudaBindConstantVar(constant_memory_mapping* constant, VarDefs* arr_def, SgGlobal* globals, SgStatementPtrList* stmt_list) {
+  SgName cudaMemcpyToSymbol_name("cudaMemcpyToSymbol");
+  SgFunctionDeclaration* cudaMemcpyToSymbol_decl = buildNondefiningFunctionDeclaration(
+    cudaMemcpyToSymbol_name, buildVoidType(), buildFunctionParameterList(), globals);
+  SgExprListExp* args = buildExprListExp();
+  args->append_expression(buildCastExp(constant->get_mapped_symbol_exp(arr_def->original_name.c_str()),
+                                       buildPointerType(buildVoidType())));
+  args->append_expression(buildVarRefExp(arr_def->in_data));
+  args->append_expression(arr_def->size_expr);
+  stmt_list->push_back(buildExprStatement(
+    buildFunctionCallExp(buildFunctionRefExp(cudaMemcpyToSymbol_decl), args)));
+}
+
+static void consmapArrayRefs(constant_memory_mapping* constant, std::vector<IR_ArrayRef*>* refs, SgGlobal* globals, IR_Code* ir, CG_roseBuilder* ocg) {
+  // if constant mapping is not being used, ignore this function
+  if(constant == NULL) return;
+  for(int i = 0; i < refs->size(); i++) {
+    IR_ArrayRef* aref = (*refs)[i];
+    if(constant->is_array_mapped(aref->name().c_str())) {
+      // get array reference dimensions
+      int dims = aref->symbol()->n_dim();
+      if(dims > 2) {
+        printf(" \n CHiLL does not handle constant memory mapping for more than 2D arrays.\n");
+        return;
+      }
+      
+      SgExpression* varexp = constant->get_mapped_symbol_exp(aref->name().c_str());
+      SgExpression* index_exp;
+      // build index expression
+      if(dims == 1) {
+        index_exp = static_cast<omega::CG_roseRepr*>(aref->index(0)->clone())->GetExpression();
+      }
+      if(dims == 2) {
+        VarDefs* arr_def = constant->get_vardef(aref->name().c_str());
+        CG_outputRepr* i0 = aref->index(0)->clone();
+        CG_outputRepr* i1 = aref->index(1)->clone();
+        CG_outputRepr* sz = new CG_roseRepr(buildIntVal(arr_def->size_multi_dim[0]));
+        CG_outputRepr* exp = ocg->CreatePlus(ocg->CreateTimes(sz->clone(), i0), i1);
+        index_exp = static_cast<omega::CG_roseRepr*>(exp->clone())->GetExpression();
+      }
+      ir->ReplaceExpression(aref, new CG_roseRepr(buildPntrArrRefExp(varexp, index_exp)));
+    }
+  }
+}
+
+/*
+ * setupTexmappingVar
+ * handles texture variable declaration
+ * and adds a global texture object 
+ * parameters:
+ *   texture    - the texture_memory_mapping object
+ *   arr_def    - the VarDefs object for the mapped variable
+ *   globals    - Rose Global variables
+ *   i          - an index to keep the new variable names unique
+ *   devptr_sym - the devptr that the original variable is associated with
+ *   symtab     - GPU function symbol table
+ */
+static void setupTexmappingVar(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals, int i, SgVariableSymbol* devptr_sym, SgSymbolTable* symtab) {  
+  char* buf1 = new char[32];
+  snprintf(buf1, 32, "tex%dRef", i+1);
+  arr_def->secondName = buf1;
+  
+  char buf2[64];
+  // single-dimensional 
+  snprintf(buf2, 64, "texture<float, %d, cudaReadModeElementType>", 1);
+  // multi-dimensional
+  // snprintf(buf2, 64, "texture<float, %d, cudaReadModeElemetType>", (int)(arr_def->size_multi_dim.size())); //*/
+  
+  SgVariableDeclaration* texvar_decl = buildVariableDeclaration(SgName(std::string(buf1)), buildOpaqueType(buf2, globals));
+  
+  SgInitializedNamePtrList& variables = texvar_decl->get_variables();
+  SgInitializedNamePtrList::const_iterator j = variables.begin();
+  SgInitializedName* initializedName = *j;
+  SgVariableSymbol* texvar_sym = new SgVariableSymbol(initializedName);
+  prependStatement(texvar_decl, globals);
+  
+  texvar_sym->set_parent(symtab);
+  symtab->insert(SgName(buf1), texvar_sym);
+  
+  texture->set_mapped_symbol(arr_def->original_name.c_str(), texvar_sym);
+  texture->set_devptr_symbol(arr_def->original_name.c_str(), devptr_sym);
+  texture->set_vardef(arr_def->original_name.c_str(), arr_def);
+}
+
+
+/*
+ * One dimensional version of cudaBindTexture
+ * see cudaBindTexture for details
+ */
+static SgFunctionCallExp* cudaBindTexture1D(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals) {
+  SgName cudaBindTexture_name("cudaBindTexture");
+  SgFunctionDeclaration* cudaBindTexture_decl = buildNondefiningFunctionDeclaration(
+      cudaBindTexture_name, buildVoidType(), buildFunctionParameterList(), globals);
+  
+  SgExprListExp* args = buildExprListExp();
+  args->append_expression(buildIntVal(0));
+  args->append_expression(texture->get_mapped_symbol_exp(arr_def->original_name.c_str()));
+  args->append_expression(texture->get_devptr_symbol_exp(arr_def->original_name.c_str()));
+  args->append_expression(arr_def->size_expr);
+  return buildFunctionCallExp(buildFunctionRefExp(cudaBindTexture_decl), args);
+}
+
+/*
+ * Two dimensional version of cudaBindTexture
+ * see cudaBindTexture for details
+ */
+//static SgFunctionCallExp* cudaBindTexture2D(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals) {
+//  SgName cudaBindTexture_name("cudaBindTexture2D");
+//  SgFunctionDeclaration* cudaBindTexture_decl = buildNondefiningFunctionDeclaration(
+//      cudaBindTexture_name, buildVoidType(), buildFunctionParameterList(), globals);
+//  
+//  SgExprListExp* args = buildExprListExp();
+//  args->append_expression(buildIntVal(0));
+//  args->append_expression(texture->get_tex_mapped_symbol_exp(arr_def->original_name.c_str()));
+//  args->append_expression(texture->get_devptr_symbol_exp(arr_def->original_name.c_str()));
+//  args->append_expression(buildIntVal(texture->get_dim_length(arr_def->original_name.c_str(), 0)));
+//  args->append_expression(buildIntVal(texture->get_dim_length(arr_def->original_name.c_str(), 1)));
+//  args->append_expression(arr_def->size_expr);
+//  return buildFunctionCallExp(buildFunctionRefExp(cudaBindTexture_decl), args);
+//}
+
+/*
+ * cudaBindTexture
+ * binds a variable to a texture
+ * parameters:
+ *    texture   - the texture mapping object
+ *    arr_def   - the VarDefs object
+ *    globals   - global symbol table
+ *    stmt_list - the GPU functions' statement list
+ * notes:
+ *    only supports binding 1D textures, may need to consider cudaBindTexture2D for 2D textures
+ */
+static void cudaBindTexture(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals, SgStatementPtrList* stmt_list) {
+  //int dims = (int)(arr_def->size_multi_dim.size());
+  //int dims = texture->get_dims(arr_def->original_name.c_str());
+  //if(dims == 1)
+    stmt_list->push_back(
+      buildExprStatement(cudaBindTexture1D(texture, arr_def, globals)));
+  //if(dims == 2)
+  //  stmt_list->push_back(
+  //    buildExprStatement(cudaBindTexture2D(texture, arr_def, globals)));
+}
+
+/*
+ * texmapArrayRefs
+ * maps array reference expresions of texture mapped variables to the tex1D function
+ * parameters:
+ *    texture - the texture mapping object
+ *    refs    - a list of all array read operations
+ *    globals - global symbol table
+ *    ir      - handles IR_Code operations
+ *    ocg     - handles CG_roseBuilder operations
+**/
+static void texmapArrayRefs(texture_memory_mapping* texture, std::vector<IR_ArrayRef*>* refs, SgGlobal* globals, IR_Code* ir, CG_roseBuilder *ocg) {
+  // if texture mapping is not being used, ignore this function
+  if(texture == NULL) return;
+  for(int i = 0; i < refs->size(); i++) {
+    IR_ArrayRef* aref = (*refs)[i];
+    if(texture->is_array_mapped(aref->name().c_str())) {
+      
+      // get array dimensions
+      VarDefs* arr_def = texture->get_vardef(aref->name().c_str());
+      int dims = aref->symbol()->n_dim();
+      if(dims > 2) {
+        printf(" \n CHiLL does not handle texture mapping for more than 2D arrays.\n");
+        // TODO throw some sort of error. or handle in texture_copy function
+        return;
+      }
+      
+      // build texture lookup function declaration
+      char texNDfetch_strName[16];
+      sprintf(texNDfetch_strName, "tex%dDfetch", 1); // for now, only support tex1Dfetch
+      //sprintf(texNDfetch_strName, "tex%dDfetch", dims);
+      SgFunctionDeclaration* fetch_decl = buildNondefiningFunctionDeclaration(
+          SgName(texNDfetch_strName), buildFloatType(), buildFunctionParameterList(), globals);
+      
+      // build args
+      SgExprListExp* args = buildExprListExp();
+      args->append_expression(texture->get_mapped_symbol_exp(aref->name().c_str()));
+      
+      // set indexing args
+      //for(int i = 0; i < dims; i++) {
+      //  args->append_expression((static_cast<omega::CG_roseRepr*>(aref->index(i)->clone()))->GetExpression());
+      //}
+      if(dims == 1) {
+        args->append_expression(static_cast<omega::CG_roseRepr*>(aref->index(0)->clone())->GetExpression());
+      }
+      else if(dims == 2) {
+        CG_outputRepr* i0 = aref->index(0)->clone();
+        CG_outputRepr* i1 = aref->index(1)->clone();
+        CG_outputRepr* sz = new CG_roseRepr(buildIntVal(arr_def->size_multi_dim[0]));
+        CG_outputRepr* expr = ocg->CreatePlus(ocg->CreateTimes(sz->clone(), i0), i1);
+        args->append_expression(static_cast<omega::CG_roseRepr*>(expr->clone())->GetExpression());
+      }
+      
+      // build function call and replace original array ref
+      SgFunctionCallExp* fetch_call = buildFunctionCallExp(buildFunctionRefExp(fetch_decl), args);
+      ir->ReplaceExpression(aref, new CG_roseRepr(fetch_call));
+    }
+  }
+}
+
+SgNode* LoopCuda::cudaize_codegen_v2() {
+    if(cudaDebug)
+	printf("cudaize codegen V2\n");
+  CG_roseBuilder *ocg = dynamic_cast<CG_roseBuilder*>(ir->builder());
+  if (!ocg)
+    return false;
+
+ //protonu--adding an annote to track texture memory type
+  //ANNOTE(k_cuda_texture_memory, "cuda texture memory", TRUE);
+  //ANNOTE(k_cuda_constant_memory, "cuda constant memory", TRUE);
+  int tex_mem_on = 0;
+  int cons_mem_on = 0;
+
+
+  
+  CG_outputRepr* repr;
+  std::vector<VarDefs> arrayVars;
+  std::vector<VarDefs> localScopedVars;
+  
+  std::vector<IR_ArrayRef *> ro_refs;
+  std::vector<IR_ArrayRef *> wo_refs;
+  std::set<std::string> uniqueRefs;
+  std::set<std::string> uniqueWoRefs;
+  std::set<const SgVariableSymbol *> syms;
+  std::set<const SgVariableSymbol *> psyms;
+  std::set<const SgVariableSymbol *> pdSyms;
+  SgStatementPtrList* replacement_list = new SgStatementPtrList;
+  
+  for (int j = 0; j < stmt.size(); j++) {
+    std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[j].code);
+    for (int i = 0; i < refs.size(); i++) {
+      //printf("ref %s wo %d\n", static_cast<const char*>(refs[i]->name()), refs[i]->is_write());
+      SgVariableSymbol* var = body_symtab->find_variable(
+        SgName((char*) refs[i]->name().c_str()));
+      SgVariableSymbol* var2 = parameter_symtab->find_variable(
+        SgName((char*) refs[i]->name().c_str()));
+      
+      //If the array is not a parameter, then it's a local array and we
+      //want to recreate it as a stack variable in the kernel as opposed to
+      //passing it in.
+      if (var != NULL) {
+        //anand-- needs modification, if variable is parameter it wont be part of the
+        // block's symbol table but the functiond definition's symbol table
+        
+        continue;
+      }
+      if (uniqueRefs.find(refs[i]->name()) == uniqueRefs.end()) {
+        
+        uniqueRefs.insert(refs[i]->name());
+        if (refs[i]->is_write()) {
+          uniqueWoRefs.insert(refs[i]->name());
+          wo_refs.push_back(refs[i]);
+        } else
+          ro_refs.push_back(refs[i]);
+      }
+      if (refs[i]->is_write()
+          && uniqueWoRefs.find(refs[i]->name())
+          == uniqueWoRefs.end()) {
+        uniqueWoRefs.insert(refs[i]->name());
+        wo_refs.push_back(refs[i]);
+        //printf("adding %s to wo\n", static_cast<const char*>(refs[i]->name()));
+      }
+      pdSyms.insert((const SgVariableSymbol*) var2);
+    }
+  }
+  
+  if (cudaDebug) {
+      printf("reading from array ");
+      for (int i = 0; i < ro_refs.size(); i++)
+	  printf("'%s' ", ro_refs[i]->name().c_str());
+      printf("and writing to array ");
+      for (int i = 0; i < wo_refs.size(); i++)
+	  printf("'%s' ", wo_refs[i]->name().c_str());
+      printf("\n");
+  }  
+  const char* gridName = "dimGrid";
+  const char* blockName = "dimBlock";
+  
+  //TODO: Could allow for array_dims_vars to be a mapping from array
+  //references to to variable names that define their length.
+  SgVariableSymbol* dim1 = 0;
+  SgVariableSymbol* dim2 = 0;
+  
+  for (int i = 0; i < wo_refs.size(); i++) {
+    //TODO: Currently assume all arrays are floats of one or two dimentions
+    SgVariableSymbol* outArray = 0;
+    std::string name = wo_refs[i]->name();
+    outArray = body_symtab->find_variable(SgName((char*) name.c_str()));
+    int size_n_d;
+    if (outArray == NULL)
+      outArray = parameter_symtab->find_variable(
+        SgName((char*) name.c_str()));
+    
+    VarDefs v;
+    v.size_multi_dim = std::vector<int>();
+    char buf[32];
+    snprintf(buf, 32, "devO%dPtr", i + 1);
+    v.name = buf;
+    if (isSgPointerType(outArray->get_type())) {
+      if (isSgArrayType(
+            isSgNode(
+              isSgPointerType(outArray->get_type())->get_base_type()))) {
+        //  v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type();
+        SgType* t =
+          isSgPointerType(outArray->get_type())->get_base_type();
+        /*   SgExprListExp* dimList = t->get_dim_info();
+             SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
+             SgExpression* expr=NULL;
+             for (; j != dimList->get_expressions().end(); j++)
+             expr = *j;
+        */
+        while (isSgArrayType(t))
+          t = isSgArrayType(t)->get_base_type();
+        
+        if (!isSgType(t)) {
+          char buf[1024];
+          sprintf(buf, "CudaizeCodeGen: Array type undetected!");
+          throw std::runtime_error(buf);
+          
+        }
+        
+        v.type = t;
+      } else
+        v.type = isSgPointerType(outArray->get_type())->get_base_type();
+    } else if (isSgArrayType(outArray->get_type())) {
+      if (isSgArrayType(
+            isSgNode(
+              isSgArrayType(outArray->get_type())->get_base_type()))) {
+        //  v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type();
+        SgType* t =
+          isSgArrayType(outArray->get_type())->get_base_type();
+        /*   SgExprListExp* dimList = t->get_dim_info();
+             SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
+             SgExpression* expr=NULL;
+             for (; j != dimList->get_expressions().end(); j++)
+             expr = *j;
+        */
+        while (isSgArrayType(t))
+          t = isSgArrayType(t)->get_base_type();
+        
+        if (!isSgType(t)) {
+          char buf[1024];
+          sprintf(buf, "CudaizeCodeGen: Array type undetected!");
+          throw std::runtime_error(buf);
+          
+        }
+        
+        v.type = t;
+      } else
+        v.type = isSgArrayType(outArray->get_type())->get_base_type();
+    } else
+      v.type = buildFloatType();
+    v.tex_mapped = false;
+    v.cons_mapped = false;
+    v.original_name = wo_refs[i]->name();
+    //Size of the array = dim1 * dim2 * num bytes of our array type
+    
+    //If our input array is 2D (non-linearized), we want the actual
+    //dimentions of the array
+    CG_outputRepr* size;
+    //Lookup in array_dims
+    std::map<std::string, int>::iterator it = array_dims.find(name.c_str());
+    if (isSgPointerType(outArray->get_type())
+        && isSgArrayType(
+          isSgNode(
+            isSgPointerType(outArray->get_type())->get_base_type()))) {
+      SgType* t = isSgPointerType(outArray->get_type())->get_base_type();
+      /*   SgExprListExp* dimList = t->get_dim_info();
+           SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
+           SgExpression* expr=NULL;
+           for (; j != dimList->get_expressions().end(); j++)
+           expr = *j;
+      */
+      if (isSgIntVal(isSgArrayType(t)->get_index()))
+        size_n_d =
+          (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
+      else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgUnsignedIntVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgUnsignedLongVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d =
+          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
+      else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgLongLongIntVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d =
+          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
+      else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgUnsignedLongLongIntVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgAddOp(isSgArrayType(t)->get_index())) {
+        SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
+        
+        SgExpression *lhs = op_add->get_lhs_operand();
+        SgExpression *rhs = op_add->get_rhs_operand();
+        
+        if (isSgIntVal(lhs))
+          size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value());
+        else if (isSgUnsignedIntVal(lhs))
+          size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value()
+            + (int) isSgUnsignedIntVal(rhs)->get_value();
+        else if (isSgUnsignedLongVal(lhs))
+          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                            + isSgUnsignedLongVal(rhs)->get_value());
+        else if (isSgLongIntVal(lhs))
+          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                            + isSgUnsignedLongVal(rhs)->get_value());
+        else if (isSgLongLongIntVal(lhs))
+          size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value()
+                            + isSgUnsignedLongVal(rhs)->get_value());
+        else if (isSgLongIntVal(lhs))
+          size_n_d = (int) (isSgLongIntVal(lhs)->get_value()
+                            + isSgLongIntVal(rhs)->get_value());
+        else if (isSgUnsignedLongLongIntVal(lhs))
+          size_n_d =
+            (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
+                   + isSgUnsignedLongLongIntVal(rhs)->get_value());
+        
+      }
+      t = isSgArrayType(t)->get_base_type();
+      while (isSgArrayType(t)) {
+        int dim;
+        if (isSgIntVal(isSgArrayType(t)->get_index()))
+          dim =
+            (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
+        else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgUnsignedIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgUnsignedLongVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgLongLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgUnsignedLongLongIntVal(
+                   isSgArrayType(t)->get_index()))
+          dim = (int) (isSgUnsignedLongLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgAddOp(isSgArrayType(t)->get_index())) {
+          SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
+          
+          SgExpression *lhs = op_add->get_lhs_operand();
+          SgExpression *rhs = op_add->get_rhs_operand();
+          
+          if (isSgIntVal(lhs))
+            dim = (int) isSgIntVal(lhs)->get_value()
+              + (int) (isSgIntVal(rhs)->get_value());
+          else if (isSgUnsignedIntVal(lhs))
+            dim = (int) isSgUnsignedIntVal(lhs)->get_value()
+              + (int) isSgUnsignedIntVal(rhs)->get_value();
+          else if (isSgUnsignedLongVal(lhs))
+            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                         + isSgUnsignedLongVal(rhs)->get_value());
+          else if (isSgLongIntVal(lhs))
+            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                         + isSgUnsignedLongVal(rhs)->get_value());
+          else if (isSgLongLongIntVal(lhs))
+            dim = (int) (isSgLongLongIntVal(lhs)->get_value()
+                         + isSgUnsignedLongVal(rhs)->get_value());
+          else if (isSgLongIntVal(lhs))
+            dim = (int) (isSgLongIntVal(lhs)->get_value()
+                         + isSgLongIntVal(rhs)->get_value());
+          else if (isSgUnsignedLongLongIntVal(lhs))
+            dim =
+              (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
+                     + isSgUnsignedLongLongIntVal(rhs)->get_value());
+          
+        }
+        size_n_d *= dim;
+        v.size_multi_dim.push_back(dim);
+        t = isSgArrayType(t)->get_base_type();
+      }
+      //v.size_2d = (int) (isSgIntVal(t->get_index())->get_value());
+      
+      if (cudaDebug)
+	  printf("Detected Multi-dimensional array sized of %d for %s\n",
+		 size_n_d, (char*) wo_refs[i]->name().c_str());
+      size = ocg->CreateInt(size_n_d);
+    } else if (isSgArrayType(outArray->get_type())
+               && isSgArrayType(
+                 isSgNode(
+                   isSgArrayType(outArray->get_type())->get_base_type()))) {
+      SgType* t = outArray->get_type();
+      /*   SgExprListExp* dimList = t->get_dim_info();
+           SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
+           SgExpression* expr=NULL;
+           for (; j != dimList->get_expressions().end(); j++)
+           expr = *j;
+      */
+      
+      if (isSgIntVal(isSgArrayType(t)->get_index()))
+        size_n_d =
+          (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
+      else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgUnsignedIntVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgUnsignedLongVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d =
+          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
+      else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgLongLongIntVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d =
+          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
+      else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgUnsignedLongLongIntVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgAddOp(isSgArrayType(t)->get_index())) {
+        SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
+        
+        SgExpression *lhs = op_add->get_lhs_operand();
+        SgExpression *rhs = op_add->get_rhs_operand();
+        
+        if (isSgIntVal(lhs))
+          size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value());
+        else if (isSgUnsignedIntVal(lhs))
+          size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value()
+            + (int) isSgUnsignedIntVal(rhs)->get_value();
+        else if (isSgUnsignedLongVal(lhs))
+          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                            + isSgUnsignedLongVal(rhs)->get_value());
+        else if (isSgLongIntVal(lhs))
+          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                            + isSgUnsignedLongVal(rhs)->get_value());
+        else if (isSgLongLongIntVal(lhs))
+          size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value()
+                            + isSgUnsignedLongVal(rhs)->get_value());
+        else if (isSgLongIntVal(lhs))
+          size_n_d = (int) (isSgLongIntVal(lhs)->get_value()
+                            + isSgLongIntVal(rhs)->get_value());
+        else if (isSgUnsignedLongLongIntVal(lhs))
+          size_n_d =
+            (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
+                   + isSgUnsignedLongLongIntVal(rhs)->get_value());
+        
+      }
+      t = isSgArrayType(t)->get_base_type();
+      while (isSgArrayType(t)) {
+        int dim;
+        if (isSgIntVal(isSgArrayType(t)->get_index()))
+          dim =
+            (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
+        else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgUnsignedIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgUnsignedLongVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgLongLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgUnsignedLongLongIntVal(
+                   isSgArrayType(t)->get_index()))
+          dim = (int) (isSgUnsignedLongLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgAddOp(isSgArrayType(t)->get_index())) {
+          SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
+          
+          SgExpression *lhs = op_add->get_lhs_operand();
+          SgExpression *rhs = op_add->get_rhs_operand();
+          
+          if (isSgIntVal(lhs))
+            dim = (int) isSgIntVal(lhs)->get_value()
+              + (int) (isSgIntVal(rhs)->get_value());
+          else if (isSgUnsignedIntVal(lhs))
+            dim = (int) isSgUnsignedIntVal(lhs)->get_value()
+              + (int) isSgUnsignedIntVal(rhs)->get_value();
+          else if (isSgUnsignedLongVal(lhs))
+            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                         + isSgUnsignedLongVal(rhs)->get_value());
+          else if (isSgLongIntVal(lhs))
+            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                         + isSgUnsignedLongVal(rhs)->get_value());
+          else if (isSgLongLongIntVal(lhs))
+            dim = (int) (isSgLongLongIntVal(lhs)->get_value()
+                         + isSgUnsignedLongVal(rhs)->get_value());
+          else if (isSgLongIntVal(lhs))
+            dim = (int) (isSgLongIntVal(lhs)->get_value()
+                         + isSgLongIntVal(rhs)->get_value());
+          else if (isSgUnsignedLongLongIntVal(lhs))
+            dim =
+              (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
+                     + isSgUnsignedLongLongIntVal(rhs)->get_value());
+          
+        }
+        size_n_d *= dim;
+        v.size_multi_dim.push_back(dim);
+        t = isSgArrayType(t)->get_base_type();
+      }
+      
+      //v.size_2d = (int) (isSgIntVal(t->get_index())->get_value());
+      
+      if (cudaDebug)
+	  printf("Detected Multi-Dimensional array sized of %d for %s\n",
+		 size_n_d, (char*) wo_refs[i]->name().c_str());
+      size = ocg->CreateInt(size_n_d);
+    } else if (it != array_dims.end()) {
+      int ref_size = it->second;
+      //size =
+      //        ocg->CreateInt(
+      //                isSgIntVal(
+      //                        isSgArrayType(outArray->get_type())->get_index())->get_value());
+      //v.size_2d = isSgArrayType(outArray->get_type())->get_rank();
+      //v.var_ref_size = ref_size;
+      size = ocg->CreateInt(ref_size);
+      
+    } else {
+      if (dim1) {
+        size = ocg->CreateTimes(
+          new CG_roseRepr(isSgExpression(buildVarRefExp(dim1))),
+          new CG_roseRepr(isSgExpression(buildVarRefExp(dim2))));
+      } else {
+        char buf[1024];
+        sprintf(buf,
+                "CudaizeCodeGen: Array reference %s does not have a "
+                "detectable size or specififed dimentions",
+                name.c_str());
+        throw std::runtime_error(buf);
+      }
+    }
+    
+    v.size_expr =
+      static_cast<CG_roseRepr*>(ocg->CreateTimes(size,
+                                                 new omega::CG_roseRepr(
+                                                   isSgExpression(buildSizeOfOp(v.type)))))->GetExpression();
+    
+    v.in_data = 0;
+    v.out_data = outArray;
+    //Check for in ro_refs and remove it at this point
+    std::vector<IR_ArrayRef *>::iterator it_;
+    for (it_ = ro_refs.begin(); it_ != ro_refs.end(); it_++) {
+      if ((*it_)->name() == wo_refs[i]->name()) {
+        break;
+      }
+    }
+    if (it_ != ro_refs.end()) {
+      v.in_data = outArray;
+      ro_refs.erase(it_);
+    }
+    
+    arrayVars.push_back(v);
+    
+  }
+  
+  //protonu-- assuming that all texture mapped memories were originally read only mems
+  //there should be safety checks for that, will implement those later
+  
+  for (int i = 0; i < ro_refs.size(); i++) {
+    SgVariableSymbol* inArray = 0;
+    std::string name = ro_refs[i]->name();
+    inArray = body_symtab->find_variable(SgName((char*) name.c_str()));
+    if (inArray == NULL)
+      inArray = parameter_symtab->find_variable(
+        SgName((char*) name.c_str()));
+    
+    VarDefs v;
+    v.size_multi_dim = std::vector<int>();
+    char buf[32];
+    snprintf(buf, 32, "devI%dPtr", i + 1);
+    v.name = buf;
+    int size_n_d;
+    if (isSgPointerType(inArray->get_type())) {
+      if (isSgArrayType(
+            isSgNode(
+              isSgPointerType(inArray->get_type())->get_base_type()))) {
+        
+        SgType* t =
+          isSgPointerType(inArray->get_type())->get_base_type();
+        
+        while (isSgArrayType(t))
+          t = isSgArrayType(t)->get_base_type();
+        
+        if (!isSgType(t)) {
+          char buf[1024];
+          sprintf(buf, "CudaizeCodeGen: Array type undetected!");
+          throw std::runtime_error(buf);
+          
+        }
+        v.type = t;
+      } else
+        v.type = isSgPointerType(inArray->get_type())->get_base_type();
+    } else if (isSgArrayType(inArray->get_type())) {
+      if (isSgArrayType(
+            isSgNode(
+              isSgArrayType(inArray->get_type())->get_base_type()))) {
+        
+        SgType* t = inArray->get_type();
+        while (isSgArrayType(t))
+          t = isSgArrayType(t)->get_base_type();
+        
+        if (!isSgType(t)) {
+          char buf[1024];
+          sprintf(buf, "CudaizeCodeGen: Array type undetected!");
+          throw std::runtime_error(buf);
+          
+        }
+        v.type = t;
+      } else
+        v.type = isSgArrayType(inArray->get_type())->get_base_type();
+    }
+    
+    else
+      v.type = buildFloatType();
+    
+    v.tex_mapped = false;
+    v.cons_mapped = false;
+    v.original_name = ro_refs[i]->name();
+    
+    //derick -- adding texture and constant mapping
+    if ( texture != NULL)
+      v.tex_mapped = (texture->is_array_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars
+    if (v.tex_mapped){
+      printf("this variable  %s is mapped to texture memory", name.c_str());
+    }
+    //derick -- this is commented out until constant memory is implemeted
+    if ( constant_mem != NULL)
+      v.cons_mapped = (constant_mem->is_array_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars
+    if (v.cons_mapped){
+      printf("this variable  %s is mapped to constant memory", name.c_str());
+    }
+    
+    //Size of the array = dim1 * dim2 * num bytes of our array type
+    //If our input array is 2D (non-linearized), we want the actual
+    //dimentions of the array (as it might be less than cu_n
+    CG_outputRepr* size;
+    //Lookup in array_dims
+    std::map<std::string, int>::iterator it = array_dims.find(name.c_str());
+    if (isSgPointerType(inArray->get_type())
+        && isSgArrayType(
+          isSgPointerType(inArray->get_type())->get_base_type())) {
+      //SgArrayType* t = isSgArrayType(isSgArrayType(inArray->get_type())->get_base_type());
+      //v.size_2d = t->get_rank();
+      SgType* t = isSgPointerType(inArray->get_type())->get_base_type();
+      /*   SgExprListExp* dimList = t->get_dim_info();
+           SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
+           SgExpression* expr=NULL;
+           for (; j != dimList->get_expressions().end(); j++)
+           expr = *j;
+      */
+      //v.size_2d = 1;
+      if (isSgIntVal(isSgArrayType(t)->get_index()))
+        size_n_d =
+          (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
+      else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgUnsignedIntVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgUnsignedLongVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d =
+          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
+      else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgLongLongIntVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d =
+          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
+      else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgUnsignedLongLongIntVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgAddOp(isSgArrayType(t)->get_index())) {
+        SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
+        
+        SgExpression *lhs = op_add->get_lhs_operand();
+        SgExpression *rhs = op_add->get_rhs_operand();
+        
+        if (isSgIntVal(lhs))
+          size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value());
+        else if (isSgUnsignedIntVal(lhs))
+          size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value()
+            + (int) isSgUnsignedIntVal(rhs)->get_value();
+        else if (isSgUnsignedLongVal(lhs))
+          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                            + isSgUnsignedLongVal(rhs)->get_value());
+        else if (isSgLongIntVal(lhs))
+          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                            + isSgUnsignedLongVal(rhs)->get_value());
+        else if (isSgLongLongIntVal(lhs))
+          size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value()
+                            + isSgUnsignedLongVal(rhs)->get_value());
+        else if (isSgLongIntVal(lhs))
+          size_n_d = (int) (isSgLongIntVal(lhs)->get_value()
+                            + isSgLongIntVal(rhs)->get_value());
+        else if (isSgUnsignedLongLongIntVal(lhs))
+          size_n_d =
+            (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
+                   + isSgUnsignedLongLongIntVal(rhs)->get_value());
+        
+      }
+      t = isSgArrayType(t)->get_base_type();
+      while (isSgArrayType(t)) {
+        int dim;
+        if (isSgIntVal(isSgArrayType(t)->get_index()))
+          dim =
+            (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
+        else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgUnsignedIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgUnsignedLongVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgLongLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgUnsignedLongLongIntVal(
+                   isSgArrayType(t)->get_index()))
+          dim = (int) (isSgUnsignedLongLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgAddOp(isSgArrayType(t)->get_index())) {
+          SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
+          
+          SgExpression *lhs = op_add->get_lhs_operand();
+          SgExpression *rhs = op_add->get_rhs_operand();
+          
+          if (isSgIntVal(lhs))
+            dim = (int) isSgIntVal(lhs)->get_value()
+              + (int) (isSgIntVal(rhs)->get_value());
+          else if (isSgUnsignedIntVal(lhs))
+            dim = (int) isSgUnsignedIntVal(lhs)->get_value()
+              + (int) isSgUnsignedIntVal(rhs)->get_value();
+          else if (isSgUnsignedLongVal(lhs))
+            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                         + isSgUnsignedLongVal(rhs)->get_value());
+          else if (isSgLongIntVal(lhs))
+            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                         + isSgUnsignedLongVal(rhs)->get_value());
+          else if (isSgLongLongIntVal(lhs))
+            dim = (int) (isSgLongLongIntVal(lhs)->get_value()
+                         + isSgUnsignedLongVal(rhs)->get_value());
+          else if (isSgLongIntVal(lhs))
+            dim = (int) (isSgLongIntVal(lhs)->get_value()
+                         + isSgLongIntVal(rhs)->get_value());
+          else if (isSgUnsignedLongLongIntVal(lhs))
+            dim =
+              (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
+                     + isSgUnsignedLongLongIntVal(rhs)->get_value());
+          
+        }
+        size_n_d *= dim;
+        v.size_multi_dim.push_back(dim);
+        t = isSgArrayType(t)->get_base_type();
+      }
+      if (cudaDebug)
+	  printf("Detected Multi-dimensional array sized of %d for %s\n",
+		 size_n_d, (char*) ro_refs[i]->name().c_str());
+      size = ocg->CreateInt(size_n_d);
+    } else if (isSgArrayType(inArray->get_type())
+               && isSgArrayType(
+                 isSgArrayType(inArray->get_type())->get_base_type())) {
+      //SgArrayType* t = isSgArrayType(isSgArrayType(inArray->get_type())->get_base_type());
+      //v.size_2d = t->get_rank();
+      SgType* t = inArray->get_type();
+      /*   SgExprListExp* dimList = t->get_dim_info();
+           SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
+           SgExpression* expr=NULL;
+           for (; j != dimList->get_expressions().end(); j++)
+           expr = *j;
+      */
+      
+      if (isSgIntVal(isSgArrayType(t)->get_index()))
+        size_n_d =
+          (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
+      else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgUnsignedIntVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgUnsignedLongVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d =
+          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
+      else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgLongLongIntVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d =
+          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
+      else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index()))
+        size_n_d = (int) (isSgUnsignedLongLongIntVal(
+                            isSgArrayType(t)->get_index())->get_value());
+      else if (isSgAddOp(isSgArrayType(t)->get_index())) {
+        SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
+        
+        SgExpression *lhs = op_add->get_lhs_operand();
+        SgExpression *rhs = op_add->get_rhs_operand();
+        
+        if (isSgIntVal(lhs))
+          size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value());
+        else if (isSgUnsignedIntVal(lhs))
+          size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value()
+            + (int) isSgUnsignedIntVal(rhs)->get_value();
+        else if (isSgUnsignedLongVal(lhs))
+          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                            + isSgUnsignedLongVal(rhs)->get_value());
+        else if (isSgLongIntVal(lhs))
+          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                            + isSgUnsignedLongVal(rhs)->get_value());
+        else if (isSgLongLongIntVal(lhs))
+          size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value()
+                            + isSgUnsignedLongVal(rhs)->get_value());
+        else if (isSgLongIntVal(lhs))
+          size_n_d = (int) (isSgLongIntVal(lhs)->get_value()
+                            + isSgLongIntVal(rhs)->get_value());
+        else if (isSgUnsignedLongLongIntVal(lhs))
+          size_n_d =
+            (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
+                   + isSgUnsignedLongLongIntVal(rhs)->get_value());
+        
+      }
+      t = isSgArrayType(t)->get_base_type();
+      while (isSgArrayType(t)) {
+        int dim;
+        if (isSgIntVal(isSgArrayType(t)->get_index()))
+          dim =
+            (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
+        else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgUnsignedIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgUnsignedLongVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgLongLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
+          dim = (int) (isSgLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgUnsignedLongLongIntVal(
+                   isSgArrayType(t)->get_index()))
+          dim = (int) (isSgUnsignedLongLongIntVal(
+                         isSgArrayType(t)->get_index())->get_value());
+        else if (isSgAddOp(isSgArrayType(t)->get_index())) {
+          SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
+          
+          SgExpression *lhs = op_add->get_lhs_operand();
+          SgExpression *rhs = op_add->get_rhs_operand();
+          
+          if (isSgIntVal(lhs))
+            dim = (int) isSgIntVal(lhs)->get_value()
+              + (int) (isSgIntVal(rhs)->get_value());
+          else if (isSgUnsignedIntVal(lhs))
+            dim = (int) isSgUnsignedIntVal(lhs)->get_value()
+              + (int) isSgUnsignedIntVal(rhs)->get_value();
+          else if (isSgUnsignedLongVal(lhs))
+            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                         + isSgUnsignedLongVal(rhs)->get_value());
+          else if (isSgLongIntVal(lhs))
+            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
+                         + isSgUnsignedLongVal(rhs)->get_value());
+          else if (isSgLongLongIntVal(lhs))
+            dim = (int) (isSgLongLongIntVal(lhs)->get_value()
+                         + isSgUnsignedLongVal(rhs)->get_value());
+          else if (isSgLongIntVal(lhs))
+            dim = (int) (isSgLongIntVal(lhs)->get_value()
+                         + isSgLongIntVal(rhs)->get_value());
+          else if (isSgUnsignedLongLongIntVal(lhs))
+            dim =
+              (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
+                     + isSgUnsignedLongLongIntVal(rhs)->get_value());
+          
+        }
+        size_n_d *= dim;
+        v.size_multi_dim.push_back(dim);
+        t = isSgArrayType(t)->get_base_type();
+      }
+      if (cudaDebug)
+	  printf("Detected Multi-Dimensional array sized of %d for %s\n",
+		 size_n_d, (char*) ro_refs[i]->name().c_str());
+      size = ocg->CreateInt(size_n_d);
+    }
+    
+    else if (it != array_dims.end()) {
+      int ref_size = it->second;
+      //                v.var_ref_size = ref_size;
+      size = ocg->CreateInt(ref_size);
+    } else {
+      if (dim1) {
+        size = ocg->CreateTimes(
+          new CG_roseRepr(isSgExpression(buildVarRefExp(dim1))),
+          new CG_roseRepr(isSgExpression(buildVarRefExp(dim2))));
+      } else {
+        char buf[1024];
+        sprintf(buf,
+                "CudaizeCodeGen: Array reference %s does not have a "
+                "detectable size or specififed dimentions",
+                name.c_str());
+        throw std::runtime_error(buf);
+      }
+    }
+    v.size_expr =
+      static_cast<CG_roseRepr*>(ocg->CreateTimes(size,
+                                                 new omega::CG_roseRepr(
+                                                   isSgExpression(buildSizeOfOp(v.type)))))->GetExpression();
+    
+    v.in_data = inArray;
+    v.out_data = 0;
+    arrayVars.push_back(v);
+  }
+  
+  if (arrayVars.size() < 2) {
+    fprintf(stderr,
+            "cudaize error: Did not find two arrays being accessed\n");
+    return false;
+  }
+
+  //protonu--debugging tool--the printf statement
+  //tex_mem_on signals use of tex mem
+  /* derick -- texmapping near malloc mcopy
+  for(int i=0; i<arrayVars.size(); i++)
+  {
+	  //printf("var name %s, tex_mem used %s\n", arrayVars[i].name.c_str(), (arrayVars[i].tex_mapped)?"true":"false");
+	  if (arrayVars[i].tex_mapped  ) tex_mem_on ++;
+	  //if (arrayVars[i].cons_mapped  ) cons_mem_on ++;
+  }
+  */
+  
+  //Add our mallocs (and input array memcpys)
+  for (int i = 0; i < arrayVars.size(); i++) {
+    if(arrayVars[i].cons_mapped) {
+      setupConstantVar(constant_mem, &arrayVars[i], globals, i, symtab);
+      SgStatementPtrList *tnl = new SgStatementPtrList;
+      cudaBindConstantVar(constant_mem, &arrayVars[i], globals, tnl);
+      setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl));
+    } 
+    else {
+      SgVariableDeclaration* defn = buildVariableDeclaration(
+        SgName(arrayVars[i].name.c_str()),
+        buildPointerType(arrayVars[i].type));
+      SgInitializedNamePtrList& variables = defn->get_variables();
+      SgInitializedNamePtrList::const_iterator j = variables.begin();
+      SgInitializedName* initializedName = *j;
+      SgVariableSymbol* dvs = new SgVariableSymbol(initializedName);
+      prependStatement(defn, func_body);
+      
+      dvs->set_parent(body_symtab);
+      body_symtab->insert(SgName(arrayVars[i].name.c_str()), dvs);
+    
+//    SgVariableSymbol* dvs = body_symtab->find_variable(SgName(arrayVars[i].name.c_str()));
+    
+    //  if(dvs == NULL)
+    //      dvs =  parameter_symtab->find_variable(SgName(arrayVars[i].name.c_str()));
+    
+    //cudaMalloc args
+    // SgBasicBlock* block = buildBasicBlock();
+      SgName name_cuda_malloc("cudaMalloc");
+      SgFunctionDeclaration * decl_cuda_malloc =
+        buildNondefiningFunctionDeclaration(name_cuda_malloc,
+                                            buildVoidType(), buildFunctionParameterList(), globals);
+      
+      SgName name_cuda_copy("cudaMemcpy");
+      SgFunctionDeclaration * decl_cuda_copy =
+        buildNondefiningFunctionDeclaration(name_cuda_copy,
+                                            buildVoidType(), buildFunctionParameterList(), globals);
+      
+      SgExprListExp* args = buildExprListExp();
+      args->append_expression(
+        buildCastExp(buildAddressOfOp(buildVarRefExp(dvs)),
+                     buildPointerType(buildPointerType(buildVoidType()))));
+      args->append_expression(arrayVars[i].size_expr);
+    
+//    decl_cuda_malloc->get_parameterList()->append_arg
+      SgFunctionCallExp *the_call = buildFunctionCallExp(
+        buildFunctionRefExp(decl_cuda_malloc), args);
+      
+      SgExprStatement* stmt = buildExprStatement(the_call);
+    
+    //  (*replacement_list).push_back (stmt);
+    
+      SgStatementPtrList* tnl = new SgStatementPtrList;
+      (*tnl).push_back(stmt);
+      setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl));
+      if (arrayVars[i].in_data) {
+      
+        SgExprListExp * cuda_copy_in_args = buildExprListExp();
+        cuda_copy_in_args->append_expression(
+          isSgExpression(buildVarRefExp(dvs)));
+        cuda_copy_in_args->append_expression(
+          isSgExpression(buildVarRefExp(arrayVars[i].in_data)));
+        CG_roseRepr* size_exp = new CG_roseRepr(arrayVars[i].size_expr);
+        cuda_copy_in_args->append_expression(
+          static_cast<CG_roseRepr*>(size_exp->clone())->GetExpression());
+        cuda_copy_in_args->append_expression(
+          buildOpaqueVarRefExp("cudaMemcpyHostToDevice", globals));
+      
+//                                      cuda_copy_in_args->append_expression(
+//                                              new SgVarRefExp(sourceLocation, )
+//                                      );
+        SgFunctionCallExp * cuda_copy_in_func_call = buildFunctionCallExp(
+          buildFunctionRefExp(decl_cuda_copy), cuda_copy_in_args);
+      
+        SgExprStatement* stmt = buildExprStatement(cuda_copy_in_func_call);
+      
+        SgStatementPtrList *tnl = new SgStatementPtrList;
+        (*tnl).push_back(stmt);
+        setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl));
+      
+        if(arrayVars[i].tex_mapped) {
+          setupTexmappingVar(texture, &arrayVars[i], globals, i, dvs, symtab);
+          SgStatementPtrList *tnl = new SgStatementPtrList;
+          cudaBindTexture(texture, &arrayVars[i], globals, tnl);
+          setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl));
+        }
+      }
+    }
+  }
+  
+  //Build dimGrid dim3 variables based on loop dimentions and ti/tj
+  char blockD1[120];
+  char blockD2[120];
+  if (dim1) {
+    snprintf(blockD1, 120, "%s/%d",
+             dim1->get_declaration()->get_name().getString().c_str(), cu_tx);
+    snprintf(blockD2, 120, "%s/%d",
+             dim2->get_declaration()->get_name().getString().c_str(), cu_ty);
+  } else {
+    snprintf(blockD1, 120, "%d", cu_bx);
+    snprintf(blockD2, 120, "%d", cu_by);
+    //snprintf(blockD1, 120, "%d/%d", cu_nx, cu_tx);
+    //snprintf(blockD2, 120, "%d/%d", cu_ny, cu_ty);
+  }
+  
+  SgInitializedName* arg1 = buildInitializedName("i", buildIntType());
+  SgInitializedName* arg2 = buildInitializedName("j", buildIntType());
+  SgInitializedName* arg3 = buildInitializedName("k", buildIntType());
+  SgName type_name("dim3");
+  //SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(type_name);
+  
+  //ROSE_ASSERT(type_symbol != NULL);
+  
+  //SgClassDeclaration * dim3classdecl = isSgClassDeclaration(
+  //        type_symbol->get_declaration());
+  
+  SgFunctionDeclaration * funcdecl = buildNondefiningFunctionDeclaration(
+    SgName("dim3"), buildOpaqueType("dim3", globalScope),
+    //isSgType(dim3classdecl->get_type()),
+    buildFunctionParameterList(arg1, arg2, arg3), globalScope);
+  
+  if (cu_bx && cu_by)
+    repr = ocg->CreateDim3((const char*) gridName, ocg->CreateInt(cu_bx),
+                           ocg->CreateInt(cu_by));
+  else if (cu_bx_repr && cu_by_repr)
+    repr = ocg->CreateDim3((const char*) gridName, cu_bx_repr, cu_by_repr);
+  else if (cu_bx_repr)
+    repr = ocg->CreateDim3((const char*) gridName, cu_bx_repr,
+                           ocg->CreateInt(1));
+  setup_code = ocg->StmtListAppend(setup_code, repr);
+  //SgStatementPtrList* dimList = static_cast<CG_roseRepr *>(repr)->GetList();
+  
+  //for(SgStatementPtrList::iterator it = (*dimList).begin(); it != (*dimList).end(); it++)
+  //    (*replacement_list).push_back (*it);
+  
+  //  repr = ocg->CreateDim3((const char*)blockName, cu_tx,cu_ty);
+  
+  if (cu_tz > 1 || cu_tz_repr) {
+    
+    if (cu_tx && cu_ty && cu_tz)
+      repr = ocg->CreateDim3((char*) blockName, ocg->CreateInt(cu_tx),
+                             ocg->CreateInt(cu_ty), ocg->CreateInt(cu_tz));
+    else if (cu_tx_repr && cu_ty_repr && cu_tz_repr)
+      repr = ocg->CreateDim3((char*) blockName, cu_tx_repr, cu_ty_repr,
+                             cu_tz_repr);
+    // SgStatementPtrList* dimList = static_cast<CG_roseRepr *>(repr)->GetList();
+    
+    // for(SgStatementPtrList::iterator it = (*dimList).begin(); it != (*dimList).end(); it++)
+    //    (*replacement_list).push_back (*it);
+    
+  } else {
+    if (cu_tx && cu_ty)
+      repr = ocg->CreateDim3((char*) blockName, ocg->CreateInt(cu_tx),
+                             ocg->CreateInt(cu_ty));
+    else if (cu_tx_repr && cu_ty_repr)
+      repr = ocg->CreateDim3((char*) blockName, cu_tx_repr, cu_ty_repr);
+    //SgStatementPtrList* dimList = static_cast<CG_roseRepr *>(repr)->GetList();
+    
+    //for(SgStatementPtrList::iterator it = (*dimList).begin(); it != (*dimList).end(); it++)
+    //   (*replacement_list).push_back (*it);
+    
+  }
+  
+  setup_code = ocg->StmtListAppend(setup_code, repr);
+  
+  SgCudaKernelExecConfig* config = new SgCudaKernelExecConfig(
+    buildVarRefExp(gridName), buildVarRefExp(blockName), NULL, NULL);
+  //SgCudaKernelExecConfig* config = new SgCudaKernelExecConfig(buildIntVal(cu_bx), , NULL, NULL);
+  SgExprListExp* iml = new SgExprListExp();
+  SgCastExp* dim_s;
+  
+  //Creating Kernel function
+  SgBasicBlock* bb = new SgBasicBlock(TRANSFORMATION_FILE_INFO);
+  SgFunctionDefinition* kernel_defn = new SgFunctionDefinition(
+    TRANSFORMATION_FILE_INFO, bb);
+  SgFunctionDeclaration* kernel_decl_ = new SgFunctionDeclaration(
+    TRANSFORMATION_FILE_INFO, SgName((char*)cu_kernel_name.c_str()),buildFunctionType(buildVoidType(), buildFunctionParameterList()), kernel_defn);
+  SgFunctionDeclaration* kernel_decl = new SgFunctionDeclaration(
+    TRANSFORMATION_FILE_INFO, SgName((char*)cu_kernel_name.c_str()),buildFunctionType(buildVoidType(), buildFunctionParameterList()), kernel_defn);
+  
+  //((kernel_decl->get_declarationModifier()).get_storageModifier()).setStatic();
+  
+  kernel_decl->set_definingDeclaration(kernel_decl);
+  kernel_defn->set_parent(kernel_decl);
+  bb->set_parent(kernel_defn);
+  bb->set_endOfConstruct(TRANSFORMATION_FILE_INFO);
+  bb->get_endOfConstruct()->set_parent(bb);
+  
+  //SgFunctionSymbol* functionSymbol = new SgFunctionSymbol(kernel_decl_);
+  //globals->insert_symbol(SgName((char*) cu_kernel_name.c_str()),
+  //            functionSymbol);
+  SgFunctionSymbol* functionSymbol2 = new SgFunctionSymbol(kernel_decl);
+  
+  globals->insert_symbol(SgName((char*) cu_kernel_name.c_str()),
+                         functionSymbol2);
+  
+  kernel_decl_->set_parent(globals);
+  
+  kernel_decl_->set_scope(globals);
+  
+  kernel_decl_->setForward();
+  
+  globals->prepend_declaration(kernel_decl_);
+  
+  kernel_decl->set_endOfConstruct(TRANSFORMATION_FILE_INFO);
+  kernel_decl->get_endOfConstruct()->set_parent(kernel_decl);
+  
+  kernel_decl->set_parent(globals);
+  kernel_decl->set_scope(globals);
+  
+  kernel_decl->get_definition()->set_endOfConstruct(TRANSFORMATION_FILE_INFO);
+  kernel_decl->get_definition()->get_endOfConstruct()->set_parent(
+    kernel_decl->get_definition());
+  
+  globals->append_statement(kernel_decl);
+  
+  //printf("%s %s\n", static_cast<const char*>(cu_kernel_name), dims);
+  //--derick - kernel function parameters  
+  for (int i = 0; i < arrayVars.size(); i++)
+    //Throw in a type cast if our kernel takes 2D array notation
+    //like (float(*) [1024])
+  {
+    //protonu--throwing in another hack to stop the caller from passing tex mapped
+    //vars to the kernel.
+    if (arrayVars[i].tex_mapped == true || arrayVars[i].cons_mapped)
+      continue;
+    if (!(arrayVars[i].size_multi_dim.empty())) {
+      //snprintf(dims,120,"(float(*) [%d])%s", arrayVars[i].size_2d,
+      //         const_cast<char*>(arrayVars[i].name.c_str()));
+      
+      SgType* t = arrayVars[i].type;
+      for (int k = arrayVars[i].size_multi_dim.size() - 1; k >= 0; k--) {
+        t = buildArrayType(t,
+                           buildIntVal(arrayVars[i].size_multi_dim[k]));
+      }
+      SgVariableSymbol* temp = body_symtab->find_variable(
+        SgName((char*) arrayVars[i].name.c_str()));
+      if (temp == NULL)
+        temp = parameter_symtab->find_variable(
+          SgName((char*) arrayVars[i].name.c_str()));
+      
+      dim_s = buildCastExp(buildVarRefExp(temp), buildPointerType(t),
+                           SgCastExp::e_C_style_cast);
+      
+      //printf("%d %s\n", i, dims);
+      iml->append_expression(dim_s);
+      
+      SgInitializedName* id = buildInitializedName(
+        (char*) arrayVars[i].original_name.c_str(),
+        buildPointerType(t));
+      kernel_decl->get_parameterList()->append_arg(id);
+      kernel_decl_->get_parameterList()->append_arg(id);
+      id->set_file_info(TRANSFORMATION_FILE_INFO);
+      
+      // DQ (9/8/2007): We now test this, so it has to be set explicitly.
+      id->set_scope(kernel_decl->get_definition());
+      
+      // DQ (9/8/2007): Need to add variable symbol to global scope!
+      //printf ("Fixing up the symbol table in scope = %p = %s for SgInitializedName = %p = %s \n",globalScope,globalScope->class_name().c_str(),var1_init_name,var1_init_name->get_name().str());
+      SgVariableSymbol *var_symbol = new SgVariableSymbol(id);
+      kernel_decl->get_definition()->insert_symbol(id->get_name(),
+                                                   var_symbol);
+      
+      // if(kernel_decl->get_definition()->get_symbol_table()->find((const) id) == NULL)
+      
+    } else {
+      //printf("%d %s\n", i, static_cast<const char*>(arrayVars[i].name));
+      SgVariableSymbol* temp = body_symtab->find_variable(
+        SgName((char*) arrayVars[i].name.c_str()));
+      if (temp == NULL)
+        temp = parameter_symtab->find_variable(
+          SgName((char*) arrayVars[i].name.c_str()));
+      iml->append_expression(buildVarRefExp(temp));
+      SgInitializedName* id = buildInitializedName(
+        (char*) arrayVars[i].original_name.c_str(),
+        buildPointerType(arrayVars[i].type));
+      kernel_decl->get_parameterList()->append_arg(id);
+      kernel_decl_->get_parameterList()->append_arg(id);
+      id->set_file_info(TRANSFORMATION_FILE_INFO);
+      
+      // DQ (9/8/2007): We now test this, so it has to be set explicitly.
+      id->set_scope(kernel_decl->get_definition());
+      
+      // DQ (9/8/2007): Need to add variable symbol to global scope!
+      //printf ("Fixing up the symbol table in scope = %p = %s for SgInitializedName = %p = %s \n"$
+      SgVariableSymbol *var_symbol = new SgVariableSymbol(id);
+      kernel_decl->get_definition()->insert_symbol(id->get_name(),
+                                                   var_symbol);
+      
+    }
+    
+  }
+  if (dim1) {
+    iml->append_expression(buildVarRefExp(dim1));
+    SgInitializedName* id = buildInitializedName(
+      dim1->get_name().getString().c_str(), dim1->get_type());
+    kernel_decl->get_parameterList()->append_arg(id);
+    
+    iml->append_expression(buildVarRefExp(dim2));
+    SgInitializedName* id2 = buildInitializedName(
+      dim2->get_name().getString().c_str(), dim2->get_type());
+    
+    kernel_decl->get_parameterList()->append_arg(id);
+    kernel_decl_->get_parameterList()->append_arg(id);
+  }
+  
+  kernel_decl->get_functionModifier().setCudaKernel();
+  kernel_decl_->get_functionModifier().setCudaKernel();
+  SgCudaKernelCallExp * cuda_call_site = new SgCudaKernelCallExp(
+    TRANSFORMATION_FILE_INFO, buildFunctionRefExp(kernel_decl), iml,config);
+  
+  //  SgStatementPtrList *tnl2 = new SgStatementPtrList;
+  
+  (*replacement_list).push_back(buildExprStatement(cuda_call_site));
+  
+  setup_code = ocg->StmtListAppend(setup_code,
+                                   new CG_roseRepr(replacement_list));
+  
+  //cuda free variables
+  for (int i = 0; i < arrayVars.size(); i++) {
+    if (arrayVars[i].out_data) {
+      
+      SgName name_cuda_copy("cudaMemcpy");
+      SgFunctionDeclaration * decl_cuda_copyout =
+        buildNondefiningFunctionDeclaration(name_cuda_copy,
+                                            buildVoidType(), buildFunctionParameterList(),
+                                            globals);
+      
+      SgExprListExp* args = buildExprListExp();
+      SgExprListExp * cuda_copy_out_args = buildExprListExp();
+      cuda_copy_out_args->append_expression(
+        isSgExpression(buildVarRefExp(arrayVars[i].out_data)));
+      cuda_copy_out_args->append_expression(
+        isSgExpression(buildVarRefExp(arrayVars[i].name)));
+      CG_roseRepr* size_exp = new CG_roseRepr(arrayVars[i].size_expr);
+      cuda_copy_out_args->append_expression(
+        static_cast<CG_roseRepr*>(size_exp->clone())->GetExpression());
+      cuda_copy_out_args->append_expression(
+        buildOpaqueVarRefExp("cudaMemcpyDeviceToHost", globals));
+      
+//                                      cuda_copy_in_args->append_expression(
+//                                              new SgVarRefExp(sourceLocation, )
+//                                      );
+      SgFunctionCallExp * cuda_copy_out_func_call = buildFunctionCallExp(
+        buildFunctionRefExp(decl_cuda_copyout), cuda_copy_out_args);
+      
+      SgFunctionCallExp *the_call = buildFunctionCallExp(
+        buildFunctionRefExp(decl_cuda_copyout), cuda_copy_out_args);
+      
+      SgExprStatement* stmt = buildExprStatement(the_call);
+      
+      SgStatementPtrList* tnl3 = new SgStatementPtrList;
+      
+      (*tnl3).push_back(stmt);
+      
+      //   tree_node_list* tnl = new tree_node_list;
+      //   tnl->append(new tree_instr(the_call));
+      setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl3));
+      
+    }
+    if(!arrayVars[i].cons_mapped) {
+      SgName name_cuda_free("cudaFree");
+      SgFunctionDeclaration * decl_cuda_free =
+        buildNondefiningFunctionDeclaration(name_cuda_free,
+                                            buildVoidType(), buildFunctionParameterList(), globals);
+      
+      SgExprListExp* args3 = buildExprListExp();
+      
+      SgVariableSymbol* tmp = body_symtab->find_variable(
+        SgName(arrayVars[i].name.c_str()));
+      if (tmp == NULL)
+        tmp = parameter_symtab->find_variable(
+          SgName(arrayVars[i].name.c_str()));
+      
+      args3->append_expression(buildVarRefExp(tmp));
+      
+      SgFunctionCallExp *the_call2 = buildFunctionCallExp(
+        buildFunctionRefExp(decl_cuda_free), args3);
+      
+      SgExprStatement* stmt2 = buildExprStatement(the_call2);
+      
+      SgStatementPtrList* tnl4 = new SgStatementPtrList;
+      
+      (*tnl4).push_back(stmt2);
+      //(*replacement_list).push_back (stmt2);
+    
+      setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl4));
+    }
+  }
+  
+  // ---------------
+  // BUILD THE KERNEL
+  // ---------------
+  
+  //Extract out kernel body
+  SgNode* code = getCode();
+  //Create kernel function body
+  //Add Params
+  std::map<std::string, SgVariableSymbol*> loop_vars;
+  //In-Out arrays
+  for (int i = 0; i < arrayVars.size(); i++) {
+    /*   if(arrayVars[i].in_data)
+         fptr = arrayVars[i].in_data->type()->clone();
+         else
+         fptr = arrayVars[i].out_data->type()->clone();
+    */
+    
+    // fptr = new_proc_syms->install_type(fptr);
+    std::string name =
+      arrayVars[i].in_data ?
+      arrayVars[i].in_data->get_declaration()->get_name().getString() :
+      arrayVars[i].out_data->get_declaration()->get_name().getString();
+    //SgVariableSymbol* sym = new var_sym(fptr, arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name());
+    
+    SgVariableSymbol* sym =
+      kernel_decl->get_definition()->get_symbol_table()->find_variable(
+        (const char*) name.c_str());
+    /* SgVariableDeclaration*  defn = buildVariableDeclaration(SgName(name.c_str()), buildFloatType());
+       SgInitializedNamePtrList& variables = defn->get_variables();
+       SgInitializedNamePtrList::const_iterator i = variables.begin();
+       SgInitializedName* initializedName = *i;
+       SgVariableSymbol* sym = new SgVariableSymbol(initializedName);
+       prependStatement(defn, isSgScopeStatement(root_));
+       
+       vs->set_parent(symtab2_);
+       symtab2_->insert(SgName(_s.c_str()), vs);
+    */
+    
+    if (sym != NULL)
+      loop_vars.insert(
+        std::pair<std::string, SgVariableSymbol*>(std::string(name),
+                                                  sym));
+  }
+  
+  //Figure out which loop variables will be our thread and block dimention variables
+  std::vector<SgVariableSymbol *> loop_syms;
+  //Get our indexes
+  std::vector<const char*> indexes; // = get_loop_indexes(code,cu_num_reduce);
+  int threadsPos = 0;
+  
+  CG_outputRepr *body = NULL;
+  SgFunctionDefinition* func_d = func_definition;
+  //std::vector<SgVariableSymbol *> symbols =  recursiveFindRefs(code);
+  
+  SgName name_sync("__syncthreads");
+  SgFunctionDeclaration * decl_sync = buildNondefiningFunctionDeclaration(
+    name_sync, buildVoidType(), buildFunctionParameterList(),
+    globalScope);
+  
+  recursiveFindRefs(code, syms, func_d);
+  
+  //SgFunctionDeclaration* func = Outliner::generateFunction (code, (char*)cu_kernel_name.c_str(), syms, pdSyms, psyms, NULL, globalScope);
+  
+  if (cu_bx > 1 || cu_bx_repr) {
+    indexes.push_back("bx");
+    SgName type_name("blockIdx.x");
+    SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
+      type_name);
+    SgVariableDeclaration * var_decl = buildVariableDeclaration("bx",
+                                                                buildIntType(), NULL,
+                                                                isSgScopeStatement(kernel_decl->get_definition()->get_body()));
+    SgStatementPtrList *tnl = new SgStatementPtrList;
+    // (*tnl).push_back(isSgStatement(var_decl));
+    appendStatement(var_decl, kernel_decl->get_definition()->get_body());
+    
+    SgVariableSymbol* bx =
+      kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
+        SgName("bx"));
+    SgStatement* assign = isSgStatement(
+      buildAssignStatement(buildVarRefExp(bx),
+                           buildOpaqueVarRefExp("blockIdx.x",
+                                                kernel_decl->get_definition()->get_body())));
+    (*tnl).push_back(assign);
+    // body = ocg->StmtListAppend(body,
+    //                                  new CG_roseRepr(tnl));
+    appendStatement(assign, kernel_decl->get_definition()->get_body());
+    
+  }
+  if (cu_by > 1 || cu_by_repr) {
+    indexes.push_back("by");
+    SgName type_name("blockIdx.y");
+    SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
+      type_name);
+    SgVariableDeclaration * var_decl = buildVariableDeclaration("by",
+                                                                buildIntType(), NULL,
+                                                                isSgScopeStatement(kernel_decl->get_definition()->get_body()));
+    // SgStatementPtrList *tnl = new SgStatementPtrList;
+    // (*tnl).push_back(isSgStatement(var_decl));
+    appendStatement(var_decl, kernel_decl->get_definition()->get_body());
+    
+    SgVariableSymbol* by =
+      kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
+        SgName("by"));
+    SgStatement* assign = isSgStatement(
+      buildAssignStatement(buildVarRefExp(by),
+                           buildOpaqueVarRefExp("blockIdx.y",
+                                                kernel_decl->get_definition()->get_body())));
+    //(*tnl).push_back(assign);
+    // body = ocg->StmtListAppend(body,
+    //                                  new CG_roseRepr(tnl));
+    appendStatement(assign, kernel_decl->get_definition()->get_body());
+    
+  }
+  if (cu_tx_repr || cu_tx > 1) {
+    threadsPos = indexes.size();
+    indexes.push_back("tx");
+    SgName type_name("threadIdx.x");
+    SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
+      type_name);
+    SgVariableDeclaration * var_decl = buildVariableDeclaration("tx",
+                                                                buildIntType(), NULL,
+                                                                isSgScopeStatement(kernel_decl->get_definition()->get_body()));
+    //  SgStatementPtrList *tnl = new SgStatementPtrList;
+    //  (*tnl).push_back(isSgStatement(var_decl));
+    appendStatement(var_decl, kernel_decl->get_definition()->get_body());
+    
+    SgVariableSymbol* tx =
+      kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
+        SgName("tx"));
+    SgStatement* assign = isSgStatement(
+      buildAssignStatement(buildVarRefExp(tx),
+                           buildOpaqueVarRefExp("threadIdx.x",
+                                                kernel_decl->get_definition()->get_body())));
+    //(*tnl).push_back(assign);
+    // body = ocg->StmtListAppend(body,
+    //                                  new CG_roseRepr(tnl));
+    appendStatement(assign, kernel_decl->get_definition()->get_body());
+    
+  }
+  if (cu_ty_repr || cu_ty > 1) {
+    indexes.push_back("ty");
+    SgName type_name("threadIdx.y");
+    SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
+      type_name);
+    SgVariableDeclaration * var_decl = buildVariableDeclaration("ty",
+                                                                buildIntType(), NULL,
+                                                                isSgScopeStatement(kernel_decl->get_definition()->get_body()));
+    appendStatement(var_decl, kernel_decl->get_definition()->get_body());
+    
+    // SgStatementPtrList *tnl = new SgStatementPtrList;
+    // (*tnl).push_back(isSgStatement(var_decl));
+    SgVariableSymbol* ty =
+      kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
+        SgName("ty"));
+    SgStatement* assign = isSgStatement(
+      buildAssignStatement(buildVarRefExp(ty),
+                           buildOpaqueVarRefExp("threadIdx.y",
+                                                kernel_decl->get_definition()->get_body())));
+    // (*tnl).push_back(assign);
+    //  body = ocg->StmtListAppend(body,
+    //                                   new CG_roseRepr(tnl));
+    appendStatement(assign, kernel_decl->get_definition()->get_body());
+    
+  }
+  if (cu_tz_repr || cu_tz > 1) {
+    indexes.push_back("tz");
+    SgName type_name("threadIdx.z");
+    SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
+      type_name);
+    SgVariableDeclaration * var_decl = buildVariableDeclaration("tz",
+                                                                buildIntType(), NULL,
+                                                                isSgScopeStatement(kernel_decl->get_definition()->get_body()));
+    //   SgStatementPtrList *tnl = new SgStatementPtrList;
+    //   (*tnl).push_back(isSgStatement(var_decl));
+    appendStatement(var_decl, kernel_decl->get_definition()->get_body());
+    
+    SgVariableSymbol* tz =
+      kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
+        "tz");
+    SgStatement* assign = isSgStatement(
+      buildAssignStatement(buildVarRefExp(tz),
+                           buildOpaqueVarRefExp("threadIdx.z",
+                                                kernel_decl->get_definition()->get_body())));
+    //    (*tnl).push_back(assign);
+    //     body = ocg->StmtListAppend(body,
+    //                                      new CG_roseRepr(tnl));
+    appendStatement(assign, kernel_decl->get_definition()->get_body());
+    
+  }
+  
+  std::map<std::string, SgVariableSymbol*> loop_idxs; //map from idx names to their new syms
+  
+  SgNode* swapped_ = swapVarReferences(code, syms,
+                                       kernel_decl->get_definition()->get_symbol_table(),
+                                       kernel_decl->get_definition()->get_body()->get_symbol_table(),
+                                       kernel_decl->get_definition()->get_body());
+  
+  //std::cout << swapped_->unparseToString() << std::endl << std::endl;
+  
+  SgNode *swapped = recursiveFindReplacePreferedIdxs(swapped_,
+                                                     kernel_decl->get_definition()->get_body()->get_symbol_table(),
+                                                     kernel_decl->get_definition()->get_symbol_table(),
+                                                     kernel_decl->get_definition()->get_body(), loop_idxs, globalScope); //in-place swapping
+  //swapped->print();
+  
+  if (!isSgBasicBlock(swapped)) {
+    appendStatement(isSgStatement(swapped),
+                    kernel_decl->get_definition()->get_body());
+    swapped->set_parent(
+      isSgNode(kernel_decl->get_definition()->get_body()));
+  } else {
+    
+    for (SgStatementPtrList::iterator it =
+           isSgBasicBlock(swapped)->get_statements().begin();
+         it != isSgBasicBlock(swapped)->get_statements().end(); it++) {
+      appendStatement(*it, kernel_decl->get_definition()->get_body());
+      (*it)->set_parent(
+        isSgNode(kernel_decl->get_definition()->get_body()));
+      
+    }
+    
+  }
+  
+  for (int i = 0; i < indexes.size(); i++) {
+    std::vector<SgForStatement*> tfs = findCommentedFors(indexes[i],
+                                                         swapped);
+    for (int k = 0; k < tfs.size(); k++) {
+      //printf("replacing %p tfs for index %s\n", tfs[k], indexes[i]);
+      SgNode* newBlock = forReduce(tfs[k], loop_idxs[indexes[i]],
+                                   kernel_decl->get_definition());
+      //newBlock->print();
+      swap_node_for_node_list(tfs[k], newBlock);
+      //printf("AFTER SWAP\n");        newBlock->print();
+    }
+  }
+
+  //--derick replace array refs of texture mapped vars here
+  body = new CG_roseRepr(kernel_decl->get_definition()->get_body());
+  std::vector<IR_ArrayRef*> refs = ir->FindArrayRef(body);
+  texmapArrayRefs(texture, &refs, globals, ir, ocg);
+  // do the same for constant mapped vars
+  consmapArrayRefs(constant_mem, &refs, globals, ir, ocg);
+  
+  return swapped;
+}
+
+//Order taking out dummy variables
+std::vector<std::string> cleanOrder(std::vector<std::string> idxNames) {
+  std::vector<std::string> results;
+  for (int j = 0; j < idxNames.size(); j++) {
+    if (idxNames[j].length() != 0)
+      results.push_back(idxNames[j]);
+  }
+  return results;
+}
+
+//First non-dummy level in ascending order
+int LoopCuda::nonDummyLevel(int stmt, int level) {
+  //level comes in 1-basd and should leave 1-based
+  for (int j = level - 1; j < idxNames[stmt].size(); j++) {
+    if (idxNames[stmt][j].length() != 0) {
+      //printf("found non dummy level of %d with idx: %s when searching for %d\n", j+1, (const char*) idxNames[stmt][j], level);
+      return j + 1;
+    }
+  }
+  char buf[128];
+  sprintf(buf, "%d", level);
+  throw std::runtime_error(
+    std::string("Unable to find a non-dummy level starting from ")
+    + std::string(buf));
+}
+
+int LoopCuda::findCurLevel(int stmt, std::string idx) {
+  for (int j = 0; j < idxNames[stmt].size(); j++) {
+    if (strcmp(idxNames[stmt][j].c_str(), idx.c_str()) == 0)
+      return j + 1;
+  }
+  throw std::runtime_error(
+    std::string("Unable to find index ") + idx
+    + std::string(" in current list of indexes"));
+}
+
+void LoopCuda::permute_cuda(int stmt,
+                            const std::vector<std::string>& curOrder) {
+  //printf("curOrder: ");
+  //printVs(curOrder);
+  //printf("idxNames: ");
+  //printVS(idxNames[stmt]);
+  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt]);
+  bool same = true;
+  std::vector<int> pi;
+  for (int i = 0; i < curOrder.size(); i++) {
+    bool found = false;
+    for (int j = 0; j < cIdxNames.size(); j++) {
+      if (strcmp(cIdxNames[j].c_str(), curOrder[i].c_str()) == 0) {
+        pi.push_back(j + 1);
+        found = true;
+        if (j != i)
+          same = false;
+      }
+    }
+    if (!found) {
+      throw std::runtime_error(
+        "One of the indexes in the permute order were not "
+        "found in the current set of indexes.");
+    }
+  }
+  for (int i = curOrder.size(); i < cIdxNames.size(); i++) {
+    pi.push_back(i);
+  }
+  if (same)
+    return;
+  permute(stmt, pi);
+  //Set old indexe names as new
+  for (int i = 0; i < curOrder.size(); i++) {
+    idxNames[stmt][i] = curOrder[i].c_str(); //what about sibling stmts?
+  }
+}
+
+bool LoopCuda::permute(int stmt_num, const std::vector<int> &pi) {
+// check for sanity of parameters
+  if (stmt_num >= stmt.size() || stmt_num < 0)
+    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
+  const int n = stmt[stmt_num].xform.n_out();
+  if (pi.size() > (n - 1) / 2)
+    throw std::invalid_argument(
+      "iteration space dimensionality does not match permute dimensionality");
+  int first_level = 0;
+  int last_level = 0;
+  for (int i = 0; i < pi.size(); i++) {
+    if (pi[i] > (n - 1) / 2 || pi[i] <= 0)
+      throw std::invalid_argument(
+        "invalid loop level " + to_string(pi[i])
+        + " in permuation");
+    
+    if (pi[i] != i + 1) {
+      if (first_level == 0)
+        first_level = i + 1;
+      last_level = i + 1;
+    }
+  }
+  if (first_level == 0)
+    return true;
+  
+  std::vector<int> lex = getLexicalOrder(stmt_num);
+  std::set<int> active = getStatements(lex, 2 * first_level - 2);
+  Loop::permute(active, pi);
+}
+
+void LoopCuda::tile_cuda(int stmt, int level, int outer_level) {
+  tile_cuda(stmt, level, 1, outer_level, "", "", CountedTile);
+}
+void LoopCuda::tile_cuda(int level, int tile_size, int outer_level,
+                         std::string idxName, std::string ctrlName, TilingMethodType method) {
+  tile_cuda(0, level, tile_size, outer_level, idxName, ctrlName, method);
+}
+
+void LoopCuda::tile_cuda(int stmt, int level, int tile_size, int outer_level,
+                         std::string idxName, std::string ctrlName, TilingMethodType method) {
+  //Do regular tile but then update the index and control loop variable
+  //names as well as the idxName to reflect the current state of things.
+  //printf("tile(%d,%d,%d,%d)\n", stmt, level, tile_size, outer_level);
+  //printf("idxNames before: ");
+  //printVS(idxNames[stmt]);
+  
+  tile(stmt, level, tile_size, outer_level, method);
+  
+  if (idxName.size())
+    idxNames[stmt][level - 1] = idxName.c_str();
+  if (tile_size == 1) {
+    //potentially rearrange loops
+    if (outer_level < level) {
+      std::string tmp = idxNames[stmt][level - 1];
+      for (int i = level - 1; i > outer_level - 1; i--) {
+        if (i - 1 >= 0)
+          idxNames[stmt][i] = idxNames[stmt][i - 1];
+      }
+      idxNames[stmt][outer_level - 1] = tmp;
+    }
+    //TODO: even with a tile size of one, you need a insert (of a dummy loop)
+    idxNames[stmt].insert(idxNames[stmt].begin() + (level), "");
+  } else {
+    if (!ctrlName.size())
+      throw std::runtime_error("No ctrl loop name for tile");
+    //insert
+    idxNames[stmt].insert(idxNames[stmt].begin() + (outer_level - 1),
+                          ctrlName.c_str());
+  }
+  
+  //printf("idxNames after: ");
+  //printVS(idxNames[stmt]);
+}
+
+bool LoopCuda::datacopy_privatized_cuda(int stmt_num, int level,
+                                        const std::string &array_name,
+                                        const std::vector<int> &privatized_levels, bool allow_extra_read,
+                                        int fastest_changing_dimension, int padding_stride,
+                                        int padding_alignment, bool cuda_shared) {
+  int old_stmts = stmt.size();
+  //  printf("before datacopy_privatized:\n");
+  printIS();
+  //datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, cuda_shared);
+  if (cuda_shared)
+    datacopy_privatized(stmt_num, level, array_name, privatized_levels,
+                        allow_extra_read, fastest_changing_dimension, padding_stride,
+                        padding_alignment, 1);
+  else
+    datacopy_privatized(stmt_num, level, array_name, privatized_levels,
+                        allow_extra_read, fastest_changing_dimension, padding_stride,
+                        padding_alignment, 0);
+  //  printf("after datacopy_privatized:\n");
+  printIS();
+  
+  //Adjust idxNames to reflect updated state
+  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
+  int new_stmts = stmt.size();
+  for (int i = old_stmts; i < new_stmts; i++) {
+    //printf("fixing up statement %d\n", i);
+    std::vector<std::string> idxs;
+    
+    //protonu-making sure the vector of nonSplitLevels grows along with
+    //the statement structure
+    stmt_nonSplitLevels.push_back(std::vector<int>());
+    
+    //Indexes up to level will be the same
+    for (int j = 0; j < level - 1; j++)
+      idxs.push_back(cIdxNames[j]);
+    
+    //Expect privatized_levels to match
+    for (int j = 0; j < privatized_levels.size(); j++)
+      idxs.push_back(cIdxNames[privatized_levels[j] - 1]);//level is one-based
+    
+    //all further levels should match order they are in originally
+    if (privatized_levels.size()) {
+      int last_privatized = privatized_levels.back();
+      int top_level = last_privatized
+        + (stmt[i].IS.n_set() - idxs.size());
+      //printf("last privatized_levels: %d top_level: %d\n", last_privatized, top_level);
+      for (int j = last_privatized; j < top_level; j++) {
+        idxs.push_back(cIdxNames[j]);
+        //printf("pushing back: %s\n", (const char*)cIdxNames[j]);
+      }
+    }
+    idxNames.push_back(idxs);
+  }
+}
+
+bool LoopCuda::datacopy_cuda(int stmt_num, int level,
+                             const std::string &array_name, 
+                             const std::vector<std::string> new_idxs,
+                             bool allow_extra_read, int fastest_changing_dimension,
+                             int padding_stride, int padding_alignment, bool cuda_shared) {
+  
+  int old_stmts = stmt.size();
+  //datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment,cuda_shared);
+  //  printf("before datacopy:\n");
+  //  printIS();
+  if (cuda_shared)
+    datacopy(stmt_num, level, array_name, allow_extra_read,
+             fastest_changing_dimension, padding_stride, padding_alignment,
+             1);
+  else
+    datacopy(stmt_num, level, array_name, allow_extra_read,
+             fastest_changing_dimension, padding_stride, padding_alignment,
+             0);
+  //  printf("after datacopy:\n");
+  printIS();
+
+  //Adjust idxNames to reflect updated state
+  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
+  int new_stmts = stmt.size();
+  for (int i = old_stmts; i < new_stmts; i++) {
+    //printf("fixing up statement %d\n", i);
+    std::vector<std::string> idxs;
+    
+    //protonu-making sure the vector of nonSplitLevels grows along with
+    //the statement structure
+    stmt_nonSplitLevels.push_back(std::vector<int>());
+    
+    //Indexes up to level will be the same
+    for (int j = 0; j < level - 1; j++)
+      idxs.push_back(cIdxNames[j]);
+    
+    //all further levels should get names from new_idxs
+    int top_level = stmt[i].IS.n_set();
+    //printf("top_level: %d level: %d\n", top_level, level);
+    if (new_idxs.size() < top_level - level + 1)
+      throw std::runtime_error(
+        "Need more new index names for new datacopy loop levels");
+    
+    for (int j = level - 1; j < top_level; j++) {
+      idxs.push_back(new_idxs[j - level + 1].c_str());
+      //printf("pushing back: %s\n", new_idxs[j-level+1].c_str());
+    }
+    idxNames.push_back(idxs);
+  }
+}
+
+bool LoopCuda::unroll_cuda(int stmt_num, int level, int unroll_amount) {
+  int old_stmts = stmt.size();
+  //bool b= unroll(stmt_num, , unroll_amount);
+  
+  int dim = 2 * level - 1;
+  std::vector<int> lex = getLexicalOrder(stmt_num);
+  std::set<int> same_loop = getStatements(lex, dim - 1);
+  
+  level = nonDummyLevel(stmt_num, level);
+  //printf("unrolling %d at level %d\n", stmt_num,level);
+  
+  //protonu--using the new version of unroll, which returns
+  //a set of ints instead of a bool. To keep Gabe's logic
+  //I'll check the size of the set, if it's 0 return true
+  //bool b= unroll(stmt_num, level, unroll_amount);
+  std::set<int> b_set = unroll(stmt_num, level, unroll_amount, idxNames);
+  bool b = false;
+  if (b_set.size() == 0)
+    b = true;
+  //end--protonu
+  
+  //Adjust idxNames to reflect updated state
+  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
+  std::vector<std::string> origSource = idxNames[stmt_num];
+  ;
+  //Drop index names at level
+  if (unroll_amount == 0) {
+    //For all statements that were in this unroll together, drop index name for unrolled level
+    idxNames[stmt_num][level - 1] = "";
+    for (std::set<int>::iterator i = same_loop.begin();
+         i != same_loop.end(); i++) {
+      //printf("in same loop as %d is %d\n", stmt_num, (*i));
+      //idxNames[(*i)][level-1] = "";
+      idxNames[(*i)] = idxNames[stmt_num];
+    }
+  }
+  
+  lex = getLexicalOrder(stmt_num);
+  same_loop = getStatements(lex, dim - 1);
+  
+  bool same_as_source = false;
+  int new_stmts = stmt.size();
+  for (int i = old_stmts; i < new_stmts; i++) {
+    //Check whether we had a sync for the statement we are unrolling, if
+    //so, propogate that to newly created statements so that if they are
+    //in a different loop structure, they will also get a syncthreads
+    int size = syncs.size();
+    for (int j = 0; j < size; j++) {
+      if (syncs[j].first == stmt_num)
+        syncs.push_back(make_pair(i, syncs[j].second));
+    }
+    
+    //protonu-making sure the vector of nonSplitLevels grows along with
+    //the statement structure
+    stmt_nonSplitLevels.push_back(std::vector<int>());
+    
+    //We expect that new statements have a constant for the variable in
+    //stmt[i].IS at level (as seen with print_with_subs), otherwise there
+    //will be a for loop at level and idxNames should match stmt's
+    //idxNames pre-unrolled
+    Relation IS = stmt[i].IS;
+    //Ok, if you know how the hell to get anything out of a Relation, you
+    //should probably be able to do this more elegantly. But for now, I'm
+    //hacking it.
+    std::string s = IS.print_with_subs_to_string();
+    //s looks looks like
+    //{[_t49,8,_t51,_t52,128]: 0 <= _t52 <= 3 && 0 <= _t51 <= 15 && 0 <= _t49 && 64_t49+16_t52+_t51 <= 128}
+    //where level == 5, you see a integer in the input set
+    
+    //If that's not an integer and this is the first new statement, then
+    //we think codegen will have a loop at that level. It's not perfect,
+    //not sure if it can be determined without round-tripping to codegen.
+    int sIdx = 0;
+    int eIdx = 0;
+    for (int j = 0; j < level - 1; j++) {
+      sIdx = s.find(",", sIdx + 1);
+      if (sIdx < 0)
+        break;
+    }
+    if (sIdx > 0) {
+      eIdx = s.find("]");
+      int tmp = s.find(",", sIdx + 1);
+      if (tmp > 0 && tmp < eIdx)
+        eIdx = tmp; //", before ]"
+      if (eIdx > 0) {
+        sIdx++;
+        std::string var = s.substr(sIdx, eIdx - sIdx);
+        //printf("%s\n", s.c_str());
+        //printf("set var for stmt %d at level %d is %s\n", i, level, var.c_str());
+        if (atoi(var.c_str()) == 0 && i == old_stmts) {
+          //TODO:Maybe do see if this new statement would be in the same
+          //group as the original and if it would, don't say
+          //same_as_source
+          if (same_loop.find(i) == same_loop.end()) {
+            printf(
+              "stmt %d level %d, newly created unroll statement should have same level indexes as source\n",
+              i, level);
+            same_as_source = true;
+          }
+        }
+      }
+    }
+    
+    //printf("fixing up statement %d n_set %d with %d levels\n", i, stmt[i].IS.n_set(), level-1);
+    if (same_as_source)
+      idxNames.push_back(origSource);
+    else
+      idxNames.push_back(idxNames[stmt_num]);
+  }
+  
+  return b;
+}
+
+void LoopCuda::copy_to_texture(const char *array_name) {
+  //protonu--placeholder for now
+  //set the bool for using cuda memory as true
+  //in a vector of strings, put the names of arrays to tex mapped
+  if (!texture)
+    texture = new texture_memory_mapping(true, array_name);
+  else
+    texture->add(array_name);
+  
+}
+
+//void LoopCuda::copy_to_texture_2d(const char *array_name, int width, int height) {
+//  if (!texture)
+//    texture = new texture_memory_mapping(true, array_name, width, height);
+//  else
+//    texture->add(array_name, width, height);
+//}
+
+void LoopCuda::copy_to_constant(const char *array_name) {
+  if(!constant_mem)
+    constant_mem = new constant_memory_mapping(true, array_name);
+  else
+    constant_mem->add(array_name);
+}
+
+//protonu--moving this from Loop
+SgNode* LoopCuda::codegen() {
+  if (code_gen_flags & GenCudaizeV2)
+    return cudaize_codegen_v2();
+  //Do other flagged codegen methods, return plain vanilla generated code
+  return getCode();
+}
+
+//These three are in Omega code_gen.cc and are used as a massive hack to
+//get out some info from MMGenerateCode. Yea for nasty side-effects.
+namespace omega {
+  extern int checkLoopLevel;
+  extern int stmtForLoopCheck;
+  extern int upperBoundForLevel;
+  extern int lowerBoundForLevel;
+}
+
+CG_outputRepr* LoopCuda::extractCudaUB(int stmt_num, int level,
+                                       int &outUpperBound, int &outLowerBound) {
+  // check for sanity of parameters
+  const int m = stmt.size();
+  if (stmt_num >= m || stmt_num < 0)
+    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
+  const int n = stmt[stmt_num].xform.n_out();
+  if (level > (n - 1) / 2 || level <= 0)
+    throw std::invalid_argument("invalid loop level " + to_string(level));
+  
+  int dim = 2 * level - 1;
+  
+  std::vector<int> lex = getLexicalOrder(stmt_num);
+  std::set<int> same_loop = getStatements(lex, dim - 1);
+  
+  // extract the intersection of the iteration space to be considered
+  Relation hull;
+  {
+    hull = Relation::True(n);
+    for (std::set<int>::iterator i = same_loop.begin();
+         i != same_loop.end(); i++) {
+      Relation r = getNewIS(*i);
+      for (int j = dim + 2; j <= r.n_set(); j++)
+        r = Project(r, r.set_var(j));
+      hull = Intersection(hull, r);
+      hull.simplify(2, 4);
+    }
+    
+    for (int i = 2; i <= dim + 1; i += 2) {
+      //std::string name = std::string("_t") + to_string(t_counter++);
+      std::string name = std::string("_t")
+        + to_string(tmp_loop_var_name_counter++);
+      hull.name_set_var(i, name);
+    }
+    hull.setup_names();
+  }
+  
+  // extract the exact loop bound of the dimension to be unrolled
+  if (is_single_iteration(hull, dim)) {
+    throw std::runtime_error(
+      "No loop availabe at level to extract upper bound.");
+  }
+  Relation bound = get_loop_bound(hull, dim);
+  if (!bound.has_single_conjunct() || !bound.is_satisfiable()
+      || bound.is_tautology())
+    throw loop_error(
+      "loop error: unable to extract loop bound for cudaize");
+  
+  // extract the loop stride
+  EQ_Handle stride_eq;
+  /*int stride = 1;
+    {
+    bool simple_stride = true;
+    int strides = countStrides(bound.query_DNF()->single_conjunct(),
+    bound.set_var(dim + 1), stride_eq, simple_stride);
+    if (strides > 1)
+    throw loop_error("loop error: too many strides");
+    else if (strides == 1) {
+    int sign = stride_eq.get_coef(bound.set_var(dim + 1));
+    //      assert(sign == 1 || sign == -1);
+    Constr_Vars_Iter it(stride_eq, true);
+    stride = abs((*it).coef / sign);
+    }
+    }
+  */
+  int stride = 1;
+  {
+    
+    coef_t stride;
+    std::pair<EQ_Handle, Variable_ID> result = find_simplest_stride(bound,
+                                                                    bound.set_var(dim + 1));
+    if (result.second == NULL)
+      stride = 1;
+    else
+      stride = abs(result.first.get_coef(result.second))
+        / gcd(abs(result.first.get_coef(result.second)),
+              abs(result.first.get_coef(bound.set_var(dim + 1))));
+    
+    if (stride > 1)
+      throw loop_error("loop error: too many strides");
+    /*else if (stride == 1) {
+      int sign = result.first.get_coef(bound.set_var(dim+1));
+      assert(sign == 1 || sign == -1);
+      } */
+  }
+  
+  if (stride != 1) {
+    char buf[1024];
+    sprintf(buf, "Cudaize: Loop at level %d has non-one stride of %d",
+            level, stride);
+    throw std::runtime_error(buf);
+  }
+  
+  //Use code generation system to build tell us our bound information. We
+  //need a hard upper bound a 0 lower bound.
+  
+  checkLoopLevel = level * 2;
+  stmtForLoopCheck = stmt_num;
+  upperBoundForLevel = -1;
+  lowerBoundForLevel = -1;
+  printCode(1, false);
+  checkLoopLevel = 0;
+  
+  outUpperBound = upperBoundForLevel;
+  outLowerBound = lowerBoundForLevel;
+  
+  if (outUpperBound == -1) {
+    
+    CG_result* temp = last_compute_cgr_;
+    
+    while (temp) {
+      CG_loop * loop;
+      if (loop = dynamic_cast<CG_loop*>(temp)) {
+        if (loop->level_ == 2 * level) {
+          Relation bound = copy(loop->bounds_);
+          Variable_ID v = bound.set_var(2 * level);
+          for (GEQ_Iterator e(
+                 const_cast<Relation &>(bound).single_conjunct()->GEQs());
+               e; e++) {
+            if ((*e).get_coef(v) < 0
+                && (*e).is_const_except_for_global(v))
+              return output_upper_bound_repr(ir->builder(), *e, v,
+                                             bound,
+                                             std::vector<std::pair<CG_outputRepr *, int> >(
+                                               bound.n_set(),
+                                               std::make_pair(
+                                                 static_cast<CG_outputRepr *>(NULL),
+                                                 0)));
+          }
+        }
+        if (loop->level_ > 2 * level)
+          break;
+        else
+          temp = loop->body_;
+      } else
+        break;
+    }
+  }
+  
+  return NULL;
+}
+
+void LoopCuda::printCode(int effort, bool actuallyPrint) const {
+  const int m = stmt.size();
+  if (m == 0)
+    return;
+  const int n = stmt[0].xform.n_out();
+  
+  /*or (int i = 0; i < m; i++) {
+    IS[i + 1] = stmt[i].IS;
+    xform[i + 1] = stmt[i].xform;
+    
+    //nonSplitLevels[i+1] = stmt[i].nonSplitLevels;
+    }
+  */
+  
+  // invalidate saved codegen computation
+  if (last_compute_cgr_ != NULL) {
+    delete last_compute_cgr_;
+    last_compute_cgr_ = NULL;
+  }
+  
+  if (last_compute_cg_ != NULL) {
+    delete last_compute_cg_;
+    last_compute_cg_ = NULL;
+  }
+  
+  //Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
+  /*CG_stringBuilder *ocg = new CG_stringBuilder();
+    Tuple<CG_outputRepr *> nameInfo;
+    for (int i = 1; i <= m; i++)
+    nameInfo.append(new CG_stringRepr("s" + to_string(i)));
+  */
+  
+  // -- replacing MMGenerateCode
+  // -- formally CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort);
+  // -- in the future, these if statements need to be cleaned up.
+  // -- something like check_lastComputeCG might be a decent protected member function
+  // -- and/or something that returns a std::vector<CG_outputRepr*> that also checks last_compute_cg_
+  //if (last_compute_cg_ == NULL) {
+  std::vector<Relation> IS(m);
+  std::vector<Relation> xforms(m);
+  std::vector<std::vector<int> > nonSplitLevels(m);
+  
+  /*    std::vector < std::vector <std::string> > idxTupleNames;
+        if (useIdxNames) {
+        for (int i = 0; i < idxNames.size(); i++) {
+        Tuple<std::string> idxs;
+        for (int j = 0; j < idxNames[i].size(); j++)
+        idxs.append(idxNames[i][j]);
+        idxTupleNames.append(idxs);
+        }
+        }
+  */
+  for (int i = 0; i < m; i++) {
+    IS[i] = stmt[i].IS;
+    xforms[i] = stmt[i].xform;
+    nonSplitLevels[i] = stmt_nonSplitLevels[i];
+  }
+  Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
+  
+  last_compute_cg_ = new CodeGen(xforms, IS, known, nonSplitLevels, idxNames,
+                                 syncs);
+
+  delete last_compute_cgr_;  // this was just done  above? 
+  last_compute_cgr_ = NULL;
+  //}
+  
+  if (last_compute_cgr_ == NULL || last_compute_effort_ != effort) {
+    delete last_compute_cgr_;
+    last_compute_cgr_ = last_compute_cg_->buildAST(effort);
+    last_compute_effort_ = effort;
+  }
+  
+  //std::vector<CG_outputRepr *> stmts(m);
+  //for (int i = 0; i < m; i++)
+  //    stmts[i] = stmt[i].code;
+  //CG_outputRepr* repr = last_compute_cgr_->printRepr(ocg, stmts);
+  // -- end replacing MMGenerateCode
+  std::string repr = last_compute_cgr_->printString();
+  
+  if (actuallyPrint)
+    std::cout << repr << std::endl;
+  //std::cout << static_cast<CG_stringRepr*>(repr)->GetString();
+  /*
+    for (int i = 1; i <= m; i++)
+    delete nameInfo[i];
+  */
+  
+  //delete ocg;
+}
+
+void LoopCuda::printRuntimeInfo() const {
+  for (int i = 0; i < stmt.size(); i++) {
+    Relation IS = stmt[i].IS;
+    Relation xform = stmt[i].xform;
+    printf("stmt[%d]\n", i);
+    printf("IS\n");
+    IS.print_with_subs();
+    
+    printf("xform[%d]\n", i);
+    xform.print_with_subs();
+    
+  }
+}
+
+void LoopCuda::printIndexes() const {
+  for (int i = 0; i < stmt.size(); i++) {
+    printf("stmt %d nset %d ", i, stmt[i].IS.n_set());
+    
+    for (int j = 0; j < idxNames[i].size(); j++) {
+      if (j > 0)
+        printf(",");
+      printf("%s", idxNames[i][j].c_str());
+    }
+    printf("\n");
+  }
+}
+
+SgNode* LoopCuda::getCode(int effort) const {
+  const int m = stmt.size();
+  if (m == 0)
+    return new SgNode;
+  const int n = stmt[0].xform.n_out();
+  /*
+    Tuple<CG_outputRepr *> ni(m);
+    Tuple < Relation > IS(m);
+    Tuple < Relation > xform(m);
+    vector < vector <int> > nonSplitLevels(m);
+    for (int i = 0; i < m; i++) {
+    ni[i + 1] = stmt[i].code;
+    IS[i + 1] = stmt[i].IS;
+    xform[i + 1] = stmt[i].xform;
+    nonSplitLevels[i + 1] = stmt_nonSplitLevels[i];
+    
+    //nonSplitLevels[i+1] = stmt[i].nonSplitLevels;
+    }
+  */
+  //Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
+//#ifdef DEBUG
+//#endif
+  //std::cout << GetString(MMGenerateCode(new CG_stringBuilder(), xform, IS, ni, known,
+  //                nonSplitLevels, syncs, idxTupleNames, effort));
+  if (last_compute_cgr_ != NULL) {
+    delete last_compute_cgr_;
+    last_compute_cgr_ = NULL;
+  }
+  
+  if (last_compute_cg_ != NULL) {
+    delete last_compute_cg_;
+    last_compute_cg_ = NULL;
+  }
+  
+  CG_outputBuilder *ocg = ir->builder();
+  // -- replacing MMGenerateCode
+  // -- formally CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort);
+  // -- in the future, these if statements need to be cleaned up.
+  // -- something like check_lastComputeCG might be a decent protected member function
+  // -- and/or something that returns a std::vector<CG_outputRepr*> that also checks last_compute_cg_
+  //if (last_compute_cg_ == NULL) {
+  std::vector<Relation> IS(m);
+  std::vector<Relation> xforms(m);
+  std::vector<std::vector<int> > nonSplitLevels(m);
+  for (int i = 0; i < m; i++) {
+    IS[i] = stmt[i].IS;
+    xforms[i] = stmt[i].xform;
+    nonSplitLevels[i] = stmt_nonSplitLevels[i];
+  }
+  
+  /*std::vector < std::vector<std::string> > idxTupleNames;
+    if (useIdxNames) {
+    for (int i = 0; i < idxNames.size(); i++) {
+    std::vector<std::string> idxs;
+    for (int j = 0; j < idxNames[i].size(); j++)
+    idxs.push_back(idxNames[i][j]);
+    idxTupleNames.push_back(idxs);
+    }
+    }
+  */
+  Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
+  
+  last_compute_cg_ = new CodeGen(xforms, IS, known, nonSplitLevels, idxNames,
+                                 syncs);
+  delete last_compute_cgr_;
+  last_compute_cgr_ = NULL;
+  //}
+  
+  if (last_compute_cgr_ == NULL || last_compute_effort_ != effort) {
+    delete last_compute_cgr_;
+    last_compute_cgr_ = last_compute_cg_->buildAST(effort);
+    last_compute_effort_ = effort;
+  }
+  
+  std::vector<CG_outputRepr *> stmts(m);
+  for (int i = 0; i < m; i++)
+    stmts[i] = stmt[i].code;
+  CG_outputRepr* repr = last_compute_cgr_->printRepr(ocg, stmts);
+  // -- end replacing MMGenerateCode
+  
+  //CG_outputRepr *overflow_initialization = ocg->CreateStmtList();
+  CG_outputRepr *overflow_initialization = ocg->StmtListAppend(NULL, NULL);
+  for (std::map<int, std::vector<Free_Var_Decl *> >::const_iterator i =
+         overflow.begin(); i != overflow.end(); i++)
+    for (std::vector<Free_Var_Decl *>::const_iterator j = i->second.begin();
+         j != i->second.end(); j++)
+      //overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->CreateStmtList(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0))));
+      overflow_initialization = ocg->StmtListAppend(
+        overflow_initialization,
+        ocg->StmtListAppend(
+          ocg->CreateAssignment(0,
+                                ocg->CreateIdent((*j)->base_name()),
+                                ocg->CreateInt(0)), NULL));
+  
+  repr = ocg->StmtListAppend(overflow_initialization, repr);
+  SgNode *tnl = static_cast<CG_roseRepr *>(repr)->GetCode();
+  SgStatementPtrList *list = static_cast<CG_roseRepr *>(repr)->GetList();
+  
+  if (tnl != NULL)
+    return tnl;
+  else if (tnl == NULL && list != NULL) {
+    SgBasicBlock* bb2 = buildBasicBlock();
+    
+    for (SgStatementPtrList::iterator it = (*list).begin();
+         it != (*list).end(); it++)
+      bb2->append_statement(*it);
+    
+    tnl = isSgNode(bb2);
+  } else
+    throw loop_error("codegen failed");
+  
+  delete repr;
+  /*
+    for (int i = 1; i <= m; i++)
+    delete ni[i];
+  */
+  return tnl;
+  
+}
+
+//protonu--adding constructors for the new derived class
+LoopCuda::LoopCuda() :
+  Loop(), code_gen_flags(GenInit) {
+}
+
+LoopCuda::LoopCuda(IR_Control *irc, int loop_num) :
+  Loop(irc) {
+  setup_code = NULL;
+  teardown_code = NULL;
+  code_gen_flags = 0;
+  cu_bx = cu_by = cu_tx = cu_ty = cu_tz = 1;
+  cu_bx_repr = NULL;
+  cu_tx_repr = NULL;
+  cu_by_repr = NULL;
+  cu_ty_repr = NULL;
+  cu_tz_repr = NULL;
+  
+  cu_num_reduce = 0;
+  cu_mode = GlobalMem;
+  texture = NULL;
+  constant_mem = NULL;
+  
+  int m = stmt.size();
+  //printf("\n the size of stmt(initially) is: %d\n", stmt.size());
+  for (int i = 0; i < m; i++)
+    stmt_nonSplitLevels.push_back(std::vector<int>());
+  
+  globals = ((IR_cudaroseCode *) ir)->gsym_;
+  globalScope = ((IR_cudaroseCode *) ir)->first_scope;
+  parameter_symtab = ((IR_cudaroseCode *) ir)->parameter;
+  body_symtab = ((IR_cudaroseCode *) ir)->body;
+  func_body = ((IR_cudaroseCode *) ir)->defn;
+  func_definition = ((IR_cudaroseCode *) ir)->func_defn;
+  std::vector<SgForStatement *> tf = ((IR_cudaroseCode *) ir)->get_loops();
+  
+  symtab = tf[loop_num]->get_symbol_table();
+  
+  std::vector<SgForStatement *> deepest = find_deepest_loops(
+    isSgNode(tf[loop_num]));
+  
+  for (int i = 0; i < deepest.size(); i++) {
+    SgVariableSymbol* vs;
+    SgForInitStatement* list = deepest[i]->get_for_init_stmt();
+    SgStatementPtrList& initStatements = list->get_init_stmt();
+    SgStatementPtrList::const_iterator j = initStatements.begin();
+    if (SgExprStatement *expr = isSgExprStatement(*j))
+      if (SgAssignOp* op = isSgAssignOp(expr->get_expression()))
+        if (SgVarRefExp* var_ref = isSgVarRefExp(op->get_lhs_operand()))
+          vs = var_ref->get_symbol();
+    
+    index.push_back(vs->get_name().getString().c_str()); //reflects original code index names
+  }
+  
+  for (int i = 0; i < stmt.size(); i++)
+    idxNames.push_back(index); //refects prefered index names (used as handles in cudaize v2)
+  useIdxNames = false;
+  
+}
+
+void LoopCuda::printIS() {
+ if (!cudaDebug) return;
+  int k = stmt.size();
+  for (int i = 0; i < k; i++) {
+    printf(" printing statement:%d\n", i);
+    stmt[i].IS.print();
+  }
+}
+