From c285135eb903c31cd221f90f03e288a6b67770cd Mon Sep 17 00:00:00 2001
From: Derick Huth <derickhuth@gmail.com>
Date: Thu, 24 Sep 2015 11:26:53 -0600
Subject: pre-v0.2.1

---
 INSTALL                            |   18 -
 Makefile-Old                       |  251 ---
 chill_run.cc                       |   15 +-
 chillmodule.cc                     |    4 +-
 dep.cc                             |    2 +-
 examples/chill/gemm.c              |   18 -
 examples/chill/gemm.script         |   31 -
 examples/chill/gemv.c              |   15 -
 examples/chill/gemv.script         |    9 -
 examples/chill/jacobi1.c           |   13 -
 examples/chill/jacobi1.script      |   18 -
 examples/chill/jacobi2.c           |   15 -
 examples/chill/jacobi2.script      |   21 -
 examples/chill/unroll.c            |   33 -
 examples/chill/unroll.script       |   35 -
 examples/cuda-chill/cp.c           |   29 -
 examples/cuda-chill/cp.lua         |   46 -
 examples/cuda-chill/cudaize.lua    | 1004 ---------
 examples/cuda-chill/cudaize.py     | 1047 ---------
 examples/cuda-chill/mm.c           |   10 -
 examples/cuda-chill/mm.lua         |   38 -
 examples/cuda-chill/mpeg4.c        |   23 -
 examples/cuda-chill/mpeg4.lua      |   45 -
 examples/cuda-chill/mriq-fh.c      |   38 -
 examples/cuda-chill/mriq-fh.lua    |   73 -
 examples/cuda-chill/mriq.c         |   33 -
 examples/cuda-chill/mriq.lua       |   55 -
 examples/cuda-chill/mv-shadow.c    |    9 -
 examples/cuda-chill/mv-shadow.lua  |   65 -
 examples/cuda-chill/mv.c           |    9 -
 examples/cuda-chill/mv.lua         |   65 -
 examples/cuda-chill/mv_try.c       |    9 -
 examples/cuda-chill/mv_try.lua     |   14 -
 examples/cuda-chill/nbody.c        |   66 -
 examples/cuda-chill/nbody.lua      |   53 -
 examples/cuda-chill/tmv-shadow.c   |    9 -
 examples/cuda-chill/tmv-shadow.lua |   50 -
 examples/cuda-chill/tmv.c          |    9 -
 examples/cuda-chill/tmv.lua        |   50 -
 examples/fortran/README            |   10 -
 examples/fortran/ccd.f             |   32 -
 examples/fortran/ccd.script        |   18 -
 examples/fortran/gemm.f90          |   58 -
 examples/fortran/gemm.script       |   30 -
 examples/fortran/rose_gemm.f90     |  155 --
 graph-test.cc                      |  148 --
 graph.hh                           |    3 +-
 include/ir_suif.hh                 |    1 -
 include/ir_suif_utils.hh           |    1 -
 ir_cuda_rose_utils.cc              |  191 --
 ir_cuda_suif_utils.cc              |   54 -
 ir_cudarose.cc                     |  165 --
 ir_cudarose.hh                     |   46 -
 ir_cudasuif.cc                     |  144 --
 ir_cudasuif.hh                     |   36 -
 loop.cc                            |    1 +
 loop_backup.cc                     | 3311 ----------------------------
 loop_cuda.cc                       | 2123 ------------------
 loop_cuda_rose.cc                  | 3734 -------------------------------
 loop_modified.cc                   | 4234 ------------------------------------
 mem_mapping_utils.cc               |   76 -
 mem_mapping_utils.hh               |   59 -
 omega/INSTALL                      |   34 -
 omega/README                       |   96 -
 omega/ROSE_INSTALL.txt             |   77 -
 omega/bin/oc                       |    1 -
 orig_loop_datacopy.cc              | 1175 ----------
 67 files changed, 12 insertions(+), 19348 deletions(-)
 delete mode 100644 INSTALL
 delete mode 100644 Makefile-Old
 delete mode 100644 examples/chill/gemm.c
 delete mode 100644 examples/chill/gemm.script
 delete mode 100644 examples/chill/gemv.c
 delete mode 100644 examples/chill/gemv.script
 delete mode 100644 examples/chill/jacobi1.c
 delete mode 100644 examples/chill/jacobi1.script
 delete mode 100644 examples/chill/jacobi2.c
 delete mode 100644 examples/chill/jacobi2.script
 delete mode 100644 examples/chill/unroll.c
 delete mode 100644 examples/chill/unroll.script
 delete mode 100644 examples/cuda-chill/cp.c
 delete mode 100644 examples/cuda-chill/cp.lua
 delete mode 100644 examples/cuda-chill/cudaize.lua
 delete mode 100755 examples/cuda-chill/cudaize.py
 delete mode 100644 examples/cuda-chill/mm.c
 delete mode 100644 examples/cuda-chill/mm.lua
 delete mode 100755 examples/cuda-chill/mpeg4.c
 delete mode 100644 examples/cuda-chill/mpeg4.lua
 delete mode 100755 examples/cuda-chill/mriq-fh.c
 delete mode 100755 examples/cuda-chill/mriq-fh.lua
 delete mode 100644 examples/cuda-chill/mriq.c
 delete mode 100644 examples/cuda-chill/mriq.lua
 delete mode 100644 examples/cuda-chill/mv-shadow.c
 delete mode 100644 examples/cuda-chill/mv-shadow.lua
 delete mode 100644 examples/cuda-chill/mv.c
 delete mode 100644 examples/cuda-chill/mv.lua
 delete mode 100644 examples/cuda-chill/mv_try.c
 delete mode 100644 examples/cuda-chill/mv_try.lua
 delete mode 100644 examples/cuda-chill/nbody.c
 delete mode 100644 examples/cuda-chill/nbody.lua
 delete mode 100644 examples/cuda-chill/tmv-shadow.c
 delete mode 100644 examples/cuda-chill/tmv-shadow.lua
 delete mode 100644 examples/cuda-chill/tmv.c
 delete mode 100644 examples/cuda-chill/tmv.lua
 delete mode 100644 examples/fortran/README
 delete mode 100644 examples/fortran/ccd.f
 delete mode 100644 examples/fortran/ccd.script
 delete mode 100644 examples/fortran/gemm.f90
 delete mode 100644 examples/fortran/gemm.script
 delete mode 100644 examples/fortran/rose_gemm.f90
 delete mode 100644 graph-test.cc
 delete mode 120000 include/ir_suif.hh
 delete mode 120000 include/ir_suif_utils.hh
 delete mode 100644 ir_cuda_rose_utils.cc
 delete mode 100644 ir_cuda_suif_utils.cc
 delete mode 100644 ir_cudarose.cc
 delete mode 100644 ir_cudarose.hh
 delete mode 100644 ir_cudasuif.cc
 delete mode 100644 ir_cudasuif.hh
 delete mode 100644 loop_backup.cc
 delete mode 100644 loop_cuda.cc
 delete mode 100644 loop_cuda_rose.cc
 delete mode 100644 loop_modified.cc
 delete mode 100644 mem_mapping_utils.cc
 delete mode 100644 mem_mapping_utils.hh
 delete mode 100644 omega/INSTALL
 delete mode 100644 omega/README
 delete mode 100644 omega/ROSE_INSTALL.txt
 delete mode 120000 omega/bin/oc
 delete mode 100644 orig_loop_datacopy.cc

diff --git a/INSTALL b/INSTALL
deleted file mode 100644
index aef619a..0000000
--- a/INSTALL
+++ /dev/null
@@ -1,18 +0,0 @@
-BUILD
-=====
-
-1. Edit Makefile. Change SUIFHOME and OMEGAHOME to correct paths.
-
-2. Do "make depend" in the chill directory.
-
-3. Optional, do "make clean" or "make veryclean" which removes additional
-   target files and flex/bison generated files.
-
-4. Do "make".
-
-
-INSTALLATION
-============
-
-You can use CHiLL in source directory since all links are already
-created in bin/, lib/ and include/ directories.
diff --git a/Makefile-Old b/Makefile-Old
deleted file mode 100644
index 7f2c8b5..0000000
--- a/Makefile-Old
+++ /dev/null
@@ -1,251 +0,0 @@
-
-.SUFFIXES:
-.PHONY: all depend depend-cuda-chill clean veryclean cuda-chill
-.PHONY: chill 
-
-CC = g++
-CFLAGS = -g -Wno-write-strings
-DEPENDENCE_CFLAGS = -M
-OMEGAHOME=./omega
-
-ifdef TEST_COVERAGE
-  CFLAGS := $(CFLAGS) -fprofile-arcs -ftest-coverage
-endif
-
-# TODO   auto-generate using config.h generated by autoconf?
-CHILLVERSION = "\"0.2.0\""
-PYTHON=python  #=$(shell `which python` ) 
-PYVERSION=$(shell $(PYTHON) -c "import sys; print(sys.version[:3])")  # 2.6
-PYTHONVER = python$(PYVERSION)
-PYTHONINCLUDE = $(shell $(PYTHON) -c "from distutils import sysconfig; print(sysconfig.get_python_inc())")
-PYTHONLIBDIR  = $(shell $(PYTHON) -c "from distutils import sysconfig; print(sysconfig.get_config_var('LIBDIR'))")
-PYTHONCONFIG  = $(shell $(PYTHON) -c "from distutils import sysconfig; print(sysconfig.get_config_var('LIBPL'))")
-# SCRIPT_LANG = lua <-- supplied by the command line
-
-
-# this creates a LUAHOME even if you don't have such a directory
-ifeq ($(strip $(wildcard $(LUAHOME))),)
-LUAHOME = $(HOME)/lua
-endif
-LUA_PATH = -L${LUAHOME}/lib
-
-
-# where do include files live
-INC_PATH = -I${PYTHONINCLUDE} -I${OMEGAHOME}/include -I${LUAHOME}/include
-
-# where do libraries live
-LIB_PATH = -L${OMEGAHOME}/code_gen/obj -L${OMEGAHOME}/omega_lib/obj 
-# seemingly not needed -L${PYTHONCONFIG}
-
-
-
-CORE_LIBS = -lm -lcodegen -lomega
-RUNNER_LIBS = -llua -ldl -lreadline -lhistory -lpthread -ldl -lutil -lm -l${PYTHONVER}
-
-TDLHOME = ${ROSEHOME}/libltdl
-
-BOOST_DATE_TIME_LIB = -lboost_date_time
-BOOST_FILESYSTEM_LIB = -lboost_filesystem
-BOOST_LDFLAGS = -L${BOOSTHOME}/lib
-BOOST_PROGRAM_OPTIONS_LIB = -lboost_program_options
-BOOST_REGEX_LIB = -lboost_regex
-BOOST_SYSTEM_LIB = -lboost_system
-BOOST_THREAD_LIB = -lboost_thread
-BOOST_WAVE_LIB = -lboost_wave
-
-ROSE_LIBS =  -lrose  $(BOOST_LDFLAGS) $(BOOST_DATE_TIME_LIB)\
-             $(BOOST_THREAD_LIB) $(BOOST_FILESYSTEM_LIB) $(BOOST_PROGRAM_OPTIONS_LIB)\
-             $(BOOST_REGEX_LIB)  $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB)  \
-             $(BOOST_WAVE_LIB) -lrt -ldl
-
-
-# Source files common to both chill and cuda-chill
-CORE_SRCS = dep.cc omegatools.cc irtools.cc loop.cc loop_basic.cc loop_datacopy.cc loop_unroll.cc loop_tile.cc loop_extra.cc
-LIB_SRCS = $(CORE_SRCS)
-
-# files that will be generated by bison, flex, and make that need to be removed at clean.
-GENERATED_SRCS = parser.tab.hh parser.tab.cc parse_expr.yy.cc parse_expr.ll.hh parse_expr.tab.cc parse_expr.tab.hh Makefile.deps 
-# object files that are specific to lua or python builds. -- This is used so that SCRIPT_LANG does not need to be specified during clean
-ORPHAN_OBJS = chill_run_util.o chillmodule.o parse_expr.tab.o parse_expr.yy.o
-
-# files used in chill and cuda-chill interfaces
-ifeq ($(SCRIPT_LANG),lua)
-  RUNNER_SRCS = chill_run.cc chill_env.cc
-else
-  ifeq ($(SCRIPT_LANG),python)
-    RUNNER_SRCS = chill_run.cc chillmodule.cc
-  else
-    RUNNER_SRCS = chill_run.cc chill_env.cc
-  endif
-endif
-
-# files used in chill but not cuda-chill
-IR_CHILL_SRCS = ir_rose.cc ir_rose_utils.cc
-ifeq ($(SCRIPT_LANG),lua)
-  YACC_SRCS = parse_expr.yy.cc parse_expr.tab.cc
-  CHILL_RUNNER_SRCS = chill_run_util.cc
-  CHILL_SRCS = $(CORE_SRCS) $(IR_CHILL_SRCS) $(CHILL_RUNNER_SRCS) $(RUNNER_SRCS)
-else
-  ifeq ($(SCRIPT_LANG),python)
-    YACC_SRCS = parse_expr.yy.cc parse_expr.tab.cc
-    CHILL_RUNNER_SRCS = chill_run_util.cc
-    CHILL_SRCS = $(CORE_SRCS) $(IR_CHILL_SRCS) $(CHILL_RUNNER_SRCS) $(RUNNER_SRCS)
-  else
-    YACC_SRCS = lex.yy.cc parser.tab.cc
-    CHILL_RUNNER_SRCS = 
-    CHILL_SRCS = $(CORE_SRCS) $(IR_CHILL_SRCS) $(YACC_SRCS) $(RUNNER_SRCS)
-  endif
-endif
-
-# source files for cuda-chill but not chill
-CUDACHILL_ONLY_SRCS = mem_mapping_utils.cc loop_cuda_rose.cc
-IR_CUDACHILL_SRCS = ir_rose.cc ir_rose_utils.cc ir_cudarose.cc ir_cuda_rose_utils.cc
-CUDACHILL_RUNNER_SRCS =
-CUDACHILL_SRCS = $(CORE_SRCS) $(CUDACHILL_ONLY_SRCS) $(IR_CUDACHILL_SRCS) $(RUNNER_SRCS) $(CUDACHILL_RUNNER_SRCS)
-
-# set interface language flags
-ifeq ($(SCRIPT_LANG),lua)
-  RUNNER_EXTRA_CFLAGS = -DLUA
-else
-  ifeq ($(SCRIPT_LANG),python)
-    RUNNER_EXTRA_CFLAGS = -DPYTHON
-  endif
-endif
-
-depend-cuda-chill: CFLAGS := $(CFLAGS) -DCUDACHILL
-cuda-chill: CFLAGS := $(CFLAGS) -DCUDACHILL
-
-ALL_SRCS = $(CORE_SRCS) $(YACC_SRCS) $(IR_CHILL_SRCS) $(CUDACHILL_ONLY_SRCS) $(IR_CUDACHILL_SRCS) $(RUNNER_SRCS) $(CHILL_RUNNER_SRCS) $(CUDACHILL_RUNNER_SRCS)
-ALL_OBJS = $(ALL_SRCS:.cc=.o) $(ORPHAN_OBJS)
-
-RUNNER_DEFINES = -DLUA_USE_LINUX -DCHILL_BUILD_VERSION=$(CHILLVERSION) -DCHILL_BUILD_DATE="\"$(CHILL_BUILD_DATE)\""
-
-
-YACC_EXTRA_CFLAGS =
-
-#####################################################################
-# compiler intermediate code specific definitions
-#####################################################################
-
-
-
-#LIBS := $(LIBS) $(ROSE_LIBS)
-LIB_PATH := $(LIB_PATH) -L${ROSEHOME}/lib -L${TDLHOME}
-#LIB_SRCS := $(LIB_SRCS) #  $(IR_SRCS)
-INC_PATH := $(INC_PATH) -I${ROSEHOME}/include -I${BOOSTHOME}/include
-YACC_EXTRA_CFLAGS := -DBUILD_ROSE
-RUNNER_EXTRA_CFLAGS := $(RUNNER_EXTRA_CFLAGS) -DBUILD_ROSE
-
-
-#####################################################################
-# build rules
-#####################################################################
-
-YACC_OBJS = $(YACC_SRCS:.cc=.o)
-RUNNER_OBJS = $(RUNNER_SRCS:.cc=.o)
-CHILL_RUNNER_OBJS = $(CHILL_RUNNER_SRCS:.cc=.o)
-CUDACHILL_RUNNER_OBJS = $(CUDACHILL_RUNNER_SRCS:.cc=.o)
-LIB_OBJS = $(LIB_SRCS:.cc=.o)
-IR_CHILL_OBJS = $(IR_CHILL_SRCS:.cc=.o) 
-IR_CUDACHILL_OBJS = $(IR_CUDACHILL_SRCS:.cc=.o) 
-CUDACHILL_ONLY_OBJS = $(CUDACHILL_ONLY_SRCS:.cc=.o)
-
-CHILL_OBJS     = $(CHILL_SRCS:.cc=.o)
-CUDACHILL_OBJS = $(CUDACHILL_SRCS:.cc=.o)
-
-
-all:
-	$(MAKE) depend-chill
-	$(MAKE) chill
-	$(MAKE) depend-cuda-chill
-	$(MAKE) cuda-chill 
-
-
-# can't these be combined to a superset of all source files?
-depend: depend-cuda-chill
-
-depend-chill: $(LIB_SRCS) $(RUNNER_SRCS) $(CHILL_RUNNER_SRCS) $(YACC_SRCS)
-	$(CC) $(DEPENDENCE_CFLAGS) $(INC_PATH) $(LIB_SRCS) $(RUNNER_SRCS) $(CHILL_RUNNER_SRCS) $(YACC_SRCS) > Makefile.deps
-
-depend-cuda-chill: $(LIB_SRCS) $(RUNNER_SRCS) $(CUDACHILL_RUNNER_SRCS)
-	$(CC) $(DEPENDENCE_CFLAGS) $(INC_PATH) $(LIB_SRCS) $(RUNNER_SRCS) $(CUDACHILL_RUNNER_SRCS) > Makefile.deps
-
-libchill_xform.a: $(LIB_OBJS) $(IR_CHILL_OBJS)
-	ar -rs $@ $(LIB_OBJS) $(IR_CHILL_OBJS)
-
-libcudachill_xform.a: $(LIB_OBJS) $(IR_CUDACHILL_OBJS) $(CUDACHILL_ONLY_OBJS)
-	ar -rs $@ $(LIB_OBJS) $(IR_CUDACHILL_OBJS) $(CUDACHILL_ONLY_OBJS)
-
-%.o: %.cc
-	$(CC) $(CFLAGS) $(INC_PATH) $< -c -o $@
-
-
-clean:
-	@rm -fr $(ALL_OBJS) $(YACC_SRCS) $(GENERATED_SRCS)
-
-veryclean:
-	@rm -fr $(ALL_OBJS) $(YACC_SRCS) libchill_xform.a libcudachill_xform.a chill cuda-chill
-
-
-cuda-chill: libcudachill_xform.a $(CUDACHILL_RUNNER_OBJS) $(RUNNER_OBJS)
-	$(CC) $(CFLAGS) $(LIB_PATH) $(LUA_PATH) $(CUDACHILL_RUNNER_OBJS) $(RUNNER_OBJS) $< $(CORE_LIBS) $(ROSE_LIBS) $(RUNNER_LIBS) -o $@
-
-ifeq ($(SCRIPT_LANG),lua)
-chill: libchill_xform.a $(CHILL_RUNNER_OBJS) $(RUNNER_OBJS) $(YACC_OBJS)
-	$(CC) $(CFLAGS) $(LIB_PATH) $(LUA_PATH) $(YACC_OBJS) $(CHILL_RUNNER_OBJS) $(RUNNER_OBJS) $< $(CORE_LIBS)  $(ROSE_LIBS) $(RUNNER_LIBS) -o $@
-else
-ifeq ($(SCRIPT_LANG),python)
-chill: libchill_xform.a $(CHILL_RUNNER_OBJS) $(RUNNER_OBJS) $(YACC_OBJS)
-	$(CC) $(CFLAGS) $(LIB_PATH) $(YACC_OBJS) $(CHILL_RUNNER_OBJS) $(RUNNER_OBJS) $< $(CORE_LIBS) $(ROSE_LIBS) $(RUNNER_LIBS) -o $@
-
-else
-chill: libchill_xform.a $(YACC_OBJS)
-	$(CC) $(CFLAGS) $(LIB_PATH) $(YACC_OBJS) $< $(CORE_LIBS)  $(ROSE_LIBS) -o $@
-endif
-endif
-
-
-lex.yy.cc: parser.ll parser.tab.hh
-	flex++ parser.ll
-
-lex.yy.o: lex.yy.cc
-	$(CC) $(CFLAGS) -c $< -o $@
-
-parser.tab.hh parser.tab.cc: parser.yy
-	bison -t -d $<
-
-parser.tab.o: parser.tab.cc
-	$(CC) $(CFLAGS) $(YACC_EXTRA_CFLAGS) $(INC_PATH) -DCHILL_BUILD_DATE="\"$(CHILL_BUILD_DATE)\"" -c $< -o $@
-
-
-parse_expr.tab.cc: parse_expr.yy
-	bison -t -d parse_expr.yy
-
-parse_expr.tab.o: parse_expr.tab.cc
-	$(CC) $(CFLAGS) $(YACC_CFLAGS) $(INC_PATH) -o $@ -c parse_expr.tab.cc
-
-parse_expr.yy.cc: parse_expr.tab.cc parse_expr.ll
-	flex -o parse_expr.yy.cc parse_expr.ll
-
-parse_expr.yy.o: parse_expr.yy.cc
-	$(CC) $(CFLAGS) $(YACC_CFLAGS) $(INC_PATH) -o $@ -c parse_expr.yy.cc
-
-$(RUNNER_SRCS:.cc=.o): %.o: %.cc
-	$(CC) $(CFLAGS) $(RUNNER_EXTRA_CFLAGS) $(INC_PATH) $(RUNNER_DEFINES) $< -c -o $@
-
-$(CHILL_RUNNER_SRCS:.cc=.o): %.o: %.cc
-	$(CC) $(CFLAGS) $(RUNNER_EXTRA_CFLAGS) $(INC_PATH) $(RUNNER_DEFINES) $< -c -o $@
-
-$(CUDACHILL_RUNNER_SRCS:.cc=.o): %.o %.cc
-	$(CC) $(CFLAGS) $(RUNNER_EXTRA_CFLAGS) $(INC_PATH) $(RUNNER_DEFINES) $< -c -o $@
-
-
-$(IR_SRCS:.cc=.o): %.o: %.cc
-	$(CC) -Wno-write-strings $(CFLAGS) $(INC_PATH) $< -c -o $@
-
-ifeq ($(shell test -f Makefile.deps && echo "true"), true)
-include Makefile.deps
-endif
-
-CHILL_BUILD_DATE = $(shell date +%m/%d/%Y)
-
diff --git a/chill_run.cc b/chill_run.cc
index a3c9180..d33819b 100644
--- a/chill_run.cc
+++ b/chill_run.cc
@@ -281,14 +281,14 @@ int main( int argc, char* argv[] )
     //---
     // Run a CHiLL interpreter
     //---
-    printf("CUDA-CHiLL v0.2.0 (built on %s)\n", CHILL_BUILD_DATE);
+    printf("CHiLL v0.2.1 (built on %s)\n", CHILL_BUILD_DATE);
     printf("Copyright (C) 2008 University of Southern California\n");
     printf("Copyright (C) 2009-2012 University of Utah\n");
     //is_interactive = true; // let the lua interpreter know.
     fflush(stdout);
     // TODO: read lines of python code.
     //Not sure if we should set fail from interactive mode
-    printf("CUDA-CHiLL ending...\n");
+    printf("CHiLL ending...\n");
     fflush(stdout);
   }
 
@@ -336,7 +336,7 @@ int main( int argc, char* argv[] )
     //---
     // Run a CHiLL interpreter
     //---
-    printf("CUDA-CHiLL v0.2.0 (built on %s)\n", CHILL_BUILD_DATE);
+    printf("CUDA-CHiLL v0.2.1 (built on %s)\n", CHILL_BUILD_DATE);
     printf("Copyright (C) 2008 University of Southern California\n");
     printf("Copyright (C) 2009-2012 University of Utah\n");
     is_interactive = true; // let the lua interpreter know.
@@ -359,7 +359,6 @@ int main( int argc, char* argv[] )
     #endif
     #ifdef BUILD_ROSE
     ((IR_cudaroseCode *)(ir_code))->commit_loop(myloop, lnum);
-    ((IR_roseCode*)(ir_code))->finalizeRose();
     #elif BUILD_SUIF
     ((IR_cudasuifCode *)(ir_code))->commit_loop(myloop, lnum);
     #endif
@@ -375,16 +374,14 @@ int main( int argc, char* argv[] )
     lnum_end = get_loop_num_end(L);
     DEBUG_PRINT("calling ROSE code gen?    loop num %d - %d\n", lnum_start, lnum_end);
     #endif
-    
+#endif
     #ifdef BUILD_ROSE
     finalize_loop(lnum_start, lnum_end);
     //((IR_roseCode*)(ir_cide))->commit_loop(myloop, lnum);
     ((IR_roseCode*)(ir_code))->finalizeRose();
-    #elif BUILD_SUIF
-    ((IR_suifCode*)(ir_code))->commit_loop(myloop, lnum);
+    //#elif BUILD_SUIF
+    //((IR_suifCode*)(ir_code))->commit_loop(myloop, lnum);
     #endif
-    
-#endif
     delete ir_code;
   }
 #ifdef PYTHON
diff --git a/chillmodule.cc b/chillmodule.cc
index fa55199..fbeb477 100644
--- a/chillmodule.cc
+++ b/chillmodule.cc
@@ -1431,7 +1431,7 @@ static PyObject* chill_permute(PyObject* self, PyObject* args) {
     int stmt_num = intArg(args, 1);
     int level = intArg(args, 2);
     std::vector<int> pi;
-    if(!tointvector(args, 2, pi))
+    if(!tointvector(args, 3, pi))
       throw std::runtime_error("the third argument in permute(stmt_num, level, pi) must be an int vector");
     myloop->permute(stmt_num, level, pi);
   }
@@ -1750,7 +1750,7 @@ static PyMethodDef ChillMethods[] = {
   {"print_space",         chill_print_space,         METH_VARARGS,    "print something or other "},
   {"add_sync",            chill_add_sync,            METH_VARARGS,    "add sync, whatever that is"},
   {"rename_index",        chill_rename_index,        METH_VARARGS,    "rename a loop index"},
-  {"permute",             chill_permute_v2,          METH_VARARGS,    "change the order of loops?"},
+  {"permute",             chill_permute,             METH_VARARGS,    "change the order of loops?"},
   {"tile3",               chill_tile_v2_3arg,        METH_VARARGS,    "something to do with tile"},
   {"tile7",               chill_tile_v2_7arg,        METH_VARARGS,    "something to do with tile"},
   {"thread_dims",         thread_dims,               METH_VARARGS,    "tx, ty, tz "},
diff --git a/dep.cc b/dep.cc
index 7bf781a..a675d03 100644
--- a/dep.cc
+++ b/dep.cc
@@ -37,7 +37,7 @@ std::ostream& operator<<(std::ostream &os, const DependenceVector &d) {
   
   switch (d.type) {
   case DEP_W2R:
-    os << "flow";
+    os << "true";
     if (d.is_reduction)
       os << "_reduction";
     break;
diff --git a/examples/chill/gemm.c b/examples/chill/gemm.c
deleted file mode 100644
index a565511..0000000
--- a/examples/chill/gemm.c
+++ /dev/null
@@ -1,18 +0,0 @@
-
-#define N 512 
-
-int main() {
-
-	float a[N][N], b[N][N], c[N][N];
-
-	int i, j, k;
-
-	for (j = 0; j < N; j++)
-		for (k = 0; k < N; k++)
-			for (i = 0; i < N; i++) {
-				c[i][j] = c[i][j] + a[i][k] * b[k][j];
-			}
-
-	return 0;
-}
-
diff --git a/examples/chill/gemm.script b/examples/chill/gemm.script
deleted file mode 100644
index ed91567..0000000
--- a/examples/chill/gemm.script
+++ /dev/null
@@ -1,31 +0,0 @@
-#matrix multiply large array size for intel machine
-source: gemm.c
-procedure: main
-format: rose
-loop: 0
-
-TI = 128
-TJ = 8
-TK = 512
-UI = 2
-UJ = 2
-
-permute([3,1,2])
-tile(0,2,TJ)
-#print space
-tile(0,2,TI)
-#print space
-tile(0,5,TK)
-#print space
-
-datacopy(0,3,a,false,1)
-#print space
-
-datacopy(0,4,b)
-print
-unroll(0,4,UI)#print space
-print 
-unroll(0,5,UJ)
-#print space
-print
-
diff --git a/examples/chill/gemv.c b/examples/chill/gemv.c
deleted file mode 100644
index 610d4cb..0000000
--- a/examples/chill/gemv.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#define N 10
-
-int main() {
-	// int n;
-	float a[N];
-	float b[N];
-	float c[N][N];
-
-	int i, j;
-
-	for (i = 1; i < N; i++)
-		for (j = 1; j < N; j++)
-			a[i] = a[i] + c[i][j] * b[j];
-
-}
diff --git a/examples/chill/gemv.script b/examples/chill/gemv.script
deleted file mode 100644
index f1d5f89..0000000
--- a/examples/chill/gemv.script
+++ /dev/null
@@ -1,9 +0,0 @@
-source: gemv.c # matrix-vector multiply
-procedure: main
-format : rose
-loop: 0
-
-
-
-original()
-print
diff --git a/examples/chill/jacobi1.c b/examples/chill/jacobi1.c
deleted file mode 100644
index 0fcaee4..0000000
--- a/examples/chill/jacobi1.c
+++ /dev/null
@@ -1,13 +0,0 @@
-#define N 512
-
-int main() {
-	int i, t;
-
-	float a[N][N];
-
-	for (t = 2; t <= 100; t++)
-		for (i = 2; i <= N - 1; i++)
-			a[t][i] = a[t - 1][i - 1] + a[t - 1][i] + a[t - 1][i + 1];
-
-	return 0;
-}
diff --git a/examples/chill/jacobi1.script b/examples/chill/jacobi1.script
deleted file mode 100644
index c0dec8d..0000000
--- a/examples/chill/jacobi1.script
+++ /dev/null
@@ -1,18 +0,0 @@
-#
-# tiling perfect jacobi loop nest with time step, use
-# unimodular transformation first (only applicable to the
-# perfect loop nest) to make tiling legal.
-#
-
-source: jacobi1.c
-procedure: main
-format : rose
-loop: 0
-
-print dep
-
-nonsingular([[1,0],[1,1]])  # unimodular matrix, determinant is one
-tile(0,2,64)
-
-print dep
-print
diff --git a/examples/chill/jacobi2.c b/examples/chill/jacobi2.c
deleted file mode 100644
index b8d8d7b..0000000
--- a/examples/chill/jacobi2.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#define N 512
-
-int main() {
-	double a[N];
-	double b[N];
-	int t, i;
-	for (t = 1; t <= 100; t++) {
-		for (i = 2; i <= N - 1; i++)
-			b[i] = (double) 0.25 * (a[i - 1] + a[i + 1]) + (double) 0.5 * a[i];
-
-		for (i = 2; i <= N - 1; i++)
-			a[i] = b[i];
-	}
-	return 0;
-}
diff --git a/examples/chill/jacobi2.script b/examples/chill/jacobi2.script
deleted file mode 100644
index afe14c6..0000000
--- a/examples/chill/jacobi2.script
+++ /dev/null
@@ -1,21 +0,0 @@
-#
-# tiling imperfect jacobi loop nest, more details in the paper
-# "Automatic Tiling of Iterative Stencil Loops" by Zhiyuan Li and
-# Yonghong Song, TOPLAS, 2004.
-#
-
-source: jacobi2.c
-procedure: main
-format: rose
-loop: 0
-
-print dep
-
-original()
-shift([1], 2, 1)
-fuse([0,1], 2)  # optional
-skew([0,1], 2, [2,1])
-tile(0, 2, 32, 1)
-
-print dep
-print
diff --git a/examples/chill/unroll.c b/examples/chill/unroll.c
deleted file mode 100644
index e74dea3..0000000
--- a/examples/chill/unroll.c
+++ /dev/null
@@ -1,33 +0,0 @@
-
-#define N 14
-#define DT 0.314
-
-void foo(int n, float* x, float* y, float* z, float* f3, float* f1, float* w) {
-
-	int i, j;
-
-	for (i = 1; i <= 14; i++)
-		x[i] = 1.0;
-
-	for (i = 1; i <= 14; i += 3)
-		y[i] = 1.0;
-
-	for (i = N + 1; i <= N + 20; i += 3)
-		z[i] = 1.0;
-
-	for (i = 0; i <= N; i++) {
-		for (j = i; j <= i + N; j++)
-			f3[i] = f3[i] + f1[j] * w[j - i];
-		f3[i] = f3[i] * DT;
-	}
-
-	return 0;
-}
-
-int main() {
-	float x[N], y[N], z[N], f3[N], f1[N], w[N];
-
-	foo(N, x, y, z, f3, f1, w);
-	return 0;
-}
-
diff --git a/examples/chill/unroll.script b/examples/chill/unroll.script
deleted file mode 100644
index e64acb6..0000000
--- a/examples/chill/unroll.script
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Test unroll-and-jam. The last loop adapted from the simple
-# convolution example from p463 of "Optimizing Compilers for
-# Modern Architectures", by Randy Allen and Ken Kennedy.
-#
-
-source: unroll.c
-procedure: foo
-format: rose
-# fully unroll a loop with known iteration count
-loop: 0
-original()
-unroll(0,1,3)
-print
-print space
-
-
-# a strided loop
-loop: 1
-original()
-unroll(0,1,2)
-print
-print space
-
-# lower and upper bounds are not constant
-loop: 2
-original()
-unroll(0,1,20)
-print
-
-# parallelogram iteration space
-loop: 3
-original()
-unroll(0,1,2)
-print
diff --git a/examples/cuda-chill/cp.c b/examples/cuda-chill/cp.c
deleted file mode 100644
index 837d7a6..0000000
--- a/examples/cuda-chill/cp.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#define N 1
-
-#define VOLSIZEY 512
-#define VOLSIZEX 512
-#define VOLSIZEZ 1
-#define ATOMCOUNT 4000
-#define GRIDSPACING 0.1
-#define zDim 0
-
-extern float sqrtf(float);
-
-void cenergy_cpu(float atoms[ATOMCOUNT*4],float *energy,float z)
-{
-int i,j,n;float dx,dy,dz; 
-   
-    for (j=0; j<VOLSIZEY; j++) {
-        for (i=0; i<VOLSIZEX; i++) {
-            	  for (n=0;n<ATOMCOUNT;n+=4) {
-				dx = (GRIDSPACING * i) - atoms[n];
-				dy = (GRIDSPACING * j) - atoms[n+1];
-				dz = z - atoms[n+2];
-        		        energy[(j*VOLSIZEX + i)+VOLSIZEX*VOLSIZEY*zDim] += atoms[n+3]/sqrtf( (dx*dx) + (dy*dy)+ (dz*dz) ) ;
-            }
-              
-
-        }
-    }
-}
-
diff --git a/examples/cuda-chill/cp.lua b/examples/cuda-chill/cp.lua
deleted file mode 100644
index 1ef2264..0000000
--- a/examples/cuda-chill/cp.lua
+++ /dev/null
@@ -1,46 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("cp.c", "cenergy_cpu", 0) 
-
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
-                     --copy_to_shared methods
-V=512
-N=4000
-N=1
-
-Tj=32
-Ti=16
-Tii=16
-Tjj=16
-
---normalize_index("j")
---normalize_index("i")
-print_code()
-normalize_index("n")
--- TILE COMMANDS ZEROOOOOOOOOOO:3
---permute(0,{"i","j","n"})
---tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","n"})--CU=-1
-tile_by_index({"j","i"},{Tj,Ti},{l1_control="jj",l2_control="ii"},{"jj","ii","j","i","n"})--CU=-1
---tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1
-
---tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3
---tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","i","iii","j","jjj","n"})--CU=3
---tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"})
---tile_by_index({"i"}, {Tii}, {l1_control="iii",l1_tile="i"}, {"ii", "jj", "iii","i","j","n"})
-print_code()
-cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"jj","ii"}, thread={"j","i"}})--CU=3
---cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"ii","jj"}, thread={"i","j"}})--CU=3
-print_code()
-copy_to_shared("tx","atoms",-16)
-copy_to_registers("tx","energy")
---copy_to_texture("atoms")
---unroll_to_depth(1)
---unroll(0,9,0)
---unroll(0,5,0)
-
---unroll(0,8,256)
-print_code()
diff --git a/examples/cuda-chill/cudaize.lua b/examples/cuda-chill/cudaize.lua
deleted file mode 100644
index 7359cca..0000000
--- a/examples/cuda-chill/cudaize.lua
+++ /dev/null
@@ -1,1004 +0,0 @@
-
--- THIS IS CUDAIZE.LUA
-
-function table.contains_key(table, key)
-   for k in pairs(table) do
-      if k == key then
-         return true
-      end
-   end
-   return false
-end
-
-function valid_indices(stmt, indices)
-   --print( "valid_indices() lua calling C cur_indices")
-   --io.flush()
-   cur = cur_indices(stmt) 
-   --print("Cur indices "..list_to_string(cur))
-   for idx in pairs(indices) do
-      if not table.contains_key(cur,idx) then
-         return false
-      end
-   end
-   return true
-end
-
-function next_clean_level(cur_idxs,level)
-   --print("next_clean_level( ..., "..level.." )")
-   --print(string.format("indices_at_each_level %s ",list_to_string(cur_idxs) ))
-   
-   --print("loop to "..#cur_idxs)
-   for i=level+1,#cur_idxs do
-      --print("Checking level "..i.." = '"..cur_idxs[i].."'")
-      if (# cur_idxs[i] > 0) then
-         --print("Good enough"..(# cur_idxs[i]))
-         --print("returning "..i)
-         return i
-      end
-   end
-   return -1 --sentinal that there were no non-dummy indices left
-end
-
-function build_order(final_order, tile_idx_names, ctrl_idx_names, tile_idx_map, cur_level)
-   order = {}
-   --print("\nbuild_order()")
-   --print("build_order(): final_order = ( "..list_to_string(final_order).." )")
-   --print("build_order(): ctrl_idx_names = ("..list_to_string(ctrl_idx_names).." )")
-   --print("cur_level "..cur_level.."")
-   --io.flush()
-   
-   for i,k in ipairs(final_order) do
-      skip = false
-      cur = final_order[i]
-      --print("\ncur "..cur.." = final_order["..i.."] = "..final_order[i].."  ")
-      --control loops below our current level should not be in the current order
-      for j=cur_level+2,# ctrl_idx_names do
-         --print("j "..j.." final_order["..i.."] = "..final_order[i].."  ")
-         if ctrl_idx_names[j] == final_order[i] then
-            skip = true
-            --print("SKIP "..final_order[i].."  ")
-            --io.flush()
-         end
-      end
-      --possibly substitute tile indices ifn necessar
-      if table.contains_key(tile_idx_map,final_order[i]) then
-         approved_sub = false
-         sub_string = tile_idx_map[final_order[i]]
-         for j=cur_level+2,# tile_idx_names do
-            if tile_idx_names[j] == sub_string then
-               approved_sub = true
-            end
-         end
-         if approved_sub then
-            cur = sub_string
-         end
-      end
-      if not skip then
-         table.insert(order,cur)
-      end
-   end
-   return order
-end
-
-function list_to_string(str_list)
-   --Helpful debug output
-   l = ""
-   for i,str in ipairs(str_list) do
-      if i > 1 then
-         l = l .. ", " .. str
-      else
-         l = str
-      end
-   end
-   return l
-end
-
-
-function find_cur_level(stmt,idx)
-   --Search cur_indices for a idx at stmt
-   cur = cur_indices(stmt)
-   --print(string.format("find_cur_level(stmt %d, idx %s)  Cur indices %s", stmt, idx, list_to_string(cur)))
-   for i,cidx in ipairs(cur) do
-      if cidx == idx then
-         --print(string.format("found it at index %d", i))
-         return i
-      end
-   end
-   error("Unable to find "..idx.." in current list of indices")
-end
-
-
-function chk_cur_level(stmt,idx)
-   --Search cur_indices for a idx at stmt
-   cur = cur_indices(stmt)
-   for i,cidx in ipairs(cur) do
-      if cidx == idx then
-         return i
-      end
-   end
-   return -1
-end
-
-
-function find_offset(cur_order, tile, control)
-   --print("Looking for tile '"..tile.."' and control '"..control.."' in ( "..list_to_string(cur_order)..", )")
-   idx1 = -1
-   idx2 = -1
-   for i,cur in ipairs(cur_order) do
-      if(cur == tile) then
-         idx1 = i
-      end
-      if(cur == control) then
-         idx2 = i
-      end
-   end
-   if(idx1 < 0) then
-      error("Unable to find tile " .. tile .. " in current list of indices")
-   end
-   if(idx2 < 0) then
-      error("Unable to find control " .. control .. " in current list of indices")
-   end
-   --print("found at level " .. idx2 .. " and " .. idx1)
-   if(idx2 < idx1) then
-      return idx2-idx1+1
-   else
-      return idx2-idx1
-   end
-end
-
-function tile_by_index(tile_indices, sizes, index_names, final_order, tile_method)
-   --print "STARTING TILE BY INDEX"
-   --io.flush()
-   stmt = 0 --assume stmt 0
-   cur = cur_indices(stmt)
-   --print("Cur indices "..list_to_string(cur))
-   if not valid_indices(stmt,tile_indices) then
-      error('One of the indices in the first parameter were not '..
-            'found in the current set of indices.')
-   end
-   if not tile_method then tile_method = counted end
-   tile_idx_names = {}
-   for i,s in ipairs(tile_indices) do tile_idx_names[i]=s end --shallow copy
-   --print("tile_index_names: ['"..list_to_string(tile_indices).."']")
-   
-   --print("index_names:  ") 
-   --for k,v in pairs(index_names) do print(k,v) end
-   
-   --io.flush()
-   
-   ctrl_idx_names = {}
-   tile_idx_map = {}
-   for k,v in pairs(index_names) do
-      valid = false
-      if(string.sub(k,1,1) == "l") then
-         if string.sub(k,-8) == "_control" then
-            i = tonumber(string.sub(k,2,-9))
-            if i and i >= 1 and i <= (# tile_indices) then
-               ctrl_idx_names[i] = v
-               --print(string.format("Handling control %s for loop level %d",v,i))
-               --print("control "..k.."   name  "..v.." ")
-               valid = true
-            end
-         elseif string.sub(k,-5) == "_tile" then
-            i = tonumber(string.sub(k,2,-6))
-            if i and i >= 1 and i <= (# tile_indices) then
-               --print(string.format("tile %s -> %s",tile_indices[i], v))
-               tile_idx_names[i] = v
-               tile_idx_map[v] = tile_indices[i]
-               --print(string.format("tile %s -> %s",tile_indices[i], v))
-               valid = true
-            end
-         end
-      end
-      if not valid then error(string.format("%s is not a proper key for specifying "..
-                                            "tile or control loop indices\n", k)) end
-   end
-   
-   --filter out control indices (and do name substitution of unprocessed tile indices) for a given level
-   cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, -1)
-   permute(stmt, cur_order)
-   
-   for i,cur_idx in ipairs(tile_indices) do
-      --print(string.format("i %d  cur_idx %s calling build order ********", i-1, cur_idx))
-      cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)
-      --Find a offset between tile loop and control loop
-      -- 0   = control loop one level above tile loop
-      -- -1  = control loop two levels above tile loop
-      -- > 0 = tile loop above control loop
-      -- In the last case, we do two extra tile commands to get the control
-      -- above the tile and then rely on the final permute to handle the
-      -- rest
-      level = find_cur_level(stmt,cur_idx)
-      offset = find_offset(cur_order, tile_idx_names[i], ctrl_idx_names[i])
-      --print(string.format("offset %d", offset))
-      
-      if (offset <= 0) then
-         --print(string.format("[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %s)",stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method)) 
-         tile(stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method)
-      else
-         --print(string.format("2tile(%d, %d, %d, %d, %s, %s, %s)", stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method))
-         tile(stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method);--regular level
-         --flip tile and control loop
-         --print(string.format("3tile(%d, %d, %d)",stmt, level+1, level+1))
-         tile(stmt, level+1, level+1);
-         --print(string.format("4tile(%d, %d, %d)",stmt, level+1, level))
-         tile(stmt, level+1, level);
-         --print(string.format("\n[offset>0]tile(%d, %d, %d, %d,%s,%s,%s)",stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method)) 
-	 --print_code()
-         
-      end
-      
-      --Do permutation based on cur_order
-      --print "permute based on build order calling build_order()"
-      --print "cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)"
-      cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)
-      --print "permute(stmt, cur_order);"
-      permute(stmt, cur_order);
-      --print "\nafter permute(), code is:"
-      --print_code()
-   end
-   --print "ENDING TILE BY INDEX"
-   --print_code()
-end
-
-function normalize_index(index)
-   stmt = 0 --assume stmt 0cur = cur_indices(stmt)
-   --print("Cur indices "..list_to_string(cur))
-   l = find_cur_level(stmt, index)
-   tile(stmt, l, l)
-   --print(string.format("\n[Normalize]tile(%d, %d, %d)",stmt, l,l)) 
-end
-
-function is_in_indices(stmt, idx)
-   cur = cur_indices(stmt)
-   for i=0,#cur,1 do
-      if(cur[i]==idx) then
-         return true
-      end
-   end
-   return false
-   
-end
-
-
-function copy_to_registers(start_loop, array_name)
-   
-   --print("\n\n****** starting copy to registers")
-   io.flush()
-
-   stmt = 0 --assume stmt 0
-   
-   -- [Malik] first we make sure that tx and ty are consecutive loops in the 2D thread setup, otherwise all levels for subsequent operations are messed up. Start logic.
-   cur = cur_indices(stmt)
-   table_Size = table.getn(cur)
-   
-   --print(string.format("Cur indices %s,",list_to_string(cur)))
-   --print(string.format("The table size is %d", table_Size))
-   --table.foreach(cur, print)
-   --print_code()
-   
-   level_tx = -1
-   level_ty = -1
-   if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end
-   if is_in_indices(stmt,"ty") then level_ty = find_cur_level(stmt,"ty") end
-   --print(string.format("level_tx %d  level_ty %d", level_tx, level_ty))
-   
-   ty_lookup_idx = "" 
-   org_level_ty = level_ty
-   
-   --if(cur[level_tx+1]~=nil and cur[level_tx+1]~="") then ty_lookup = ty_lookup+1 end
-   if(cur[level_ty+1]~=nil and cur[level_ty+1]~="") then 
-      --print(string.format("IF  cur[%d] = %s", level_ty+1, cur[level_ty+1]))
-      ty_lookup_idx = cur[level_ty+1] 
-   else
-      --if cur[level_ty]  ~= nil then print(string.format("ELSE ty_lookup_idx = cur[%d] = %s", level_ty, cur[level_ty])) --   TODO 
-      --else print "ELSE (dangerous)" end
-      ty_lookup_idx = cur[level_ty]  -- may assign nil !?
-   end
-   --if ty_lookup_idx ~= nil then print(string.format("ty_lookup_idx '%s'", ty_lookup_idx))  --  TODO 
-   --else print "ty_lookup_idx is NIL"
-   --end
-   
-   if level_ty > 0 then
-      --print(string.format("\ntile3(%d,%d,%d)",stmt,level_ty,level_tx+1))
-      tile(stmt,level_ty,level_tx+1) 
-   end
-   --print_code()
-   
-   --print("\ntylookup is %d",ty_lookup)
-   --exit(0)
-   --
-   cur = cur_indices(stmt)
-   table_Size = table.getn(cur)
-   --print(string.format("Cur indices %s,",list_to_string(cur)))
-   --print("The table size is "..table.getn(cur))
-   --table.foreach(cur, print)
-   
-   if is_in_indices(stmt,"tx") then   level_tx = find_cur_level(stmt,"tx") end
-   if ty_lookup_idx then
-      if is_in_indices(stmt,ty_lookup_idx) then level_ty = find_cur_level(stmt,ty_lookup_idx) end
-   end
-   
-   ty_lookup = 1
-   idx_flag = -1
-   -- find the level of the next valid index after ty+1
-   --print(string.format("\nlevel_ty %d", level_ty))
-   if level_ty > 0 then
-      --print(string.format("table_Size %d", table_Size))
-      for num= level_ty+ty_lookup,table_Size do
-         --print(string.format("num=%d   cur[num] = '%s'",num, cur[num]))
-         if(cur[num] ~= "") then
-            idx_flag = find_cur_level(stmt,cur[num])
-            --print (string.format("idx_flag = %d", idx_flag))
-            break
-         end
-      end
-   end
-   
-   --print(string.format("\n(first) I am checking all indexes after ty+1 %s",idx_flag))
-   --print_code()
-   --print ""
-   
-   how_many_levels = 1
-   startat = idx_flag + 1
-   if startat == 0 then startat = 1 end  -- avoid attempt to examine an illegal array offset
-   --print(string.format("idx_flag = %d   I will check levels starting with %d", idx_flag, idx_flag+1))
-   
-   for ch_lev = startat,table_Size,1 do    -- was for ch_lev = idx_flag+1,table_Size,1 do
-      --print(string.format("ch_lev %d", ch_lev))
-      if(cur[ch_lev] ~= nil and cur[ch_lev] ~= "") then
-         --print(string.format("cur[%d] = '%s'", ch_lev, cur[ch_lev])) 
-         how_many_levels = how_many_levels+1
-      end
-   end
-   --print("\nHow Many Levels",how_many_levels)
-   
-   -- change this all to reflect the real logic which is to normalize all loops inside the thread loops. 
-   if(how_many_levels <2) then
-      while( idx_flag >= 0) do
-         for num = level_ty+ty_lookup,(table_Size) do
-            --print(string.format("at top of loop, num is %d", num))
-            --print(string.format("num %d", num))
-            --print(string.format("cur[num] = '%s'", cur[num]))
-            if(cur[num] ~= "") then
-               idx=cur[num]
-               --print(string.format("idx '%s'", idx))
-               
-               curlev = find_cur_level(stmt,idx)
-               --print(string.format("curlev %d", curlev))
-               
-               --print_code()
-               --print(string.format("\n[COPYTOREG]tile(%d,%d,%d)",stmt,find_cur_level(stmt,idx),level_tx))
-               tile(stmt,find_cur_level(stmt,idx),find_cur_level(stmt,idx))
-               curlev = find_cur_level(stmt,idx)
-               --print(string.format("curlev %d", curlev))
-               tile(stmt,find_cur_level(stmt,idx),level_tx)
-               --print(string.format("hehe '%s'",cur[num]))
-               
-               cur = cur_indices(stmt)
-               --print("Cur indices INSIDE"..list_to_string(cur))
-               table_Size = table.getn(cur)
-               --print(string.format("Table Size is: %d",table_Size))
-               level_tx = find_cur_level(stmt,"tx")
-               --print(string.format("\n level TX is: %d",level_tx))
-               level_ty = find_cur_level(stmt,ty_lookup_idx)
-               --print(string.format("\n level TY is: %d",level_ty))
-               idx_flag = -1
-               --print "idx_flag = -1"
-               
-               -- find the level of the next valid index after ty+1
-               
-               -- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?)
-               for num= level_ty+ty_lookup,table_Size do
-                  --print(string.format("num mucking num = %d", num))
-                  if(cur[num] ~= nil and cur[num] ~= "") then
-                     idx_flag = find_cur_level(stmt,cur[num])
-                     --print("\n(second) I am checking all indexes after ty+1 %s",cur[num])
-                     break
-                  end
-               end
-               --print(string.format("num mucked to %d     idx_flag = %d", num, idx_flag))
-               
-            end
-            --print(string.format("at bottom of loop, num is %d", num))
-         end
-      end
-   end
-   --print "done with levels"
-   
-   
-   
-   
-   --print "ARE WE SYNCED HERE?"
-   --print_code()
-   --print("\ntile(%d,%d,%d)",stmt,level_k,level_k)
-   --tile(stmt,level_k,level_k)
-   
-   -- [Malik] end logic
-   --print_code()
-   start_level = find_cur_level(stmt, start_loop)
-   --We should hold contant any block or tile loop
-   block_idxs = block_indices()
-   thread_idxs = thread_indices()
-   --print("\nblock indices are")
-   --table.foreach(block_idxs, print)
-   --print("\nthread indices are")
-   --table.foreach(thread_idxs, print)
-   --print(string.format("\nStart Level: %d",start_level))
-   
-   hold_constant = {}
-   --print("\n Now in Blocks")
-   for i,idx in ipairs(block_idxs) do
-      --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx)))
-      if find_cur_level(stmt,idx) >= start_level then
-         table.insert(hold_constant, idx)
-         --print(string.format("\nJust inserted block %s in hold_constant",idx))
-      end
-   end
-   
-   
-   --print("\n Now in Threads")
-   for i,idx in ipairs(thread_idxs) do
-      --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx)))
-      if find_cur_level(stmt,idx) >= start_level then
-         table.insert(hold_constant, idx)
-         --print(string.format("\nJust inserted thread %s in hold_constant",idx))
-      end
-   end
-   
-   --print "\nhold constant table is: "
-   --table.foreach(hold_constant, print)
-   
-   --print("\nbefore datacopy pvt")
-   old_num_stmts = num_statements()
-   --print_code()
-   --print(string.format("\n[DataCopy]datacopy_privatized(%d, %s, %s, vector having privatized levels)",stmt, start_loop, array_name)) 
-   --table.foreach(hold_constant, print)
-   datacopy_privatized(stmt, start_loop, array_name, hold_constant)
-   
-   --print(hold_constant)
-   new_num_stmts = num_statements()
-   --print("\nthe num of statements:%d\n",new_num_stmt)
-   --print_code()
-   --exit(0)
-   -- [Malik] normalize the copy loops created.
-   cur = cur_indices(old_num_stmts)
-   --print("Cur indices "..list_to_string(cur))
-   for cidx,i in ipairs(cur) do
-      if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then
-         --tile(old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i))
-         --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i))
-      end
-   end
-   --print_code()
-   --print("\nthe num of statements OLD+1 :",(old_num_stmts+1))  
-
-
---[[ 
-   is this commented out? why yes, yes it is   block comment 
-   if( (old_num_stmts+1) <= new_num_stmts) then
-      cur = cur_indices(old_num_stmts+1)
-      --print("Cur indices+1 "..list_to_string(cur))
-      for cidx,i in ipairs(cur) do
-         if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then
-            tile(old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i))
-	    --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i))
-         end
-      end
-   end
---]]
-
-
-   --Unroll to the last thread level
-   --for stmt=old_num_stmts,new_num_stmts-1 do
-   -- level = find_cur_level(stmt,thread_idxs[#thread_idxs])--get last thread level
-   --if level < #cur_indices(stmt) then
-   -- unroll(stmt,level+1,0)
-   --print(string.format("\n[Unroll]unroll(%d, %d, 0)",stmt, level+1)) 
-   ----print_code()
-   --end
-   --end
-   io.flush()
-   --print("****** ending copy to registers\n\n")
-   --io.flush()
-end
-
-function copy_to_shared(start_loop, array_name, alignment)
-   --print(string.format("\nstarting copy to shared(%s, %s, %d )",start_loop,array_name,alignment))
-   stmt = 0 --assume stmt 0
-   cur = cur_indices(stmt)
-   --print("Cur indices "..list_to_string(cur))
-   
-   start_level = find_cur_level(stmt, start_loop)
-   --print(string.format("start_level %d", start_level))
-   
-   old_num_stmts = num_statements()
-   --print(string.format("old_num_statements %d", old_num_stmts))
-   
-   --Now, we give it indices for up to two dimentions for copy loop
-   copy_loop_idxs = {"tmp1","tmp2"}
-   --print(string.format("\n[DataCopy]datacopy(%d, %d, %s, {\"tmp1\",\"tmp2\"},false,0,1,%d,true)",stmt, start_level, array_name, alignment)) 
-   datacopy(stmt, start_level, array_name, copy_loop_idxs, false, 0, 1, alignment,true)
-   
-   add_sync(stmt,start_loop)
-   new_num_stmts = num_statements()
-   
-   --This is fairly CUBLAS2 specific, not sure how well it generalizes,
-   --but for a 2D copy, what we want to do is "normalize" the first loop
-   --"tmp1" then get its hard upper bound. We then want to tile it to
-   --make the control loop of that tile "ty". We then tile "tmp2" with a
-   --size of 1 and make it "tx".
-   --print(string.format("fairly CUBLAS2 specific, OLD %d  NEW %d",  old_num_stmts, new_num_stmts ))
-   
-   for stmt=old_num_stmts,new_num_stmts-1 do
-      --print(string.format("for stmt = %d", stmt))
-      was_no_error, level = pcall(find_cur_level, stmt, "tmp2")
-      
-      if was_no_error then 
-         --print_code() 
-         --print("\nCopy to shared: [If was no error]\n")
-         find_cur_level(stmt,"tmp2")
-         tile(stmt, level, level)
-         
-         lower,upper = hard_loop_bounds(stmt, level)
-         upper = upper + 1
-         --print(string.format("lower %d  upper %d", lower, upper))
-         
-         tx,ty = thread_dims()
-         --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx)
-         
-         level = find_cur_level(stmt,"tmp1")
-         --print(string.format("level %d", level))
-         
-         if tx == upper and ty == 1 then
-            --print(string.format("tx = %d    upper = %d     ty = %d", tx, upper, ty))
-            --print "Don't need"
-            
-            --Don't need an extra tile level, just move this loop up
-            second_level = find_cur_level(stmt,"tmp2")
-            --print(string.format("\n[Tile0]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx")) 
-            tile(stmt, second_level, 1, level, "tx", "tx", counted)
-         else
-            --print "DO need?"
-            --print_code()
-            if(ty == 1) then new_ctrl = "tmp3" else new_ctrl = "ty" end
-
-
---[[ Commenting out a block of Gabe's code in this control flow
-               -- level = find_cur_level(stmt,"tmp1")
-               tile(stmt, level, level)
-
-               lower,upper = hard_loop_bounds(stmt, level)
-               upper = upper + 1
-               --print_code()
-               --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx..", level: "..level)
-               if(math.ceil(upper/ty) > 1)then
-                  tile(stmt, level, math.ceil(upper/ty), level, "tmp", new_ctrl, counted)
-                  --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level,  math.ceil(upper/ty),upper,ty, level, "tmp", new_ctrl)) 
-               else
-                  tile(stmt, level, math.ceil(upper/ty), level, "ty", new_ctrl, counted)
-		  --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level,  math.ceil(upper/ty),upper,ty, level, "tx", new_ctrl))
-               end
-               
-               --print_code()    
-               -- [Malik] If here we have the loop upper bound > tx, then we should tile once more after the next tile, to carve out the correct tx. 
-               lower1,upper1 = hard_loop_bounds(stmt,level)
-               level1 = level
-               stmt1 = stmt
-               -- [Malik] Do the tile after the second level tile with if condition. Just to keep the original order, the tile is being pushed to the end. 
-               
-               --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1)
-
-               --print_code()
-               --level = find_cur_level(stmt,"tmp")
-               --tile(stmt,level,level)
-               --print_code() 
-               
-               --[Malik] if you are moving the loop above the level1, you need to update level1 with new position which would be level1+2 or second_level
-               if(level <= level1) then level1 = level1+2 end
- 	       --print(string.format("\n[Tile2]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx")) 
-               --print("\n----------------------------------")
-               --print_code()
-               --print("\n**********************************")
-               --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1)
-               -- [Malik] If the upper bound > tx, we do another tile to carve out the correct tx from a bigger loop. Else just normalize the bounds. 
-               if( upper1 > ty) then
-                  third_level = find_cur_level(stmt1,"tmp")
-                  --print("\n\n\n\t\t\t\tthirdlevel:"..third_level)
-                  tile(stmt1, third_level, ty, third_level, "ty", "tmp", counted)
-                  --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt1, third_level, ty,third_level, "ty", "tmp"))
-                  tile(stmt1,third_level+1,third_level+1)
-                  --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level+1))
-                  tile(stmt1,third_level+1,third_level)
-                  --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level))
-               else
-                  tile(stmt1,level1,level1)
-                  --print(string.format("\n[Tile3ELSE]tile(%d, %d, %d)",stmt1,level1,level1))
-               end
-               
-               --print("\nStarting tmp2\n");--print_code();
-               second_level = find_cur_level(stmt,"tmp2")
-               lower,upper = hard_loop_bounds(stmt,second_level)
-               level = second_level
-               --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..", level:"..level)
-               
-               if(math.ceil(upper/tx) > 1)then
-                  tile(stmt, second_level,math.ceil(upper/tx), level, "tmp", "tx", counted)
-                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tmp", "tx"))
-               else
-                  tile(stmt, second_level,math.ceil(upper/tx), level, "tx", "tx", counted)
-                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tx", "tx"))
-               end
-               --print_code()
-               lower2,upper2 = hard_loop_bounds(stmt,level)
-               level2 = level
-               stmt2 = stmt
-               --print("[Malik]-loop cleanup@tmp2: lower2, upper2: "..lower2..", "..upper2..", tx: "..tx..", level:"..level2)
-               -- now for the second level.
-               if( upper2 > tx) then
-                  forth_level = find_cur_level(stmt2,"tmp")
-                  --print("\n\n\n\t\t\t\tforthlevel:"..forth_level)
-                  --print_code()
-                  tile(stmt2, forth_level, 1, forth_level, "tx", "tmp", counted)
-                  --print(string.format("\n[Tile3B]tile(%d, %d, %d,%d,%s,%s,counted)",stmt2, forth_level, tx,forth_level, "ty", "tmp"))
-                  --print_code()
-                  --tile(stmt2,forth_level+1,forth_level+1)
-                  --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level+1))
-                  --tile(stmt2,forth_level+1,forth_level)
-                  --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level))
-               else
-                  new_level = find_cur_level(stmt2,"ty")
-                  tile(stmt2,level2,1,new_level,"tx","tx",counted)
-                  --print(string.format("\n[Tile3BELSE]tile(%d, %d, %d)",stmt2,level2,level2))
-                  tmp_level = find_cur_level(stmt2,"tmp")
-                  tile(stmt2,tmp_level,tmp_level)
-               end
-               
-               --print_code()
-               --print("\n----------------------------------")
---]]
-               
-               --print_code() 
-               --print("\nStarting tmp2\n");--print_code();
-               first_level = find_cur_level(stmt,"tmp1")
-               second_level = find_cur_level(stmt,"tmp2")
-               lower,upper = hard_loop_bounds(stmt,second_level)
-               
-               --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..",first level:"..first_level..",second_level:"..second_level)
-               
-               -- Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx.
-               --print(string.format("\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,1,first_level, "tx", "tx"))
-               tile(stmt,second_level,1,first_level,"tx","tx",counted)
-               --print_code()
-               
-               first_level = find_cur_level(stmt,"tmp1")
-               lower_1,upper_1 = hard_loop_bounds(stmt,first_level)
-               tx_level = find_cur_level(stmt,"tx")
-               lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level)
-               --print(string.format("UL_1 %d %d     UL_tx %d %d", lower_1, upper_1, lower_tx, upper_tx))
-               
-               if(math.ceil(upper_tx/tx) > 1)then
-                  --print "ceil I say"
-                  --print(string.format("\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,tx,tx_level, "tx", "tmp1"))
-                  tile(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted)
-                  --print_code()
-                  
-                  peat = find_cur_level(stmt,"tx")
-                  --print(string.format("\n[Tile1]tile(%d, %d, %d)",stmt, peat, peat))
-                  tile(stmt, peat, peat )  --find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx"))
-                  --print_code()
-                  
-                  if (find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx")) then
-                     --print(string.format("\nagain [Tile1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")))
-                     tile(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
-                     --print_code()
-                  end
-                  --else
-                  --tile(stmt, tx_level,1, tx_level, "tx", "tx", counted)
-                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,1,tx_level, "tx", "tx"))
-               end
-               --print_code()
-               --]]  -- this apparently is NOT the end of a block comment
-               
-               --print("\nStarting tmp1\n")
-               -- Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty".
-               tile(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))     
-               --print_code()  
-               
-               ty_level = find_cur_level(stmt,"tmp1")
-               lower_ty,upper_ty = hard_loop_bounds(stmt,ty_level)
-               
-               tx_level = find_cur_level(stmt,"tx")
-               lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level)
-               --print("[Malik]-loop cleanup@tmp1: lowerty, upperty: "..lower_ty..", "..upper_ty..", ty: "..ty..",ty level:"..ty_level..",tx_level:"..tx_level..", stmt: "..stmt)
-               
-               --print "before ceil"
-               if(math.ceil(upper_ty/ty) > 1)then
-                  --print "CEIL IF"
-                  --print("\n Inside upper_ty/ty > 1\n");
-                  
-                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,ty,ty_level, "ty", "tmp_ty"))
-                  tile(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted)
-                  --print_code()
-                  
-                  --print(string.format("\n[Tile2-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt  ,"ty"),find_cur_level(stmt,"ty")))
-                  tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty"))
-                  --print_code()
-                  
-                  -----------------------------------------------------------------------
-                  ----------------------------------------------------------------------
-                  cur_idxs = cur_indices(stmt)
-                  --print("\n cur indexes are "..list_to_string(cur_idxs))
-                  
-                  -- Putting ty before any tmp_tx   
-                  idx_flag = -1
-                  for num= 0,table.getn(cur_idxs) do
-                     if(cur[num] == "tmp_tx") then
-                        idx_flag = find_cur_level(stmt,cur[num])
-                        break
-                     end
-                  end
-                  --print(string.format("\n (1) so i have found out the value of idx flag as %d",idx_flag) )
-                  
-                  if(idx_flag >=0 ) then  
-                     if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then
-                        --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
-                        tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                        --print_code()
-                     end
-                  end
-                  
-                  -- Now Putting ty before any tmp_ty
-                  idx_flag = -1
-                  for num= 0,table.getn(cur_idxs) do
-                     if(cur[num] == "tmp_ty") then
-                        idx_flag = find_cur_level(stmt,cur[num])
-                        break
-                     end
-                  end
-		  --print(string.format("\n IF  so i have found out the value of idx flag as %d",idx_flag) )
-                  if(idx_flag >=0 ) then  
-                     --print "one more test"
-                     if ((find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"))) then
-                        --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
-                        tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                        --print_code()
-                     end
-                  end
-               else
-                  --print "CEIL ELSE"
-                  --cur_idxs = cur_indices(stmt)
-                  --print("\n Inside upper_ty/ty <= 1\n");
-                  
-                  --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,1,ty_level, "ty", "ty"))
-                  tile(stmt, ty_level,1, ty_level, "ty", "ty", counted)
-                  --print_code()
-                  
-                  --print(string.format("\n[Tile3-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1))
-                  tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
-                  --print_code()
-                  
-                  idx_flag = -1
-                  if(cur_idxs) then
-                     --print "CAN NEVER GET HERE?  cur_idxs"
-                     for num= 0,table.getn(cur_idxs) do
-                        if(cur[num] == "tmp_ty") then
-                           idx_flag = find_cur_level(stmt,cur[num])
-                           break
-                        end
-                     end
-                  end
-                  --print(string.format("\n ELSE so i have found out the value of idx flag as %d",idx_flag) )
-                  if(idx_flag >=0 ) then  
-                     if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then
-                        --print(string.format("tile( stmt %d, level ty %d, level ty %d",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) 
-                        tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                        --print(string.format("\n[Tile3-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
-                     end
-                  end
-               end
-               
-               --print_code()
-         end
-         
-         
-         --print "\n\n *** at bottom of if in copy to shared, "
-         --print_code()
-         --print "end of if"
-         
-      else
-         --copy to shared only created one level, not two, so we use a different approach (MV & TMV)
-         --print("\nCopy to shared: [If was error]\n")
-         level = find_cur_level(stmt,"tmp1")
-         tile(stmt, level, level)
-         
-         --print(string.format("\n[Tile]tile(%d, %d, %d)",stmt, level, level)) 
-         tx,ty = thread_dims()
-         lower,upper = hard_loop_bounds(stmt, level)
-         upper = upper+1 --upper bound given as <=, compare to dimensions tx which is <
-         --print("upper "..upper.." tx "..tx)
-         if upper == tx then
-            rename_index(stmt, "tmp1", "tx")
-         else
-            --print("upper is not tx")
-            --TODO: Don't know, maybe do some tileing etc
-            --print_code()
-            --print("upper "..upper.." tx "..tx.." stmt: "..stmt.." level: "..level)
-            tile(stmt, level,tx,level, "tx", "tmp_tx", counted)
-            --print_code()
-            
-            --print("stmt:"..stmt.." level+1: "..level+1)
-            --print("TILE 7")
-            tile(stmt, level+1,1,level+1,"tx", "tx",counted)
-            --print("TILE 3")
-            tile(stmt,level+1,level)
-            --print_code()
-            
-            if(ty > 1) then
-               --print_code()
-               --print("GOING IN")
-               lower,upper = hard_loop_bounds(stmt, level+1)
-               --print(string.format("ty %d  lower %d  upper %d", ty, lower, upper))
-               --upper=125
-               --print("NOW FOR Y: upper "..upper.." ty "..ty.." stmt: "..stmt.." level: "..(level+1).." bound:"..math.ceil(upper/ty))
-               tile(stmt, level+1,math.ceil(upper/ty),level+1, "tmp_ty", "ty", counted)
-               --tile(stmt, level+2,math.ceil(upper/ty),level+2, "tmp_ty", "ty", counted)
-            end
-            --print_code()
-            --rename_index(stmt, "tmp1", "tx")
-            --print("Warning: Need to implement some logic here to tile the single level shared copy loop to match thread dimensions")
-         end
-      end
-      --Always add sync
-      add_sync(stmt,start_loop)
-      
-   end
-   --print("ending copy to shared\n")
-   --print_code()
-end
-
-function unroll_to_depth(max_depth)
-   --print(string.format("\n\nunroll_to_depth(%d)", max_depth ))
-   --print "SYNC UP"
-   
-   cur = cur_indices(0)
-   thread_idxs = thread_indices()
-   guard_idx = thread_idxs[#thread_idxs]
-   
-   --print(string.format("cur    indices %s",list_to_string(cur)))
-   --print(string.format("thread indices %s",list_to_string(thread_idxs)))
-   --print(string.format("#thread_idxs = %d", #thread_idxs))
-   --print(string.format("guard_idx = %s", guard_idx))
-   
-   ---- HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS   
-   common_loops = {}
-   comm_loops_cnt = 0
-   num_stmts = num_statements()
-   --print(string.format("num statements %d", num_stmts))
-   
-   for stmt=0,num_stmts-1 do
-      cur_idxs = cur_indices(stmt)
-      
-      --print(string.format("\nSTMT %d Current Indices: %s",stmt,list_to_string(cur_idxs)))
-      
-      if(chk_cur_level(stmt,"tx")>0) then
-         for ii=1,find_cur_level(stmt,"tx")-1 do    -- started at 0
-            --print(string.format("ii = %d", ii)) -- index starts at 1, what does index 0 do?
-            --if cur_idxs[ii] == nil then print "cur_idxs[i]] is NIL" 
-            --else print(string.format("cur_idxs[%d] = '%s'", ii, cur_idxs[ii])) -- index starts at 1, what does index 0 do?
-            --end
-            
-            if(cur_idxs[ii] ~= "bx" and cur_idxs[ii] ~= "by" and cur_idxs[ii] ~= nil and cur_idxs[ii] ~= "tx" and cur_idxs[ii] ~= "ty" and cur_idxs[ii] ~= "") then 
-               
-               --print(string.format("id %s is not in the list", cur_idxs[ii] ))
-               
-               for stmt1=stmt+1,num_stmts-1 do
-                  --print(string.format("\nii %d stmt1 is %d", ii, stmt1))          
-                  cur_idxs1 = cur_indices(stmt1)
-                  --print("\nstmt1 cur_idxs1 is "..list_to_string(cur_idxs1))   
-                  
-                  --print(string.format("cur level(%d, %s) = %d", stmt, "tx",  find_cur_level(stmt,"tx")))    
-                  
-                  endrange = find_cur_level(stmt,"tx")-1
-                  --print(string.format("for iii=1, %d do", endrange))
-                  
-                  for iii=1,find_cur_level(stmt,"tx")-1 do  -- started at 0
-                     --print(string.format("stmt %d   ii %d   iii %d ", stmt, ii, iii))
-                     --if(cur_idxs1[iii] ~= nil) then 
-                     --   print(string.format("stmt %d   ii %d   iii %d  cur_idxs1[%d] = '%s'", stmt, ii, iii, iii, cur_idxs1[iii]))  
-                     --else 
-                     --   print(string.format("stmt %d   ii %d   iii %d  cur_idxs1[%d] = NIL", stmt, ii, iii, iii))  
-                     --end
-                     
-                     if(cur_idxs1[iii] ~= "bx" and cur_idxs1[iii] ~= "by" and cur_idxs1[iii] ~= nil and cur_idxs1[iii] ~= "tx" and cur_idxs1[iii] ~= "ty" and cur_idxs1[iii] ~= "") then  
-                        if(cur_idxs[ii] == cur_idxs1[iii]) then
-                           --print("\nfound idx:"..cur_idxs[ii])
-			   --if(comm_loops_cnt == 0) then print "\n\n*** WARNING *** assigning to array index ZERO in Lua" end
-                           common_loops[comm_loops_cnt] = cur_idxs[ii]
-                           --print(string.format("cl[%d] = '%s'", comm_loops_cnt,   common_loops[comm_loops_cnt]))
-                           comm_loops_cnt = comm_loops_cnt + 1
-                        end
-                     end  
-                  end
-               end  
-            end
-         end
-      end
-   end
-   ----
-   --if(comm_loops_cnt>0) then 
-   --   print("\n COMM LOOPS :TOTAL "..comm_loops_cnt..", and are "..list_to_string(common_loops).." this loop :"..common_loops[0])
-   --else
-   --   print "UNROLL can't unroll any loops?"
-   --end
-   
-   
-   
-   
-   repeat
-      old_num_stmts = num_statements()
-      --print(string.format("old_num_statements %d", old_num_stmts))
-      
-      for stmt=0,old_num_stmts-1 do
-         cur_idxs = cur_indices(stmt)
-         --print(string.format("stmt %d    cur_idxs = %s", stmt, list_to_string(cur_idxs)))
-         if(#cur_idxs > 0) then 
-            gaurd_level = -1
-            if(chk_cur_level(stmt,guard_idx)>0) then
-               gaurd_level = find_cur_level(stmt,guard_idx)
-            end
-            --print(string.format("guard_level(sp) = %d", gaurd_level))
-            
-            if(gaurd_level>-1) then
-               level = next_clean_level(cur_idxs,gaurd_level)
-               --print(string.format("next clean level %d", level))
-               
-               --need to handle max_depth
-               num_unrolled = 0
-               level_unroll_comm = level
-               level_arr = {}
-               while level >= 0 do
-                  --print(string.format("while: level = %d", level))
-                  
-                  if num_unrolled == max_depth then break end
-                  --print("Unrolling "..stmt.." at level "..(level).." index ".. cur_idxs[gaurd_level+1])
-                  
-                  level_arr[num_unrolled] = level
-                  num_unrolled = num_unrolled + 1
-                  
-                  guard_level = find_cur_level(stmt,guard_idx)
-                  level = next_clean_level(cur_idxs,level+1)
-               end
-               --dies print("How many levels for unroll commands"..table.getn(level_arr).." which is "..level_arr[0].." and "..level_arr[#level_arr])
-               --if(table.getn(level_arr) ~= nil) then
-               
-               --print "OK, NOW WE UNROLL"
-               
-               if(level_unroll_comm >= 0)then
-                  for i = table.getn(level_arr),0,-1 do
-                     --print(string.format("\ni=%d", i))
-                     --print(string.format("[Unroll]unroll(%d, %d, 0)",stmt, level_arr[i]))     
-                     
-                     unroll(stmt,level_arr[i],0)
-                     --print("finished unroll]]\n")
-                     --print_code()
-                  end
-               end
-------
-            end    
---[[
-
-THERE WAS A BIG BLOCK OF COMMENTED OUT CODE HERE 
-
-
---]]
-------
-         end
-      end
-      new_num_stmts = num_statements()
-
-   until old_num_stmts == new_num_stmts
-
-end
-
-
diff --git a/examples/cuda-chill/cudaize.py b/examples/cuda-chill/cudaize.py
deleted file mode 100755
index ffef009..0000000
--- a/examples/cuda-chill/cudaize.py
+++ /dev/null
@@ -1,1047 +0,0 @@
-#! /usr/bin/python
-
-# THIS IS CUDAIZE.PY
-
-import chill
-import sys
-import math 
-
-strided = 0
-counted = 1
-
-def print_code():
-    chill.print_code()
-    print ""
-    sys.stdout.flush()
-
-    
-def table_contains_key( table, key ):  # use a dict for the 'table'?
-    return table.has_key(key) # (key in table)?
-
-def print_array( arr ):  # a useful function to mimic lua output 
-    for a in arr[:-1]:
-        print "%s," % a,
-    print "%s" % arr[-1]
-    sys.stdout.flush()
-
-def valid_indices( statement, indices ):
-    #print "valid_indices() python calling C cur_indices"
-    #print statement
-    cur = chill.cur_indices(statement) # calls C
-    #print "python valid_indices(), cur = ",
-    #print cur
-    #print "indices = ",
-    #print indices
-
-    for index in indices:
-        if not index in cur:
-            return False
-    return True
-
-def next_clean_level( indices_at_each_level, level):
-    #print "next_clean_level( ..., %d )" % level 
-    #print "indices_at_each_level ",
-    print_array( indices_at_each_level )
-
-    numlevels = len(indices_at_each_level)
-    #print "loop to %d" % numlevels
-    for i in range(level+1, numlevels+1):
-        pythoni = i-1 # LUA index starts at 1
-        #print "Checking level %d = '%s'" % (i, indices_at_each_level[pythoni])
-        sys.stdout.flush()
-        if len(indices_at_each_level[pythoni]) > 0: # LUA INDEX STARTS AT 1
-            #print "returning %d" % i
-            return i  # MATCH lua return value, LUA index starts at one
-    return -1  # no non-dummy indices
-
-
-
-
-def build_order(  final_order, tile_index_names, control_index_names, tile_index_map, current_level):
-    order = []   
-    #print "\nbuild_order()"
-    #print "build_order(): final_order = (",
-    count = 0
-    for f in final_order:
-        #if count+1 == len(final_order):
-        #    print "%s )" % f
-        #else:
-        #    print "%s," % f ,
-        count += 1
-
-        keys = control_index_names.keys()
-        keys.sort()
-        #if (2 == len(keys)):
-        #    print "build_order(): ctrl_idx_names = (%s, %s)" % (control_index_names[0], control_index_names[1])
-        #else:
-        #    print "build_order(): ctrl_idx_names = (%s" % control_index_names[0],
-        #    for k in keys[1:]:
-        #        print ", %s" % control_index_names[k],
-        #    print ")"
-
-    #print control_index_names
-    #print "cur_level %d" % current_level
-    
-    #print "tile index map: ",
-    #print tile_index_map
-
-
-    for i in range(len(final_order)):
-        k = final_order[i]  # not used?
-        skip = False
-        cur = final_order[i]  
-        # control loops below our current level should not be in the current order
-
-        # skip = cur in control_index_names[current_level+2:] 
-        #print "\n%d control_index_names, " % len(control_index_names)
-        #print control_index_names
-
-        for j in range(current_level+1, len(control_index_names)):
-            #print "comparing cur %s with cin[%d] %s" % ( cur, j, control_index_names[j])
-            if control_index_names[j] == cur:
-                skip = True 
-                #print "SKIP %s  " % cur
-
-        # possibly substitute tile indices if necessary
-        if tile_index_map.has_key(cur):
-            approved_sub = False
-            sub_string = tile_index_map[cur]
-            #print "sub_string = ",
-            #print sub_string
-
-            # approved_sub = sub_string in tile_index_names[current_level+2:]
-            for j in range(current_level+1, len(tile_index_names)):
-                if tile_index_names[j] == sub_string:
-                    approved_sub = True
-            if approved_sub:
-                cur = sub_string
-
-        if not skip:
-            order.append( cur)  
-    #print "build_order() returning order (",
-    #print order
-    #for o in order:
-    #    print "%s," % o,
-    #print ")"
-    return order
-
-def find_cur_level( stmt, idx ):
-    #print "find_cur_level(stmt %d, idx %s)  Cur indices" % ( stmt, idx ),
-    
-    cur = chill.cur_indices(stmt)
-    #for c in cur[:-1]:
-    #    print "%s," % c,
-    #print "%s" % cur[ -1 ] 
-
-    index = 1 # lua starts indices at 1 !!  
-    for c in cur:
-        if c == idx:
-            #print "found it at index %d" % index
-            #sys.stdout.flush()
-            #print "in find_cur_level, returning ",
-            #print index
-            return index
-        index += 1
-    #print "find_cur_level(), Unable to find index %s in" % idx,
-    #print cur
-    #print "in find_cur_level, returning -1"
-    return -1  # special meaning "it's not there"
-
-def chk_cur_level( stmt, idx ):
-    # search cur_indices for a ind at stmt
-    cur = chill.cur_indices(stmt)
-    if idx in cur:
-       return 1 + cur.index(idx)  # lua index starts at 1 !
-    return -1
-
-def find_offset( cur_order, tile, control):
-    #print "Looking for tile '%s' and control '%s' in (" % (tile, control),
-    #print cur_order
-    #for o in cur_order:
-    #    print "%s," % o,
-    #print ")"
-
-    idx1 = -1
-    idx2 = -1
-    if tile in cur_order: 
-        idx1 = 1 + cur_order.index(tile) # lua indexes from 1!
-    else:
-        print "find_offset(), unable to find tile %s in current list of indices" % tile
-        sys.exit(-1)
-
-    if control in cur_order:
-        idx2 = 1 + cur_order.index(control) # lua indexes from 1!
-    else:
-        print "find_offset(), unable to find control %s in current list of indices" % control
-        sys.exit(-1)
-
-    #print "found at level %d and %d" % ( idx2, idx1 )
-    # this appears horrible
-    if idx2 < idx1:
-        return idx2-idx1+1 # bad ordering
-    else:
-        return idx2-idx1
-
-
-
-def tile_by_index( tile_indices, sizes, index_names, final_order, tile_method):
-    #print "STARTING TILE BY INDEX"
-    #print "tile_by_index() tile_method ",
-    #print tile_method
-    #print "index_names: ",
-    #print index_names
-
-    stmt = 0 # assume statement 0
-    if not valid_indices( stmt, tile_indices):
-        print "python tile_by_index() one or more of ",
-        print tile_indices,
-        print " is not valid"
-        sys.exit(-1)
-
-    if tile_method == None:
-        #print "CREATING tile_method = 1"
-        tile_method = 1 # "counted"
-
-    tile_index_names = []
-    for ti in tile_indices:
-        tile_index_names.append( ti )  # make a copy? 
-    #print "tile_index_names:",
-    #print tile_index_names
-
-    control_index_names = {} # a dictionary?
-    tile_index_map =  {}
-    
-    #print "index_names: "
-    #print index_names
-
-    for pair in index_names:
-        valid = False
-        control = pair[0]
-        name    = pair[1]
-        #print "control %s   name  %s" % ( control, name )
-        
-        if control[0] == "l" and control[1].isdigit():
-            if control.endswith("_control"):
-                index = int(control[1: -8])
-                control_index_names[index-1] = name
-                valid = True
-
-            elif control.endswith("_tile"):
-                index = int(control[1: -5])
-                #print "index %d" % index
-                tile_index_names[index-1] = name # ?? 
-                tile_index_map[name] = tile_indices[index-1]
-                valid = True
-        if not valid:
-            print "%s is not a proper key for specifying tile or control loop indices\n" % control
-
-    #print "control_index_names = ",
-    #print control_index_names
-
-    #print "tile_index_names = ",
-    #print tile_index_names
-
-    #print "before call to build_order(), tile_index_map = ",
-    #print tile_index_map
-
-
-    # filter out control indices (and do name substitution of unprocessed tile indices) for a given level
-    cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, -1)
-
-    #print "returned from build_order python\n\n"
-
-    # print("permute("..stmt..", {"..list_to_string(cur_order).."})")
-    #print "permute(%d, {" % stmt,
-    #print "cur_order = ",
-    #print cur_order,
-    #print "})"
-
-    cur_order.insert(0, stmt)
-    #print cur_order
-    chill.permute( tuple( cur_order)) 
-    #print "in cudaize.py, returned from C code chill.permute()\n"
-
-    for i in range(len(tile_indices)):
-        cur_idx = tile_indices[i]
-        #print "i %d  cur_idx %s calling build order ********" % (i, cur_idx)
-        cur_order = build_order( final_order, tile_indices, control_index_names, tile_index_map, i)
-        #print "cur_idx %s return from build order" % cur_idx
-        
-        # Find an offset between tile loop and control loop
-        #  0   = control loop one level above tile loop
-        #  -1  = control loop two levels above tile loop
-        #  > 0 = tile loop above control loop
-        #  In the last case, we do two extra tile commands to get the control
-        #  above the tile and then rely on the final permute to handle the
-        #  rest
-        level = find_cur_level(stmt,cur_idx)
-        #print "level %d\n" % level     
-
-        offset = find_offset(cur_order, tile_index_names[i], control_index_names[i])
-        #print "offset %d" % offset
-
-        if offset <= 0:
-            #print "[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  )
-            chill.tile7( stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  )
-            #print "in cudaize.py, returned from C code chill.tile7\n"
-
-        else:
-            #print "2tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  )
-            chill.tile7( stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  ) # regular level
-
-            # flip and tile control loop
-            #print "3tile(%d, %d, %d)" % ( stmt, level+1, level+1)
-            chill.tile3( stmt, level+1, level+1)
-
-            #print "4tile(%d, %d, %d)" % ( stmt, level+1, level)
-            chill.tile3( stmt, level+1, level)
-
-            #print_code()
-
-        # Do permutation based on cur_order
-        #print("permute based on build order calling build_order()")
-        cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, i)
-
-        #print("permute based on build order return from build_order()")
-
-        #  print("permute("..stmt..", {"..list_to_string(cur_order).."})")
-        topermute = cur_order
-        topermute.insert(0, stmt)
-        chill.permute( tuple(topermute) ) 
-        #print "\nafter permute(), code is:"
-        #print_code()
-
-def normalize_index( index ):
-    #print "in cudaize.py, normalize_index( %s )" % index
-    stmt = 0  # assume stmt 0
-    l = find_cur_level( stmt, index )
-    chill.tile3( stmt, l, l )
-
-def is_in_indices( stmt, idx):
-    cur = chill.cur_indices(stmt)
-    return idx in cur
-
-def copy_to_registers( start_loop, array_name ):
-    #print "\n\n****** starting copy to registers"
-    #sys.stdout.flush()
-
-    stmt = 0    # assume stmt 0
-    cur = chill.cur_indices(stmt) # calls C    
-    table_Size = len(cur)
-
-    #print "Cur indices",
-    #print_array(cur)
-    #print "\nThe table size is %d" % table_Size
-    #count=1
-    #for c in cur:
-    #    print "%d\t%s" % (count,c)
-    #    count += 1
-
-    #print_code()
-
-    # would be much cleaner if not translating this code from lua!
-    level_tx = -1
-    level_ty = -1   
-    if is_in_indices(stmt,"tx"):
-        level_tx = find_cur_level(stmt,"tx")
-    if is_in_indices(stmt,"ty"):
-        level_ty = find_cur_level(stmt,"ty")
-    #print "level_tx %d  level_ty %d" % ( level_tx, level_ty )
-    #sys.stdout.flush()
-
-    ty_lookup_idx = "" 
-    org_level_ty = level_ty
-
-    # UGLY logic. Lua index starts at 1, so all tests etc here are off by 1 from the lua code
-    # level_ty initializes to -1 , which is not a valid index, and so there is added code to 
-    # make it not try to acccess offset -1.   -1 IS a valid python array index
-    # to top it off, the else below can assign a NIL to ty_lookup_idx! 
-    if level_ty != -1 and cur[level_ty] != "":
-        #print "IF  cur[%d] = %s" % ( level_ty, cur[level_ty] )
-        ty_lookup_idx = cur[level_ty] 
-    else:
-        #print "ELSE ty_lookup_idx = cur[%d] = %s" % ( level_ty, cur[level_ty-1]) 
-        ty_lookup_idx = cur[level_ty-1] 
-    #print "ty_lookup_idx '%s'" % ty_lookup_idx
-
-    if level_ty > -1:
-        #print "\ntile3(%d,%d,%d)" % (stmt,level_ty,level_tx+1)
-        chill.tile3(stmt,level_ty,level_tx+1) 
-    #print_code()   
-
-    cur = chill.cur_indices(stmt) # calls C 
-    table_Size = len(cur)
-    #print "Cur indices ",
-    #for c in cur:
-    #    print "%s," % c,
-    #print "\nThe table size is %d" % len(cur)
-    #count=1
-    #for c in cur:
-    #    print "%d\t%s" % (count,c)
-    #    count += 1
-    #sys.stdout.flush()
-
-    if is_in_indices(stmt,"tx"):
-        level_tx = find_cur_level(stmt,"tx")
-    if ty_lookup_idx != "":                      # perhaps incorrect test 
-        if is_in_indices(stmt,ty_lookup_idx):
-           level_ty = find_cur_level(stmt,ty_lookup_idx)
-           
-    ty_lookup = 1
-    idx_flag = -1
-    # find the level of the next valid index after ty+1
-    #print "\nlevel_ty %d" % level_ty
-    if level_ty > -1:
-       #print "table_Size %d" % table_Size
-       for num in range(-1 + level_ty+ty_lookup,table_Size):   # ??  off by one?
-           #print "num=%d   cur[num] = '%s'" % (num+1, cur[num]) # num+1 is lua index ????
-           sys.stdout.flush()
-           if cur[num] != "":
-               idx_flag = find_cur_level(stmt,cur[num])
-               #print "idx_flag = %d" % idx_flag
-               break
-               
-    #print "\n(first) I am checking all indexes after ty+1 %s" % idx_flag
-    #print_code()   
-    #print "" 
-
-    how_many_levels = 1
-    
-    #print "idx_flag = %d   I will check levels starting with %d" % (idx_flag, idx_flag+1)
-    # lua arrays start at index 1. the next loop in lua starts at offset 0, since idx_flag can be -1
-    # thus the check for "not equal nil" in lua (bad idea)
-    # python arrays start at 0, so will check for things that lua doesn't (?)
-    startat = idx_flag + 1
-    if idx_flag == -1:
-        startat = 1  # pretend we're lua for now.   TODO: fix the logic
-
-    for ch_lev in range(startat,table_Size+1):       # logic may be wrong (off by one)
-        #print "ch_lev %d" % ch_lev
-        if ch_lev <= table_Size and cur[ch_lev-1] != "":
-           #print "cur[%d] = '%s'" % ( ch_lev, cur[ch_lev-1] )
-           how_many_levels += 1
-
-    #print "\nHow Many Levels %d" % how_many_levels
-    sys.stdout.flush()
-    sys.stdout.flush()
-
-    if how_many_levels< 2:
-        while( idx_flag >= 0):
-            for num in range(level_ty+ty_lookup,table_Size+1):
-                #print "at top of loop, num is %d" % num
-                #print "cur[num] = '%s'" % cur[num-1]
-                if cur[num-1] != "":
-                    idx = cur[num-1]
-                    #print "idx '%s'" % idx
-                    sys.stdout.flush()
-                    curlev = find_cur_level(stmt,idx)
-                    #print "curlev %d" % curlev
-
-                    #print "\n[COPYTOREG]tile(%d,%d,%d)"%(stmt,curlev,level_tx)
-
-                    chill.tile3(stmt, curlev, curlev)
-                    curlev = find_cur_level(stmt,idx)
-                    #print "curlev %d" % curlev
-                    chill.tile3(stmt,curlev,level_tx)
-                    #print "hehe '%s'" % cur[num-1]
-                    
-                    cur = chill.cur_indices(stmt)
-                    #print "Cur indices INSIDE",
-                    #for c in cur:
-                    #    print "%s," % c,
-                    table_Size = len(cur)
-                    #print "\nTable Size is: %d" % len(cur)
-
-                    level_tx = find_cur_level(stmt,"tx")
-                    #print "\n level TX is: %d" % level_tx
-                    level_ty = find_cur_level(stmt,ty_lookup_idx)
-                    #print "\n level TY is: %d" %level_ty
-                    idx_flag = -1
-                    #print "idx_flag = -1"
-
-
-                    #- find the level of the next valid index after ty+1
-                    #- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?)
-                    for num2 in range( -1 + level_ty+ty_lookup ,table_Size): # lua starts index at one
-                        #print "num mucking num = %d" % num2
-                        if(cur[num2] != ""):
-                            #print "cur[%d] = '%s'" % ( num2, cur[num2] )
-                            idx_flag = find_cur_level(stmt,cur[num2])
-                            #print("\n(second) I am checking all indexes after ty+1 %s",cur[num2])
-                            break
-
-                    #print "num mucked to %d     idx_flag = %d" % (num, idx_flag)
-
-                #print "at bottom of loop, num is %d" % num
-          
-    #print "done with levels"
-
-    # this was a block comment ???
-
-#    for num in range(level_ty+1, table_Size+1):
-#        print "num %d" % num
-#        if cur[num-1] != "":
-#            idx_flag = find_cur_level(stmt,cur[num-1])  ## ugly 
-#    print "idx_flag = %d" % idx_flag
-
-    # change this all to reflect the real logic which is to normalize all loops inside the thread loops. 
-#    print "change this all ...\n"
-#    print "level_ty+1 %d  table_Size-1 %d     idx_flag %d" %( level_ty+1, table_Size-1, idx_flag)
-#    sys.stdout.flush()
-#    sys.stdout.flush()
-
-#    while level_ty+1 < (table_Size-1) and idx_flag >= 0:
-#        print "*** level_ty %d" %  level_ty
-#        for num in range(level_ty+2,table_Size+1):  # lua for includes second value
-#            print "num %d   cur[num] %s" % (num, cur[num])
-#            if cur[num] != "":
-#                idx = cur[num]
-#                print "idx='%s'" % idx
-#                #print_code()
-                
-                
-            
-
-    #print "ARE WE SYNCED HERE?"
-    #print_code()
-
-    #  [Malik] end logic
-    start_level = find_cur_level(stmt, start_loop) # start_loop was passed parameter!
-
-    # We should hold constant any block or tile loop
-    block_idxs  = chill.block_indices()
-    thread_idxs = chill.thread_indices()
-    #print"\nblock indices are"
-    #for index, val in enumerate(block_idxs):
-    #    print "%d\t%s" % ( int(index)+1 , val )
-    #print"\nthread indices are"
-    #for index, val in enumerate(thread_idxs):
-    #    print "%d\t%s" % ( int(index)+1 , val )
-    #print "\nStart Level: %d" % start_level
-
-    hold_constant = []
-    #print("\n Now in Blocks")
-    for idx in block_idxs:
-        blocklevel = find_cur_level(stmt,idx)
-        if blocklevel >= start_level:
-           hold_constant.append(idx)
-           #print "\nJust inserted block %s in hold_constant" %idx
-
-    #print("\n Now in Threads")
-    for idx in thread_idxs:
-        blocklevel = find_cur_level(stmt,idx)
-        if blocklevel >= start_level:
-            hold_constant.append(idx)
-            #print "\nJust inserted thread %s in hold_constant" %idx
-    #print "\nhold constant table is: "
-    #for index, val in enumerate(hold_constant):
-    #    print "%d\t%s" % ( int(index)+1 , val )
-    
-    #print("\nbefore datacopy pvt")
-    old_num_stmts = chill.num_statements()
-    #sys.stdout.flush()
-
-    #print "\n[DataCopy]datacopy_privatized(%d, %s, %s, " % (stmt, start_loop, array_name),
-    #print hold_constant,
-    #print ")"
-    passtoC = [stmt, start_loop, array_name ] # a list
-    passtoC.append( len(hold_constant ) )
-    for h in hold_constant:
-        passtoC.append( h )
-    chill.datacopy_privatized( tuple( passtoC ))
-    sys.stdout.flush()
-    sys.stdout.flush()
-    
-    new_num_statements = chill.num_statements()
-    #print "new num statements %d" % new_num_statements    
-
-    # Unroll to the last thread level
-#    for stmt in range(old_num_statements, new_num_statements):
-#        print "unrolling statement %d" % stmt
-#        level = find_cur_level(stmt,thread_idxs[-1]) #get last thread level
-#        print "level is %d" % level
-#        idxs = chill.cur_indices(stmt)
-#        if level < len(idxs):
-#            chill.unroll(stmt,level+1,0)
-
-
-
-def copy_to_shared( start_loop, array_name, alignment ):
-    #print "\nstarting copy to shared( %s, %s, %d)" % (start_loop, array_name, alignment ) 
-    #print "copy_to_shared( %s, %s, %d) in cudaize.py" % ( start_loop, array_name, alignment )
-    stmt = 0 # assume statement 0
-
-    cur = chill.cur_indices(stmt)
-    #print "Cur indices ",
-    #print_array( cur )
-
-    start_level = find_cur_level( stmt, start_loop )
-    #print "start_level %d" % start_level
-
-    old_num_statements = chill.num_statements()
-    #print "old_num_statements %d" % old_num_statements
-    
-
-    # Now, we give it indices for up to two dimensions for copy loop
-    copy_loop_idxs = ["tmp1","tmp2"]
-    #chill.datacopy_9arg(stmt, start_level, array_name, copy_loop_idxs, False, 0, 1, alignment,True)
-    passtoC = [stmt, start_level, array_name]   # a list
-    passtoC.append( len(copy_loop_idxs))
-    for i in copy_loop_idxs:
-        passtoC.append(i)
-    passtoC.append( 0 ) # False
-    passtoC.append( 0 )
-    passtoC.append( 1 )
-    passtoC.append( alignment )
-    passtoC.append( 1 )   # True
-    #print "\n[DataCopy]datacopy( ",
-    #print passtoC,
-    #print ")"
-
-    #if array_name == "b":
-    #    chill.cheat(1)
-    #if array_name == "c":
-    #    chill.cheat(2)
-    
-    chill.datacopy_9arg( tuple( passtoC ))
-
-    #print "back from datacopy_9arg\n\n\n"
-    #sys.stdout.flush()
-
-
-    #print "calling add_sync( %d, %s )" % ( stmt, start_loop )
-    chill.add_sync( stmt, start_loop )
-    #print "back from add_sync()\n\n"
-
-    new_num_statements = chill.num_statements()
-    
-    #  This is fairly CUBLAS2 specific, not sure how well it generalizes,
-    #  but for a 2D copy, what we want to do is "normalize" the first loop
-    #  "tmp1" then get its hard upper bound. We then want to tile it to
-    #  make the control loop of that tile "ty". We then tile "tmp2" with a
-    #  size of 1 and make it "tx".
-
-    #print "fairly CUBLAS2 specific, OLD %d  NEW %d" % ( old_num_statements, new_num_statements)
-    sys.stdout.flush()
-    sys.stdout.flush()
-
-    for stmt in range(old_num_statements, new_num_statements):
-        #print "for stmt = %d" % stmt
-        level = find_cur_level( stmt, "tmp2")
-        #print "FOUND CUR LEVEL?  level '",
-        #print level,
-        #print "'"
-
-        #print "in loop, stmt %d   level %d" % ( stmt, level )
-        if level != -1:
-            #print "\nCopy to shared: [If was no error]\n"
-            find_cur_level(stmt,"tmp2")
-            chill.tile3( stmt, level, level )
-            
-            #print "hard_loop_bounds( %d, %d )" % (stmt, level)
-            bounds = chill.hard_loop_bounds(stmt, level)
-            lower = bounds[0]
-            upper = 1+ bounds[1]
-            #print "lower %d  upper %d" % ( lower, upper )
-
-            dims = chill.thread_dims()
-            #print "in cudaize.py copy_to_shared, dims =",
-            #print dims
-            tx = dims[0]
-            ty = dims[1]
-            #print "2-loop cleanup: lower, upper: %d, %d,  tx: %d" % ( lower, upper, tx)
-
-            level = find_cur_level(stmt,"tmp1")
-            #print "level %d" % level
-            if tx == upper and ty == 1:
-                #print "tx = %d    upper = %d     ty = %d"% (tx, upper, ty)
-                #print "Don't need"
-
-                # Don't need an extra tile level, just move this loop up
-                second_level = find_cur_level(stmt,"tmp2")
-                chill.tile7(stmt, second_level, 1, level, "tx", "tx", counted)
-
-            else:
-                #print "DO need?"
-                if ty == 1:
-                    new_ctrl = "tmp3" 
-                else:
-                    new_ctrl = "ty"
-
-                # LOTS of commented out code here in cudaize.lua 
-
-                #print_code()
-                #print "\nStarting tmp2\n"
-                first_level  = find_cur_level(stmt,"tmp1")
-                second_level = find_cur_level(stmt,"tmp2")
-                bounds = chill.hard_loop_bounds(stmt, second_level)
-                lower = bounds[0]
-                upper = 1 + bounds[1]   # BROKEN?
-                        
-                #print "[Malik]-loop cleanup@tmp2: lower, upper: %d, %d, tx: %d,first level:%d,second_level:%d" % ( lower, upper-1, tx, first_level, second_level) 
-
-                # Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx.
-                #print "\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, second_level,1,first_level, "tx", "tx")
-                chill.tile7(stmt, second_level,1,first_level,"tx","tx",counted)
-                #print_code()
-
-                first_level = find_cur_level(stmt,"tmp1")
-                bounds = chill.hard_loop_bounds(stmt, first_level)
-                lower_1 =     bounds[0]
-                upper_1 = 1 + bounds[1]
-                tx_level = find_cur_level(stmt,"tx")
-                bounds = chill.hard_loop_bounds(stmt,tx_level)
-                lower_tx =   bounds[0]
-                upper_tx = 1+bounds[1]
-                #print "UL_1 %d %d     UL_tx %d %d" % ( lower_1, upper_1-1, lower_tx, upper_tx-1)
-
-                if int(math.ceil( float(upper_tx)/float(tx))) > 1:
-                     #print "ceil I say"
-                     #print "\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, tx_level,tx,tx_level, "tx", "tmp1")
-                     chill.tile7(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted)
-                     #print_code()
-
-                     repeat = find_cur_level(stmt,"tx")
-                     #print "\n[Tile1]tile(%d, %d, %d)" % (stmt, repeat, repeat)
-                     chill.tile3(stmt, repeat, repeat)  #find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx"))
-                     #print_code()
-
-                     if find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx"):
-                        #print "\nagain [Tile1]tile(%d, %d, %d)" % (stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
-                        chill.tile3(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
-                        #print_code()
-
-                #print_code()
-
-                #print "\nStarting tmp1\n"
-                # Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty".
-                chill.tile3(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))      
-                #print_code()
-
-                ty_level = find_cur_level(stmt,"tmp1")
-                bounds = chill.hard_loop_bounds(stmt,ty_level)
-                lower_ty = bounds[0]
-                upper_ty = 1 + bounds[1]
-
-                tx_level = find_cur_level(stmt,"tx")
-                bounds = chill.hard_loop_bounds(stmt,tx_level)
-                lower_tx = bounds[0]
-                upper_tx = 1 + bounds[1]
-
-                #print "[Malik]-loop cleanup@tmp1: lowerty, upperty: %d, %d, ty: %d,ty level:%d,tx_level:%d, stmt: %d" % ( lower_ty, upper_ty-1, ty, ty_level, tx_level, stmt)
-                
-                #print "before ceil"
-                #sys.stdout.flush()
-
-                if(math.ceil(float(upper_ty)/float(ty)) > 1):
-                    #print "CEIL IF"
-                    #print "\n Inside upper_ty/ty > 1\n"
-
-                    #print "\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, ty_level,ty,ty_level, "ty", "tmp_ty")
-                    chill.tile7(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted)
-                    #print_code()
-
-                    #print "\n[Tile2-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt  ,"ty"),find_cur_level(stmt,"ty"))
-                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty"))
-                    #print_code()
-
-                    cur_idxs = chill.cur_indices(stmt)
-                    #print "\n cur indexes are ",
-                    #print_array( cur_idxs)
-                    #sys.stdout.flush()
-
-                    # Putting ty before any tmp_tx
-                    idx_flag = -1
-                    if "tmp_tx" in cur_idxs:
-                        idx_flag = 1 + cur_idxs.index("tmp_tx")   # lua index starts at 1
-                    #print "\n (1) so i have found out the value of idx flag as %d" % idx_flag
-                    #sys.stdout.flush()      
-                    
-                    if idx_flag >= 0:
-                         if find_cur_level(stmt,"ty") > find_cur_level(stmt,"tmp_ty"):
-                             #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                             chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                             #print_code()
-                    
-                    
-                    #  Now Putting ty before any tmp_ty
-                    sys.stdout.flush()      
-                    idx_flag = -1
-                    if "tmp_ty" in cur_idxs:
-                        idx_flag = 1 + cur_idxs.index("tmp_ty") # lua index starts at 1
-                    #print "\n IF  so i have found out the value of idx flag as %d" % idx_flag
-                    #sys.stdout.flush()      
-                                            
-                    if idx_flag >= 0:
-                        #print "one more test"
-                        sys.stdout.flush()
-                        if find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"):
-                            #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                            #sys.stdout.flush()
-                            chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                            #print_code()
-
-
-
-                else:
-                    #print "CEIL ELSE"
-                    #print "\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, ty_level,1,ty_level, "ty", "ty")
-                    #sys.stdout.flush()
-                    chill.tile7( stmt, ty_level, 1, ty_level, "ty", "ty", counted )
-                    #print_code()
-
-                    #print "\n[Tile3-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
-                    sys.stdout.flush()
-
-                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
-                    #print_code()
-
-
-                    idx_flag = -1
-                    # LUA code checks to see if cur_idxs exists?  it is unused except in the other clause of this is
-                    #if(cur_idxs) then
-                        #print "CAN NEVER GET HERE?  cur_idxs"
-                        #for num= 0,table.getn(cur_idxs) do
-                            #if(cur[num] == "tmp_ty") then
-                            #idx_flag = find_cur_level(stmt,cur[num])
-                            #break
-                        #end
-                    #end
-                    print "\n ELSE so i have found out the value of idx flag as %d" % idx_flag
-                    if idx_flag >= 0:  # can't happen
-                        print "tile( stmt %d, level ty %d, level ty %d" % ( stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                        #chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-                    
-                        
-                    
-
-                    
-            #print "\n\n *** at bottom of if in copy to shared, "
-            #print_code()
-            #print "end of if"
-
-        else:
-            #  copy to shared only created one level, not two, so we use a different approach (MV & TMV)
-            #print "\nCopy to shared: [If was error]\n"
-            level = find_cur_level(stmt,"tmp1")
-            chill.tile3(stmt, level, level)
-
-            dims = chill.thread_dims()
-            #print dims
-            tx = dims[0]
-            ty = dims[1]
-
-            bounds = chill.hard_loop_bounds(stmt, level)
-            lower = bounds[0]   
-            upper = bounds[1]
-
-            #print "bounds  lower %d    upper %d" % (lower, upper)
-            upper = upper+1 # upper bound given as <=, compare to dimensions tx which is <
-            if upper == tx:
-                #print "upper == tx"
-                chill.rename_index( stmt, "tmp1", "tx")
-            else:
-                #print "upper is not tx"
-                #print "upper %d tx %d stmt: %d level: %d" % ( upper, tx, stmt, level)
-                chill.tile7( stmt, level, tx, level, "tx", "tmp_tx", counted)
-                #print_code()
-
-                #print "stmt:%d level+1: %d" % ( stmt, level+1) 
-                #print("TILE 7")
-                chill.tile7( stmt, level+1,1,level+1,"tx", "tx",counted)
-                #print("TILE 3")
-                chill.tile3( stmt, level+1, level)
-                #print_code()           
-
-
-                if ty > 1:
-                   #print "GOING IN"
-                   bounds = chill.hard_loop_bounds(stmt, level+1)
-                   lower = bounds[0]   
-                   upper = bounds[1]   
-                   #print "ty %d  lower %d  upper %d" % ( ty, lower, upper )
-                   floatdiv = float(upper)/float(ty)
-                   bound =  int(math.ceil(float(upper)/float(ty)))
-                   #print "NOW FOR Y: upper %d ty %d stmt: %d level: %d bound: %d" % ( upper, ty, stmt, level+1,   bound)
-                   chill.tile7(stmt, level+1, bound, level+1, "tmp_ty", "ty", counted)
-
-        # Always add sync
-        chill.add_sync( stmt, start_loop )
-    #print "ending copy to shared\n"
-    #sys.stdout.flush()
-    #print_code()     
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-def unroll_to_depth( max_depth ):
-    print "\n\nunroll_to_depth(%d)" % max_depth
-    print "SYNC UP"
-    sys.stdout.flush()
-
-    cur = chill.cur_indices(0)
-    thread_idxs = chill.thread_indices()
-    guard_idx = thread_idxs[-1]  # last one
-
-    print "cur    indices",
-    print_array(cur)
-    print "thread indices", 
-    print_array(thread_idxs)
-    print "guard_idx = %s" % guard_idx
-
-    #print "thread_idxs = ",
-    #print thread_idxs
-    guard_idx = thread_idxs[-1]
-    #print "guard_idx = %s" % guard_idx
-
-    #  HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS
-    common_loops = []
-    comm_loops_cnt = 0
-    num_stmts = chill.num_statements()
-    print "num statements %d" % num_stmts
-
-    for stmt in range(num_stmts):
-        sys.stdout.flush()
-        print "\nSTMT %d" % stmt,
-        cur_idxs = chill.cur_indices(stmt)
-        print "Current Indices:",
-        for c in cur_idxs[:-1]:
-            print "%s," % c,
-        print "%s" % cur_idxs[-1]   # last one
-        sys.stdout.flush()
-        #print_code()
-        
-        if chk_cur_level(stmt, "tx") > 0:
-            
-            for ii in range(find_cur_level(stmt,"tx")-1):
-                print "ii = %d\ncur_idxs[%d] = '%s'" % (ii+1, ii+1, cur_idxs[ii]) # print to match lua
-                id = cur_idxs[ii]
-                if id not in ["bx", "by", "", "tx", "ty"]:
-
-                    print "id %s is not in the list" % id
-
-                    for stmt1 in range(stmt+1, num_stmts):
-                        print "\nii %d stmt1 is %d" % (ii+1, stmt1)  # print to match lua 
-                        cur_idxs1 = chill.cur_indices(stmt1)
-                        print "\nstmt1 cur_idxs1 is ",
-                        for ind in cur_idxs1[:-1]:
-                            print "%s," % ind,
-                        print "%s" % cur_idxs1[-1]
-
-                        print "cur level(%d, %s) = %d" % (stmt, "tx", find_cur_level(stmt,"tx") )
-                        sys.stdout.flush()
-
-                        endrange = find_cur_level(stmt,"tx")-1
-                        print "for iii=1, %d do" % endrange
-                        sys.stdout.flush()
-                        for iii in range(endrange):   # off by one?  TODO 
-                            print "stmt %d   ii %d   iii %d\n" % (stmt, ii+1, iii+1),
-                            sys.stdout.flush()
-                            
-                            if iii >= len(cur_idxs1):
-                                print "stmt %d   ii %d   iii %d  cur_idxs1[%d] = NIL" % (stmt, ii+1, iii+1, iii+1, )  # print to match lua 
-                            else:
-                                print "stmt %d   ii %d   iii %d  cur_idxs1[%d] = '%s'" % (stmt, ii+1, iii+1, iii+1, cur_idxs1[iii])  # print to match lua 
-                            sys.stdout.flush()
-
-                            # this will still probably die 
-                            if iii < len(cur_idxs1) and [iii] not in ["bx", "by", "tx", "ty", ""]:
-                                if cur_idxs[ii] == cur_idxs1[iii]:
-                                    print "\nfound idx:%s" % cur_idxs[ii]
-                                    common_loops.append(cur_idxs[ii])
-                                    print "cl[%d] = '%s'" % ( comm_loops_cnt, cur_idxs[ii] )
-                                    comm_loops_cnt = len(common_loops)
-
-    if len(common_loops) > 0:
-        print "\n COMM LOOPS :TOTAL %d, and are " % comm_loops_cnt,
-        print common_loops, 
-        print " this loop : %s" % common_loops[0]
-    else:
-        print "UNROLL can't unroll any loops?"
-
-
-    while True:  # break at bottom of loop   (repeat in lua)
-        old_num_statements = chill.num_statements()
-        print "old_num_statements %d" % old_num_statements
-
-        for stmt in range(old_num_statements):
-            cur_idxs = chill.cur_indices(stmt)
-            print "stmt %d    cur_idxs =" % stmt,
-            index = 0
-            for i in cur_idxs:
-                index +=1
-                if index == len(cur_idxs):
-                    print "%s" %i
-                else:
-                    print "%s," % i,
-
-            if len(cur_idxs) > 0:
-                guard_level = -1
-                if chk_cur_level(stmt, guard_idx) > 0:
-                    guard_level = find_cur_level(stmt,guard_idx)
-                print "guard_level(sp) = %d" % guard_level
-                if guard_level > -1:
-                    level = next_clean_level(cur_idxs,guard_level)
-                    print "next clean level %d" % level
-
-                    
-                    #print "looking at %d" % stmt
-                    #print "comparing %d and %d in" % (guard_level, level),
-                    #index = 0
-                    #for i in cur_idxs:
-                    #index +=1
-                    #if index == len(cur_idxs):
-                    #    print "%s" %i
-                    #else:
-                    #    print "%s," % i,
-
-                    # need to handle max_depth
-                    num_unrolled = 0
-                    level_unroll_comm = level
-                    level_arr = []
-
-                    #print "before while, level = %d" % level 
-                    while level >= 0:
-                        print "while: level = %d" % level 
-                        if num_unrolled == max_depth:
-                            break
-
-                        print "Unrolling %d at level %d index %s" % ( stmt, level, cur_idxs[guard_level])  # ??? 
-                        level_arr.append(level)
-
-                        guard_level = find_cur_level(stmt,guard_idx)
-                        level = next_clean_level(cur_idxs,level+1)
-
-                    print "OK, NOW WE UNROLL"
-                    if level_unroll_comm >= 0:
-                        level_arr.reverse()  
-                        for i,lev in enumerate(level_arr):
-                            print "\ni=%d" % i
-                            print "[Unroll]unroll(%d, %d, 0)" % (stmt, lev)
-                            chill.unroll(stmt, lev, 0)
-
-
-        new_num_statements = chill.num_statements()
-        if old_num_statements == new_num_statements:
-            break  # exit infinite loop
-
-
-#  all other calls to C have a routine in this file   (?)
-def unroll( statement, level, unroll_amount ):
-    chill.unroll( statement, level, unroll_amount )
-
diff --git a/examples/cuda-chill/mm.c b/examples/cuda-chill/mm.c
deleted file mode 100644
index 0efbeeb..0000000
--- a/examples/cuda-chill/mm.c
+++ /dev/null
@@ -1,10 +0,0 @@
-#define N 1024
-
-void normalMM(float c[N][N], float a[N][N], float b[N][N]) {
-  int i, j, k;
-
-  for (i = 0; i < N; i++)
-    for (j = 0; j < N; j++)
-      for (k = 0; k < N; k++)
-        c[j][i] = c[j][i] + a[k][i] * b[j][k];
-}
diff --git a/examples/cuda-chill/mm.lua b/examples/cuda-chill/mm.lua
deleted file mode 100644
index 5bde1b0..0000000
--- a/examples/cuda-chill/mm.lua
+++ /dev/null
@@ -1,38 +0,0 @@
-init("mm.c", "normalMM", 0)
-dofile("cudaize.lua")
-N=1024
-Ti=128
-Tj=64
-Tk=16
-Tii=16
-Tjj=16
-
-
-
-
-N=1024
-
-
-
-
-
-
-
-
-
-
-
-
-
-tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k"})CU=1
-
-tile_by_index({"k"},{Tk},{l1_control="kk"},{"ii","jj","kk","i","j","k"})CU=3
-
-tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","kk","i","iii","j","jjj","k"},1)CU=2
-
-cudaize("mm_GPU",{a=1048576,b=1048576,c=1048576},{block={"ii","jj"}, thread={"i","j"}})CU=2
-copy_to_shared("tx","a",-16)
-copy_to_shared("tx","b",-16)
-copy_to_registers("kk","c")
---print_code()
-unroll_to_depth(2)
diff --git a/examples/cuda-chill/mpeg4.c b/examples/cuda-chill/mpeg4.c
deleted file mode 100755
index 7f83bf7..0000000
--- a/examples/cuda-chill/mpeg4.c
+++ /dev/null
@@ -1,23 +0,0 @@
-#define N1 4096
-#define N2 4096
-#define WINDOW_SIZE 16
-
-void mpeg4_cpu(float result[N1][N2], float prev[N2+WINDOW_SIZE][N2+WINDOW_SIZE], float  curr[WINDOW_SIZE*WINDOW_SIZE])
-{
-	unsigned int i;
-	unsigned int j;
-	unsigned int k;
-	unsigned int l;
-
-	for ( i = 0; i < N1; ++i)    
-		for ( j = 0; j < N2; ++j) 
-                       for ( k = 0; k < WINDOW_SIZE; ++k) 
-				for ( l = 0; l < WINDOW_SIZE; ++l) 
-					result[i][j] += prev[i+k][j+l] * curr[k*WINDOW_SIZE+l];
-				
-			
-
-		
-	
-}
-
diff --git a/examples/cuda-chill/mpeg4.lua b/examples/cuda-chill/mpeg4.lua
deleted file mode 100644
index f025dc0..0000000
--- a/examples/cuda-chill/mpeg4.lua
+++ /dev/null
@@ -1,45 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("mpeg4.c", "mpeg4_cpu", 0) 
-
---dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods
-
-N=4096
-M=4096
-W=16
-
---TI 4ust be <= M
---TJ must be <=TI
-Ti=32
-Tj=32
-Tii=16
-Tjj=16
-Tk=4
---permute(0,{"j","i","k","l"})
-tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k","l"})
---tile_by_index({"k","l"},{Tk*2,Tk*2},{l1_control="kk",l2_control="ll"},{"ii","jj","kk","ll","i","j","k","l"})
---print_code()
---tile_by_index({"k","l"},{Tk,Tk},{l1_control="kk",l2_control="ll"},{"ii","jj","i","j","kk","k","ll","l"})
-tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","iii","i","jjj","j","k","l"})
---print_code()
---normalize_index("j")
---normalize_index("i")
---print_code()
-cudaize("kernel_GPU",{curr=W*W,prev=(N+W)*(M+W),result=N*M},{block={"ii","jj"}, thread={"i","j"}})
---print_code()
-copy_to_shared("iii","prev",16)
-
-copy_to_registers("jjj","result")
-
---print_code()
---copy_to_constant_no_tile("curr")
-unroll_to_depth(2)
-print_code()
-print_space()
-
-
diff --git a/examples/cuda-chill/mriq-fh.c b/examples/cuda-chill/mriq-fh.c
deleted file mode 100755
index 1e924b7..0000000
--- a/examples/cuda-chill/mriq-fh.c
+++ /dev/null
@@ -1,38 +0,0 @@
-#define X 32768
-#define K 256
-struct kValues {
-  float Kx;
-  float Ky;
-  float Kz;
-  float PhiMag;
-};
-extern float sin(float);
-extern float cos(float);
-
-void mriFH_cpu(float *rPhi,float *rRho,float *iRho, float *iPhi, float *rD, float *iD, float *kx, float *ky, float *kz, float *dx, float *dy, float *dz, float *rFHref, float *iFHref)
-{
-
-    	float rfh;
-	float ifh;
-	float exp;
-	float cArg;
-	float sArg;
-    	//float rRho[K];
-	//float iRho[K];
-        unsigned int k;
-	unsigned int x;
- 
-      
-    for (x = 0; x < X; ++x) {
-        for (k = 0; k < K; ++k) {
-            
-	       exp = 2 * 3.14159 * (kx[k]* dx[x] + ky[k]* dy[x] + kz[k]* dz[x]);
-	       cArg = cos(exp);
-	       sArg = sin(exp);
-            rFHref[x] += rRho[k]* cArg - iRho[k]* sArg;
-            iFHref[x] += iRho[k]*cArg + rRho[k]*sArg;
-        }
-         
-    }
-}
-
diff --git a/examples/cuda-chill/mriq-fh.lua b/examples/cuda-chill/mriq-fh.lua
deleted file mode 100755
index 3277bac..0000000
--- a/examples/cuda-chill/mriq-fh.lua
+++ /dev/null
@@ -1,73 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("mriq-fh.c", "mriFH_cpu", 0) 
-
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
-                      --copy_to_shared methods
-N=32768
-M=256
-Tx=256
-
-
-print_code()
---permute(0,{"j","i"})
---tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"})
-tile_by_index({"x"},{Tx},{l1_control="xx"},{"xx","x","k"})
---tile_by_index({"x"},{16},{l1_control="xx1"},{"xx","x","xx1","k"})
---tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
-print_code()
-
-normalize_index("x")
---normalize_index("i")
-print_code()
---tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"})
---print_code()
---cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}})
-cudaize("kernel_GPU",{dx=N,dy=N,dz=N,iRho=M,kx=M,ky=M,kz=M,rFHref=N,iFHref=N,rRho=M},{block={"xx"}, thread={"x"}})
---copy_to_shared("tx","iRho",-16)
---copy_to_shared("tx","dz",1)
---copy_to_shared("tx","rRho",-16)
---copy_to_registers("tx","rFHref")
---copy_to_registers("tx","rRho")
---copy_to_registers("tx","iRho")
---copy_to_registers("tx","kx")
---copy_to_registers("tx","dx")
---copy_to_registers("tx","ky")
---copy_to_registers("tx","dy")
---copy_to_registers("tx","kz")
---copy_to_registers("tx","dz")
---copy_to_registers("tx","iFHref")
---copy_to_texture("rRho")
---copy_to_texture("kx")
---copy_to_texture("dx")
---copy_to_texture("ky")
---copy_to_texture("dy")
---copy_to_texture("kz")
---copy_to_texture("dz")
---copy_to_texture("iRho")
---print_code()--]]
---unroll(0,4,0)
---copy_to_constant_no_tile("kx")
---copy_to_constant_no_tile("ky")
---copy_to_constant_no_tile("kz")
---copy_to_constant_no_tile("rRho")
---copy_to_constant_no_tile("iRho")
-
---unroll_to_depth(1)
-print_code()
---[[
-copy_to_Texture("rRho")
-copy_to_Texture("kx")
-copy_to_Texture("dx")
-copy_to_Texture("ky")
-copy_to_Texture("dy")
-copy_to_Texture("kz")
-copy_to_Texture("dz")
-copy_to_Texture("iRho")
---unroll_to_depth(2)
---]]
diff --git a/examples/cuda-chill/mriq.c b/examples/cuda-chill/mriq.c
deleted file mode 100644
index ba4b87c..0000000
--- a/examples/cuda-chill/mriq.c
+++ /dev/null
@@ -1,33 +0,0 @@
-#define N 32768
-#define M 3072
-struct kValues {
-  float Kx;
-  float Ky;
-  float Kz;
-  float PhiMag;
-};
-extern float sinf(float);
-extern float cosf(float);
-
-void
-ComputeQCPU(int numK, int numX,struct kValues kVals[M],float x[N], float y[N], float z[N],float Qr[N], float Qi[N]) {
-  float expArg;
-  float cosArg;
-  float sinArg;
-  float phi;
-  int i;
-  int j;
-  numK = M;
-  numX = N;
-  for ( i = 0; i < M; i++) {
-    for ( j = 0; j < N; j++) {
-      expArg = 6.2831853071795864769252867665590058f * (kVals[i].Kx * x[j] +kVals[i].Ky * y[j] +kVals[i].Kz * z[j]);
-      cosArg = cosf(expArg);
-      sinArg = sinf(expArg);
-      phi = kVals[i].PhiMag;
-      Qr[j] += phi * cosArg;
-      Qi[j] += phi * sinArg;
-    }
-  }
-}
-  
diff --git a/examples/cuda-chill/mriq.lua b/examples/cuda-chill/mriq.lua
deleted file mode 100644
index 1170111..0000000
--- a/examples/cuda-chill/mriq.lua
+++ /dev/null
@@ -1,55 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("mriq.c", "ComputeQCPU", 0) 
-
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
-                      --copy_to_shared methods
-N=32768
-M=3072
-TI=128
-TJ=128
-
-permute(0,{"j","i"})
---tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"})
-tile_by_index({"i"}, {TJ}, {l1_control="ii",l1_tile="i"}, {"ii", "j","i"})
-tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
-
-normalize_index("j")
-normalize_index("i")
---print_code()
---tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"})
---print_code()
-cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}})
-
-copy_to_shared("tx","kVals",1)
---copy_to_shared("tx","x",1)
---copy_to_shared("tx","y",1)
---copy_to_shared("tx","z",1)
-
---copy_to_texture("kVals")
---datacopy(0, 3, "kVals", {"tt","t"},false,0,1,-16,true)
---print_code()
---datacopy_privatized(0,"tx","kVals",{"tx"})
---copy_to_registers("tx","kVals")
-copy_to_registers("ii","x")
-copy_to_registers("ii","y")
-copy_to_registers("ii","z")
-copy_to_registers("ii","Qi")
-copy_to_registers("ii","Qr")
---[[datacopy_privatized(0,"tx","x",{"tx"})
-datacopy_privatized(0,"tx","y",{"tx"})
-datacopy_privatized(0,"tx","z",{"tx"})
-datacopy_privatized(0,"tx","Qi",{"tx"})
-datacopy_privatized(0,"tx","Qr",{"tx"})
-
-
-]]--
---unroll(0,5,64)
-print_code()
---unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
diff --git a/examples/cuda-chill/mv-shadow.c b/examples/cuda-chill/mv-shadow.c
deleted file mode 100644
index 582b187..0000000
--- a/examples/cuda-chill/mv-shadow.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 1024
-
-void normalMV(float c[N][N], float a[N], float b[N]) {
-  int i, j;
-
-  for (i = 0; i < N; i++)
-    for (j = 0; j < N; j++)
-      a[i] = a[i] + c[j][i] * b[j];
-}
diff --git a/examples/cuda-chill/mv-shadow.lua b/examples/cuda-chill/mv-shadow.lua
deleted file mode 100644
index 43e8491..0000000
--- a/examples/cuda-chill/mv-shadow.lua
+++ /dev/null
@@ -1,65 +0,0 @@
-init("mv-shadow.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
-                      --copy_to_shared methods
-
-N=129
-TI=32
-TJ=64
-
-N=1024
-TI=16
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
---Tile the i and j loop, introducing "ii" as the control loop for the "i"
---tile, "k" for the control loop fo the "j" tile, with the final order
---of {"ii", "k", "i", "j"}
-tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
---tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
---Normalize indx will do a tile size of one over the loop level specified
---by the input index. This is useful to get a zero lower bound and hard
---upper bound on a loop instead of it being relative to previous loop
---levels.
---normalize_index("ii")
-normalize_index("i")
-print_code()
-
---Cudaize now determines the grid dimentions from the loops themselves
---(the upper bounds of the block and thread loops). It also renames the
---given block and thread loops's indexes to the approviate values from
---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
---size of the arrays to be copied in the CUDA scaffolding.
-cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}})
---print_code()
-
---Does a datacopy, tile, and add_sync to get a shared memory copy
-
---copy_to_shared("tx", "b", 1)
---copy_to_shared("tx", "c", -16)
---print_code()
---copy_to_texture("b")
---copy_to_texture("c")
-copy_to_registers("k", "a")
---print_code()
-
-unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
---copy_to_texture("b")
---print_code()
---unroll(0,5,0)
---print_code()
diff --git a/examples/cuda-chill/mv.c b/examples/cuda-chill/mv.c
deleted file mode 100644
index 582b187..0000000
--- a/examples/cuda-chill/mv.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 1024
-
-void normalMV(float c[N][N], float a[N], float b[N]) {
-  int i, j;
-
-  for (i = 0; i < N; i++)
-    for (j = 0; j < N; j++)
-      a[i] = a[i] + c[j][i] * b[j];
-}
diff --git a/examples/cuda-chill/mv.lua b/examples/cuda-chill/mv.lua
deleted file mode 100644
index ca54501..0000000
--- a/examples/cuda-chill/mv.lua
+++ /dev/null
@@ -1,65 +0,0 @@
-init("mv.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
-                      --copy_to_shared methods
-
-N=129
-TI=32
-TJ=64
-
-N=1024
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
---Tile the i and j loop, introducing "ii" as the control loop for the "i"
---tile, "k" for the control loop fo the "j" tile, with the final order
---of {"ii", "k", "i", "j"}
-tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
---tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
---Normalize indx will do a tile size of one over the loop level specified
---by the input index. This is useful to get a zero lower bound and hard
---upper bound on a loop instead of it being relative to previous loop
---levels.
---normalize_index("ii")
-normalize_index("i")
-print_code()
-
---Cudaize now determines the grid dimentions from the loops themselves
---(the upper bounds of the block and thread loops). It also renames the
---given block and thread loops's indexes to the approviate values from
---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
---size of the arrays to be copied in the CUDA scaffolding.
-cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}})
-
---print_code()
-
---Does a datacopy, tile, and add_sync to get a shared memory copy
-
---copy_to_shared("tx", "b", 1)
---copy_to_shared("tx", "c", -16)
---print_code()
---copy_to_texture("b")
---copy_to_texture("c")
-copy_to_registers("k", "a")
---print_code()
-
-unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
---copy_to_texture("b")
---print_code()
---unroll(0,5,0)
---print_code()
diff --git a/examples/cuda-chill/mv_try.c b/examples/cuda-chill/mv_try.c
deleted file mode 100644
index 7781f3b..0000000
--- a/examples/cuda-chill/mv_try.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 4096
-
-void normalMV(int n, float c[N][N], float a[N], float b[N]) {
-  int i, j;
-
-  for (i = 0; i < n; i++)
-    for (j = 0; j < n; j++)
-      a[i] = a[i] + c[i][j] * b[j];
-}
diff --git a/examples/cuda-chill/mv_try.lua b/examples/cuda-chill/mv_try.lua
deleted file mode 100644
index db4d9ad..0000000
--- a/examples/cuda-chill/mv_try.lua
+++ /dev/null
@@ -1,14 +0,0 @@
-init("mv_try.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
-                      --copy_to_shared methods
-
-TI=96
-
-N=4096
-
-
-tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
-cudaize("mv_GPU", {a=N, b=N, c=N*N},
-        {block={"ii"}, thread={"i"}})
-
-print_code()
diff --git a/examples/cuda-chill/nbody.c b/examples/cuda-chill/nbody.c
deleted file mode 100644
index 57899b6..0000000
--- a/examples/cuda-chill/nbody.c
+++ /dev/null
@@ -1,66 +0,0 @@
-#define NBODIES 16384
-#define SOFTENINGSQUARED 0.01f
-#define DELTATIME 0.001f
-#define DAMPING 1.0f
-
-#define NBLOCKSY 1
-#define NBLOCKSX (NBODIES/NTHREADSX)
-#define NTHREADSY 1 
-#define NTHREADSX 64
-
-#define BLOCKSIZE 128
-
-#define SHARED 1
-#define TIMER 1
-#define VERIFY 1
-
-extern float sqrtf(float);
-
-void nbody_cpu(float* oldpos,float* oldpos1, float *newpos, float *oldvel, float *newvel, float *force)
-{
-    float r0,r1,r2;
-    float invDist, invDistCube, mass, invMass;
-    unsigned int i,j;
-    for(i = 0; i < NBODIES; ++i) {
-        //force[i*4  ] = 0;
-        //force[i*4+1] = 0;
-        //force[i*4+2] = 0;
-        //force[i*4+3] = 0;
-        for(j = 0; j < NBODIES; ++j) {
-	    r0 = oldpos[j*4]-oldpos1[i*4];
-	    r1 = oldpos[j*4+1]-oldpos1[i*4+1];
-	    r2 = oldpos[j*4+2]-oldpos1[i*4+2];
-
-	    invDist = 1.0/sqrtf(r0 * r0 + r1 * r1 + r2 * r2 + SOFTENINGSQUARED);
-	    invDistCube =  invDist * invDist * invDist;
-	    mass = oldpos1[i*4+3];
-
-	    force[i*4] = force[i*4] + r0 * mass * invDistCube;
-	    force[i*4+1] = force[i*4+1] + r1 * mass * invDistCube;
-	    force[i*4+2] = force[i*4+2] + r2 * mass * invDistCube;
-
-        }
-    }
-
-/*    for (i = 0; i < NBODIES; ++i) {
-        invMass = oldvel[4*i+3];
-
-        oldvel[4*i] += (force[4*i] * invMass) * DELTATIME * DAMPING;
-        oldvel[4*i+1] += (force[4*i+1] * invMass) * DELTATIME * DAMPING;
-        oldvel[4*i+2] += (force[4*i+2] * invMass) * DELTATIME * DAMPING;
-
-        oldpos[4*i] += oldvel[4*i] * DELTATIME;
-        oldpos[4*i+1] += oldvel[4*i+1] * DELTATIME;
-        oldpos[4*i+2] += oldvel[4*i+2] * DELTATIME;
-
-        newpos[4*i+0] = oldpos[4*i];
-        newpos[4*i+1] = oldpos[4*i+1];
-        newpos[4*i+2] = oldpos[4*i+2];
-        newpos[4*i+3] = oldpos[4*i+3];
-
-        newvel[4*i+0] = oldvel[4*i];
-        newvel[4*i+1] = oldvel[4*i+1];
-        newvel[4*i+2] = oldvel[4*i+2];
-        newvel[4*i+3] = oldvel[4*i+3];
-    }*/
-}
diff --git a/examples/cuda-chill/nbody.lua b/examples/cuda-chill/nbody.lua
deleted file mode 100644
index 08f88a9..0000000
--- a/examples/cuda-chill/nbody.lua
+++ /dev/null
@@ -1,53 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("nbody.c", "nbody_cpu" , 0) 
-
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
-                     --copy_to_shared methods
-NBODIES=16384
-
-
---Tj=128 CHANGE FOR BEST..... BEST IS 64BLOCKS 128THREADS
---Ti=256
-Tj=64
-Ti=32
-Tjjj=1
-Tiii=1
-Tn=0.1
---normalize_index("j")
---
---print_code()
---normalize_index("n")
--- TILE COMMANDS ZEROOOOOOOOOOO:3
---tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j"})--CU=-1
-tile_by_index({"i"},{Ti},{l1_control="ii"},{"ii","i","j"})--CU=-1
---normalize_index("i")
---tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1
-
---tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3
---tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"})
---tile_by_index({"i"}, {Ti/2}, {l1_control="iii"}, {"ii","iii", "jj","i","j"})
---print_code()
-cudaize("kernel_GPU",{oldpos=4*NBODIES,oldpos1=4*NBODIES,oldvel=4*NBODIES,force=4*NBODIES,newpos=4*NBODIES,newvel=4*NBODIES},{block={"ii"}, thread={"i"}})--CU=3
-print_code()
---tile(0,6,6)
---copy_to_shared("tx","oldpos",-16)
---copy_to_registers("j","oldpos")
---copy_to_registers("j","oldpos1")
---copy_to_registers("j","force")
-
---copy_to_texture("oldpos")
---tile(1,3,3)
---tile(2,3,3)
-
-print_code()
---unroll_to_depth(1)
---
---tile(2,3,3)
---unroll(2,3,0)
---unroll(0,5,0)
---print_code()
diff --git a/examples/cuda-chill/tmv-shadow.c b/examples/cuda-chill/tmv-shadow.c
deleted file mode 100644
index cb9ea8d..0000000
--- a/examples/cuda-chill/tmv-shadow.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 1024
-
-void normalMV(float c[N][N], float a[N], float b[N]) {
-  int i, j;
-
-  for (i = 0; i < N; i++)
-    for (j = 0; j < N; j++)
-      a[i] = a[i] + c[i][j] * b[j];
-}
diff --git a/examples/cuda-chill/tmv-shadow.lua b/examples/cuda-chill/tmv-shadow.lua
deleted file mode 100644
index 196b939..0000000
--- a/examples/cuda-chill/tmv-shadow.lua
+++ /dev/null
@@ -1,50 +0,0 @@
-init("tmv-shadow.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
-                      --copy_to_shared methods
-
-N=1024
---N= 8209
---N=129
-TI=64
-N=1024
-TI=32
---tile, "k" for the control loop for the "j" tile, with the final order
---of {"ii", "k", "i", "j"}
-tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii",  "i", "j"})
---print_code()
---tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
-
---print_code()
---Normalize indx will do a tile size of one over the loop level specified
---by the input index. This is useful to get a zero lower bound and hard
---upper bound on a loop instead of it being relative to previous loop
---levels.
---normalize_index("i")
---print_code()
-
---Cudaize now determines the grid dimentions from the loops themselves
---(the upper bounds of the block and thread loops). It also renames the
---given block and thread loops's indexes to the approviate values from
---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
---size of the arrays to be copied in the CUDA scaffolding.
-cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}})
-
---print_code()
-
---Does a datacopy, tile, and add_sync to get a shared memory copy
-copy_to_shared("tx", "b", 1)
---copy_to_texture("b")
---print_code()
-
-copy_to_shared("tx", "c", -16)
---copy_to_texture("c")
---print_code()
-
-copy_to_registers("k", "a")
-print_code()
---unroll(0,5,0)
---unroll(0,4,0)
---unroll(2,4,16)
-unroll_to_depth(1)
---print_code()
diff --git a/examples/cuda-chill/tmv.c b/examples/cuda-chill/tmv.c
deleted file mode 100644
index cb9ea8d..0000000
--- a/examples/cuda-chill/tmv.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 1024
-
-void normalMV(float c[N][N], float a[N], float b[N]) {
-  int i, j;
-
-  for (i = 0; i < N; i++)
-    for (j = 0; j < N; j++)
-      a[i] = a[i] + c[i][j] * b[j];
-}
diff --git a/examples/cuda-chill/tmv.lua b/examples/cuda-chill/tmv.lua
deleted file mode 100644
index 5071108..0000000
--- a/examples/cuda-chill/tmv.lua
+++ /dev/null
@@ -1,50 +0,0 @@
-init("tmv.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
-                      --copy_to_shared methods
-
-N=1024
---N= 8209
---N=129
-TI=64
-N=1024
-TI=32
---tile, "k" for the control loop for the "j" tile, with the final order
---of {"ii", "k", "i", "j"}
-tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii",  "i", "j"})
---print_code()
---tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
-
---print_code()
---Normalize indx will do a tile size of one over the loop level specified
---by the input index. This is useful to get a zero lower bound and hard
---upper bound on a loop instead of it being relative to previous loop
---levels.
---normalize_index("i")
---print_code()
-
---Cudaize now determines the grid dimentions from the loops themselves
---(the upper bounds of the block and thread loops). It also renames the
---given block and thread loops's indexes to the approviate values from
---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
---size of the arrays to be copied in the CUDA scaffolding.
-cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}})
-
---print_code()
-
---Does a datacopy, tile, and add_sync to get a shared memory copy
-copy_to_shared("tx", "b", 1)
---copy_to_texture("b")
---print_code()
-
-copy_to_shared("tx", "c", -16)
---copy_to_texture("c")
---print_code()
-
-copy_to_registers("k", "a")
-print_code()
---unroll(0,5,0)
---unroll(0,4,0)
---unroll(2,4,16)
-unroll_to_depth(1)
---print_code()
diff --git a/examples/fortran/README b/examples/fortran/README
deleted file mode 100644
index 4f23bee..0000000
--- a/examples/fortran/README
+++ /dev/null
@@ -1,10 +0,0 @@
-// Manu
-
-1) Fortran support added to permute, tile, unroll and datacopy. Tested these w.r.t gemm.c using gemm.script. 
-   There might be other issues (like fusion due to unroll, ...) that have not been tested.
-
-2) To incorporate Fortran support I had to modify certain values in omega (include/omega/omega_core/oc.h). 
-   To solve for large number of unknowns, these values have to be reverted back.
-
-3) Tested the existing chill scripts using Derick's python script. 
-   At least the existing chill scripts are not affected by the fortran related changes.
diff --git a/examples/fortran/ccd.f b/examples/fortran/ccd.f
deleted file mode 100644
index 12d834d..0000000
--- a/examples/fortran/ccd.f
+++ /dev/null
@@ -1,32 +0,0 @@
-c
-c These have been separated out from ccsd_t_singles_l.F and ccsd_t_doubles_l.F
-c
-      subroutine clean_sd_t_s1_1(h3d,h2d,h1d,p6d,p5d,p4d,
-     2                     triplesx,t1sub,v2sub)
-      IMPLICIT NONE
-      integer h3d,h2d,h1d,p6d,p5d,p4d
-      integer h3,h2,h1,p6,p5,p4
-      integer N
-			double precision triplesx(16,16,16,16,16,16)
-      double precision t1sub(16,16)
-      double precision v2sub(16,16,16,16)
-      
-      N = 16       
-
-      do p4=1,10
-      do p5=1,10
-      do p6=1,10
-      do h1=1,10
-      do h2=1,10
-      do h3=1,10
-       triplesx(h3,h2,h1,p6,p5,p4)=triplesx(h3,h2,h1,p6,p5,p4)
-     1   + t1sub(p4,h1)*v2sub(h3,h2,p6,p5)
-      enddo
-      enddo
-      enddo
-      enddo
-      enddo
-      enddo
-      return
-      end
-
diff --git a/examples/fortran/ccd.script b/examples/fortran/ccd.script
deleted file mode 100644
index c2af500..0000000
--- a/examples/fortran/ccd.script
+++ /dev/null
@@ -1,18 +0,0 @@
-source: ccd.f
-procedure: clean_sd_t_s1_1
-format : rose
-loop: 0
-
-
-
-original()
-
-UN=4
-
-unroll(0,5,4)
-unroll(0,4,4)
-unroll(0,3,4)
-unroll(0,2,4)
-unroll(0,1,4)
-
-print
diff --git a/examples/fortran/gemm.f90 b/examples/fortran/gemm.f90
deleted file mode 100644
index b65bb58..0000000
--- a/examples/fortran/gemm.f90
+++ /dev/null
@@ -1,58 +0,0 @@
-program matmul
-
-    integer N,i,j,k
-    real*8 a(10,10), b(10,10), c(10,10), ct(10,10),mysum
-
-    do i=1,10,1
-      do j=1,10,1
-        a(i,j) = i+j 
-        b(i,j) = i-j
-        c(i,j) = 0.0
-        ct(i,j) = 0.0
-      end do
-      b(i,i) = 1.0;
-    end do
-
-
-      DO j=1,10,1
-         DO k=1,10,1
-            DO i=1,10,1
-               c(i,j) = c(i,j)+a(i,k)*b(k,j)
-            end do
-        end do
-      end do
-
-
-
-    call gemm(10,a,b,ct)
-
-    mysum = 0.0
-    do i=1,10,1
-      do j=1,10,1
-        mysum = c(i,j) - ct(i,j)
-      end do
-    end do
-
-   if (abs(mysum) >= 0.00001) then
-     write (*,*) "Something wrong"
-   else
-     write (*,*) "Output matches"
-   end if
-    
-end program matmul
-
-      SUBROUTINE gemm(N,A,B,C)
-      INTEGER N
-      REAL*8  A(N,N), B(N,N), C(N,N)
-
-      INTEGER I,J,K
-
-      DO J=1,N,1
-         DO K=1,N,1
-            DO I=1,N,1
-               C(I,J) = C(I,J)+A(I,K)*B(K,J)
-						end do
-				end do
-			end do
-
-      END subroutine
diff --git a/examples/fortran/gemm.script b/examples/fortran/gemm.script
deleted file mode 100644
index 01eb859..0000000
--- a/examples/fortran/gemm.script
+++ /dev/null
@@ -1,30 +0,0 @@
-#matrix multiply large array size for intel machine
-source: gemm.f90
-procedure: gemm
-format: rose
-loop: 0
-
-TI = 128
-#TI = 4
-TJ = 8
-#TK = 3
-TK = 512
-UI = 2
-UJ = 2
-
-permute([3,1,2])
-tile(0,2,TJ)
-#print space
-tile(0,2,TI)
-#print space
-tile(0,5,TK)
-#print space
-
-
-datacopy(0,3,A,false,-1)
-#print space
-
-datacopy(0,4,B)
-unroll(0,4,UI)                                                            
-unroll(0,5,UJ)  
-
diff --git a/examples/fortran/rose_gemm.f90 b/examples/fortran/rose_gemm.f90
deleted file mode 100644
index d150922..0000000
--- a/examples/fortran/rose_gemm.f90
+++ /dev/null
@@ -1,155 +0,0 @@
-PROGRAM matmul
-INTEGER :: N, i, j, k
-REAL(kind=8) :: a(10,10), b(10,10), c(10,10), ct(10,10), mysum
-DO i = 1, 10, 1
-DO j = 1, 10, 1
-a(i,j) = i + j
-b(i,j) = i - j
-c(i,j) = 0.0
-ct(i,j) = 0.0
-END DO
-b(i,i) = 1.0
-END DO
-DO j = 1, 10, 1
-DO k = 1, 10, 1
-DO i = 1, 10, 1
-c(i,j) = c(i,j) + a(i,k) * b(k,j)
-END DO
-END DO
-END DO
-CALL gemm(10,a,b,ct)
-mysum = 0.0
-DO i = 1, 10, 1
-DO j = 1, 10, 1
-mysum = c(i,j) - ct(i,j)
-END DO
-END DO
-IF (abs(mysum) >= 0.00001) THEN
-WRITE (*, FMT=*) "Something wrong"
-ELSE
-WRITE (*, FMT=*) "Output matches"
-END IF
-END PROGRAM matmul
-
-SUBROUTINE gemm(N,A,B,C)
-INTEGER :: t12
-INTEGER :: t10
-INTEGER :: t8
-INTEGER :: t6
-INTEGER :: t4
-INTEGER :: t2
-INTEGER :: chill_t64
-INTEGER :: chill_t63
-INTEGER :: chill_t62
-INTEGER :: chill_t61
-INTEGER :: chill_t60
-INTEGER :: chill_t59
-INTEGER :: chill_t58
-INTEGER :: chill_t57
-INTEGER :: chill_t56
-INTEGER :: chill_t55
-INTEGER :: chill_t54
-INTEGER :: chill_t53
-INTEGER :: chill_t52
-INTEGER :: chill_t51
-INTEGER :: chill_t50
-INTEGER :: chill_t49
-INTEGER :: chill_t48
-INTEGER :: chill_t47
-INTEGER :: over2
-INTEGER :: chill_t46
-INTEGER :: chill_t45
-INTEGER :: chill_t44
-INTEGER :: chill_t43
-INTEGER :: chill_t42
-INTEGER :: chill_t41
-INTEGER :: chill_t40
-INTEGER :: chill_t39
-INTEGER :: chill_t38
-INTEGER :: chill_t37
-INTEGER :: chill_t36
-INTEGER :: chill_t35
-INTEGER :: chill_t34
-INTEGER :: chill_t33
-INTEGER :: chill_t32
-INTEGER :: chill_t31
-INTEGER :: chill_t30
-INTEGER :: chill_t29
-INTEGER :: chill_t28
-INTEGER :: chill_t27
-INTEGER :: chill_t26
-INTEGER :: chill_t25
-INTEGER :: chill_t24
-INTEGER :: chill_t23
-INTEGER :: over1
-INTEGER :: chill_t22
-INTEGER :: chill_t21
-INTEGER :: chill_t20
-INTEGER :: chill_t19
-INTEGER :: chill_t18
-INTEGER :: chill_t17
-INTEGER :: chill_t16
-INTEGER :: chill_t15
-REAL(kind=8), DIMENSION(8,512) :: f_P2
-INTEGER :: chill_t14
-INTEGER :: chill_t13
-INTEGER :: chill_t12
-INTEGER :: chill_t11
-INTEGER :: chill_t10
-INTEGER :: chill_t9
-INTEGER :: chill_t8
-INTEGER :: chill_t7
-REAL(kind=8), DIMENSION(512,128) :: f_P1
-INTEGER :: chill_t1
-INTEGER :: chill_t2
-INTEGER :: chill_t4
-INTEGER :: chill_t6
-INTEGER :: chill_t5
-INTEGER :: N
-REAL(kind=8) :: A(N,N), B(N,N), C(N,N)
-INTEGER :: I, J, K
-over1 = 0
-over2 = 0
-DO t2 = 1, N, 512
-DO t4 = 1, N, 128
-DO t6 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
-DO t8 = t4, merge(t4 + 127,N,t4 + 127 <= N), 1
-f_P1(t8 - t4 + 1,t6 - t2 + 1) = A(t8,t6)
-END DO
-END DO
-DO t6 = 1, N, 8
-DO t8 = t6, merge(N,t6 + 7,N <= t6 + 7), 1
-DO t10 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
-f_P2(t10 - t2 + 1,t8 - t6 + 1) = B(t10,t8)
-END DO
-END DO
-over1 = MOD(N,2)
-DO t8 = t4, merge(-over1 + N,t4 + 126,-over1 + N <= t4 + 126), 2
-over2 = MOD(N,2)
-DO t10 = t6, merge(t6 + 6,N - over2,t6 + 6 <= N - over2), 2
-DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1
-C(t8,t10) = C(t8,t10) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
-C(t8 + 1,t10) = C(t8 + 1,t10) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
-C(t8,t10 + 1) = C(t8,t10 + 1) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1)
-C(t8 + 1,t10 + 1) = C(t8 + 1,t10 + 1) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1)
-END DO
-END DO
-IF (N - 7 <= t6 .AND. 1 <= over2) THEN
-DO t12 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
-C(t8,N) = C(t8,N) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1)
-C(t8 + 1,N) = C(t8 + 1,N) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1)
-END DO
-END IF
-END DO
-IF (N - 127 <= t4 .AND. 1 <= over1) THEN
-DO t10 = t6, merge(t6 + 7,N,t6 + 7 <= N), 1
-DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1
-C(N,t10) = C(N,t10) + f_P1(N - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
-END DO
-END DO
-END IF
-END DO
-END DO
-END DO
-END SUBROUTINE 
-
diff --git a/graph-test.cc b/graph-test.cc
deleted file mode 100644
index 3cdcbee..0000000
--- a/graph-test.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-#include "graph.hh"
-
-using std::cout;
-using std::endl;
-template<typename T>
-struct A {
-};
-
-template struct Graph<Empty,Empty>;
-
-int main() {
-  Graph<> g;
-  
-  for (int i = 0; i < 8; i++)
-    g.insert();
-  
-  std::vector<Empty> t;
-  t.push_back(Empty());
-  t.push_back(Empty());
-  
-  g.connect(0,1);
-  g.connect(1,4);
-  g.connect(4,0);
-  g.connect(4,5);
-  g.connect(1,5);
-  g.connect(1,2);
-  g.connect(2,3);
-  g.connect(3,2);
-  g.connect(2,6);
-  g.connect(5,6);
-  g.connect(6,5);
-  g.connect(6,7);
-  g.connect(3,7);
-  g.connect(7,7,t);
-  
-  g.insert();
-  g.insert();
-  g.connect(9,8);
-  g.connect(8,0);
-  
-  cout << "Graph #1:" << endl;
-  cout << g;
-  
-  std::vector<std::set<int> > r = g.topoSort();
-  
-  cout << "topological order: ";
-  int num_scc = 0;
-  for (int i = 0; i < r.size(); i++) {
-    if (i != 0)
-      cout << ' ';
-    if (r[i].size() > 1) {
-      cout << '(';
-      num_scc++;
-    }
-    for (std::set<int>::iterator j = r[i].begin(); j != r[i].end(); j++) {
-      if (j != r[i].begin())
-        cout << ' ';
-      cout << (*j+1);
-    }
-    if (r[i].size() > 1)
-      cout << ')';
-  }
-  cout << endl;
-  cout << "total number of SCC: " << num_scc << endl;
-  
-  Graph<> g2;
-  
-  for (int i = 0; i < 6; i++)
-    g2.insert();
-  
-  g2.connect(0,1);
-  g2.connect(0,2);
-  g2.connect(3,4);
-  g2.connect(3,5);
-  g2.connect(3,2);
-  g2.connect(5,0);
-  
-  cout << endl << "Graph #2:" << endl;
-  cout << g2;
-  
-  std::vector<std::set<int> > r2 = g2.packed_topoSort();
-  
-  cout << "packed topological order: ";
-  for (int i = 0; i < r2.size(); i++) {
-    if (i != 0)
-      cout << ' ';
-    if (r2[i].size() > 1)
-      cout << '(';
-    for (std::set<int>::iterator j = r2[i].begin(); j != r2[i].end(); j++) {
-      if (j != r2[i].begin())
-        cout << ' ';
-      cout << (*j+1);
-    }
-    if (r2[i].size() > 1)
-      cout << ')';
-  }
-  cout << endl;
-  
-  Graph<> g3;
-  
-  for (int i = 0; i < 6; i++)
-    g3.insert();
-  
-  g3.connect(5,2);
-  g3.connect(5,3);
-  g3.connect(5,4);
-  g3.connect(3,1);
-  g3.connect(1,0);
-  
-  cout << endl << "Graph #3:" << endl;
-  cout << g3;
-  
-  std::vector<std::set<int> > r3 = g3.topoSort();
-  
-  cout << "topological order: ";
-  for (int i = 0; i < r3.size(); i++) {
-    if (i != 0)
-      cout << ' ';
-    if (r3[i].size() > 1)
-      cout << '(';
-    for (std::set<int>::iterator j = r3[i].begin(); j != r3[i].end(); j++) {
-      if (j != r3[i].begin())
-        cout << ' ';
-      cout << (*j+1);
-    }
-    if (r3[i].size() > 1)
-      cout << ')';
-  }
-  cout << endl;
-  
-  r3 = g3.packed_topoSort();
-  
-  cout << "packed topological order: ";
-  for (int i = 0; i < r3.size(); i++) {
-    if (i != 0)
-      cout << ' ';
-    if (r3[i].size() > 1)
-      cout << '(';
-    for (std::set<int>::iterator j = r3[i].begin(); j != r3[i].end(); j++) {
-      if (j != r3[i].begin())
-        cout << ' ';
-      cout << (*j+1);
-    }
-    if (r3[i].size() > 1)
-      cout << ')';
-  }
-  cout << endl;
-}
diff --git a/graph.hh b/graph.hh
index 5d0ff66..f8471df 100644
--- a/graph.hh
+++ b/graph.hh
@@ -76,7 +76,8 @@ template<typename VertexType, typename EdgeType>
 std::ostream& operator<<(std::ostream &os, const Graph<VertexType, EdgeType> &g) {
   for (int i = 0; i < g.vertex.size(); i++)
     for (typename Graph<VertexType,EdgeType>::EdgeList::const_iterator j = g.vertex[i].second.begin(); j != g.vertex[i].second.end(); j++) {
-      os << "s" << i << "->" << "s" << j->first << ":";
+	//      os << i+1 << "->" << j->first+1 << ":";
+	os << "s" << i << "->" << "s" << j->first << ":";
       for (typename std::vector<EdgeType>::const_iterator k = j->second.begin(); k != j->second.end(); k++)
         os << " " << *k;
       os << std::endl;
diff --git a/include/ir_suif.hh b/include/ir_suif.hh
deleted file mode 120000
index 37f4ae8..0000000
--- a/include/ir_suif.hh
+++ /dev/null
@@ -1 +0,0 @@
-../ir_suif.hh
\ No newline at end of file
diff --git a/include/ir_suif_utils.hh b/include/ir_suif_utils.hh
deleted file mode 120000
index 327320d..0000000
--- a/include/ir_suif_utils.hh
+++ /dev/null
@@ -1 +0,0 @@
-../ir_suif_utils.hh
\ No newline at end of file
diff --git a/ir_cuda_rose_utils.cc b/ir_cuda_rose_utils.cc
deleted file mode 100644
index e7b4c37..0000000
--- a/ir_cuda_rose_utils.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2008 University of Southern California
- Copyright (C) 2009 University of Utah
- All Rights Reserved.
-
- Purpose:
-   SUIF interface utilities.
-
- Notes:
-
- Update history:
-   01/2006 created by Chun Chen
-*****************************************************************************/
-
-//#include <suif1.h>
-#include "ir_rose_utils.hh"
-
-
-/**
- * Returns the body of the for loop found by finding the first loop in
- * code, and if level > 1 recursively calling on the body of the found
- * loop and (level-1)
- */
-SgNode* loop_body_at_level(SgNode* tnl, int level) {
-  SgNode *inner_nl = 0;
-  //Now strip out the tnl on the inner level of the for loop
-  //tree_node_list_iter tnli(tnl);
-  
-  if (isSgBasicBlock(tnl)) {
-    
-    SgStatementPtrList& tnli = isSgBasicBlock(tnl)->get_statements();
-    
-    for (SgStatementPtrList::iterator it = tnli.begin(); it != tnli.end();
-         it++) {
-      if (isSgForStatement(*it)) {
-        inner_nl = loop_body_at_level(isSgForStatement(*it), level);
-        break;
-      }
-      
-    }
-    
-  }
-  
-  return inner_nl;
-}
-
-SgNode* loop_body_at_level(SgForStatement* loop, int level) {
-  if (level > 1)
-    return loop_body_at_level(loop->get_loop_body(), level - 1);
-  return loop->get_loop_body();
-}
-
-void swap_node_for_node_list(SgNode* tn, SgNode* new_tnl) {
-  SgStatement *s = isSgStatement(tn);
-  
-  SgStatement* p;
-  if (s != 0) {
-    p = isSgStatement(tn->get_parent());
-    
-    if (p != 0) {
-      
-      if (isSgBasicBlock(new_tnl)) {
-        
-        /*SgStatementPtrList & list_ =
-          isSgBasicBlock(new_tnl)->get_statements();
-          
-          if (isSgForStatement(p)) {
-          if (!isSgBasicBlock(isSgForStatement(p)->get_loop_body()))
-          p->replace_statement(s, isSgStatement(new_tnl));
-          else {
-          p->insert_statement(s, list_, true);
-          p->remove(s);
-          }
-          } else {
-          p->insert_statement(s, list_, true);
-          p->remove(s);
-          }
-        */
-        if (isSgForStatement(p)) {
-          if (!isSgBasicBlock(isSgForStatement(p)->get_loop_body()))
-            p->replace_statement(s, isSgStatement(new_tnl));
-          else {
-            
-            SgStatementPtrList& list_ =
-              isSgBasicBlock(new_tnl)->get_statements();
-            
-            //std::vector<SgStatement*> list;
-            
-            SgStatementPtrList::iterator it = list_.begin();
-            SgStatement* begin = *it;
-            begin->set_parent(p);
-            
-            p->replace_statement(s, begin);
-            it++;
-            //SgStatement* stmt = first;
-            SgStatement* temp = begin;
-            for (; it != list_.end(); it++) {
-              (*it)->set_parent(p);
-              p->insert_statement(temp, *it, false);
-              temp = *it;
-            }
-            
-          }
-          
-        } else {
-          
-          
-          SgStatementPtrList& list_ =
-            isSgBasicBlock(new_tnl)->get_statements();
-          
-          //std::vector<SgStatement*> list;
-          
-          SgStatementPtrList::iterator it = list_.begin();
-          SgStatement* begin = *it;
-          begin->set_parent(p);
-          
-          p->replace_statement(s, begin);
-          it++;
-          //SgStatement* stmt = first;
-          SgStatement* temp = begin;
-          for (; it != list_.end(); it++) {
-            (*it)->set_parent(p);
-            p->insert_statement(temp, *it, false);
-            temp = *it;
-          }
-          
-        }
-        
-        /*  SgStatement* temp = s;
-            
-            SgStatementPtrList::iterator it = list_.begin();
-            p->insert_statement(temp, *it, true);
-            temp = *it;
-            p->remove_statement(s);
-            it++;
-            for (; it != list_.end(); it++) {
-            p->insert_statement(temp, *it, false);
-            temp = *it;
-            }
-            
-            // new_tnl->set_parent(p);
-            //new_tnl->get_statements();
-            SgStatementPtrList& list =
-            isSgBasicBlock(new_tnl)->get_statements();
-            
-            //std::vector<SgStatement*> list;
-            
-            SgStatementPtrList::iterator it = list.begin();
-            SgStatement* begin = *it;
-            begin->set_parent(p);
-            
-            p->replace_statement(s, begin);
-            it++;
-            //SgStatement* stmt = first;
-            SgStatement* temp = begin;
-            for (; it != list.end(); it++) {
-            (*it)->set_parent(p);
-            p->insert_statement(temp, *it, false);
-            temp = *it;
-            }
-        */
-        /*              SgStatementPtrList& stmt_list = isSgBasicBlock(new_tnl)->get_statements();
-                        SgStatement* target =   s;
-                        
-                        for(SgStatementPtrList::iterator it = stmt_list.begin() ; it != stmt_list.end(); it++)
-                        {
-                        isSgNode(*it)->set_parent(p);
-                        p->insert_statement(isSgStateme, *it, false);
-                        target = *it;
-                        }
-                        
-                        p->remove_statement(s);
-                        
-        */
-      }else if(isSgIfStmt(p)) {
-        
-        if(isSgIfStmt(p)->get_true_body() == s)
-          isSgIfStmt(p)->set_true_body(isSgStatement(new_tnl));
-        else if(isSgIfStmt(p)->get_false_body() == s)
-          isSgIfStmt(p)->set_false_body(isSgStatement(new_tnl));
-        new_tnl->set_parent(p);
-      } 
-      else {
-        p->replace_statement(s, isSgStatement(new_tnl));
-        new_tnl->set_parent(p);
-      }
-    }
-    
-  }
-  //    return isSgNode(p);
-}
diff --git a/ir_cuda_suif_utils.cc b/ir_cuda_suif_utils.cc
deleted file mode 100644
index f15c190..0000000
--- a/ir_cuda_suif_utils.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2008 University of Southern California
- Copyright (C) 2009 University of Utah
- All Rights Reserved.
-
- Purpose:
-   SUIF interface utilities.
-
- Notes:
-
- Update history:
-   01/2006 created by Chun Chen
-*****************************************************************************/
-
-#include <suif1.h>
-#include "ir_suif_utils.hh"
-
-
-/**
- * Returns the body of the for loop found by finding the first loop in
- * code, and if level > 1 recursively calling on the body of the found
- * loop and (level-1)
- */
-tree_node_list* loop_body_at_level(tree_node_list* tnl, int level)
-{
-  tree_node_list *inner_nl = 0;
-  //Now strip out the tnl on the inner level of the for loop
-  tree_node_list_iter tnli(tnl);
-  while (!tnli.is_empty()) {
-    tree_node *node = tnli.step();
-    if(node->kind() == TREE_FOR)
-    {
-      //Found the first tree_for, call sibling function
-      inner_nl = loop_body_at_level((tree_for*)node, level);
-      break;
-    }
-  }
-  return inner_nl;
-}
-
-tree_node_list* loop_body_at_level(tree_for* loop, int level)
-{
-  if(level > 1)
-    return loop_body_at_level(loop->body(), level-1);
-  return loop->body();
-}
-
-tree_node_list*  swap_node_for_node_list(tree_node* tn, tree_node_list* new_tnl)
-{
-  tree_node_list* tnl  = tn->parent();
-  tnl->insert_after(new_tnl, tn->list_e());
-  delete tnl->remove(tn->list_e());
-  return tnl;
-}
diff --git a/ir_cudarose.cc b/ir_cudarose.cc
deleted file mode 100644
index 6b31bdd..0000000
--- a/ir_cudarose.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2009 University of Utah
- All Rights Reserved.
-
- Purpose:
-   CHiLL's SUIF interface.
-
- Notes:
-   Array supports mixed pointer and array type in a single declaration.
-
- History:
-   2/2/2011 Created by Protonu Basu. 
-*****************************************************************************/
-
-#include <typeinfo>
-#include "ir_cudarose.hh"
-#include "loop.hh"
-#include "loop_cuda_rose.hh"
-//#include "ir_suif_utils.hh"
-
-using namespace SageBuilder;
-using namespace SageInterface;
-
-IR_cudaroseCode::IR_cudaroseCode(const char *filename, const char* proc_name) :
-  IR_roseCode(filename, proc_name) {
-  
-  //std::string file_suffix = StringUtility::fileNameSuffix(filename);
-  
-  //if (CommandlineProcessing::isCFileNameSuffix(file_suffix))
-  //{
-  std::string orig_name = StringUtility::stripPathFromFileName(filename);
-  std::string naked_name = StringUtility::stripFileSuffixFromFileName(
-    orig_name);
-  file->set_unparse_output_filename("rose_" + naked_name + ".cu");
-  
-  //}
-  
-  gsym_ = root;
-  first_scope = firstScope;
-  parameter = symtab2_;
-  body = symtab3_;
-  defn = func->get_definition()->get_body();
-  func_defn = func->get_definition();
-}
-
-
-
-IR_ArraySymbol *IR_cudaroseCode::CreateArraySymbol(const IR_Symbol *sym,
-                                                   std::vector<omega::CG_outputRepr *> &size, int sharedAnnotation) {
-  SgType *tn;
-  SgVariableSymbol* vs;
-  if (typeid(*sym) == typeid(IR_roseScalarSymbol)) {
-    tn = static_cast<const IR_roseScalarSymbol *>(sym)->vs_->get_type();
-  } else if (typeid(*sym) == typeid(IR_roseArraySymbol)) {
-    tn = static_cast<const IR_roseArraySymbol *>(sym)->vs_->get_type();
-    while (isSgArrayType(tn) || isSgPointerType(tn)) {
-      if (isSgArrayType(tn))
-        tn = isSgArrayType(tn)->get_base_type();
-      else if (isSgPointerType(tn))
-        tn = isSgPointerType(tn)->get_base_type();
-      else
-        throw ir_error(
-          "in CreateScalarSymbol: symbol not an array nor a pointer!");
-    }
-  } else
-    throw std::bad_typeid();
-  
-  for (int i = size.size() - 1; i >= 0; i--)
-    tn = buildArrayType(tn,
-                        static_cast<omega::CG_roseRepr *>(size[i])->GetExpression());
-  
-  static int rose_array_counter = 1;
-  std::string s = std::string("_P") + omega::to_string(rose_array_counter++);
-  SgVariableDeclaration* defn2 = buildVariableDeclaration(
-    const_cast<char *>(s.c_str()), tn);
-  SgInitializedNamePtrList& variables2 = defn2->get_variables();
-  
-  SgInitializedNamePtrList::const_iterator i2 = variables2.begin();
-  SgInitializedName* initializedName2 = *i2;
-  vs = new SgVariableSymbol(initializedName2);
-  
-  prependStatement(defn2,
-                   isSgScopeStatement(func->get_definition()->get_body()));
-  
-  vs->set_parent(symtab_);
-  symtab_->insert(SgName(s.c_str()), vs);
-  
-  SgStatementPtrList* tnl5 = new SgStatementPtrList;
-  
-  (*tnl5).push_back(isSgStatement(defn2));
-  
-  omega::CG_roseRepr* stmt = new omega::CG_roseRepr(tnl5);
-  
-  init_code_ = ocg_->StmtListAppend(init_code_,
-                                    static_cast<omega::CG_outputRepr *>(stmt));
-  
-  if (sharedAnnotation == 1)
-    isSgNode(defn2)->setAttribute("__shared__",
-                                  new AstTextAttribute("__shared__"));
-  
-  return new IR_roseArraySymbol(this, vs);
-}
-
-bool IR_cudaroseCode::commit_loop(Loop *loop, int loop_num) {
-  if (loop == NULL)
-    return true;
-  
-  LoopCuda *cu_loop = (LoopCuda *) loop;
-  SgNode *tnl = cu_loop->codegen();
-  if (!tnl)
-    return false;
-  
-  SgStatementPtrList* new_list = NULL;
-  if (isSgBasicBlock(tnl)) {
-    new_list = new SgStatementPtrList;
-    for (SgStatementPtrList::iterator it =
-           isSgBasicBlock(tnl)->get_statements().begin();
-         it != isSgBasicBlock(tnl)->get_statements().end(); it++)
-      (*new_list).push_back(*it);
-  }
-  
-  //Only thing that should be left will be the inserting of the tnl* into the loop
-  omega::CG_outputRepr *repr;
-  if (new_list == NULL)
-    repr = new omega::CG_roseRepr(tnl);
-  else
-    repr = new omega::CG_roseRepr(new_list);
-  if (cu_loop->init_code != NULL)
-    repr = ocg_->StmtListAppend(cu_loop->init_code->clone(), repr);
-  
-  std::vector<SgForStatement *> loops = find_loops(
-    func->get_definition()->get_body());
-  tnl = isSgNode(loops[loop_num])->get_parent();
-  
-  if (cu_loop->setup_code != NULL) {
-    SgStatementPtrList* setup_tnl =
-      static_cast<omega::CG_roseRepr *>(cu_loop->setup_code)->GetList();
-    
-    SgStatement* target = isSgStatement(loops[loop_num]);
-    
-    for (SgStatementPtrList::iterator it = (*setup_tnl).begin();
-         it != (*setup_tnl).end(); it++) {
-      
-      isSgStatement(tnl)->insert_statement(target, *it, false);
-      isSgNode(*it)->set_parent(tnl);
-      target = *it;
-    }
-    
-    //SgStatementPtrList
-    // for SgStatementPtrList::it
-    //TODO: I think this is a hack we can undo if we have loop->codegen()
-    //loo->getCode(), maybe also get rid of setup and teardown...
-    //fix_unfinished_comment(setup_tnl, indexes_string);
-    //isSgStatement(tnl)->replace_statement(isSgStatement(loops[loop_num]), *setup_tnl);
-    isSgStatement(tnl)->remove_statement(isSgStatement(loops[loop_num]));
-  }
-  
-  delete repr;
-  
-  return true;
-}
-
-IR_cudaroseCode::~IR_cudaroseCode() {
-}
-
diff --git a/ir_cudarose.hh b/ir_cudarose.hh
deleted file mode 100644
index 34e0404..0000000
--- a/ir_cudarose.hh
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef IR_CUDA_ROSE
-#define IR_CUDA_ROSE
-
-#include <code_gen/CG_roseRepr.h>
-#include <code_gen/CG_roseBuilder.h>
-#include "ir_rose.hh"
-#include "loop.hh"
-#include "loop_cuda_rose.hh"
-#include "ir_rose_utils.hh"
-
-
-
-class IR_cudaroseCode : public IR_roseCode{
-  
-public:
-  
-  
-  IR_cudaroseCode(const char *filename, const char* proc_name);
-  
-  
-  
-  SgGlobal *gsym_;
-  SgScopeStatement* defn;      
-  SgGlobal* first_scope;
-  SgSymbolTable* parameter;
-  SgSymbolTable* body;
-  SgFunctionDefinition* func_defn;    
-  std::vector<SgSymbolTable*> write_procs;//procs to write  
-  
-  
-  IR_ArraySymbol *CreateArraySymbol(const IR_Symbol *sym, std::vector<omega::CG_outputRepr *> &size,int sharedAnnotation = 1);
-  omega::CG_outputRepr* init_code(){ return init_code_; }
-  bool commit_loop(Loop *loop, int loop_num);
-  std::vector<SgForStatement *> get_loops()
-  { 
-    std::vector<SgForStatement *> loops = find_loops(func->get_definition()->get_body()); 
-    return loops;
-  }
-  
-  ~IR_cudaroseCode();
-  
-};
-
-
-#endif
-
diff --git a/ir_cudasuif.cc b/ir_cudasuif.cc
deleted file mode 100644
index c646e13..0000000
--- a/ir_cudasuif.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2009 University of Utah
- All Rights Reserved.
-
- Purpose:
-   CHiLL's SUIF interface.
-
- Notes:
-   Array supports mixed pointer and array type in a single declaration.
-
- History:
-   2/2/2011 Created by Protonu Basu. 
-*****************************************************************************/
-
-#include <typeinfo>
-#include "ir_cudasuif.hh"
-#include "loop.hh"
-#include "loop_cuda.hh"
-#include "ir_suif_utils.hh"
-
-
-IR_cudasuifCode::IR_cudasuifCode(const char *filename, int proc_num)
-  :IR_suifCode(filename, proc_num)
-{
-  //setting up gsym_ here
-  fileset->reset_iter();
-  gsym_ = fileset->globals();
-  
-}
-
-
-
-IR_ArraySymbol *IR_cudasuifCode::CreateArraySymbol(const IR_Symbol *sym,
-                                                   std::vector<omega::CG_outputRepr *> &size,
-                                                   int sharedAnnotation)
-{
-  type_node *tn;
-  
-  if (typeid(*sym) == typeid(IR_suifScalarSymbol)) {
-    tn = static_cast<const IR_suifScalarSymbol *>(sym)->vs_->type();
-  }
-  else if (typeid(*sym) == typeid(IR_suifArraySymbol)) {
-    tn = static_cast<const IR_suifArraySymbol *>(sym)->vs_->type();
-    if (tn->is_modifier())
-      tn = static_cast<modifier_type *>(tn)->base();
-    while (tn->is_array() || tn->is_ptr()) {
-      if (tn->is_array())
-        tn = static_cast<array_type *>(tn)->elem_type();
-      else if (tn->is_ptr())
-        tn = static_cast<ptr_type *>(tn)->ref_type();
-    } 
-  }
-  else
-    throw std::bad_typeid();
-  
-  if (is_fortran_)
-    for (int i = 0; i < size.size(); i++) {
-      var_sym *temporary = symtab_->new_unique_var(type_s32);
-      init_code_ = ocg_->StmtListAppend(init_code_, ocg_->StmtListAppend(ocg_->CreateAssignment(0, new omega::CG_suifRepr(operand(temporary)), size[i]),NULL));
-      
-      tn = new array_type(tn, array_bound(1), array_bound(temporary));
-      symtab_->add_type(tn);
-    }
-  else     
-    for (int i = size.size()-1; i >= 0; i--) {
-      var_sym *temporary = symtab_->new_unique_var(type_s32);
-      //init_code_ = ocg_->StmtListAppend(init_code_, ocg_->CreateStmtList(ocg_->CreateAssignment(0, new omega::CG_suifRepr(operand(temporary)), size[i])));
-      init_code_ = ocg_->StmtListAppend(init_code_, ocg_->StmtListAppend(ocg_->CreateAssignment(0, new omega::CG_suifRepr(operand(temporary)), size[i]), NULL));
-      
-      tn = new array_type(tn, array_bound(1), array_bound(temporary));
-      symtab_->add_type(tn);
-      if(i == 0 && sharedAnnotation == 1){
-        tn = static_cast<omega::CG_suifBuilder*>(ocg_)->ModifyType(tn, "__shared__");
-        symtab_->add_type(tn);
-      }
-    }
-  
-  static int suif_array_counter = 1;
-  std::string s = std::string("_P") + omega::to_string(suif_array_counter++);
-  var_sym *vs = new var_sym(tn, const_cast<char *>(s.c_str()));
-  vs->add_to_table(symtab_);
-  
-  return new IR_suifArraySymbol(this, vs);
-}
-
-
-bool IR_cudasuifCode::commit_loop(Loop *loop, int loop_num) {  
-  if (loop == NULL)
-    return true;
-  
-  //Call code-gen part of any scripting routines that were run.
-  // internally call GetCode
-  // Add stuff before and after (setup, teardown
-  // return a tnl
-  LoopCuda *cu_loop = (LoopCuda *)loop;
-  tree_node_list *tnl = cu_loop->codegen();
-  if(!tnl)
-    return false;
-  
-  //set up our new procs
-  for(int i=0; i<cu_loop->new_procs.size(); i++)
-  {
-    printf("setting proc fse\n");
-    cu_loop->new_procs[i]->set_fse(fse_);
-    write_procs.push_back(cu_loop->new_procs[i]);
-  }
-  
-  //Only thing that should be left will be the inserting of the tnl* into the loop
-  
-  omega::CG_outputRepr *repr = new omega::CG_suifRepr(tnl);
-  if (cu_loop->init_code != NULL)
-    repr = ocg_->StmtListAppend(cu_loop->init_code->clone(), repr);
-  
-  std::vector<tree_for *> loops = find_loops(psym_->block()->body());
-  tnl = loops[loop_num]->parent();
-  
-  if (cu_loop->setup_code != NULL) {
-    tree_node_list *setup_tnl = static_cast<omega::CG_suifRepr *>(cu_loop->setup_code->clone())->GetCode();
-    //TODO: I think this is a hack we can undo if we have loop->codegen()
-    //loo->getCode(), maybe also get rid of setup and teardown...
-    //fix_unfinished_comment(setup_tnl, indexes_string);
-    tnl->insert_before(setup_tnl, loops[loop_num]->list_e());
-  }
-  tnl->insert_before(static_cast<omega::CG_suifRepr *>(repr)->GetCode(), loops[loop_num]->list_e());
-  if (cu_loop->teardown_code != NULL) {
-    tree_node_list *setup_tnl = static_cast<omega::CG_suifRepr *>(cu_loop->teardown_code->clone())->GetCode();
-    tnl->insert_before(setup_tnl, loops[loop_num]->list_e());
-  }
-  
-  tnl->remove(loops[loop_num]->list_e());
-  
-  delete repr;
-  return true;
-}
-
-IR_cudasuifCode::~IR_cudasuifCode()
-{
-  for(int i=0; i<write_procs.size(); i++)
-  {
-    if (!write_procs[i]->is_written())
-      write_procs[i]->write_proc(fse_);
-    write_procs[i]->flush_proc();
-  }
-}
diff --git a/ir_cudasuif.hh b/ir_cudasuif.hh
deleted file mode 100644
index 834778e..0000000
--- a/ir_cudasuif.hh
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef IR_CUDA_SUIF
-#define IR_CUDA_SUIF
-
-#include <code_gen/CG_suifRepr.h>
-#include <code_gen/CG_suifBuilder.h>
-#include "ir_suif.hh"
-#include "loop.hh"
-#include "loop_cuda.hh"
-#include "ir_suif_utils.hh"
-
-
-
-class IR_cudasuifCode : public IR_suifCode{
-  
-public:
-  global_symtab *gsym_;
-  std::vector<proc_sym*> write_procs;//procs to write  
-  
-  
-  IR_cudasuifCode(const char *filename, int proc_num);
-  IR_ArraySymbol *CreateArraySymbol(const IR_Symbol *sym, 
-                                    std::vector<omega::CG_outputRepr *> &size,
-                                    int sharedAnnotation = 1);
-  omega::CG_outputRepr* init_code(){ return init_code_; }
-  bool commit_loop(Loop *loop, int loop_num);
-  std::vector<tree_for *> get_loops()
-  {
-    std::vector<tree_for *> loops = find_loops(psym_->block()->body());
-    return loops;
-  }
-  ~IR_cudasuifCode();
-  
-};
-
-
-#endif
diff --git a/loop.cc b/loop.cc
index ce83006..0a82f7a 100644
--- a/loop.cc
+++ b/loop.cc
@@ -53,6 +53,7 @@ bool Loop::isInitialized() const {
 
 bool Loop::init_loop(std::vector<ir_tree_node *> &ir_tree,
                      std::vector<ir_tree_node *> &ir_stmt) {
+
   ir_stmt = extract_ir_stmts(ir_tree);
   stmt_nesting_level_.resize(ir_stmt.size());
   std::vector<int> stmt_nesting_level(ir_stmt.size());
diff --git a/loop_backup.cc b/loop_backup.cc
deleted file mode 100644
index b361ed4..0000000
--- a/loop_backup.cc
+++ /dev/null
@@ -1,3311 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2008 University of Southern California
- Copyright (C) 2009-2010 University of Utah
- All Rights Reserved.
-
- Purpose:
-   Core loop transformation functionality.
-
- Notes:
-   "level" (starting from 1) means loop level and it corresponds to "dim"
- (starting from 0) in transformed iteration space [c_1,l_1,c_2,l_2,....,
- c_n,l_n,c_(n+1)], e.g., l_2 is loop level 2 in generated code, dim 3
- in transformed iteration space, and variable 4 in Omega relation.
- All c's are constant numbers only and they will not show up as actual loops.
- Formula:
-    dim = 2*level - 1
-    var = dim + 1
-
- History:
-   10/2005 Created by Chun Chen.
-   09/2009 Expand tile functionality, -chun
-   10/2009 Initialize unfusible loop nest without bailing out, -chun
-*****************************************************************************/
-
-#include <limits.h>
-#include <math.h>
-#include <code_gen/code_gen.h>
-#include <code_gen/CG_outputBuilder.h>
-#include <code_gen/output_repr.h>
-#include <iostream>
-#include <map>
-#include "loop.hh"
-#include "omegatools.hh"
-#include "irtools.hh"
-#include "chill_error.hh"
-
-using namespace omega;
-
-const std::string Loop::tmp_loop_var_name_prefix = std::string("_t");
-const std::string Loop::overflow_var_name_prefix = std::string("over");
-
-//-----------------------------------------------------------------------------
-// Class Loop
-//-----------------------------------------------------------------------------
-
-bool Loop::init_loop(std::vector<ir_tree_node *> &ir_tree, std::vector<ir_tree_node *> &ir_stmt) {
-  ir_stmt = extract_ir_stmts(ir_tree);
-  std::vector<int> stmt_nesting_level(ir_stmt.size());
-  for (int i = 0; i < ir_stmt.size(); i++) {
-    ir_stmt[i]->payload = i;
-    int t = 0;
-    ir_tree_node *itn = ir_stmt[i];
-    while (itn->parent != NULL) {
-      itn = itn->parent;
-      if (itn->content->type() == IR_CONTROL_LOOP)
-        t++;
-    }
-    stmt_nesting_level[i] = t;
-  }
-  
-  stmt = std::vector<Statement>(ir_stmt.size());
-  int n_dim = -1;
-  int max_loc;
-  std::vector<std::string> index;
-  for (int i = 0; i < ir_stmt.size(); i++) {
-    int max_nesting_level = -1;
-    int loc;
-    for (int j = 0; j < ir_stmt.size(); j++)
-      if (stmt_nesting_level[j] > max_nesting_level) {
-        max_nesting_level = stmt_nesting_level[j];
-        loc = j;
-      }
-    
-    // most deeply nested statement acting as a reference point
-    if (n_dim == -1) {
-      n_dim = max_nesting_level;
-      max_loc = loc;
-      
-      index = std::vector<std::string>(n_dim);
-      
-      ir_tree_node *itn = ir_stmt[loc];
-      int cur_dim = n_dim-1;
-      while (itn->parent != NULL) {
-        itn = itn->parent;
-        if (itn->content->type() == IR_CONTROL_LOOP) {
-          index[cur_dim] = static_cast<IR_Loop *>(itn->content)->index()->name();
-          itn->payload = cur_dim--;
-        }
-      }
-    }
-    
-    // align loops by names, temporary solution
-    ir_tree_node *itn = ir_stmt[loc];
-    while (itn->parent != NULL) {
-      itn = itn->parent;
-      if (itn->content->type() == IR_CONTROL_LOOP && itn->payload == -1) {
-        std::string name = static_cast<IR_Loop *>(itn->content)->index()->name();
-        for (int j = 0; j < n_dim; j++)
-          if (index[j] == name) {
-            itn->payload = j;
-            break;
-          }
-        if (itn->payload == -1)
-          throw loop_error("no complex alignment yet");
-      }
-    }
-    
-    // set relation variable names
-    Relation r(n_dim);
-    F_And *f_root = r.add_and();
-    itn = ir_stmt[loc];
-    while (itn->parent != NULL) {
-      itn = itn->parent;
-      if (itn->content->type() == IR_CONTROL_LOOP)
-        r.name_set_var(itn->payload+1, static_cast<IR_Loop *>(itn->content)->index()->name());
-    }
-    
-    // extract information from loop/if structures
-    std::vector<bool> processed(n_dim, false);
-    Tuple<std::string> vars_to_be_reversed;
-    itn = ir_stmt[loc];
-    while (itn->parent != NULL) {
-      itn = itn->parent;
-      
-      switch (itn->content->type()) {
-      case IR_CONTROL_LOOP: {
-        IR_Loop *lp = static_cast<IR_Loop *>(itn->content);
-        Variable_ID v = r.set_var(itn->payload+1);
-        int c;
-        
-        try {
-          c = lp->step_size();
-          if (c > 0) {
-            CG_outputRepr *lb = lp->lower_bound();
-            exp2formula(ir, r, f_root, freevar, lb, v, 's', IR_COND_GE, true);
-            CG_outputRepr *ub = lp->upper_bound();
-            IR_CONDITION_TYPE cond = lp->stop_cond();
-            if (cond == IR_COND_LT || cond == IR_COND_LE)
-              exp2formula(ir, r, f_root, freevar, ub, v, 's', cond, true);
-            else
-              throw ir_error("loop condition not supported");
-            
-          }
-          else if (c < 0) {
-            CG_outputBuilder *ocg = ir->builder();
-            CG_outputRepr *lb = lp->lower_bound();
-            lb = ocg->CreateMinus(NULL, lb);
-            exp2formula(ir, r, f_root, freevar, lb, v, 's', IR_COND_GE, true);
-            CG_outputRepr *ub = lp->upper_bound();
-            ub = ocg->CreateMinus(NULL, ub);
-            IR_CONDITION_TYPE cond = lp->stop_cond();
-            if (cond == IR_COND_GE)
-              exp2formula(ir, r, f_root, freevar, ub, v, 's', IR_COND_LE, true);
-            else if (cond == IR_COND_GT)
-              exp2formula(ir, r, f_root, freevar, ub, v, 's', IR_COND_LT, true);
-            else
-              throw ir_error("loop condition not supported");
-            
-            vars_to_be_reversed.append(lp->index()->name());
-          }
-          else
-            throw ir_error("loop step size zero");
-        }
-        catch (const ir_error &e) {
-          for (int i = 0; i < itn->children.size(); i++)
-            delete itn->children[i];
-          itn->children = std::vector<ir_tree_node *>();
-          itn->content = itn->content->convert();
-          return false;
-        }
-        
-        if (abs(c) != 1) {
-          F_Exists *f_exists = f_root->add_exists();
-          Variable_ID e = f_exists->declare();
-          F_And *f_and = f_exists->add_and();
-          Stride_Handle h = f_and->add_stride(abs(c));
-          if (c > 0)
-            h.update_coef(e, 1);
-          else
-            h.update_coef(e, -1);
-          h.update_coef(v, -1);
-          CG_outputRepr *lb = lp->lower_bound();
-          exp2formula(ir, r, f_and, freevar, lb, e, 's', IR_COND_EQ, true);
-        }
-        
-        processed[itn->payload] = true;
-        break;
-      }
-      case IR_CONTROL_IF: {
-        CG_outputRepr *cond = static_cast<IR_If *>(itn->content)->condition();
-        try {
-          if (itn->payload % 2 == 1)
-            exp2constraint(ir, r, f_root, freevar, cond, true);
-          else {
-            F_Not *f_not = f_root->add_not();
-            F_And *f_and = f_not->add_and();
-            exp2constraint(ir, r, f_and, freevar, cond, true);
-          }
-        }
-        catch (const ir_error &e) {
-          std::vector<ir_tree_node *> *t;
-          if (itn->parent == NULL)
-            t = &ir_tree;
-          else
-            t = &(itn->parent->children);
-          int id = itn->payload;
-          int i = t->size() - 1;
-          while (i >= 0) {
-            if ((*t)[i] == itn) {
-              for (int j = 0; j < itn->children.size(); j++)
-                delete itn->children[j];
-              itn->children = std::vector<ir_tree_node *>();
-              itn->content = itn->content->convert();
-            }
-            else if ((*t)[i]->payload >> 1 == id >> 1) {
-              delete (*t)[i];
-              t->erase(t->begin()+i);
-            }
-            i--;
-          }
-          return false;
-        }
-        
-        break;
-      }
-      default:
-        for (int i = 0; i < itn->children.size(); i++)
-          delete itn->children[i];
-        itn->children = std::vector<ir_tree_node *>();
-        itn->content = itn->content->convert();
-        return false;
-      }
-    }
-    
-    // add information for missing loops
-    for (int j = 0; j < n_dim; j++)
-      if (!processed[j]) {
-        ir_tree_node *itn = ir_stmt[max_loc];
-        while (itn->parent != NULL) {
-          itn = itn->parent;
-          if (itn->content->type() == IR_CONTROL_LOOP && itn->payload == j)
-            break;
-        }
-        
-        Variable_ID v = r.set_var(j+1);
-        if (loc < max_loc) {
-          CG_outputRepr *lb = static_cast<IR_Loop *>(itn->content)->lower_bound();
-          exp2formula(ir, r, f_root, freevar, lb, v, 's', IR_COND_EQ, true);
-        }
-        else { // loc > max_loc
-          CG_outputRepr *ub = static_cast<IR_Loop *>(itn->content)->upper_bound();
-          exp2formula(ir, r, f_root, freevar, ub, v, 's', IR_COND_EQ, true);
-        }
-      }
-    
-    r.setup_names();
-    r.simplify();
-    
-    // insert the statement
-    CG_outputBuilder *ocg = ir->builder();
-    Tuple<CG_outputRepr *> reverse_expr;
-    for (int j = 1; j <= vars_to_be_reversed.size(); j++) {
-      CG_outputRepr *repl = ocg->CreateIdent(vars_to_be_reversed[j]);
-      repl = ocg->CreateMinus(NULL, repl);
-      reverse_expr.append(repl);
-    }     
-    CG_outputRepr *code = static_cast<IR_Block *>(ir_stmt[loc]->content)->extract();
-    code = ocg->CreatePlaceHolder(0, code, reverse_expr, vars_to_be_reversed);
-    stmt[loc].code = code;
-    stmt[loc].IS = r;
-    stmt[loc].loop_level = std::vector<LoopLevel>(n_dim);
-    for (int i = 0; i < n_dim; i++) {
-      stmt[loc].loop_level[i].type = LoopLevelOriginal;
-      stmt[loc].loop_level[i].payload = i;
-      stmt[loc].loop_level[i].parallel_level = 0;
-    }
-    
-    stmt_nesting_level[loc] = -1;
-  }
-  
-  return true;
-}  
-
-
-
-Loop::Loop(const IR_Control *control) {
-  ir = const_cast<IR_Code *>(control->ir_);
-  init_code = NULL;
-  cleanup_code = NULL;
-  tmp_loop_var_name_counter = 1;
-  overflow_var_name_counter = 1;
-  known = Relation::True(0);
-  
-  std::vector<ir_tree_node *> ir_tree = build_ir_tree(control->clone(), NULL);
-  std::vector<ir_tree_node *> ir_stmt;
-  
-  while (!init_loop(ir_tree, ir_stmt)) {}
-  
-  // init the dependence graph
-  for (int i = 0; i < stmt.size(); i++)
-    dep.insert();
-  
-  for (int i = 0; i < stmt.size(); i++)
-    for (int j = i; j < stmt.size(); j++) {
-      std::pair<std::vector<DependenceVector>, std::vector<DependenceVector> > dv = test_data_dependences(ir, stmt[i].code, stmt[i].IS, stmt[j].code, stmt[j].IS, freevar);
-      
-      for (int k = 0; k < dv.first.size(); k++)
-        if (is_dependence_valid(ir_stmt[i], ir_stmt[j], dv.first[k], true))
-          dep.connect(i, j, dv.first[k]);
-        else
-          dep.connect(j, i, dv.first[k].reverse());
-      
-      for (int k = 0; k < dv.second.size(); k++)
-        if (is_dependence_valid(ir_stmt[j], ir_stmt[i], dv.second[k], false))
-          dep.connect(j, i, dv.second[k]);
-        else
-          dep.connect(i, j, dv.second[k].reverse());
-    }
-  
-  // cleanup the IR tree
-  for (int i = 0; i < ir_tree.size(); i++)
-    delete ir_tree[i];
-  
-  // init dumb transformation relations e.g. [i, j] -> [ 0, i, 0, j, 0]
-  for (int i = 0; i < stmt.size(); i++) {
-    int n = stmt[i].IS.n_set();
-    stmt[i].xform = Relation(n, 2*n+1);
-    F_And *f_root = stmt[i].xform.add_and();
-    
-    for (int j = 1; j <= n; j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(stmt[i].xform.output_var(2*j), 1);
-      h.update_coef(stmt[i].xform.input_var(j), -1);
-    }
-    
-    for (int j = 1; j <= 2*n+1; j+=2) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(stmt[i].xform.output_var(j), 1);
-    }
-    stmt[i].xform.simplify();
-  }
-  
-  if (stmt.size() != 0)
-    num_dep_dim = stmt[0].IS.n_set();
-  else
-    num_dep_dim = 0;
-}
-
-
-Loop::~Loop() {
-  for (int i = 0; i < stmt.size(); i++)
-    if (stmt[i].code != NULL) {
-      stmt[i].code->clear();
-      delete stmt[i].code;
-    }
-  if (init_code != NULL) {
-    init_code->clear();
-    delete init_code;
-  }
-  if (cleanup_code != NULL) {
-    cleanup_code->clear();
-    delete cleanup_code;
-  }
-}
-
-
-int Loop::get_dep_dim_of(int stmt_num, int level) const {
-  if (stmt_num < 0 || stmt_num >= stmt.size())
-    throw std::invalid_argument("invaid statement " + to_string(stmt_num));
-  
-  if (level < 1 || level > stmt[stmt_num].loop_level.size())
-    return -1;
-  
-  int trip_count = 0;
-  while (true) {
-    switch (stmt[stmt_num].loop_level[level-1].type) {
-    case LoopLevelOriginal:
-      return stmt[stmt_num].loop_level[level-1].payload;
-    case LoopLevelTile:
-      level = stmt[stmt_num].loop_level[level-1].payload;
-      if (level < 1)
-        return -1;
-      if (level > stmt[stmt_num].loop_level.size())
-        throw loop_error("incorrect loop level information for statement " + to_string(stmt_num));
-      break;
-    default:
-      throw loop_error("unknown loop level information for statement " + to_string(stmt_num));
-    }
-    trip_count++;
-    if (trip_count >= stmt[stmt_num].loop_level.size())
-      throw loop_error("incorrect loop level information for statement " + to_string(stmt_num));
-  }
-}
-
-
-int Loop::get_last_dep_dim_before(int stmt_num, int level) const {
-  if (stmt_num < 0 || stmt_num >= stmt.size())
-    throw std::invalid_argument("invaid statement " + to_string(stmt_num));
-  
-  if (level < 1)
-    return -1;
-  if (level > stmt[stmt_num].loop_level.size())
-    level = stmt[stmt_num].loop_level.size() + 1;
-  
-  for (int i = level-1; i >= 1; i--)
-    if (stmt[stmt_num].loop_level[i-1].type == LoopLevelOriginal)
-      return stmt[stmt_num].loop_level[i-1].payload;
-  
-  return -1;
-}
-
-
-void Loop::print_internal_loop_structure() const {
-  for (int i = 0; i < stmt.size(); i++) {
-    std::vector<int> lex = getLexicalOrder(i);
-    std::cout << "s" << i+1 << ": ";
-    for (int j = 0; j < stmt[i].loop_level.size(); j++) {
-      if (2*j < lex.size())
-        std::cout << lex[2*j];
-      switch (stmt[i].loop_level[j].type) {
-      case LoopLevelOriginal:
-        std::cout << "(dim:" << stmt[i].loop_level[j].payload << ")";
-        break;
-      case LoopLevelTile:
-        std::cout << "(tile:" << stmt[i].loop_level[j].payload << ")";
-        break;
-      default:
-        std::cout << "(unknown)";
-      }
-      std::cout << ' ';
-    }
-    for (int j = 2*stmt[i].loop_level.size(); j < lex.size(); j+=2) {
-      std::cout << lex[j];
-      if (j != lex.size()-1)
-        std::cout << ' ';
-    }
-    std::cout << std::endl;
-  }
-}
-
-
-CG_outputRepr *Loop::getCode(int effort) const {  
-  const int m = stmt.size();
-  if (m == 0)
-    return NULL;
-  const int n = stmt[0].xform.n_out();
-  
-  Tuple<CG_outputRepr *> ni(m);
-  Tuple<Relation> IS(m);
-  Tuple<Relation> xform(m);
-  for (int i = 0; i < m; i++) {
-    ni[i+1] = stmt[i].code;
-    IS[i+1] = stmt[i].IS;
-    xform[i+1] = stmt[i].xform;
-  }
-  
-  Relation known = Extend_Set(copy(this->known), n - this->known.n_set());  
-  CG_outputBuilder *ocg = ir->builder();
-  CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, effort);
-  
-  if (init_code != NULL)
-    repr = ocg->StmtListAppend(init_code->clone(), repr);
-  if (cleanup_code != NULL)
-    repr = ocg->StmtListAppend(repr, cleanup_code->clone());
-  
-  return repr;
-}
-
-
-void Loop::printCode(int effort) const {
-  const int m = stmt.size();
-  if (m == 0)
-    return;
-  const int n = stmt[0].xform.n_out();
-  
-  Tuple<Relation> IS(m);
-  Tuple<Relation> xform(m);
-  for (int i = 0; i < m; i++) {
-    IS[i+1] = stmt[i].IS;
-    xform[i+1] = stmt[i].xform;
-  }
-  
-  Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
-  std::cout << MMGenerateCode(xform, IS, known, effort);
-}
-
-
-Relation Loop::getNewIS(int stmt_num) const {
-  Relation result;
-  
-  if (stmt[stmt_num].xform.is_null()) {
-    Relation known = Extend_Set(copy(this->known), stmt[stmt_num].IS.n_set() - this->known.n_set());
-    result = Intersection(copy(stmt[stmt_num].IS), known);
-  }
-  else {
-    Relation known = Extend_Set(copy(this->known), stmt[stmt_num].xform.n_out() - this->known.n_set()); 
-    result = Intersection(Range(Restrict_Domain(copy(stmt[stmt_num].xform), copy(stmt[stmt_num].IS))), known);
-  }
-  
-  result.simplify(2, 4);
-  
-  return result;
-}
-
-std::vector<Relation> Loop::getNewIS() const {
-  const int m = stmt.size();
-  
-  std::vector<Relation> new_IS(m);
-  for (int i = 0; i < m; i++)
-    new_IS[i] = getNewIS(i);
-  
-  return new_IS;
-}
-
-
-void Loop::permute(const std::vector<int> &pi) {
-  std::set<int> active;
-  for (int i = 0; i < stmt.size(); i++)
-    active.insert(i);
-  
-  permute(active, pi);
-}
-
-
-void Loop::original() {
-  std::set<int> active;
-  for (int i = 0; i < stmt.size(); i++)
-    active.insert(i);
-  setLexicalOrder(0, active);
-}
-
-
-void Loop::permute(const std::set<int> &active, const std::vector<int> &pi) {
-  if (active.size() == 0 || pi.size() == 0)
-    return;
-  
-  // check for sanity of parameters
-  int level = pi[0];
-  for (int i = 1; i < pi.size(); i++)
-    if (pi[i] < level)
-      level = pi[i];
-  if (level < 1)
-    throw std::invalid_argument("invalid permuation");
-  std::vector<int> reverse_pi(pi.size(), 0);
-  for (int i = 0; i < pi.size(); i++)
-    if (pi[i] >= level+pi.size())
-      throw std::invalid_argument("invalid permutation");
-    else
-      reverse_pi[pi[i]-level] = i+level;
-  for (int i = 0; i < reverse_pi.size(); i++)
-    if (reverse_pi[i] == 0)
-      throw std::invalid_argument("invalid permuation");
-  int ref_stmt_num;
-  std::vector<int> lex;
-  for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
-    if (*i < 0 || *i >= stmt.size())
-      throw std::invalid_argument("invalid statement " + to_string(*i));
-    if (i == active.begin()) {
-      ref_stmt_num = *i;
-      lex = getLexicalOrder(*i);
-    }
-    else {
-      if (level+pi.size()-1 > stmt[*i].loop_level.size())
-        throw std::invalid_argument("invalid permuation");
-      std::vector<int> lex2 = getLexicalOrder(*i);
-      for (int j = 0; j < 2*level-3; j+=2)
-        if (lex[j] != lex2[j])
-          throw std::invalid_argument("statements to permute must be in the same subloop");
-      for (int j = 0; j < pi.size(); j++)
-        if (!(stmt[*i].loop_level[level+j-1].type == stmt[ref_stmt_num].loop_level[level+j-1].type &&
-              stmt[*i].loop_level[level+j-1].payload == stmt[ref_stmt_num].loop_level[level+j-1].payload))
-          throw std::invalid_argument("permuted loops must have the same loop level types");
-    }
-  }
-  
-  // Update transformation relations
-  for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
-    int n = stmt[*i].xform.n_out();
-    Relation mapping(n, n);
-    F_And *f_root = mapping.add_and();
-    for (int j = 1; j <= n; j+= 2) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(j), 1);
-      h.update_coef(mapping.input_var(j), -1);
-    }
-    for (int j = 0; j < pi.size(); j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(2*(level+j)), 1);
-      h.update_coef(mapping.input_var(2*pi[j]), -1);
-    }
-    for (int j = 1; j < level; j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(2*j), 1);
-      h.update_coef(mapping.input_var(2*j), -1);
-    }
-    for (int j = level+pi.size(); j <= stmt[*i].loop_level.size(); j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(2*j), 1);
-      h.update_coef(mapping.input_var(2*j), -1);
-    }
-    
-    stmt[*i].xform = Composition(mapping, stmt[*i].xform);
-    stmt[*i].xform.simplify();
-  }
-  
-  // get the permuation for dependence vectors
-  std::vector<int> t;
-  for (int i = 0; i < pi.size(); i++)
-    if (stmt[ref_stmt_num].loop_level[pi[i]-1].type == LoopLevelOriginal)
-      t.push_back(stmt[ref_stmt_num].loop_level[pi[i]-1].payload);
-  int max_dep_dim = -1;
-  int min_dep_dim = num_dep_dim;
-  for (int i = 0; i < t.size(); i++) {
-    if (t[i] > max_dep_dim)
-      max_dep_dim = t[i];
-    if (t[i] < min_dep_dim)
-      min_dep_dim = t[i];
-  }
-  if (min_dep_dim > max_dep_dim)
-    return;
-  if (max_dep_dim - min_dep_dim + 1 != t.size())
-    throw loop_error("cannot update the dependence graph after permuation");
-  std::vector<int> dep_pi(num_dep_dim);
-  for (int i = 0; i < min_dep_dim; i++)
-    dep_pi[i] = i;
-  for (int i = min_dep_dim; i <= max_dep_dim; i++)
-    dep_pi[i] = t[i-min_dep_dim];
-  for (int i = max_dep_dim+1; i < num_dep_dim; i++)
-    dep_pi[i] = i;  
-  
-  // update the dependence graph
-  DependenceGraph g;
-  for (int i = 0; i < dep.vertex.size(); i++)
-    g.insert();
-  for (int i = 0; i < dep.vertex.size(); i++)
-    for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++) {
-      if ((active.find(i) != active.end() && active.find(j->first) != active.end())) {
-        std::vector<DependenceVector> dv = j->second;
-        for (int k = 0; k < dv.size(); k++) {
-          switch (dv[k].type) {
-          case DEP_W2R:
-          case DEP_R2W:
-          case DEP_W2W:
-          case DEP_R2R: {
-            std::vector<coef_t> lbounds(num_dep_dim);
-            std::vector<coef_t> ubounds(num_dep_dim);
-            for (int d = 0; d < num_dep_dim; d++) {
-              lbounds[d] = dv[k].lbounds[dep_pi[d]];
-              ubounds[d] = dv[k].ubounds[dep_pi[d]];
-            }
-            dv[k].lbounds = lbounds;
-            dv[k].ubounds = ubounds;
-            break;
-          }
-          case DEP_CONTROL: {
-            break;
-          }
-          default:
-            throw loop_error("unknown dependence type");
-          }
-        }
-        g.connect(i, j->first, dv);
-      }
-      else if (active.find(i) == active.end() && active.find(j->first) == active.end()) {
-        std::vector<DependenceVector> dv = j->second;
-        g.connect(i, j->first, dv);
-      }
-      else {
-        std::vector<DependenceVector> dv = j->second;
-        for (int k = 0; k < dv.size(); k++)
-          switch (dv[k].type) {
-          case DEP_W2R:
-          case DEP_R2W:
-          case DEP_W2W:
-          case DEP_R2R: {
-            for (int d = 0; d < num_dep_dim; d++)
-              if (dep_pi[d] != d) {
-                dv[k].lbounds[d] = -posInfinity;
-                dv[k].ubounds[d] = posInfinity;
-              }
-            break;
-          }
-          case DEP_CONTROL:
-            break;
-          default:
-            throw loop_error("unknown dependence type");
-          }
-        g.connect(i, j->first, dv);
-      }
-    }
-  dep = g;
-  
-  // update loop level information
-  for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
-    int cur_dep_dim = min_dep_dim;
-    std::vector<LoopLevel> new_loop_level(stmt[*i].loop_level.size());
-    for (int j = 1; j <= stmt[*i].loop_level.size(); j++)
-      if (j >= level && j < level+pi.size()) {
-        switch (stmt[*i].loop_level[reverse_pi[j-level]-1].type) {
-        case LoopLevelOriginal:
-          new_loop_level[j-1].type = LoopLevelOriginal;
-          new_loop_level[j-1].payload = cur_dep_dim++;
-          new_loop_level[j-1].parallel_level = stmt[*i].loop_level[reverse_pi[j-level]-1].parallel_level;
-          break;
-        case LoopLevelTile: {
-          new_loop_level[j-1].type = LoopLevelTile;
-          int ref_level = stmt[*i].loop_level[reverse_pi[j-level]-1].payload;
-          if (ref_level >= level && ref_level < level+pi.size())
-            new_loop_level[j-1].payload = reverse_pi[ref_level-level];
-          else
-            new_loop_level[j-1].payload = ref_level;
-          new_loop_level[j-1].parallel_level = stmt[*i].loop_level[reverse_pi[j-level]-1].parallel_level;
-          break;
-        }
-        default:
-          throw loop_error("unknown loop level information for statement " + to_string(*i));
-        }
-      }
-      else {
-        switch (stmt[*i].loop_level[j-1].type) {
-        case LoopLevelOriginal:
-          new_loop_level[j-1].type = LoopLevelOriginal;
-          new_loop_level[j-1].payload = stmt[*i].loop_level[j-1].payload;
-          new_loop_level[j-1].parallel_level = stmt[*i].loop_level[j-1].parallel_level;
-          break;
-        case LoopLevelTile: {
-          new_loop_level[j-1].type = LoopLevelTile;
-          int ref_level = stmt[*i].loop_level[j-1].payload;
-          if (ref_level >= level && ref_level < level+pi.size())
-            new_loop_level[j-1].payload = reverse_pi[ref_level-level];
-          else
-            new_loop_level[j-1].payload = ref_level;
-          new_loop_level[j-1].parallel_level = stmt[*i].loop_level[j-1].parallel_level;
-          break;
-        }
-        default:
-          throw loop_error("unknown loop level information for statement " + to_string(*i));          
-        }
-      }
-    stmt[*i].loop_level = new_loop_level;
-  }
-  
-  setLexicalOrder(2*level-2, active);
-}
-
-std::set<int> Loop::split(int stmt_num, int level, const Relation &cond) {
-  // check for sanity of parameters
-  if (stmt_num < 0 || stmt_num >= stmt.size())
-    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
-  if (level <= 0 || level > stmt[stmt_num].loop_level.size())
-    throw std::invalid_argument("invalid loop level " + to_string(level));
-  
-  std::set<int> result;
-  int dim = 2*level-1;
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  std::set<int> same_loop = getStatements(lex, dim-1);
-  
-  Relation cond2 = copy(cond);
-  cond2.simplify();
-  cond2 = EQs_to_GEQs(cond2);
-  Conjunct *c = cond2.single_conjunct();
-  int cur_lex = lex[dim-1];
-  for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
-    int max_level = (*gi).max_tuple_pos();
-    Relation single_cond(max_level);
-    single_cond.and_with_GEQ(*gi);
-    
-    // TODO: should decide where to place newly created statements with
-    // complementary split condition from dependence graph.
-    bool place_after;
-    if (max_level == 0)
-      place_after = true;
-    else if ((*gi).get_coef(cond2.set_var(max_level)) < 0)
-      place_after = true;
-    else
-      place_after = false;
-    
-    // make adjacent lexical number available for new statements
-    if (place_after) {
-      lex[dim-1] = cur_lex+1;
-      shiftLexicalOrder(lex, dim-1, 1);
-    }
-    else {
-      lex[dim-1] = cur_lex-1;
-      shiftLexicalOrder(lex, dim-1, -1);
-    }
-    
-    // original statements with split condition,
-    // new statements with complement of split condition
-    int old_num_stmt = stmt.size();
-    std::map<int, int> what_stmt_num;
-    apply_xform(same_loop);
-    for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-      int n = stmt[*i].IS.n_set();
-      Relation part1, part2;
-      if (max_level > n) {
-        part1 = copy(stmt[*i].IS);
-        part2 = Relation::False(0);
-      }
-      else {
-        part1 = Intersection(copy(stmt[*i].IS), Extend_Set(copy(single_cond), n-max_level));
-        part2 = Intersection(copy(stmt[*i].IS), Extend_Set(Complement(copy(single_cond)), n-max_level));
-      }
-      
-      stmt[*i].IS = part1;
-      
-      if (Intersection(copy(part2), Extend_Set(copy(this->known), n-this->known.n_set())).is_upper_bound_satisfiable()) {
-        Statement new_stmt;
-        new_stmt.code = stmt[*i].code->clone();
-        new_stmt.IS = part2;
-        new_stmt.xform = copy(stmt[*i].xform);
-        if (place_after)
-          assign_const(new_stmt.xform, dim-1, cur_lex+1);
-        else
-          assign_const(new_stmt.xform, dim-1, cur_lex-1);
-        new_stmt.loop_level = stmt[*i].loop_level;
-        stmt.push_back(new_stmt);
-        dep.insert();
-        what_stmt_num[*i] = stmt.size() - 1;
-        if (*i == stmt_num)
-          result.insert(stmt.size() - 1);
-      }
-    }
-    
-    // update dependence graph
-    int dep_dim = get_dep_dim_of(stmt_num, level);
-    for (int i = 0; i < old_num_stmt; i++) {
-      std::vector<std::pair<int, std::vector<DependenceVector> > > D;
-      
-      for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++) {
-        if (same_loop.find(i) != same_loop.end()) {
-          if (same_loop.find(j->first) != same_loop.end()) {
-            if (what_stmt_num.find(i) != what_stmt_num.end() && what_stmt_num.find(j->first) != what_stmt_num.end())
-              dep.connect(what_stmt_num[i], what_stmt_num[j->first], j->second);
-            if (place_after && what_stmt_num.find(j->first) != what_stmt_num.end()) {
-              std::vector<DependenceVector> dvs;
-              for (int k = 0; k < j->second.size(); k++) {
-                DependenceVector dv = j->second[k];
-                if (dv.is_data_dependence() && dep_dim != -1) {
-                  dv.lbounds[dep_dim] = -posInfinity;
-                  dv.ubounds[dep_dim] = posInfinity;
-                }
-                dvs.push_back(dv);
-              }
-              if (dvs.size() > 0)
-                D.push_back(std::make_pair(what_stmt_num[j->first], dvs));
-            }
-            else if (!place_after && what_stmt_num.find(i) != what_stmt_num.end()) {
-              std::vector<DependenceVector> dvs;
-              for (int k = 0; k < j->second.size(); k++) {
-                DependenceVector dv = j->second[k];
-                if (dv.is_data_dependence() && dep_dim != -1) {
-                  dv.lbounds[dep_dim] = -posInfinity;
-                  dv.ubounds[dep_dim] = posInfinity;
-                }
-                dvs.push_back(dv);
-              }
-              if (dvs.size() > 0)
-                dep.connect(what_stmt_num[i], j->first, dvs);
-              
-            }
-          }
-          else {
-            if (what_stmt_num.find(i) != what_stmt_num.end())
-              dep.connect(what_stmt_num[i], j->first, j->second);
-          }
-        }
-        else if (same_loop.find(j->first) != same_loop.end()) {
-          if (what_stmt_num.find(j->first) != what_stmt_num.end())
-            D.push_back(std::make_pair(what_stmt_num[j->first], j->second));
-        }
-      }
-      
-      for (int j = 0; j < D.size(); j++)
-        dep.connect(i, D[j].first, D[j].second);
-    }
-  }
-  
-  return result;
-}
-
-
-
-void Loop::tile(int stmt_num, int level, int tile_size, int outer_level, TilingMethodType method, int alignment_offset, int alignment_multiple) {
-  // check for sanity of parameters
-  if (tile_size < 0)
-    throw std::invalid_argument("invalid tile size");
-  if (alignment_multiple < 1 || alignment_offset < 0)
-    throw std::invalid_argument("invalid alignment for tile");
-  if (stmt_num < 0  || stmt_num >= stmt.size())
-    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
-  if (level <= 0)
-    throw std::invalid_argument("invalid loop level " + to_string(level));
-  if (level > stmt[stmt_num].loop_level.size())
-    throw std::invalid_argument("there is no loop level " + to_string(level) + " for statement " + to_string(stmt_num));
-  if (outer_level <= 0 || outer_level > level) 
-    throw std::invalid_argument("invalid tile controlling loop level " + to_string(outer_level));
-  
-  int dim = 2*level-1;
-  int outer_dim = 2*outer_level-1;
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  std::set<int> same_tiled_loop = getStatements(lex, dim-1);
-  std::set<int> same_tile_controlling_loop = getStatements(lex, outer_dim-1);
-  
-  // special case for no tiling
-  if (tile_size == 0) {
-    for (std::set<int>::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) {
-      Relation r(stmt[*i].xform.n_out(),stmt[*i].xform.n_out()+2);
-      F_And *f_root = r.add_and();
-      for (int j = 1; j <= 2*outer_level-1; j++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(r.input_var(j), 1);
-        h.update_coef(r.output_var(j), -1);
-      }
-      EQ_Handle h1 = f_root->add_EQ();
-      h1.update_coef(r.output_var(2*outer_level), 1);
-      EQ_Handle h2 = f_root->add_EQ();
-      h2.update_coef(r.output_var(2*outer_level+1), 1);
-      for (int j = 2*outer_level; j <= stmt[*i].xform.n_out(); j++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(r.input_var(j), 1);
-        h.update_coef(r.output_var(j+2), -1);
-      }
-      
-      stmt[*i].xform = Composition(copy(r), stmt[*i].xform);
-    }
-  }
-  // normal tiling
-  else {
-    std::set<int> private_stmt;
-    for (std::set<int>::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) {
-//     if (same_tiled_loop.find(*i) == same_tiled_loop.end() && !is_single_iteration(getNewIS(*i), dim))
-//       same_tiled_loop.insert(*i);
-      
-      // should test dim's value directly but it is ok for now
-//    if (same_tiled_loop.find(*i) == same_tiled_loop.end() && get_const(stmt[*i].xform, dim+1, Output_Var) == posInfinity)
-      if (same_tiled_loop.find(*i) == same_tiled_loop.end() && overflow.find(*i) != overflow.end())
-        private_stmt.insert(*i);
-    }
-    
-    
-    // extract the union of the iteration space to be considered
-    Relation hull;
-    {
-      Tuple<Relation> r_list;
-      Tuple<int> r_mask;
-      
-      for (std::set<int>::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++)
-        if (private_stmt.find(*i) == private_stmt.end()) {
-          Relation r = project_onto_levels(getNewIS(*i), dim+1, true);
-          for (int j = outer_dim; j < dim; j++)
-            r = Project(r, j+1, Set_Var);
-          for (int j = 0; j < outer_dim; j += 2)
-            r = Project(r, j+1, Set_Var);
-          r_list.append(r);
-          r_mask.append(1);
-        }
-      
-      hull = Hull(r_list, r_mask, 1, true);
-    }
-    
-    // extract the bound of the dimension to be tiled
-    Relation bound = get_loop_bound(hull, dim);
-    if (!bound.has_single_conjunct()) {
-      // further simplify the bound
-      hull = Approximate(hull);
-      bound = get_loop_bound(hull, dim);
-      
-      int i = outer_dim - 2;
-      while (!bound.has_single_conjunct() && i >= 0) {
-        hull = Project(hull, i+1, Set_Var);
-        bound = get_loop_bound(hull, dim);
-        i -= 2;
-      }
-      
-      if (!bound.has_single_conjunct())
-        throw loop_error("cannot handle tile bounds");
-    }
-    
-    // separate lower and upper bounds
-    std::vector<GEQ_Handle> lb_list, ub_list;
-    {
-      Conjunct *c = bound.query_DNF()->single_conjunct();
-      for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
-        int coef = (*gi).get_coef(bound.set_var(dim+1));
-        if (coef < 0)
-          ub_list.push_back(*gi);
-        else if (coef > 0)
-          lb_list.push_back(*gi);
-      }
-    }
-    if (lb_list.size() == 0)
-      throw loop_error("unable to calculate tile controlling loop lower bound");
-    if (ub_list.size() == 0)
-      throw loop_error("unable to calculate tile controlling loop upper bound");
-    
-    // find the simplest lower bound for StridedTile or simplest iteration count for CountedTile
-    int simplest_lb = 0, simplest_ub = 0;
-    if (method == StridedTile) {
-      int best_cost = INT_MAX;
-      for (int i = 0; i < lb_list.size(); i++) {
-        int cost = 0;
-        for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) {
-          switch ((*ci).var->kind()) {
-          case Input_Var: {
-            cost += 5;
-            break;
-          }
-          case Global_Var: {
-            cost += 2;
-            break;
-          }
-          default:
-            cost += 15;
-            break;
-          }
-        }
-        
-        if (cost < best_cost) {
-          best_cost = cost;
-          simplest_lb = i;
-        }
-      }
-    }
-    else if (method == CountedTile) {
-      std::map<Variable_ID, coef_t> s1, s2, s3;
-      int best_cost = INT_MAX;
-      for (int i = 0; i < lb_list.size(); i++) 
-        for (int j = 0; j < ub_list.size(); j++) {
-          int cost = 0;
-          
-          for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var: {
-              s1[(*ci).var] += (*ci).coef;
-              break;
-            }
-            case Global_Var: {
-              s2[(*ci).var] += (*ci).coef;
-              break;
-            }
-            case Exists_Var:
-            case Wildcard_Var: {
-              s3[(*ci).var] += (*ci).coef;
-              break;
-            }
-            default:
-              cost = INT_MAX-2;
-              break;
-            }
-          }
-          
-          for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var: {
-              s1[(*ci).var] += (*ci).coef;
-              break;
-            }
-            case Global_Var: {
-              s2[(*ci).var] += (*ci).coef;
-              break;
-            }
-            case Exists_Var:
-            case Wildcard_Var: {
-              s3[(*ci).var] += (*ci).coef;
-              break;
-            }
-            default:
-              if (cost == INT_MAX-2)
-                cost = INT_MAX-1;
-              else
-                cost = INT_MAX-3;
-              break;
-            }
-          }
-          
-          if (cost == 0) {
-            for (std::map<Variable_ID, coef_t>::iterator k = s1.begin(); k != s1.end(); k++)
-              if ((*k).second != 0)
-                cost += 5;
-            for (std::map<Variable_ID, coef_t>::iterator k = s2.begin(); k != s2.end(); k++)
-              if ((*k).second != 0)
-                cost += 2;
-            for (std::map<Variable_ID, coef_t>::iterator k = s3.begin(); k != s3.end(); k++)
-              if ((*k).second != 0)
-                cost += 15;
-          }
-          
-          if (cost < best_cost) {
-            best_cost = cost;
-            simplest_lb = i;
-            simplest_ub = j;
-          }
-        }
-    }
-    
-    // prepare the new transformation relations
-    for (std::set<int>::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) {
-      Relation r(stmt[*i].xform.n_out(), stmt[*i].xform.n_out()+2);
-      F_And *f_root = r.add_and();
-      for (int j = 0; j < outer_dim-1; j++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(r.output_var(j+1), 1);
-        h.update_coef(r.input_var(j+1), -1);
-      }
-      
-      for (int j = outer_dim-1; j < stmt[*i].xform.n_out(); j++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(r.output_var(j+3), 1);
-        h.update_coef(r.input_var(j+1), -1);
-      }
-      
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(r.output_var(outer_dim), 1);
-      h.update_const(-lex[outer_dim-1]);
-      
-      stmt[*i].xform = Composition(r, stmt[*i].xform);
-    }
-    
-    // add tiling constraints.
-    for (std::set<int>::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) {    
-      F_And *f_super_root = stmt[*i].xform.and_with_and();
-      F_Exists *f_exists = f_super_root->add_exists();
-      F_And *f_root = f_exists->add_and();
-      
-      // create a lower bound variable for easy formula creation later
-      Variable_ID aligned_lb;
-      {
-        Variable_ID lb = f_exists->declare();
-        coef_t coef = lb_list[simplest_lb].get_coef(bound.set_var(dim+1));
-        if (coef == 1) { // e.g. if i >= m+5, then LB = m+5
-          EQ_Handle h = f_root->add_EQ();
-          h.update_coef(lb, 1);
-          for (Constr_Vars_Iter ci(lb_list[simplest_lb]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var: {
-              int pos = (*ci).var->get_position();
-              if (pos != dim + 1)
-                h.update_coef(stmt[*i].xform.output_var(pos), (*ci).coef);
-              break;
-            }
-            case Global_Var: {
-              Global_Var_ID g = (*ci).var->get_global_var();
-              Variable_ID v;
-              if (g->arity() == 0)
-                v = stmt[*i].xform.get_local(g);
-              else
-                v = stmt[*i].xform.get_local(g, (*ci).var->function_of());
-              h.update_coef(v, (*ci).coef);
-              break;
-            }
-            default:
-              throw loop_error("cannot handle tile bounds");
-            }
-          }
-          h.update_const(lb_list[simplest_lb].get_const());
-        }
-        else { // e.g. if 2i >= m+5, then m+5 <= 2*LB < m+5+2
-          GEQ_Handle h1 = f_root->add_GEQ();
-          GEQ_Handle h2 = f_root->add_GEQ();
-          for (Constr_Vars_Iter ci(lb_list[simplest_lb]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var: {
-              int pos = (*ci).var->get_position();
-              if (pos == dim + 1) {
-                h1.update_coef(lb, (*ci).coef);
-                h2.update_coef(lb, -(*ci).coef);
-              }
-              else {
-                h1.update_coef(stmt[*i].xform.output_var(pos), (*ci).coef);
-                h2.update_coef(stmt[*i].xform.output_var(pos), -(*ci).coef);
-              }
-              break;
-            }
-            case Global_Var: {
-              Global_Var_ID g = (*ci).var->get_global_var();
-              Variable_ID v;
-              if (g->arity() == 0)
-                v = stmt[*i].xform.get_local(g);
-              else
-                v = stmt[*i].xform.get_local(g, (*ci).var->function_of());
-              h1.update_coef(v, (*ci).coef);
-              h2.update_coef(v, -(*ci).coef);
-              break;
-            }
-            default:
-              throw loop_error("cannot handle tile bounds");
-            }
-          }
-          h1.update_const(lb_list[simplest_lb].get_const());
-          h2.update_const(-lb_list[simplest_lb].get_const());
-          h2.update_const(coef-1);
-        }
-        
-        Variable_ID offset_lb;
-        if (alignment_offset == 0)
-          offset_lb = lb;
-        else {
-          EQ_Handle h = f_root->add_EQ();
-          offset_lb = f_exists->declare();
-          h.update_coef(offset_lb, 1);
-          h.update_coef(lb, -1);
-          h.update_const(alignment_offset);
-        }
-        
-        if (alignment_multiple == 1) { // trivial
-          aligned_lb = offset_lb;
-        }
-        else { // e.g. to align at 4, aligned_lb = 4*alpha && LB-4 < 4*alpha <= LB
-          aligned_lb = f_exists->declare();
-          Variable_ID e = f_exists->declare();
-          
-          EQ_Handle h = f_root->add_EQ();
-          h.update_coef(aligned_lb, 1);
-          h.update_coef(e, -alignment_multiple);
-          
-          GEQ_Handle h1 = f_root->add_GEQ();
-          GEQ_Handle h2 = f_root->add_GEQ();
-          h1.update_coef(e, alignment_multiple);
-          h2.update_coef(e, -alignment_multiple);
-          h1.update_coef(offset_lb, -1);
-          h2.update_coef(offset_lb, 1);
-          h1.update_const(alignment_multiple-1);
-        }
-      }
-      
-      // create an upper bound variable for easy formula creation later
-      Variable_ID ub = f_exists->declare();
-      {
-        coef_t coef = -ub_list[simplest_ub].get_coef(bound.set_var(dim+1));
-        if (coef == 1) { // e.g. if i <= m+5, then UB = m+5
-          EQ_Handle h = f_root->add_EQ();
-          h.update_coef(ub, -1);
-          for (Constr_Vars_Iter ci(ub_list[simplest_ub]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var: {
-              int pos = (*ci).var->get_position();
-              if (pos != dim + 1)
-                h.update_coef(stmt[*i].xform.output_var(pos), (*ci).coef);
-              break;
-            }
-            case Global_Var: {
-              Global_Var_ID g = (*ci).var->get_global_var();
-              Variable_ID v;
-              if (g->arity() == 0)
-                v = stmt[*i].xform.get_local(g);
-              else
-                v = stmt[*i].xform.get_local(g, (*ci).var->function_of());
-              h.update_coef(v, (*ci).coef);
-              break;
-            }
-            default:
-              throw loop_error("cannot handle tile bounds");
-            }
-          }
-          h.update_const(ub_list[simplest_ub].get_const());
-        }
-        else { // e.g. if 2i <= m+5, then m+5-2 < 2*UB <= m+5
-          GEQ_Handle h1 = f_root->add_GEQ();
-          GEQ_Handle h2 = f_root->add_GEQ();
-          for (Constr_Vars_Iter ci(ub_list[simplest_ub]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var: {
-              int pos = (*ci).var->get_position();
-              if (pos == dim + 1) {
-                h1.update_coef(ub, -(*ci).coef);
-                h2.update_coef(ub, (*ci).coef);
-              }
-              else {
-                h1.update_coef(stmt[*i].xform.output_var(pos), -(*ci).coef);
-                h2.update_coef(stmt[*i].xform.output_var(pos), (*ci).coef);
-              }
-              break;
-            }
-            case Global_Var: {
-              Global_Var_ID g = (*ci).var->get_global_var();
-              Variable_ID v;
-              if (g->arity() == 0)
-                v = stmt[*i].xform.get_local(g);
-              else
-                v = stmt[*i].xform.get_local(g, (*ci).var->function_of());
-              h1.update_coef(v, -(*ci).coef);
-              h2.update_coef(v, (*ci).coef);
-              break;
-            }
-            default:
-              throw loop_error("cannot handle tile bounds");
-            }
-          }
-          h1.update_const(-ub_list[simplest_ub].get_const());
-          h2.update_const(ub_list[simplest_ub].get_const());
-          h1.update_const(coef-1);
-        }
-      }
-      
-      // insert tile controlling loop constraints
-      if (method == StridedTile) { // e.g. ii = LB + 32 * alpha && alpha >= 0
-        Variable_ID e = f_exists->declare();
-        GEQ_Handle h1 = f_root->add_GEQ();
-        h1.update_coef(e, 1);
-        
-        EQ_Handle h2 = f_root->add_EQ();
-        h2.update_coef(stmt[*i].xform.output_var(outer_dim+1), 1);
-        h2.update_coef(e, -tile_size);
-        h2.update_coef(aligned_lb, -1);
-      }        
-      else if (method == CountedTile) { // e.g. 0 <= ii < ceiling((UB-LB+1)/32)
-        GEQ_Handle h1 = f_root->add_GEQ();
-        h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), 1);
-        
-        GEQ_Handle h2 = f_root->add_GEQ();
-        h2.update_coef(stmt[*i].xform.output_var(outer_dim+1), -tile_size);
-        h2.update_coef(aligned_lb, -1);
-        h2.update_coef(ub, 1);
-      }
-      
-      // special care for private statements like overflow assignment
-      if (private_stmt.find(*i) != private_stmt.end()) { // e.g. ii <= UB
-        GEQ_Handle h = f_root->add_GEQ();
-        h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1); 
-        h.update_coef(ub, 1);
-      }       
-      // if (private_stmt.find(*i) != private_stmt.end()) {
-      //   if (stmt[*i].xform.n_out() > dim+3) { // e.g. ii <= UB && i = ii
-      //     GEQ_Handle h = f_root->add_GEQ();
-      //     h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1); 
-      //     h.update_coef(ub, 1);
-      
-      //     stmt[*i].xform = Project(stmt[*i].xform, dim+3, Output_Var);
-      //     f_root = stmt[*i].xform.and_with_and();
-      //     EQ_Handle h1 = f_root->add_EQ();
-      //     h1.update_coef(stmt[*i].xform.output_var(dim+3), 1);
-      //     h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1);
-      //   }
-      //   else if (method == StridedTile) { // e.g. ii <= UB since i does not exist
-      //     GEQ_Handle h = f_root->add_GEQ();
-      //     h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1); 
-      //     h.update_coef(ub, 1);
-      //   } 
-      // }
-      
-      // restrict original loop index inside the tile
-      else {
-        if (method == StridedTile) { // e.g. ii <= i < ii + tile_size
-          GEQ_Handle h1 = f_root->add_GEQ();
-          h1.update_coef(stmt[*i].xform.output_var(dim+3), 1);
-          h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1);
-          
-          GEQ_Handle h2 = f_root->add_GEQ();
-          h2.update_coef(stmt[*i].xform.output_var(dim+3), -1);
-          h2.update_coef(stmt[*i].xform.output_var(outer_dim+1), 1);
-          h2.update_const(tile_size-1);
-        }
-        else if (method == CountedTile) { // e.g. LB+32*ii <= i < LB+32*ii+tile_size
-          GEQ_Handle h1 = f_root->add_GEQ();
-          h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), -tile_size);
-          h1.update_coef(stmt[*i].xform.output_var(dim+3), 1);
-          h1.update_coef(aligned_lb, -1);
-          
-          GEQ_Handle h2 = f_root->add_GEQ();
-          h2.update_coef(stmt[*i].xform.output_var(outer_dim+1), tile_size);
-          h2.update_coef(stmt[*i].xform.output_var(dim+3), -1);
-          h2.update_const(tile_size-1);
-          h2.update_coef(aligned_lb, 1);          
-        }
-      }
-    }
-  }
-  
-  // update loop level information
-  for (std::set<int>::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) {
-    for (int j = 1; j <= stmt[*i].loop_level.size(); j++)
-      switch (stmt[*i].loop_level[j-1].type) {
-      case LoopLevelOriginal:
-        break;
-      case LoopLevelTile:
-        if (stmt[*i].loop_level[j-1].payload >= outer_level)
-          stmt[*i].loop_level[j-1].payload++;
-        break;
-      default:
-        throw loop_error("unknown loop level type for statement " + to_string(*i));
-      }
-    
-    LoopLevel ll;
-    ll.type = LoopLevelTile;
-    ll.payload = level+1;
-    ll.parallel_level = 0;
-    stmt[*i].loop_level.insert(stmt[*i].loop_level.begin()+(outer_level-1), ll);
-  }
-}
-
-
-
-std::set<int> Loop::unroll(int stmt_num, int level, int unroll_amount) {
-  // check for sanity of parameters
-  if (unroll_amount < 0)
-    throw std::invalid_argument("invalid unroll amount " + to_string(unroll_amount));
-  if (stmt_num < 0 || stmt_num >= stmt.size())
-    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
-  if (level <= 0 || level > stmt[stmt_num].loop_level.size())
-    throw std::invalid_argument("invalid loop level " + to_string(level));
-  
-  int dim = 2*level - 1;
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  std::set<int> same_loop = getStatements(lex, dim-1);
-  
-  // nothing to do
-  if (unroll_amount == 1)
-    return std::set<int>();
-  
-  // extract the intersection of the iteration space to be considered
-  Relation hull = Relation::True(level);
-  apply_xform(same_loop);
-  for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-    if (stmt[*i].IS.is_upper_bound_satisfiable()) {
-      Relation mapping(stmt[*i].IS.n_set(), level);
-      F_And *f_root = mapping.add_and();
-      for (int j = 1; j <= level; j++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(mapping.input_var(j), 1);
-        h.update_coef(mapping.output_var(j), -1);
-      }
-      hull = Intersection(hull, Range(Restrict_Domain(mapping, copy(stmt[*i].IS))));
-      hull.simplify(2, 4);
-    }
-  }
-  for (int i = 1; i <= level; i++) {
-    std::string name = tmp_loop_var_name_prefix + to_string(i);
-    hull.name_set_var(i, name);
-  }
-  hull.setup_names();
-  
-  // extract the exact loop bound of the dimension to be unrolled
-  if (is_single_loop_iteration(hull, level, this->known))
-    return std::set<int>();
-  Relation bound = get_loop_bound(hull, level, this->known);
-  if (!bound.has_single_conjunct() || !bound.is_satisfiable() || bound.is_tautology())
-    throw loop_error("unable to extract loop bound for unrolling");
-  
-  // extract the loop stride
-  EQ_Handle stride_eq;
-  int stride = 1;
-  {
-    bool simple_stride = true;
-    int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(level), stride_eq, simple_stride);
-    if (strides > 1)
-      throw loop_error("too many strides");
-    else if (strides == 1) {
-      int sign = stride_eq.get_coef(bound.set_var(level));
-      Constr_Vars_Iter it(stride_eq, true);
-      stride = abs((*it).coef/sign);
-    }
-  }
-  
-  // separate lower and upper bounds
-  std::vector<GEQ_Handle> lb_list, ub_list;
-  {
-    Conjunct *c = bound.query_DNF()->single_conjunct();
-    for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
-      int coef = (*gi).get_coef(bound.set_var(level));
-      if (coef < 0)
-        ub_list.push_back(*gi);
-      else if (coef > 0)
-        lb_list.push_back(*gi);
-    }
-  }  
-  
-  // simplify overflow expression for each pair of upper and lower bounds
-  std::vector<std::vector<std::map<Variable_ID, int> > > overflow_table(lb_list.size(), std::vector<std::map<Variable_ID, int> >(ub_list.size(), std::map<Variable_ID, int>()));
-  bool is_overflow_simplifiable = true;
-  for (int i = 0; i < lb_list.size(); i++) {
-    if (!is_overflow_simplifiable)
-      break;
-    
-    for (int j = 0; j < ub_list.size(); j++) {
-      // lower bound or upper bound has non-unit coefficient, can't simplify
-      if (ub_list[j].get_coef(bound.set_var(level)) != -1 || lb_list[i].get_coef(bound.set_var(level)) != 1) {
-        is_overflow_simplifiable = false;
-        break;
-      }
-      
-      for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) {
-        switch((*ci).var->kind()) {
-        case Input_Var:
-        {
-          if ((*ci).var != bound.set_var(level))
-            overflow_table[i][j][(*ci).var] += (*ci).coef;
-          
-          break;
-        }
-        case Global_Var:
-        {
-          Global_Var_ID g = (*ci).var->get_global_var();
-          Variable_ID v;
-          if (g->arity() == 0)
-            v = bound.get_local(g);
-          else
-            v = bound.get_local(g, (*ci).var->function_of());
-          overflow_table[i][j][(*ci).var] += (*ci).coef;
-          break;
-        }
-        default:
-          throw loop_error("failed to calculate overflow amount");
-        }
-      }
-      overflow_table[i][j][NULL] += ub_list[j].get_const();
-      
-      for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) {
-        switch((*ci).var->kind()) {
-        case Input_Var:
-        {
-          if ((*ci).var != bound.set_var(level)) {
-            overflow_table[i][j][(*ci).var] += (*ci).coef;
-            if (overflow_table[i][j][(*ci).var] == 0)
-              overflow_table[i][j].erase(overflow_table[i][j].find((*ci).var));
-          }
-          break;
-        }
-        case Global_Var:
-        {
-          Global_Var_ID g = (*ci).var->get_global_var();
-          Variable_ID v;
-          if (g->arity() == 0)
-            v = bound.get_local(g);
-          else
-            v = bound.get_local(g, (*ci).var->function_of());
-          overflow_table[i][j][(*ci).var] += (*ci).coef;
-          if (overflow_table[i][j][(*ci).var] == 0)
-            overflow_table[i][j].erase(overflow_table[i][j].find((*ci).var));
-          break;
-        }
-        default:
-          throw loop_error("failed to calculate overflow amount");
-        }
-      }
-      overflow_table[i][j][NULL] += lb_list[i].get_const();
-      
-      overflow_table[i][j][NULL] += stride;
-      if (unroll_amount == 0 || (overflow_table[i][j].size() == 1 && overflow_table[i][j][NULL]/stride < unroll_amount))
-        unroll_amount = overflow_table[i][j][NULL]/stride;
-    }
-  }
-  
-  // loop iteration count can't be determined, bail out gracefully
-  if (unroll_amount == 0)
-    return std::set<int>();
-  
-  // further simply overflow calculation using coefficients' modular
-  if (is_overflow_simplifiable) {
-    for (int i = 0; i < lb_list.size(); i++)
-      for (int j = 0; j < ub_list.size(); j++)
-        if (stride == 1) {
-          for (std::map<Variable_ID, int>::iterator k = overflow_table[i][j].begin(); k != overflow_table[i][j].end(); )
-            if ((*k).first != NULL) {
-              int t = int_mod_hat((*k).second, unroll_amount);
-              if (t == 0) {
-                overflow_table[i][j].erase(k++);
-              }
-              else {
-                int t2 = hull.query_variable_mod((*k).first, unroll_amount);
-                if (t2 != INT_MAX) {
-                  overflow_table[i][j][NULL] += t * t2;
-                  overflow_table[i][j].erase(k++);
-                }
-                else {
-                  (*k).second = t;
-                  k++;
-                }
-              }
-            }
-            else
-              k++;
-          
-          overflow_table[i][j][NULL] = int_mod_hat(overflow_table[i][j][NULL], unroll_amount);
-          
-          // Since we don't have MODULO instruction in SUIF yet (only MOD), make all coef positive in the final formula
-          for (std::map<Variable_ID, int>::iterator k = overflow_table[i][j].begin(); k != overflow_table[i][j].end(); k++)
-            if ((*k).second < 0)
-              (*k).second += unroll_amount;
-        }
-  }
-  
-  
-  // build overflow statement
-  CG_outputBuilder *ocg = ir->builder();
-  CG_outputRepr *overflow_code = NULL;
-  Relation cond_upper(level), cond_lower(level);
-  Relation overflow_constraint(0);
-  F_And *overflow_constraint_root = overflow_constraint.add_and();
-  std::vector<Free_Var_Decl *> over_var_list;
-  if (is_overflow_simplifiable && lb_list.size() == 1) {
-    for (int i = 0; i < ub_list.size(); i++) {
-      if (overflow_table[0][i].size() == 1) {
-        // upper splitting condition
-        GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]);
-        h.update_const(((overflow_table[0][i][NULL]/stride)%unroll_amount) * -stride);
-      }
-      else {
-        // upper splitting condition
-        std::string over_name = overflow_var_name_prefix + to_string(overflow_var_name_counter++);
-        Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name);
-        over_var_list.push_back(over_free_var);
-        GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]);
-        h.update_coef(cond_upper.get_local(over_free_var), -stride);
-        
-        // insert constraint 0 <= overflow < unroll_amount
-        Variable_ID v = overflow_constraint.get_local(over_free_var);
-        GEQ_Handle h1 = overflow_constraint_root->add_GEQ();
-        h1.update_coef(v, 1);
-        GEQ_Handle h2 = overflow_constraint_root->add_GEQ();
-        h2.update_coef(v, -1);
-        h2.update_const(unroll_amount-1);
-        
-        // create overflow assignment
-        bound.setup_names();
-        CG_outputRepr *rhs = NULL;
-        for (std::map<Variable_ID, int>::iterator j = overflow_table[0][i].begin(); j != overflow_table[0][i].end(); j++)
-          if ((*j).first != NULL) {
-            CG_outputRepr *t = ocg->CreateIdent((*j).first->name());
-            if ((*j).second != 1)
-              t = ocg->CreateTimes(ocg->CreateInt((*j).second), t);
-            rhs = ocg->CreatePlus(rhs, t);
-          }
-          else
-            if ((*j).second != 0)
-              rhs = ocg->CreatePlus(rhs, ocg->CreateInt((*j).second));
-        
-        if (stride != 1)
-          rhs = ocg->CreateIntegerCeil(rhs, ocg->CreateInt(stride));
-        rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount));
-        
-        CG_outputRepr *lhs = ocg->CreateIdent(over_name);
-        init_code = ocg->StmtListAppend(init_code, ocg->CreateAssignment(0, lhs, ocg->CreateInt(0)));
-        lhs = ocg->CreateIdent(over_name);
-        overflow_code = ocg->StmtListAppend(overflow_code, ocg->CreateAssignment(0, lhs, rhs));
-      }
-    }
-    
-    // lower splitting condition
-    GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[0]);
-  }
-  else if (is_overflow_simplifiable && ub_list.size() == 1) {
-    for (int i = 0; i < lb_list.size(); i++) {
-      
-      if (overflow_table[i][0].size() == 1) {
-        // lower splitting condition
-        GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]);
-        h.update_const(overflow_table[i][0][NULL] * -stride);
-      }
-      else {
-        // lower splitting condition
-        std::string over_name = overflow_var_name_prefix + to_string(overflow_var_name_counter++);
-        Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name);
-        over_var_list.push_back(over_free_var);
-        GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]);
-        h.update_coef(cond_lower.get_local(over_free_var), -stride);
-        
-        // insert constraint 0 <= overflow < unroll_amount
-        Variable_ID v = overflow_constraint.get_local(over_free_var);
-        GEQ_Handle h1 = overflow_constraint_root->add_GEQ();
-        h1.update_coef(v, 1);
-        GEQ_Handle h2 = overflow_constraint_root->add_GEQ();
-        h2.update_coef(v, -1);
-        h2.update_const(unroll_amount-1);
-        
-        // create overflow assignment
-        bound.setup_names();
-        CG_outputRepr *rhs = NULL;
-        for (std::map<Variable_ID, int>::iterator j = overflow_table[0][i].begin(); j != overflow_table[0][i].end(); j++)
-          if ((*j).first != NULL) {
-            CG_outputRepr *t = ocg->CreateIdent((*j).first->name());
-            if ((*j).second != 1)
-              t = ocg->CreateTimes(ocg->CreateInt((*j).second), t);
-            rhs = ocg->CreatePlus(rhs, t);
-          }
-          else
-            if ((*j).second != 0)
-              rhs = ocg->CreatePlus(rhs, ocg->CreateInt((*j).second));
-        
-        if (stride != 1)
-          rhs = ocg->CreateIntegerCeil(rhs, ocg->CreateInt(stride));
-        rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount));
-        
-        CG_outputRepr *lhs = ocg->CreateIdent(over_name);
-        init_code = ocg->StmtListAppend(init_code, ocg->CreateAssignment(0, lhs, ocg->CreateInt(0)));
-        lhs = ocg->CreateIdent(over_name);
-        overflow_code = ocg->StmtListAppend(overflow_code, ocg->CreateAssignment(0, lhs, rhs));
-      }
-    }
-    
-    // upper splitting condition
-    GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[0]);
-  }
-  else {
-    std::string over_name = overflow_var_name_prefix + to_string(overflow_var_name_counter++);
-    Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name);
-    over_var_list.push_back(over_free_var);
-    
-    Tuple<CG_outputRepr *> lb_repr_list, ub_repr_list;
-    for (int i = 0; i < lb_list.size(); i++) {
-      //lb_repr_list.append(outputLBasRepr(ocg, lb_list[i], bound, bound.set_var(dim+1), stride, stride_eq, Relation::True(bound.n_set()), std::vector<CG_outputRepr *>(bound.n_set(), NULL)));
-      lb_repr_list.append(outputLBasRepr(ocg, lb_list[i], bound, bound.set_var(dim+1), stride, stride_eq, Relation::True(bound.n_set()), std::vector<CG_outputRepr *>(bound.n_set())));
-      GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]);
-    }
-    for (int i = 0; i < ub_list.size(); i++) {
-      //ub_repr_list.append(outputUBasRepr(ocg, ub_list[i], bound, bound.set_var(dim+1), stride, stride_eq, std::vector<CG_outputRepr *>(bound.n_set(), NULL)));
-      ub_repr_list.append(outputUBasRepr(ocg, ub_list[i], bound, bound.set_var(dim+1), stride, stride_eq, std::vector<CG_outputRepr *>(bound.n_set())));
-      GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]);
-      h.update_coef(cond_upper.get_local(over_free_var), -stride);
-    }
-    
-    CG_outputRepr *lbRepr, *ubRepr;
-    if (lb_repr_list.size() > 1)
-      lbRepr = ocg->CreateInvoke("max", lb_repr_list);
-    else if (lb_repr_list.size() == 1)
-      lbRepr = lb_repr_list[1];
-    
-    if (ub_repr_list.size() > 1)
-      ubRepr = ocg->CreateInvoke("min", ub_repr_list);
-    else if (ub_repr_list.size() == 1)
-      ubRepr = ub_repr_list[1];
-    
-    // create overflow assignment
-    bound.setup_names();
-    CG_outputRepr *rhs = ocg->CreatePlus(ocg->CreateMinus(ubRepr, lbRepr), ocg->CreateInt(1));
-    if (stride != 1)
-      rhs = ocg->CreateIntegerDivide(rhs, ocg->CreateInt(stride));
-    rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount));
-    CG_outputRepr *lhs = ocg->CreateIdent(over_name);
-    init_code = ocg->StmtListAppend(init_code, ocg->CreateAssignment(0, lhs, ocg->CreateInt(0)));
-    lhs = ocg->CreateIdent(over_name);
-    overflow_code = ocg->CreateAssignment(0, lhs, rhs);
-    
-    // insert constraint 0 <= overflow < unroll_amount
-    Variable_ID v = overflow_constraint.get_local(over_free_var);
-    GEQ_Handle h1 = overflow_constraint_root->add_GEQ();
-    h1.update_coef(v, 1);
-    GEQ_Handle h2 = overflow_constraint_root->add_GEQ();
-    h2.update_coef(v, -1);
-    h2.update_const(unroll_amount-1);
-  }
-  
-  // insert overflow statement
-  int overflow_stmt_num = -1;
-  if (overflow_code != NULL) {
-    // build iteration space for overflow statement
-    Relation mapping(level, level-1);
-    F_And *f_root = mapping.add_and();
-    for (int i = 1; i < level; i++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(i), 1);
-      h.update_coef(mapping.input_var(i), -1);
-    }
-    Relation overflow_IS = Range(Restrict_Domain(mapping, copy(hull)));
-    for (int i = 1; i < level; i++)
-      overflow_IS.name_set_var(i, hull.set_var(i)->name());
-    overflow_IS.setup_names();  
-    
-    // build dumb transformation relation for overflow statement
-    Relation overflow_xform(level-1, 2*(level-1)+1);
-    f_root = overflow_xform.add_and();
-    for (int i = 1; i <= level-1; i++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(overflow_xform.output_var(2*i), 1);
-      h.update_coef(overflow_xform.input_var(i), -1);
-      
-      h = f_root->add_EQ();
-      h.update_coef(overflow_xform.output_var(2*i-1), 1);
-      h.update_const(-lex[2*i-2]);
-    }
-    EQ_Handle h = f_root->add_EQ();
-    h.update_coef(overflow_xform.output_var(2*(level-1)+1), 1);
-    h.update_const(-lex[2*(level-1)]);
-    
-    shiftLexicalOrder(lex, dim-1, 1);
-    Statement overflow_stmt;
-    overflow_stmt.code = overflow_code;
-    overflow_stmt.IS = overflow_IS;
-    overflow_stmt.xform = overflow_xform;
-    overflow_stmt.loop_level = std::vector<LoopLevel>(level-1);
-    for (int i = 0; i < level-1; i++) {
-      overflow_stmt.loop_level[i].type = stmt[stmt_num].loop_level[i].type;
-      if (stmt[stmt_num].loop_level[i].type == LoopLevelTile &&
-          stmt[stmt_num].loop_level[i].payload >= level)
-        overflow_stmt.loop_level[i].payload = -1;
-      else
-        overflow_stmt.loop_level[i].payload = stmt[stmt_num].loop_level[i].payload;
-      overflow_stmt.loop_level[i].parallel_level = stmt[stmt_num].loop_level[i].parallel_level;
-    }
-    stmt.push_back(overflow_stmt);
-    dep.insert();
-    overflow_stmt_num = stmt.size() - 1;
-    overflow[overflow_stmt_num] = over_var_list;
-    
-    // update the global known information on overflow variable
-    this->known = Intersection(this->known, Extend_Set(copy(overflow_constraint), this->known.n_set()-overflow_constraint.n_set()));
-    
-    // update dependence graph
-    DependenceVector dv;
-    dv.type = DEP_CONTROL;
-    for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-      dep.connect(overflow_stmt_num, *i, dv);
-    dv.type = DEP_W2W;
-    {
-      IR_ScalarSymbol *overflow_sym = NULL;
-      std::vector<IR_ScalarRef *> scalars = ir->FindScalarRef(overflow_code);
-      for (int i = scalars.size()-1; i >=0; i--)
-        if (scalars[i]->is_write()) {
-          overflow_sym = scalars[i]->symbol();
-          break;
-        }
-      for (int i = scalars.size()-1; i >=0; i--)
-        delete scalars[i];
-      dv.sym = overflow_sym;
-    }
-    dv.lbounds = std::vector<coef_t>(num_dep_dim, 0);
-    dv.ubounds = std::vector<coef_t>(num_dep_dim, 0);
-    int dep_dim = get_last_dep_dim_before(stmt_num, level);
-    for (int i = dep_dim + 1; i < num_dep_dim; i++) {
-      dv.lbounds[i] = -posInfinity;
-      dv.ubounds[i] = posInfinity;
-    }
-    for (int i = 0; i <= dep_dim; i++) {
-      if (i != 0) {
-        dv.lbounds[i-1] = 0;
-        dv.ubounds[i-1] = 0;
-      }
-      dv.lbounds[i] = 1;
-      dv.ubounds[i] = posInfinity;
-      dep.connect(overflow_stmt_num, overflow_stmt_num, dv);
-    }
-  }
-  
-  // split the loop so it can be fully unrolled
-  std::set<int> result = split(stmt_num, level, cond_upper);
-  std::set<int> result2 = split(stmt_num, level, cond_lower);
-  for (std::set<int>::iterator i = result2.begin(); i != result2.end(); i++)
-    result.insert(*i);
-  
-  // check if unrolled statements can be trivially lumped together as one statement
-  bool can_be_lumped = true;
-  if (can_be_lumped) {
-    for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-      if (*i != stmt_num) {
-        if (stmt[*i].loop_level.size() != stmt[stmt_num].loop_level.size()) {
-          can_be_lumped = false;
-          break;
-        }
-        for (int j = 0; j < stmt[stmt_num].loop_level.size(); j++)
-          if (!(stmt[*i].loop_level[j].type == stmt[stmt_num].loop_level[j].type &&
-                stmt[*i].loop_level[j].payload == stmt[stmt_num].loop_level[j].payload)) {
-            can_be_lumped = false;
-            break;
-          }
-        if (!can_be_lumped)
-          break;
-        std::vector<int> lex2 = getLexicalOrder(*i);
-        for (int j = 2*level; j < lex.size()-1; j+=2)
-          if (lex[j] != lex2[j]) {
-            can_be_lumped = false;
-            break;
-          }
-        if (!can_be_lumped)
-          break;
-      }
-  }
-  if (can_be_lumped) {
-    for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-      if (is_inner_loop_depend_on_level(stmt[*i].IS, level, known)) {
-        can_be_lumped = false;
-        break;
-      }
-  }
-  if (can_be_lumped) {
-    for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-      if (*i != stmt_num) {
-        if (!(Must_Be_Subset(copy(stmt[*i].IS), copy(stmt[stmt_num].IS)) && Must_Be_Subset(copy(stmt[stmt_num].IS), copy(stmt[*i].IS)))) {
-          can_be_lumped = false;
-          break;
-        }
-      }
-  }    
-  if (can_be_lumped) {
-    for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-      for (DependenceGraph::EdgeList::iterator j = dep.vertex[*i].second.begin(); j != dep.vertex[*i].second.end(); j++)
-        if (same_loop.find(j->first) != same_loop.end()) {        
-          for (int k = 0; k < j->second.size(); k++)
-            if (j->second[k].type == DEP_CONTROL || j->second[k].type == DEP_UNKNOWN) {
-              can_be_lumped = false;
-              break;
-            }
-          if (!can_be_lumped)
-            break;
-        }
-      if (!can_be_lumped)
-        break;
-    }
-  }
-  
-  
-  // add strides to original statements
-  // for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-  //   add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
-  
-  
-  // std::vector<Free_Var_Decl *> depending_overflow_var;
-  // for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-  //   add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
-  //   if (overflow.find(*i) != overflow.end()) {
-  //     // TO DO: It should check whether overflow vaiable depends on
-  //     // this loop index and by how much.  This step is important if
-  //     // you want to unroll loops in arbitrary order.
-  //     depending_overflow_var.insert(depending_overflow_var.end(), overflow[*i].begin(), overflow[*i].end());
-  
-  //     continue;
-  //   }
-  // }
-  
-  
-  
-//   std::map<int, std::vector<Statement> > pending;
-//   for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-//     add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
-  
-//     if (overflow.find(*i) != overflow.end()) {
-//       // TO DO: It should check whether overflow vaiable depends on
-//       // this loop index and by how much.  This step is important if
-//       // you want to unroll loops in arbitrary order.
-//       depending_overflow_var.insert(depending_overflow_var.end(), overflow[*i].begin(), overflow[*i].end());
-  
-//       continue;
-//     }
-  
-//     // create copy for each unroll amount
-//     for (int j = 1; j < unroll_amount; j++) {
-//       Tuple<CG_outputRepr *> funcList;
-//       Tuple<std::string> loop_vars;
-//       loop_vars.append(stmt[*i].IS.set_var((dim+1)/2)->name());
-//       funcList.append(ocg->CreatePlus(ocg->CreateIdent(stmt[*i].IS.set_var(level)->name()), ocg->CreateInt(j*stride)));
-//       CG_outputRepr *code = ocg->CreatePlaceHolder(0, stmt[*i].code->clone(), funcList, loop_vars);
-  
-//       // prepare the new statment to insert
-//       Statement unrolled_stmt;
-//       unrolled_stmt.IS = copy(stmt[*i].IS);
-// //      adjust_loop_bound(unrolled_stmt.IS, (dim-1)/2, j);
-//       unrolled_stmt.xform = copy(stmt[*i].xform);
-//       unrolled_stmt.code = code;
-//       unrolled_stmt.loop_level = stmt[*i].loop_level;
-//       pending[*i].push_back(unrolled_stmt);
-//     }
-//   }
-  
-//   // adjust iteration space due to loop bounds depending on this loop
-//   // index and affected overflow variables
-//   for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-//     for (int j = 0; j < pending[*i].size(); j++) {
-//       adjust_loop_bound(pending[*i][j].IS, (dim-1)/2, j+1, depending_overflow_var);
-//       //pending[*i][j].IS = Intersection(pending[*i][j].IS, Extend_Set(copy(this->known), pending[*i][j].IS.n_set() - this->known.n_set()));
-//     }
-//   }
-  
-  // insert unrolled statements
-  int old_num_stmt = stmt.size();
-  if (!can_be_lumped) {
-    std::map<int, std::vector<int> > what_stmt_num;
-    
-    for (int j = 1; j < unroll_amount; j++) {
-      for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-        Statement new_stmt;
-        
-        Tuple<CG_outputRepr *> funcList;
-        Tuple<std::string> loop_vars;
-        loop_vars.append(stmt[*i].IS.set_var(level)->name());
-        funcList.append(ocg->CreatePlus(ocg->CreateIdent(stmt[*i].IS.set_var(level)->name()), ocg->CreateInt(j*stride)));
-        new_stmt.code = ocg->CreatePlaceHolder(0, stmt[*i].code->clone(), funcList, loop_vars);
-        
-        new_stmt.IS = adjust_loop_bound(stmt[*i].IS, level, j * stride);
-        add_loop_stride(new_stmt.IS, bound, level-1, unroll_amount * stride);
-        
-        new_stmt.xform = copy(stmt[*i].xform);
-        new_stmt.loop_level = stmt[*i].loop_level;
-        stmt.push_back(new_stmt);
-        dep.insert();
-        what_stmt_num[*i].push_back(stmt.size() - 1);
-      }
-    }
-    for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-      add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);      
-    
-    
-    // update dependence graph
-    if (stmt[stmt_num].loop_level[level-1].type == LoopLevelOriginal) {
-      int dep_dim = stmt[stmt_num].loop_level[level-1].payload;
-      int new_stride = unroll_amount * stride;
-      for (int i = 0; i < old_num_stmt; i++) {
-        std::vector<std::pair<int, DependenceVector> > D;
-        
-        for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); ) {
-          if (same_loop.find(i) != same_loop.end()) {
-            if (same_loop.find(j->first) != same_loop.end()) {
-              for (int k = 0; k < j->second.size(); k++) {
-                DependenceVector dv = j->second[k];
-                if (dv.type == DEP_CONTROL || dv.type == DEP_UNKNOWN) {
-                  D.push_back(std::make_pair(j->first, dv));
-                  for (int kk = 0; kk < unroll_amount - 1; kk++)
-                    if (what_stmt_num[i][kk] != -1 && what_stmt_num[j->first][kk] != -1)
-                      dep.connect(what_stmt_num[i][kk], what_stmt_num[j->first][kk], dv);
-                }
-                else {
-                  coef_t lb = dv.lbounds[dep_dim];
-                  coef_t ub = dv.ubounds[dep_dim];
-                  if (ub == lb && int_mod(lb, static_cast<coef_t>(new_stride)) == 0) {
-                    D.push_back(std::make_pair(j->first, dv));
-                    for (int kk = 0; kk < unroll_amount - 1; kk++)
-                      if (what_stmt_num[i][kk] != -1 && what_stmt_num[j->first][kk] != -1)
-                        dep.connect(what_stmt_num[i][kk], what_stmt_num[j->first][kk], dv);
-                  }
-                  else if (lb == -posInfinity && ub == posInfinity) {
-                    D.push_back(std::make_pair(j->first, dv));
-                    for (int kk = 0; kk < unroll_amount; kk++)
-                      if (kk == 0)
-                        D.push_back(std::make_pair(j->first, dv));
-                      else if (what_stmt_num[j->first][kk-1] != -1)
-                        D.push_back(std::make_pair(what_stmt_num[j->first][kk-1], dv));
-                    for (int t = 0; t < unroll_amount - 1; t++)
-                      if (what_stmt_num[i][t] != -1)
-                        for (int kk = 0; kk < unroll_amount; kk++)
-                          if (kk == 0)
-                            dep.connect(what_stmt_num[i][t], j->first, dv);
-                          else if (what_stmt_num[j->first][kk-1] != -1)
-                            dep.connect(what_stmt_num[i][t], what_stmt_num[j->first][kk-1], dv);
-                  }
-                  else {
-                    for (int kk = 0; kk < unroll_amount; kk++) {
-                      if (lb != -posInfinity) {
-                        if (kk * stride < int_mod(lb, static_cast<coef_t>(new_stride)))
-                          dv.lbounds[dep_dim] = floor(static_cast<double>(lb)/new_stride) * new_stride + new_stride;
-                        else
-                          dv.lbounds[dep_dim] = floor(static_cast<double>(lb)/new_stride) * new_stride;
-                      }
-                      if (ub != posInfinity) {
-                        if (kk * stride > int_mod(ub, static_cast<coef_t>(new_stride)))
-                          dv.ubounds[dep_dim] = floor(static_cast<double>(ub)/new_stride) * new_stride - new_stride;
-                        else
-                          dv.ubounds[dep_dim] = floor(static_cast<double>(ub)/new_stride) * new_stride;
-                      }
-                      if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim]) {
-                        if (kk == 0)
-                          D.push_back(std::make_pair(j->first, dv));
-                        else if (what_stmt_num[j->first][kk-1] != -1)
-                          D.push_back(std::make_pair(what_stmt_num[j->first][kk-1], dv));
-                      }
-                    }
-                    for (int t = 0; t < unroll_amount-1; t++)
-                      if (what_stmt_num[i][t] != -1)
-                        for (int kk = 0; kk < unroll_amount; kk++) {
-                          if (lb != -posInfinity) {
-                            if (kk * stride < int_mod(lb+t+1, static_cast<coef_t>(new_stride)))
-                              dv.lbounds[dep_dim] = floor(static_cast<double>(lb+(t+1)*stride)/new_stride) * new_stride + new_stride;
-                            else
-                              dv.lbounds[dep_dim] = floor(static_cast<double>(lb+(t+1)*stride)/new_stride) * new_stride;
-                          }
-                          if (ub != posInfinity) {
-                            if (kk * stride > int_mod(ub+t+1, static_cast<coef_t>(new_stride)))
-                              dv.ubounds[dep_dim] = floor(static_cast<double>(ub+(t+1)*stride)/new_stride) * new_stride - new_stride;
-                            else
-                              dv.ubounds[dep_dim] = floor(static_cast<double>(ub+(t+1)*stride)/new_stride) * new_stride;
-                          }
-                          if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim]) {
-                            if (kk == 0)
-                              dep.connect(what_stmt_num[i][t], j->first, dv);
-                            else if (what_stmt_num[j->first][kk-1] != -1)
-                              dep.connect(what_stmt_num[i][t], what_stmt_num[j->first][kk-1], dv);
-                          }
-                        }
-                  }
-                }
-              }
-              
-              dep.vertex[i].second.erase(j++);
-            }
-            else {
-              for (int kk = 0; kk < unroll_amount - 1; kk++)
-                if (what_stmt_num[i][kk] != -1)
-                  dep.connect(what_stmt_num[i][kk], j->first, j->second);
-              
-              j++;
-            }
-          }
-          else {
-            if (same_loop.find(j->first) != same_loop.end())
-              for (int k = 0; k < j->second.size(); k++)
-                for (int kk = 0; kk < unroll_amount - 1; kk++)
-                  if (what_stmt_num[j->first][kk] != -1)
-                    D.push_back(std::make_pair(what_stmt_num[j->first][kk], j->second[k]));
-            j++;
-          }
-        }
-        
-        for (int j = 0; j < D.size(); j++)
-          dep.connect(i, D[j].first, D[j].second);        
-      }
-    }
-    
-    // reset lexical order for the unrolled loop body
-    std::set<int> new_same_loop;
-    for (std::map<int, std::vector<int> >::iterator i = what_stmt_num.begin(); i != what_stmt_num.end(); i++) {
-      new_same_loop.insert(i->first);
-      for (int j = 0; j < i->second.size(); j++)
-        new_same_loop.insert(i->second[j]);
-    }
-    setLexicalOrder(dim+1, new_same_loop);
-  }
-  else {
-    for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-      add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
-    
-    int max_level = stmt[stmt_num].loop_level.size();
-    std::vector<std::pair<int, int> > stmt_order;
-    for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-      stmt_order.push_back(std::make_pair(get_const(stmt[*i].xform, 2*max_level, Output_Var), *i));
-    sort(stmt_order.begin(), stmt_order.end());
-    
-    Statement new_stmt;
-    new_stmt.code = NULL;
-    for (int j = 1; j < unroll_amount; j++)
-      for (int i = 0; i < stmt_order.size(); i++) {
-        Tuple<CG_outputRepr *> funcList;
-        Tuple<std::string> loop_vars;
-        loop_vars.append(stmt[stmt_order[i].second].IS.set_var(level)->name());
-        funcList.append(ocg->CreatePlus(ocg->CreateIdent(stmt[stmt_order[i].second].IS.set_var(level)->name()), ocg->CreateInt(j*stride)));
-        CG_outputRepr *code = ocg->CreatePlaceHolder(0, stmt[stmt_order[i].second].code->clone(), funcList, loop_vars);
-        new_stmt.code = ocg->StmtListAppend(new_stmt.code, code);
-      }
-    
-    new_stmt.IS = copy(stmt[stmt_num].IS);
-    new_stmt.xform = copy(stmt[stmt_num].xform);
-    assign_const(new_stmt.xform, 2*max_level, stmt_order[stmt_order.size()-1].first+1);
-    new_stmt.loop_level = stmt[stmt_num].loop_level;
-    stmt.push_back(new_stmt);
-    dep.insert();
-    
-    // update dependence graph
-    if (stmt[stmt_num].loop_level[level-1].type == LoopLevelOriginal) {
-      int dep_dim = stmt[stmt_num].loop_level[level-1].payload;
-      int new_stride = unroll_amount * stride;
-      for (int i = 0; i < old_num_stmt; i++) {
-        std::vector<std::pair<int, std::vector<DependenceVector> > > D;
-        
-        for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); ) {
-          if (same_loop.find(i) != same_loop.end()) {
-            if (same_loop.find(j->first) != same_loop.end()) {
-              std::vector<DependenceVector> dvs11, dvs12, dvs22, dvs21;
-              for (int k = 0; k < j->second.size(); k++) {
-                DependenceVector dv = j->second[k];
-                if (dv.type == DEP_CONTROL || dv.type == DEP_UNKNOWN) {
-                  if (i == j->first) {
-                    dvs11.push_back(dv);
-                    dvs22.push_back(dv);
-                  }
-                  else
-                    throw loop_error("unrolled statements lumped together illegally");
-                }
-                else {
-                  coef_t lb = dv.lbounds[dep_dim];
-                  coef_t ub = dv.ubounds[dep_dim];
-                  if (ub == lb && int_mod(lb, static_cast<coef_t>(new_stride)) == 0) {
-                    dvs11.push_back(dv);
-                    dvs22.push_back(dv);
-                  }
-                  else {
-                    if (lb != -posInfinity)
-                      dv.lbounds[dep_dim] = ceil(static_cast<double>(lb)/new_stride) * new_stride;
-                    if (ub != posInfinity)
-                      dv.ubounds[dep_dim] = floor(static_cast<double>(ub)/new_stride) * new_stride;
-                    if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim])
-                      dvs11.push_back(dv);
-                    
-                    if (lb != -posInfinity)
-                      dv.lbounds[dep_dim] = ceil(static_cast<double>(lb)/new_stride) * new_stride;
-                    if (ub != posInfinity)
-                      dv.ubounds[dep_dim] = ceil(static_cast<double>(ub)/new_stride) * new_stride;
-                    if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim])
-                      dvs21.push_back(dv);
-                    
-                    if (lb != -posInfinity)
-                      dv.lbounds[dep_dim] = floor(static_cast<double>(lb)/new_stride) * new_stride;
-                    if (ub != posInfinity)
-                      dv.ubounds[dep_dim] = floor(static_cast<double>(ub-stride)/new_stride) * new_stride;
-                    if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim])
-                      dvs12.push_back(dv);
-                    
-                    if (lb != -posInfinity)
-                      dv.lbounds[dep_dim] = floor(static_cast<double>(lb)/new_stride) * new_stride;
-                    if (ub != posInfinity)
-                      dv.ubounds[dep_dim] = ceil(static_cast<double>(ub-stride)/new_stride) * new_stride;
-                    if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim])
-                      dvs22.push_back(dv);
-                  }
-                }
-              }
-              if (dvs11.size() > 0)
-                D.push_back(std::make_pair(i, dvs11));
-              if (dvs22.size() > 0)
-                dep.connect(old_num_stmt, old_num_stmt, dvs22);
-              if (dvs12.size() > 0)
-                D.push_back(std::make_pair(old_num_stmt, dvs12));
-              if (dvs21.size() > 0)
-                dep.connect(old_num_stmt, i, dvs21);
-              
-              dep.vertex[i].second.erase(j++);
-            }
-            else {
-              dep.connect(old_num_stmt, j->first, j->second);
-              j++;
-            }
-          }
-          else {
-            if (same_loop.find(j->first) != same_loop.end()) 
-              D.push_back(std::make_pair(old_num_stmt, j->second));
-            j++;
-          }
-        }
-        
-        for (int j = 0; j < D.size(); j++)
-          dep.connect(i, D[j].first, D[j].second);
-      }
-    }
-  }
-  
-  return result;
-}
-
-
-std::vector<int> Loop::getLexicalOrder(int stmt_num) const {
-  assert(stmt_num < stmt.size());
-  
-  const int n = stmt[stmt_num].xform.n_out();
-  std::vector<int> lex(n,0);
-  
-  for (int i = 0; i < n; i += 2)
-    lex[i] = get_const(stmt[stmt_num].xform, i, Output_Var);
-  
-  return lex;
-}
-
-std::set<int> Loop::getStatements(const std::vector<int> &lex, int dim) const {
-  const int m = stmt.size();
-  
-  std::set<int> same_loops;
-  for (int i = 0; i < m; i++) {
-    if (dim < 0)
-      same_loops.insert(i);
-    else {
-      std::vector<int> a_lex = getLexicalOrder(i);
-      int j;
-      for (j = 0; j <= dim; j+=2)
-        if (lex[j] != a_lex[j])
-          break;
-      if (j > dim)
-        same_loops.insert(i);
-    }
-  }
-  
-  return same_loops;
-}
-
-
-void Loop::shiftLexicalOrder(const std::vector<int> &lex, int dim, int amount) {
-  const int m = stmt.size();
-  
-  if (amount == 0)
-    return;
-  
-  for (int i = 0; i < m; i++) {
-    std::vector<int> lex2 = getLexicalOrder(i);
-    
-    bool need_shift = true;
-    
-    for (int j = 0; j < dim; j++)
-      if (lex2[j] != lex[j]) {
-        need_shift = false;
-        break;
-      }
-    
-    if (!need_shift)
-      continue;
-    
-    if (amount > 0) {
-      if (lex2[dim] < lex[dim])
-        continue;
-    }
-    else if (amount < 0) {
-      if (lex2[dim] > lex[dim])
-        continue;
-    }
-    
-    assign_const(stmt[i].xform, dim, lex2[dim] + amount);
-  }
-}
-
-
-void Loop::setLexicalOrder(int dim, const std::set<int> &active, int starting_order) {
-  if (active.size() == 0)
-    return;
-  
-  // check for sanity of parameters
-  if (dim < 0 || dim % 2 != 0)
-    throw std::invalid_argument("invalid constant loop level to set lexicographical order");
-  std::vector<int> lex;
-  int ref_stmt_num;
-  for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
-    if ((*i) < 0 || (*i) >= stmt.size())
-      throw std::invalid_argument("invalid statement number " + to_string(*i));
-    if (dim >= stmt[*i].xform.n_out())
-      throw std::invalid_argument("invalid constant loop level to set lexicographical order");
-    if (i == active.begin()) {
-      lex = getLexicalOrder(*i);
-      ref_stmt_num = *i;
-    }
-    else {
-      std::vector<int> lex2 = getLexicalOrder(*i);
-      for (int j = 0; j < dim; j+=2)
-        if (lex[j] != lex2[j])
-          throw std::invalid_argument("statements are not in the same sub loop nest");
-    }
-  }
-  
-  // sepearate statements by current loop level types
-  int level = (dim+2)/2;
-  std::map<std::pair<LoopLevelType, int>, std::set<int> > active_by_level_type;
-  std::set<int> active_by_no_level;
-  for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
-    if (level > stmt[*i].loop_level.size())
-      active_by_no_level.insert(*i);
-    else
-      active_by_level_type[std::make_pair(stmt[*i].loop_level[level-1].type, stmt[*i].loop_level[level-1].payload)].insert(*i);
-  }
-  
-  // further separate statements due to control dependences
-  std::vector<std::set<int> > active_by_level_type_splitted;
-  for (std::map<std::pair<LoopLevelType, int>, std::set<int> >::iterator i = active_by_level_type.begin(); i != active_by_level_type.end(); i++)
-    active_by_level_type_splitted.push_back(i->second);
-  for (std::set<int>::iterator i = active_by_no_level.begin(); i != active_by_no_level.end(); i++)
-    for (int j = active_by_level_type_splitted.size() - 1; j >= 0; j--) {
-      std::set<int> controlled, not_controlled;
-      for (std::set<int>::iterator k = active_by_level_type_splitted[j].begin(); k != active_by_level_type_splitted[j].end(); k++) {
-        std::vector<DependenceVector> dvs = dep.getEdge(*i, *k);
-        bool is_controlled = false;
-        for (int kk = 0; kk < dvs.size(); kk++)
-          if (dvs[kk].type = DEP_CONTROL) {
-            is_controlled = true;
-            break;
-          }
-        if (is_controlled)
-          controlled.insert(*k);
-        else
-          not_controlled.insert(*k);
-      }
-      if (controlled.size() != 0 && not_controlled.size() != 0) {
-        active_by_level_type_splitted.erase(active_by_level_type_splitted.begin() + j);
-        active_by_level_type_splitted.push_back(controlled);
-        active_by_level_type_splitted.push_back(not_controlled);
-      }
-    }
-  
-  // set lexical order separating loops with different loop types first
-  if (active_by_level_type_splitted.size() + active_by_no_level.size() > 1) {
-    int dep_dim = get_last_dep_dim_before(ref_stmt_num, level) + 1;
-    
-    Graph<std::set<int>, Empty> g;
-    for (std::vector<std::set<int> >::iterator i = active_by_level_type_splitted.begin(); i != active_by_level_type_splitted.end(); i++)
-      g.insert(*i);
-    for (std::set<int>::iterator i = active_by_no_level.begin(); i != active_by_no_level.end(); i++) {
-      std::set<int> t;
-      t.insert(*i);
-      g.insert(t);
-    }
-    for (int i = 0; i < g.vertex.size(); i++)
-      for (int j = i+1; j < g.vertex.size(); j++) {
-        bool connected = false;
-        for (std::set<int>::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) {
-          for (std::set<int>::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) {
-            std::vector<DependenceVector> dvs = dep.getEdge(*ii, *jj);
-            for (int k = 0; k < dvs.size(); k++)
-              if (dvs[k].is_control_dependence() ||
-                  (dvs[k].is_data_dependence() && !dvs[k].has_been_carried_before(dep_dim))) {
-                g.connect(i, j);
-                connected = true;
-                break;
-              }
-            if (connected)
-              break;
-          }
-          if (connected)
-            break;
-        }
-        connected = false;
-        for (std::set<int>::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) {
-          for (std::set<int>::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) {
-            std::vector<DependenceVector> dvs = dep.getEdge(*jj, *ii);
-            for (int k = 0; k < dvs.size(); k++)
-              if (dvs[k].is_control_dependence() ||
-                  (dvs[k].is_data_dependence() && !dvs[k].has_been_carried_before(dep_dim))) {
-                g.connect(j, i);
-                connected = true;
-                break;
-              }
-            if (connected)
-              break;
-          }
-          if (connected)
-            break;
-        }
-      }
-    
-    std::vector<std::set<int> > s = g.topoSort();
-    if (s.size() != g.vertex.size())
-      throw loop_error("cannot separate statements with different loop types at loop level " + to_string(level));
-    
-    // assign lexical order
-    int order = starting_order;
-    for (int i = 0; i < s.size(); i++) {
-      std::set<int> &cur_scc = g.vertex[*(s[i].begin())].first;
-      int sz = cur_scc.size();
-      if (sz == 1) {
-        int cur_stmt = *(cur_scc.begin());
-        assign_const(stmt[cur_stmt].xform, dim, order);
-        for (int j = dim+2; j < stmt[cur_stmt].xform.n_out(); j+=2)
-          assign_const(stmt[cur_stmt].xform, j, 0);
-        order++;
-      }
-      else {
-        setLexicalOrder(dim, cur_scc, order);
-        order += sz;
-      }
-    }
-  }
-  // set lexical order seperating single iteration statements and loops
-  else {
-    std::set<int> true_singles;
-    std::set<int> nonsingles;
-    std::map<coef_t, std::set<int> > fake_singles;
-    
-    // sort out statements that do not require loops
-    for(std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
-      Relation cur_IS = getNewIS(*i);
-      if (is_single_iteration(cur_IS, dim+1)) {
-        bool is_all_single = true;
-        for (int j = dim+3; j < stmt[*i].xform.n_out(); j+=2)
-          if (!is_single_iteration(cur_IS, j)) {
-            is_all_single = false;
-            break;
-          }
-        if (is_all_single) 
-          true_singles.insert(*i);
-        else {
-          try {
-            fake_singles[get_const(cur_IS, dim+1, Set_Var)].insert(*i);
-          }
-          catch (const std::exception &e) {
-            fake_singles[posInfinity].insert(*i);
-          }
-        }
-      }
-      else
-        nonsingles.insert(*i);
-    }
-    
-    // split nonsingles forcibly according to negative dependences present (loop unfusible)
-    int dep_dim = get_dep_dim_of(ref_stmt_num, level);
-    Graph<int, Empty> g2;
-    for (std::set<int>::iterator i = nonsingles.begin(); i != nonsingles.end(); i++)
-      g2.insert(*i);
-    for (int i = 0; i < g2.vertex.size(); i++)
-      for (int j = i+1; j < g2.vertex.size(); j++) {
-        std::vector<DependenceVector> dvs = dep.getEdge(g2.vertex[i].first, g2.vertex[j].first);
-        for (int k = 0; k < dvs.size(); k++)
-          if (dvs[k].is_control_dependence() ||
-              (dvs[k].is_data_dependence() && dvs[k].has_negative_been_carried_at(dep_dim))) {
-            g2.connect(i, j);
-            break;
-          }
-        dvs = dep.getEdge(g2.vertex[j].first, g2.vertex[i].first);
-        for (int k = 0; k < dvs.size(); k++)
-          if (dvs[k].is_control_dependence() ||
-              (dvs[k].is_data_dependence() && dvs[k].has_negative_been_carried_at(dep_dim))) {
-            g2.connect(j, i);
-            break;
-          }
-      }
-    
-    std::vector<std::set<int> > s2 = g2.packed_topoSort();
-    
-    std::vector<std::set<int> > splitted_nonsingles;
-    for (int i = 0; i < s2.size(); i++) {
-      std::set<int> cur_scc;
-      for (std::set<int>::iterator j = s2[i].begin(); j != s2[i].end(); j++)
-        cur_scc.insert(g2.vertex[*j].first);
-      splitted_nonsingles.push_back(cur_scc);
-    }
-    
-    // convert to dependence graph for grouped statements
-    dep_dim = get_last_dep_dim_before(ref_stmt_num, level) + 1;
-    Graph<std::set<int>, Empty> g;
-    for (std::set<int>::iterator i = true_singles.begin(); i != true_singles.end(); i++) {
-      std::set<int> t;
-      t.insert(*i);
-      g.insert(t);
-    }
-    for (int i = 0; i < splitted_nonsingles.size(); i++) {
-      g.insert(splitted_nonsingles[i]);
-    }   
-    for (std::map<coef_t, std::set<int> >::iterator i = fake_singles.begin(); i != fake_singles.end(); i++)
-      g.insert((*i).second);
-    
-    for (int i = 0; i < g.vertex.size(); i++)
-      for (int j = i + 1; j < g.vertex.size(); j++) {
-        bool connected = false;
-        for (std::set<int>::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) {
-          for (std::set<int>::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) {
-            std::vector<DependenceVector> dvs = dep.getEdge(*ii, *jj);
-            for (int k = 0; k < dvs.size(); k++)
-              if (dvs[k].is_control_dependence() ||
-                  (dvs[k].is_data_dependence() && !dvs[k].has_been_carried_before(dep_dim))) {
-                g.connect(i, j);
-                connected = true;
-                break;
-              }
-            if (connected)
-              break;
-          }
-          if (connected)
-            break;
-        }
-        connected = false;
-        for (std::set<int>::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) {
-          for (std::set<int>::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) {
-            std::vector<DependenceVector> dvs = dep.getEdge(*jj, *ii);
-            for (int k = 0; k < dvs.size(); k++)
-              if (dvs[k].is_control_dependence() ||
-                  (dvs[k].is_data_dependence() && !dvs[k].has_been_carried_before(dep_dim))) {
-                g.connect(j, i);
-                connected = true;
-                break;
-              }
-            if (connected)
-              break;
-          }
-          if (connected)
-            break;  
-        }
-      }
-    
-    // topological sort according to chun's permute algorithm
-    std::vector<std::set<int> > s = g.topoSort();
-    
-    // assign lexical order
-    int order = starting_order;
-    for (int i = 0; i < s.size(); i++) {
-      // translate each SCC into original statements
-      std::set<int> cur_scc;
-      for (std::set<int>::iterator j = s[i].begin(); j != s[i].end(); j++)
-        copy(g.vertex[*j].first.begin(), g.vertex[*j].first.end(), inserter(cur_scc, cur_scc.begin()));
-      
-      // now assign the constant
-      for(std::set<int>::iterator j = cur_scc.begin(); j != cur_scc.end(); j++)
-        assign_const(stmt[*j].xform, dim, order);
-      
-      if (cur_scc.size() > 1)
-        setLexicalOrder(dim+2, cur_scc);
-      else if (cur_scc.size() == 1) {
-        int cur_stmt =*(cur_scc.begin());
-        for (int j = dim+2; j < stmt[cur_stmt].xform.n_out(); j+=2)
-          assign_const(stmt[cur_stmt].xform, j, 0);
-      }
-      
-      if (cur_scc.size() > 0)
-        order++;
-    }
-  }
-}
-
-
-void Loop::apply_xform() {
-  std::set<int> active;
-  for (int i = 0; i < stmt.size(); i++)
-    active.insert(i);
-  apply_xform(active);
-}
-
-
-void Loop::apply_xform(int stmt_num) {
-  std::set<int> active;
-  active.insert(stmt_num);
-  apply_xform(active);
-}
-
-
-void Loop::apply_xform(std::set<int> &active) {
-  int max_n = 0;
-  
-  CG_outputBuilder *ocg = ir->builder();
-  for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
-    int n = stmt[*i].loop_level.size();
-    if (n > max_n)
-      max_n = n;
-    
-    std::vector<int> lex = getLexicalOrder(*i);
-    
-    Relation mapping(2*n+1, n);
-    F_And *f_root = mapping.add_and();
-    for (int j = 1; j <= n; j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(j), 1);
-      h.update_coef(mapping.input_var(2*j), -1);
-    }
-    mapping = Composition(mapping, stmt[*i].xform);
-    mapping.simplify();
-    
-    // match omega input/output variables to variable names in the code
-    for (int j = 1; j <= stmt[*i].IS.n_set(); j++)
-      mapping.name_input_var(j, stmt[*i].IS.set_var(j)->name());
-    for (int j = 1; j <= n; j++)
-      mapping.name_output_var(j, tmp_loop_var_name_prefix + to_string(tmp_loop_var_name_counter+j-1));
-    mapping.setup_names();
-    
-    Relation known = Extend_Set(copy(this->known), mapping.n_out() - this->known.n_set());
-    //stmt[*i].code = outputStatement(ocg, stmt[*i].code, 0, mapping, known, std::vector<CG_outputRepr *>(mapping.n_out(), NULL));
-    stmt[*i].code = outputStatement(ocg, stmt[*i].code, 0, mapping, known, std::vector<CG_outputRepr *>(mapping.n_out()));
-    stmt[*i].IS = Range(Restrict_Domain(mapping, stmt[*i].IS));
-    stmt[*i].IS.simplify();
-    
-    // replace original transformation relation with straight 1-1 mapping
-    mapping = Relation(n, 2*n+1);
-    f_root = mapping.add_and();
-    for (int j = 1; j <= n; j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(2*j), 1);
-      h.update_coef(mapping.input_var(j), -1);
-    }
-    for (int j = 1; j <= 2*n+1; j+=2) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(j), 1);
-      h.update_const(-lex[j-1]);
-    }  
-    stmt[*i].xform = mapping;
-  }
-  
-  tmp_loop_var_name_counter += max_n;
-}
-
-
-void Loop::addKnown(const Relation &cond) {
-  int n1 = this->known.n_set();
-  
-  Relation r = copy(cond);
-  int n2 = r.n_set();
-  
-  if (n1 < n2)
-    this->known = Extend_Set(this->known, n2-n1);
-  else if (n1 > n2)
-    r = Extend_Set(r, n1-n2);
-  
-  this->known = Intersection(this->known, r);
-}
-
-
-bool Loop::nonsingular(const std::vector<std::vector<int> > &T) {
-  if (stmt.size() == 0)
-    return true;
-  
-  // check for sanity of parameters
-  for (int i = 0; i < stmt.size(); i++) {
-    if (stmt[i].loop_level.size() != num_dep_dim)
-      throw std::invalid_argument("nonsingular loop transformations must be applied to original perfect loop nest");
-    for (int j = 0; j < stmt[i].loop_level.size(); j++)
-      if (stmt[i].loop_level[j].type != LoopLevelOriginal)
-        throw std::invalid_argument("nonsingular loop transformations must be applied to original perfect loop nest");
-  }
-  if (T.size() != num_dep_dim)
-    throw std::invalid_argument("invalid transformation matrix");
-  for (int i = 0; i < stmt.size(); i++)
-    if (T[i].size() != num_dep_dim + 1 && T[i].size() != num_dep_dim)
-      throw std::invalid_argument("invalid transformation matrix");
-  
-  // build relation from matrix
-  Relation mapping(2*num_dep_dim+1, 2*num_dep_dim+1);
-  F_And *f_root = mapping.add_and();
-  for (int i = 0; i < num_dep_dim; i++) {
-    EQ_Handle h = f_root->add_EQ();
-    h.update_coef(mapping.output_var(2*(i+1)), -1);
-    for (int j = 0; j < num_dep_dim; j++)
-      if (T[i][j] != 0) 
-        h.update_coef(mapping.input_var(2*(j+1)), T[i][j]);
-    if (T[i].size() == num_dep_dim+1)
-      h.update_const(T[i][num_dep_dim]);
-  }
-  for (int i = 1; i <= 2*num_dep_dim+1; i+=2) {
-    EQ_Handle h = f_root->add_EQ();
-    h.update_coef(mapping.output_var(i), -1);
-    h.update_coef(mapping.input_var(i), 1);
-  }
-  
-  // update transformation relations
-  for (int i = 0; i < stmt.size(); i++)
-    stmt[i].xform = Composition(copy(mapping), stmt[i].xform);
-  
-  // update dependence graph
-  for (int i = 0; i < dep.vertex.size(); i++)
-    for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++) {
-      std::vector<DependenceVector> dvs = j->second;
-      for (int k = 0; k < dvs.size(); k++) {
-        DependenceVector &dv = dvs[k];
-        switch (dv.type) {
-        case DEP_W2R:
-        case DEP_R2W:
-        case DEP_W2W:
-        case DEP_R2R: {
-          std::vector<coef_t> lbounds(num_dep_dim), ubounds(num_dep_dim);
-          for (int p = 0; p < num_dep_dim; p++) {
-            coef_t lb = 0;
-            coef_t ub = 0;
-            for (int q = 0; q < num_dep_dim; q++) {
-              if (T[p][q] > 0) {
-                if (lb == -posInfinity || dv.lbounds[q] == -posInfinity)
-                  lb = -posInfinity;
-                else
-                  lb += T[p][q] * dv.lbounds[q];
-                if (ub == posInfinity || dv.ubounds[q] == posInfinity)
-                  ub = posInfinity;
-                else
-                  ub += T[p][q] * dv.ubounds[q];
-              }
-              else if (T[p][q] < 0) {
-                if (lb == -posInfinity || dv.ubounds[q] == posInfinity)
-                  lb = -posInfinity;
-                else
-                  lb += T[p][q] * dv.ubounds[q];
-                if (ub == posInfinity || dv.lbounds[q] == -posInfinity)
-                  ub = posInfinity;
-                else
-                  ub += T[p][q] * dv.lbounds[q];
-              }
-            }
-            if (T[p].size() == num_dep_dim+1) {
-              if (lb != -posInfinity)
-                lb += T[p][num_dep_dim];
-              if (ub != posInfinity)
-                ub += T[p][num_dep_dim];
-            }
-            lbounds[p] = lb;
-            ubounds[p] = ub;
-          }
-          dv.lbounds = lbounds;
-          dv.ubounds = ubounds;
-          
-          break;
-        }
-        default:
-          ;
-        }
-      }
-      j->second = dvs;
-    }
-  
-  // set constant loop values
-  std::set<int> active;
-  for (int i = 0; i < stmt.size(); i++)
-    active.insert(i);
-  setLexicalOrder(0, active);
-  
-  return true;
-}
-
-
-void Loop::skew(const std::set<int> &stmt_nums, int level, const std::vector<int> &skew_amount) {
-  if (stmt_nums.size() == 0)
-    return;
-  
-  // check for sanity of parameters
-  int ref_stmt_num = *(stmt_nums.begin());
-  for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-    if (*i < 0 || *i >= stmt.size())
-      throw std::invalid_argument("invalid statement number " + to_string(*i));
-    if (level < 1 || level > stmt[*i].loop_level.size())
-      throw std::invalid_argument("invalid loop level " + to_string(level));
-    for (int j = stmt[*i].loop_level.size(); j < skew_amount.size(); j++)
-      if (skew_amount[j] != 0)
-        throw std::invalid_argument("invalid skewing formula");
-  }
-  
-  // set trasformation relations
-  for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-    int n = stmt[*i].xform.n_out();
-    Relation r(n,n);
-    F_And *f_root = r.add_and();
-    for (int j = 1; j <= n; j++)
-      if (j != 2*level) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(r.input_var(j), 1);
-        h.update_coef(r.output_var(j), -1);
-      }
-    EQ_Handle h = f_root->add_EQ();
-    h.update_coef(r.output_var(2*level), -1);
-    for (int j = 0; j < skew_amount.size(); j++)
-      if (skew_amount[j] != 0)
-        h.update_coef(r.input_var(2*(j+1)), skew_amount[j]);
-    
-    stmt[*i].xform = Composition(r, stmt[*i].xform);
-    stmt[*i].xform.simplify();
-  }
-  
-  // update dependence graph
-  if (stmt[ref_stmt_num].loop_level[level-1].type == LoopLevelOriginal) {
-    int dep_dim = stmt[ref_stmt_num].loop_level[level-1].payload;
-    for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++)
-      for (DependenceGraph::EdgeList::iterator j = dep.vertex[*i].second.begin(); j != dep.vertex[*i].second.end(); j++)
-        if (stmt_nums.find(j->first) != stmt_nums.end()) {
-          // dependence between skewed statements
-          std::vector<DependenceVector> dvs = j->second;
-          for (int k = 0; k < dvs.size(); k++) {
-            DependenceVector &dv = dvs[k];
-            if (dv.is_data_dependence()) {
-              coef_t lb = 0;
-              coef_t ub = 0;
-              for (int kk = 0; kk < skew_amount.size(); kk++) {
-                int cur_dep_dim = get_dep_dim_of(*i, kk+1);
-                if (skew_amount[kk] > 0) {
-                  if (lb != -posInfinity &&
-                      stmt[*i].loop_level[kk].type == LoopLevelOriginal &&
-                      dv.lbounds[cur_dep_dim] != -posInfinity)
-                    lb += skew_amount[kk] * dv.lbounds[cur_dep_dim];
-                  else {
-                    if (cur_dep_dim != -1 && !(dv.lbounds[cur_dep_dim] == 0 && dv.ubounds[cur_dep_dim] == 0))
-                      lb = -posInfinity;
-                  }
-                  if (ub != posInfinity &&
-                      stmt[*i].loop_level[kk].type == LoopLevelOriginal &&
-                      dv.ubounds[cur_dep_dim] != posInfinity)
-                    ub += skew_amount[kk] * dv.ubounds[cur_dep_dim];
-                  else {
-                    if (cur_dep_dim != -1 && !(dv.lbounds[cur_dep_dim] == 0 && dv.ubounds[cur_dep_dim] == 0))
-                      ub = posInfinity;
-                  }
-                }
-                else if (skew_amount[kk] < 0) {
-                  if (lb != -posInfinity &&
-                      stmt[*i].loop_level[kk].type == LoopLevelOriginal &&
-                      dv.ubounds[cur_dep_dim] != posInfinity)
-                    lb += skew_amount[kk] * dv.ubounds[cur_dep_dim];
-                  else {
-                    if (cur_dep_dim != -1 && !(dv.lbounds[cur_dep_dim] == 0 && dv.ubounds[cur_dep_dim] == 0))
-                      lb = -posInfinity;
-                  }
-                  if (ub != posInfinity &&
-                      stmt[*i].loop_level[kk].type == LoopLevelOriginal &&
-                      dv.lbounds[cur_dep_dim] != -posInfinity)
-                    ub += skew_amount[kk] * dv.lbounds[cur_dep_dim];
-                  else {
-                    if (cur_dep_dim != -1 && !(dv.lbounds[cur_dep_dim] == 0 && dv.ubounds[cur_dep_dim] == 0))
-                      ub = posInfinity;
-                  }
-                }
-              }
-              dv.lbounds[dep_dim] = lb;
-              dv.ubounds[dep_dim] = ub;
-            }
-          }
-          j->second = dvs;
-        }
-        else {
-          // dependence from skewed statement to unskewed statement becomes jumbled,
-          // put distance value at skewed dimension to unknown
-          std::vector<DependenceVector> dvs = j->second;
-          for (int k = 0; k < dvs.size(); k++) {
-            DependenceVector &dv = dvs[k];
-            if (dv.is_data_dependence()) {
-              dv.lbounds[dep_dim] = -posInfinity;
-              dv.ubounds[dep_dim] = posInfinity;
-            }
-          }
-          j->second = dvs;
-        }
-    for (int i = 0; i < dep.vertex.size(); i++)
-      if (stmt_nums.find(i) == stmt_nums.end())
-        for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++)
-          if (stmt_nums.find(j->first) != stmt_nums.end()) {
-            // dependence from unskewed statement to skewed statement becomes jumbled,
-            // put distance value at skewed dimension to unknown
-            std::vector<DependenceVector> dvs = j->second;
-            for (int k = 0; k < dvs.size(); k++) {
-              DependenceVector &dv = dvs[k];
-              if (dv.is_data_dependence()) {
-                dv.lbounds[dep_dim] = -posInfinity;
-                dv.ubounds[dep_dim] = posInfinity;
-              }
-            }
-            j->second = dvs;
-          }
-  }
-}
-
-
-void Loop::shift(const std::set<int> &stmt_nums, int level, int shift_amount) {
-  if (stmt_nums.size() == 0)
-    return;
-  
-  // check for sanity of parameters
-  int ref_stmt_num = *(stmt_nums.begin());
-  for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-    if (*i < 0 || *i >= stmt.size())
-      throw std::invalid_argument("invalid statement number " + to_string(*i));
-    if (level < 1 || level > stmt[*i].loop_level.size())
-      throw std::invalid_argument("invalid loop level " + to_string(level));
-  }
-  
-  // do nothing
-  if (shift_amount == 0)
-    return;
-  
-  // set trasformation relations
-  for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-    int n = stmt[*i].xform.n_out();
-    
-    Relation r(n, n);
-    F_And *f_root = r.add_and();
-    for (int j = 1; j <= n; j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(r.input_var(j), 1);
-      h.update_coef(r.output_var(j), -1);
-      if (j == 2*level)
-        h.update_const(shift_amount);
-    }
-    
-    stmt[*i].xform = Composition(r, stmt[*i].xform);
-    stmt[*i].xform.simplify();
-  }
-  
-  // update dependence graph
-  if (stmt[ref_stmt_num].loop_level[level-1].type == LoopLevelOriginal) {
-    int dep_dim = stmt[ref_stmt_num].loop_level[level-1].payload;
-    for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++)
-      for (DependenceGraph::EdgeList::iterator j = dep.vertex[*i].second.begin(); j != dep.vertex[*i].second.end(); j++)
-        if (stmt_nums.find(j->first) == stmt_nums.end()) {
-          // dependence from shifted statement to unshifted statement
-          std::vector<DependenceVector> dvs = j->second;
-          for (int k = 0; k < dvs.size(); k++) {
-            DependenceVector &dv = dvs[k];
-            if (dv.is_data_dependence()) {
-              if (dv.lbounds[dep_dim] != -posInfinity)
-                dv.lbounds[dep_dim] -= shift_amount;
-              if (dv.ubounds[dep_dim] != posInfinity)
-                dv.ubounds[dep_dim] -= shift_amount;
-            }
-          }
-          j->second = dvs;
-        }
-    for (int i = 0; i < dep.vertex.size(); i++)
-      if (stmt_nums.find(i) == stmt_nums.end())
-        for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++)
-          if (stmt_nums.find(j->first) != stmt_nums.end()) {
-            // dependence from unshifted statement to shifted statement
-            std::vector<DependenceVector> dvs = j->second;
-            for (int k = 0; k < dvs.size(); k++) {
-              DependenceVector &dv = dvs[k];
-              if (dv.is_data_dependence()) {
-                if (dv.lbounds[dep_dim] != -posInfinity)
-                  dv.lbounds[dep_dim] += shift_amount;
-                if (dv.ubounds[dep_dim] != posInfinity)
-                  dv.ubounds[dep_dim] += shift_amount;
-              }
-            }
-            j->second = dvs;
-          }
-  }
-}
-
-
-
-// bool Loop::fuse(const std::set<int> &stmt_nums, int level) {
-//   if (stmt_nums.size() == 0 || stmt_nums.size() == 1)
-//     return true;
-//   int dim = 2*level-1;
-
-//   // check for sanity of parameters
-//   std::vector<int> ref_lex;
-//   for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-//     if (*i < 0 || *i >= stmt.size())
-//       throw std::invalid_argument("invalid statement number " + to_string(*i));
-//     if (level < 1 || level > (stmt[*i].xform.n_out()-1)/2)
-//       throw std::invalid_argument("invalid loop level " + to_string(level));
-//     if (ref_lex.size() == 0)
-//       ref_lex = getLexicalOrder(*i);
-//     else {
-//       std::vector<int> lex = getLexicalOrder(*i);
-//       for (int j = 0; j < dim-1; j+=2)
-//         if (lex[j] != ref_lex[j])
-//           throw std::invalid_argument("statements for fusion must be in the same level-" + to_string(level-1) + " subloop");
-//     }
-//   }
-
-//   // collect lexicographical order values from to-be-fused statements
-//   std::set<int> lex_values;
-//   for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-//     std::vector<int> lex = getLexicalOrder(*i);
-//     lex_values.insert(lex[dim-1]);
-//   }
-//   if (lex_values.size() == 1)
-//     return true;
-
-//   // negative dependence would prevent fusion
-//   int dep_dim = xform_index[dim].first;
-//   for (std::set<int>::iterator i = lex_values.begin(); i != lex_values.end(); i++) {
-//     ref_lex[dim-1] = *i;
-//     std::set<int> a = getStatements(ref_lex, dim-1);
-//     std::set<int>::iterator j = i;
-//     j++;
-//     for (; j != lex_values.end(); j++) {
-//       ref_lex[dim-1] = *j;
-//       std::set<int> b = getStatements(ref_lex, dim-1);
-//       for (std::set<int>::iterator ii = a.begin(); ii != a.end(); ii++)
-//         for (std::set<int>::iterator jj = b.begin(); jj != b.end(); jj++) {
-//           std::vector<DependenceVector> dvs;
-//           dvs = dep.getEdge(*ii, *jj);
-//           for (int k = 0; k < dvs.size(); k++)
-//             if (dvs[k].isCarried(dep_dim) && dvs[k].hasNegative(dep_dim))
-//               throw loop_error("loop error: statements " + to_string(*ii) + " and " + to_string(*jj) + " cannot be fused together due to negative dependence");
-//           dvs = dep.getEdge(*jj, *ii);
-//           for (int k = 0; k < dvs.size(); k++)
-//             if (dvs[k].isCarried(dep_dim) && dvs[k].hasNegative(dep_dim))
-//               throw loop_error("loop error: statements " + to_string(*jj) + " and " + to_string(*ii) + " cannot be fused together due to negative dependence");
-//         }
-//     }
-//   }
-
-//   // collect all other lexicographical order values from the subloop
-//   // enclosing these to-be-fused loops
-//   std::set<int> same_loop = getStatements(ref_lex, dim-3);
-//   std::set<int> other_lex_values;
-//   for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-//     std::vector<int> lex = getLexicalOrder(*i);
-//     if (lex_values.find(lex[dim-1]) == lex_values.end())
-//       other_lex_values.insert(lex[dim-1]);
-//   }
-
-//   // update to-be-fused loops due to dependence cycle
-//   Graph<std::set<int>, Empty> g;
-//   {
-//     std::set<int> t;
-//     for (std::set<int>::iterator i = lex_values.begin(); i != lex_values.end(); i++) {
-//       ref_lex[dim-1] = *i;
-//       std::set<int> t2 = getStatements(ref_lex, dim-1);
-//       std::set_union(t.begin(), t.end(), t2.begin(), t2.end(), inserter(t, t.begin()));
-//     }
-//     g.insert(t);
-//   }
-//   for (std::set<int>::iterator i = other_lex_values.begin(); i != other_lex_values.end(); i++) {
-//     ref_lex[dim-1] = *i;
-//     std::set<int> t = getStatements(ref_lex, dim-1);
-//     g.insert(t);
-//   }
-//   for (int i = 0; i < g.vertex.size(); i++)
-//     for (int j = i+1; j < g.vertex.size(); j++)
-//       for (std::set<int>::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++)
-//         for (std::set<int>::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) {
-//           std::vector<DependenceVector> dvs;
-//           dvs = dep.getEdge(*ii, *jj);
-//           for (int k = 0; k < dvs.size(); k++)
-//             if (dvs[k].isCarried(dep_dim)) {
-//               g.connect(i, j);
-//               break;
-//             }
-//           dvs = dep.getEdge(*jj, *ii);
-//           for (int k = 0; k < dvs.size(); k++)
-//             if (dvs[k].isCarried(dep_dim)) {
-//               g.connect(j, i);
-//               break;
-//             }
-//         }
-//   std::vector<std::set<int> > s = g.topoSort();
-//   int fused_lex_value = 0;
-//   for (int i = 0; i < s.size(); i++)
-//     if (s[i].find(0) != s[i].end()) {
-//       // now add additional lexicographical order values
-//       for (std::set<int>::iterator j = s[i].begin(); j != s[i].end(); j++)
-//         if (*j != 0) {
-//           int stmt = *(g.vertex[*j].first.begin());
-//           std::vector<int> lex = getLexicalOrder(stmt);
-//           lex_values.insert(lex[dim-1]);
-//         }
-
-//       if (s.size() > 1) {
-//         if (i == 0) {
-//           int min_lex_value;
-//           for (std::set<int>::iterator j = s[i+1].begin(); j != s[i+1].end(); j++) {
-//             int stmt = *(g.vertex[*j].first.begin());
-//             std::vector<int> lex = getLexicalOrder(stmt);
-//             if (j == s[i+1].begin())
-//               min_lex_value = lex[dim-1];
-//             else if (lex[dim-1] < min_lex_value)
-//               min_lex_value = lex[dim-1];
-//           }
-//           fused_lex_value = min_lex_value - 1;
-//         }
-//         else {
-//           int max_lex_value;
-//           for (std::set<int>::iterator j = s[i-1].begin(); j != s[i-1].end(); j++) {
-//             int stmt = *(g.vertex[*j].first.begin());
-//             std::vector<int> lex = getLexicalOrder(stmt);
-//             if (j == s[i-1].begin())
-//               max_lex_value = lex[dim-1];
-//             else if (lex[dim-1] > max_lex_value)
-//               max_lex_value = lex[dim-1];
-//           }
-//           fused_lex_value = max_lex_value + 1;
-//         }
-//       }
-
-//       break;
-//     }
-
-//   // sort the newly updated to-be-fused lexicographical order values
-//   std::vector<int> ordered_lex_values;
-//   for (std::set<int>::iterator i = lex_values.begin(); i != lex_values.end(); i++)
-//     ordered_lex_values.push_back(*i);
-//   std::sort(ordered_lex_values.begin(), ordered_lex_values.end());
-
-//   // make sure internal loops inside to-be-fused loops have the same
-//   // lexicographical order before and after fusion
-//   std::vector<std::pair<int, int> > inside_lex_range(ordered_lex_values.size());
-//   for (int i = 0; i < ordered_lex_values.size(); i++) {
-//     ref_lex[dim-1] = ordered_lex_values[i];
-//     std::set<int> the_stmts = getStatements(ref_lex, dim-1);
-//     std::set<int>::iterator j = the_stmts.begin();
-//     std::vector<int> lex = getLexicalOrder(*j);
-//     int min_inside_lex_value = lex[dim+1];
-//     int max_inside_lex_value = lex[dim+1];
-//     j++;
-//     for (; j != the_stmts.end(); j++) {
-//       std::vector<int> lex = getLexicalOrder(*j);
-//       if (lex[dim+1] < min_inside_lex_value)
-//         min_inside_lex_value = lex[dim+1];
-//       if (lex[dim+1] > max_inside_lex_value)
-//         max_inside_lex_value = lex[dim+1];
-//     }
-//     inside_lex_range[i].first = min_inside_lex_value;
-//     inside_lex_range[i].second = max_inside_lex_value;
-//   }
-//   for (int i = 1; i < ordered_lex_values.size(); i++)
-//     if (inside_lex_range[i].first <= inside_lex_range[i-1].second) {
-//       int shift_lex_value = inside_lex_range[i-1].second - inside_lex_range[i].first + 1;
-//       ref_lex[dim-1] = ordered_lex_values[i];
-//       ref_lex[dim+1] = inside_lex_range[i].first;
-//       shiftLexicalOrder(ref_lex, dim+1, shift_lex_value);
-//       inside_lex_range[i].first += shift_lex_value;
-//       inside_lex_range[i].second += shift_lex_value;
-//     }
-
-//   // set lexicographical order for fused loops
-//   for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-//     std::vector<int> lex = getLexicalOrder(*i);
-//     if (lex_values.find(lex[dim-1]) != lex_values.end())
-//       assign_const(stmt[*i].xform, dim-1, fused_lex_value);      
-//   }
-
-//   // no need to update dependence graph
-//   ;
-
-//   return true;
-// }
-
-
-// bool Loop::distribute(const std::set<int> &stmt_nums, int level) {
-//   if (stmt_nums.size() == 0 || stmt_nums.size() == 1)
-//     return true;
-//   int dim = 2*level-1;
-
-//   // check for sanity of parameters
-//   std::vector<int> ref_lex;
-//   for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-//     if (*i < 0 || *i >= stmt.size())
-//       throw std::invalid_argument("invalid statement number " + to_string(*i));
-//     if (level < 1 || level > (stmt[*i].xform.n_out()-1)/2)
-//       throw std::invalid_argument("invalid loop level " + to_string(level));
-//     if (ref_lex.size() == 0)
-//       ref_lex = getLexicalOrder(*i);
-//     else {
-//       std::vector<int> lex = getLexicalOrder(*i);
-//       for (int j = 0; j <= dim-1; j+=2)
-//         if (lex[j] != ref_lex[j])
-//           throw std::invalid_argument("statements for distribution must be in the same level-" + to_string(level) + " subloop");
-//     }
-//   }
-
-//   // find SCC in the to-be-distributed loop
-//   int dep_dim = xform_index[dim].first;
-//   std::set<int> same_loop = getStatements(ref_lex, dim-1);
-//   Graph<int, Empty> g;
-//   for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-//     g.insert(*i);
-//   for (int i = 0; i < g.vertex.size(); i++)
-//     for (int j = i+1; j < g.vertex.size(); j++) {
-//       std::vector<DependenceVector> dvs;
-//       dvs = dep.getEdge(g.vertex[i].first, g.vertex[j].first);
-//       for (int k = 0; k < dvs.size(); k++)
-//         if (dvs[k].isCarried(dep_dim)) {
-//           g.connect(i, j);
-//           break;
-//         }
-//       dvs = dep.getEdge(g.vertex[j].first, g.vertex[i].first);
-//       for (int k = 0; k < dvs.size(); k++)
-//         if (dvs[k].isCarried(dep_dim)) {
-//           g.connect(j, i);
-//           break;
-//         }
-//     }
-//   std::vector<std::set<int> > s = g.topoSort();
-
-//   // find statements that cannot be distributed due to dependence cycle
-//   Graph<std::set<int>, Empty> g2;
-//   for (int i = 0; i < s.size(); i++) {
-//     std::set<int> t;
-//     for (std::set<int>::iterator j = s[i].begin(); j != s[i].end(); j++)
-//       if (stmt_nums.find(g.vertex[*j].first) != stmt_nums.end())
-//         t.insert(g.vertex[*j].first);
-//     if (!t.empty())
-//       g2.insert(t);
-//   }
-//   for (int i = 0; i < g2.vertex.size(); i++)
-//     for (int j = i+1; j < g2.vertex.size(); j++)
-//       for (std::set<int>::iterator ii = g2.vertex[i].first.begin(); ii != g2.vertex[i].first.end(); ii++)
-//         for (std::set<int>::iterator jj = g2.vertex[j].first.begin(); jj != g2.vertex[j].first.end(); jj++) {
-//           std::vector<DependenceVector> dvs;
-//           dvs = dep.getEdge(*ii, *jj);
-//           for (int k = 0; k < dvs.size(); k++)
-//             if (dvs[k].isCarried(dep_dim)) {
-//               g2.connect(i, j);
-//               break;
-//             }
-//           dvs = dep.getEdge(*jj, *ii);
-//           for (int k = 0; k < dvs.size(); k++)
-//             if (dvs[k].isCarried(dep_dim)) {
-//               g2.connect(j, i);
-//               break;
-//             }
-//         }
-//   std::vector<std::set<int> > s2 = g2.topoSort();
-
-//   // nothing to distribute
-//   if (s2.size() == 1)
-//     throw loop_error("loop error: no statement can be distributed due to dependence cycle");
-
-//   std::vector<std::set<int> > s3;
-//   for (int i = 0; i < s2.size(); i++) {
-//     std::set<int> t;
-//     for (std::set<int>::iterator j = s2[i].begin(); j != s2[i].end(); j++)
-//       std::set_union(t.begin(), t.end(), g2.vertex[*j].first.begin(), g2.vertex[*j].first.end(), inserter(t, t.begin()));
-//     s3.push_back(t);
-//   }
-
-//   // associate other affected statements with the right distributed statements
-//   for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-//     if (stmt_nums.find(*i) == stmt_nums.end()) {
-//       bool is_inserted = false;
-//       int potential_insertion_point = 0;
-//       for (int j = 0; j < s3.size(); j++) {
-//         for (std::set<int>::iterator k = s3[j].begin(); k != s3[j].end(); k++) {
-//           std::vector<DependenceVector> dvs;
-//           dvs = dep.getEdge(*i, *k);
-//           for (int kk = 0; kk < dvs.size(); kk++)
-//             if (dvs[kk].isCarried(dep_dim)) {
-//               s3[j].insert(*i);
-//               is_inserted = true;
-//               break;
-//             }
-//           dvs = dep.getEdge(*k, *i);
-//           for (int kk = 0; kk < dvs.size(); kk++)
-//             if (dvs[kk].isCarried(dep_dim))
-//               potential_insertion_point = j;
-//         }
-//         if (is_inserted)
-//           break;
-//       }
-
-//       if (!is_inserted)
-//         s3[potential_insertion_point].insert(*i);
-//     }
-
-//   // set lexicographical order after distribution
-//   int order = ref_lex[dim-1];
-//   shiftLexicalOrder(ref_lex, dim-1, s3.size()-1);
-//   for (std::vector<std::set<int> >::iterator i = s3.begin(); i != s3.end(); i++) {
-//     for (std::set<int>::iterator j = (*i).begin(); j != (*i).end(); j++)
-//       assign_const(stmt[*j].xform, dim-1, order);
-//     order++;
-//   }
-
-//   // no need to update dependence graph
-//   ;
-
-//   return true;
-// }
-
-
-
-
-
-
-
-
diff --git a/loop_cuda.cc b/loop_cuda.cc
deleted file mode 100644
index a23990d..0000000
--- a/loop_cuda.cc
+++ /dev/null
@@ -1,2123 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2009 University of Utah
- All Rights Reserved.
-
- Purpose:
-   Cudaize methods
-
- Notes:
-
- History:
-   1/7/10 Created by Gabe Rudy by migrating code from loop.cc
-     31/1/11 Modified by Protonu Basu
-*****************************************************************************/
-
-#include <code_gen/code_gen.h>
-#include <code_gen/CG_stringBuilder.h>
-#include <code_gen/output_repr.h>
-#include <code_gen/CG_outputRepr.h>
-#include "loop_cuda.hh"
-#include "loop.hh"
-#include <math.h>
-#include <useful.h>
-#include "omegatools.hh"
-#include "ir_cudasuif.hh"
-#include "ir_suif.hh"
-#include "ir_suif_utils.hh"
-#include "chill_error.hh"
-#include <vector>
-
-using namespace omega;
-char *k_cuda_texture_memory; //protonu--added to track texture memory type
-char *k_cuda_constant_memory; //protonu--added to track constant memory type
-//extern char *omega::k_cuda_texture_memory; //protonu--added to track texture memory type
-extern char *omega::k_ocg_comment;
-
-
-static int cudaDebug;
-class CudaStaticInit{ public: CudaStaticInit(){ cudaDebug=0; //Change this to 1 for debug
-}};
-static CudaStaticInit junkInitInstance__;
-
-
-
-std::string& upcase(std::string& s)
-{
-  for(int i=0; i<s.size(); i++)
-    s[i] = toupper(s[i]);
-  return s;
-}
-
-void printVs(const std::vector<std::string>& curOrder){
-  if(!cudaDebug) return;
-  for(int i=0; i<curOrder.size(); i++){
-    if(i>0)
-      printf(",");
-    printf("%s", curOrder[i].c_str());
-  }
-  printf("\n");
-}
-
-void printVS(const std::vector<std::string>& curOrder){
-  //if(!cudaDebug) return;
-  for(int i=0; i<curOrder.size(); i++){
-    if(i>0)
-      printf(",");
-    printf("%s", curOrder[i].c_str());
-  }
-  printf("\n");
-}
-
-LoopCuda::~LoopCuda() {
-  const int m = stmt.size();
-  for (int i = 0; i < m; i++)
-    stmt[i].code->clear();
-}
-
-bool LoopCuda::symbolExists(std::string s){
-  if(symtab->lookup_sym(s.c_str(), SYM_VAR, false))
-    return true;
-  if(globals->lookup_sym(s.c_str(), SYM_VAR, false))
-    return true;
-  for(int i=0; i<idxNames.size(); i++)
-    for(int j=0; j<idxNames[i].size(); j++)
-      if(strcmp(idxNames[i][j].c_str(), s.c_str()) == 0)
-        return true;
-  return false;
-}
-
-void LoopCuda::addSync(int stmt_num, std::string idxName)
-{
-  //we store these and code-gen inserts sync to omega comments where stmt
-  //in loop that has idxName being generated
-  syncs.push_back(make_pair(stmt_num,idxName));
-}
-
-void LoopCuda::renameIndex(int stmt_num, std::string idx, std::string newName)
-{
-  int level = findCurLevel(stmt_num, idx);
-  if(idxNames.size() <= stmt_num || idxNames[stmt_num].size() < level)
-    throw std::runtime_error("Invalid statment number of index");
-  idxNames[stmt_num][level-1] = newName.c_str();
-}
-
-
-
-enum Type{ Int };
-
-struct VarDefs{
-  std::string name;
-  std::string secondName;  
-  operand size_expr; //array size as an expression (can be a product of other variables etc)
-  type_node * type;
-  var_sym* in_data; //Variable of array to copy data in from (before kernel call)
-  var_sym* out_data; //Variable of array to copy data out to (after kernel call)
-  int size_2d; //-1 if linearized, the constant size N, of a NxN 2D array otherwise
-  bool tex_mapped; //protonu-- true if this variable will be texture mapped, so no need to pass it as a argument
-  bool cons_mapped; //protonu-- true if this variable will be constant mem mapped, so no need to pass it as a argument
-  std::string original_name; //this is such a hack, to store the original name, to store a table to textures used
-  int var_ref_size ;
-};
-
-tree_node_list* wrapInIfFromMinBound(tree_node_list* then_part, tree_for* loop, base_symtab* symtab, var_sym* bound_sym)
-{
-  tree_node_list* ub = loop->ub_list();
-  tree_node_list_iter upli(ub);
-  while(!upli.is_empty()){
-    tree_node *node = upli.step();
-    if(node->kind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr)
-    {
-      in_rrr* ins = (in_rrr*)((tree_instr*)node)->instr();
-      //expect the structure: cpy( _ = min(grab_me, _))
-      if(ins->opcode() == io_cpy && ins->src1_op().is_instr()){
-        ins = (in_rrr*)ins->src1_op().instr();
-        if(ins->opcode() == io_min){
-          tree_node_list* tnl = new tree_node_list;
-          tnl->append(if_node(symtab, fold_sle(operand(bound_sym), ins->src1_op().instr()->clone()), then_part));
-          return tnl;
-        }
-      }
-    }
-  }
-  return then_part; //Failed to go to proper loop level
-}
-
-/**
- * This would be better if it was done by a CHiLL xformation instead of at codegen
- *
- * state:
- * for(...)
- *   for(...)
- *     cur_body
- *   stmt1
- *
- * stm1 is in-between two loops that are going to be reduced. The
- * solution is to put stmt1 at the end of cur_body but conditionally run
- * in on the last step of the for loop.
- *
- * A CHiLL command that would work better:
- *
- * for(...)
- *   stmt0
- *   for(for i=0; i<n; i++)
- *     cur_body
- *   stmt1
- * =>
- * for(...)
- *   for(for i=0; i<n; i++)
- *     if(i==0) stmt0
- *     cur_body
- *     if(i==n-1) stmt1
- */
-
-std::vector<tree_for*> findCommentedFors(const char* index, tree_node_list* tnl){
-  std::vector<tree_for *> result;
-  
-  tree_node_list_iter iter(tnl);
-  bool next_loop_ok = false;
-  while (!iter.is_empty()) {
-    tree_node *tn = iter.step();
-    if (tn->kind() == TREE_INSTR && ((tree_instr*)tn)->instr()->opcode() == io_mrk)
-    {
-      instruction* inst = ((tree_instr*)tn)->instr();
-      std::string comment;
-      if ((inst->peek_annote(k_ocg_comment) != NULL))
-      {
-        immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment));
-        immed_list_iter data_iter(data);
-        if(!data_iter.is_empty()){
-          immed first_immed = data_iter.step();
-          if(first_immed.kind() == im_string)
-            comment = first_immed.string();
-        }
-      }
-      if(comment.find("~cuda~") != std::string::npos
-         && comment.find("preferredIdx: ") != std::string::npos){
-        std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos);
-        if(idx.find(" ") != std::string::npos)
-          idx = idx.substr(0,idx.find(" "));
-        if(strcmp(idx.c_str(),index) == 0)
-          next_loop_ok = true;
-      }
-    }
-    if (tn->kind() == TREE_FOR){
-      if(next_loop_ok){
-        //printf("found loop %s\n", static_cast<tree_for *>(tn)->index()->name());
-        result.push_back(static_cast<tree_for *>(tn));
-      }
-      else{
-        //printf("looking down for loop %s\n", static_cast<tree_for *>(tn)->index()->name());
-        std::vector<tree_for*> t = findCommentedFors(index, static_cast<tree_for *>(tn)->body());
-        std::copy(t.begin(), t.end(), back_inserter(result));
-      }
-      next_loop_ok = false;
-    }
-    if (tn->kind() == TREE_IF) {
-      //printf("looking down if\n");
-      tree_if *tni = static_cast<tree_if *>(tn);
-      std::vector<tree_for*> t = findCommentedFors(index, tni->then_part());
-      std::copy(t.begin(), t.end(), back_inserter(result));
-    }
-  }
-  
-  return result;
-}
-
-tree_node_list* forReduce(tree_for* loop, var_sym* reduceIndex, proc_symtab* proc_syms)
-{
-  //We did the replacements all at once with recursiveFindPreferedIdxs
-  //replacements r;
-  //r.oldsyms.append(loop->index());
-  //r.newsyms.append(reduceIndex);
-  //tree_for* new_loop = (tree_for*)loop->clone_helper(&r, true);
-  tree_for* new_loop = loop;
-  
-  //return body one loops in
-  tree_node_list* tnl = loop_body_at_level(new_loop, 1);
-  //wrap in conditional if necessary
-  tnl = wrapInIfFromMinBound(tnl, new_loop, proc_syms, reduceIndex);
-  return tnl;
-}
-
-void recursiveFindRefs(tree_node_list* code, proc_symtab* proc_syms, replacements* r)
-{
-  if(code->parent() && code->scope()->is_block())
-    ((block_symtab*)code->scope())->find_exposed_refs(proc_syms, r);
-  tree_node_list_iter tnli(code);
-  while (!tnli.is_empty()) {
-    tree_node *node = tnli.step();
-    //printf("node kind: %d\n", node->kind());
-    if(node->is_instr())
-    {
-      tree_instr* t_instr = (tree_instr*)node;
-      t_instr->find_exposed_refs(proc_syms, r);
-    }
-    if(node->is_block()){
-      recursiveFindRefs(static_cast<tree_block *>(node)->body(), proc_syms, r);
-    }
-    else if(node->is_for()){
-      tree_for* tn_for = static_cast<tree_for *>(node);
-      //Find refs in statemetns and body
-      tn_for->find_exposed_refs(proc_syms, r);
-      //recursiveFindRefs(tn_for->body(), proc_syms, r);
-    }
-  }
-}
-
-tree_node_list* recursiveFindReplacePreferedIdxs(tree_node_list* code, proc_symtab* proc_syms,
-                                                 proc_sym* cudaSync, func_type* unkown_func, 
-                                                 std::map<std::string, var_sym*>& loop_idxs)
-{
-  tree_node_list* tnl = new tree_node_list;
-  tree_node_list_iter tnli(code);
-  var_sym* idxSym=0;
-  bool sync = false;
-  std::vector<tree_node*>      r1;
-  std::vector<tree_node_list*> r2;
-  while (!tnli.is_empty()) {
-    tree_node *node = tnli.step();
-    //printf("node kind: %d\n", node->kind());
-    if(node->is_instr())
-    {
-      if(((tree_instr*)node)->instr()->format() == inf_rrr){
-        in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr();
-        if(inst->opcode() == io_mrk){
-          std::string comment;
-          if ((inst->peek_annote(k_ocg_comment) != NULL))
-          {
-            immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment));
-            immed_list_iter data_iter(data);
-            if(!data_iter.is_empty()){
-              immed first_immed = data_iter.step();
-              if(first_immed.kind() == im_string)
-                comment = first_immed.string();
-            }
-          }
-          if(comment.find("~cuda~") != std::string::npos
-             && comment.find("preferredIdx: ") != std::string::npos){
-            std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos);
-            if(idx.find(" ") != std::string::npos)
-              idx = idx.substr(0,idx.find(" "));
-            //printf("sym_tab preferred index: %s\n", idx.c_str());
-            if(loop_idxs.find(idx) != loop_idxs.end())
-              idxSym = loop_idxs.find(idx)->second;
-            //Get the proc variable sybol for this preferred index
-            if(idxSym == 0){
-              idxSym = (var_sym*)proc_syms->lookup_sym(idx.c_str(), SYM_VAR, false);
-              //printf("idx not found: lookup %p\n", idxSym);
-              if(!idxSym){
-                idxSym = new var_sym(type_s32, (char*)idx.c_str());
-                proc_syms->add_sym(idxSym);
-                //printf("idx created and inserted\n");
-              }
-              //Now insert into our map for future
-              loop_idxs.insert(make_pair(idx, idxSym));
-            }
-            //See if we have a sync as well
-            if(comment.find("sync") != std::string::npos){
-              //printf("Inserting sync after current block\n");
-              sync = true;
-            }
-          }
-        }
-      }
-      tnl->append(node);
-    }
-    else if(node->is_block()){
-      tree_block* b = static_cast<tree_block *>(node);
-      b->set_body(recursiveFindReplacePreferedIdxs(b->body(), proc_syms, cudaSync, unkown_func, loop_idxs));
-      tnl->append(b);
-    }
-    else if(node->is_for()){
-      tree_for* tn_for = static_cast<tree_for *>(node);
-      if(idxSym){
-        //Replace the current tn_for's index variable with idxSym
-        //printf("replacing sym %s -> %s\n", tn_for->index()->name(), idxSym->name());
-        replacements r;
-        r.oldsyms.append(tn_for->index());
-        r.newsyms.append(idxSym);
-        tree_for* new_loop = (tree_for*)tn_for->clone_helper(&r, true);
-        idxSym = 0; //Reset for more loops in this tnl
-        new_loop->set_body(recursiveFindReplacePreferedIdxs(new_loop->body(), proc_syms, cudaSync, unkown_func, loop_idxs));
-        tnl->append(new_loop);
-        
-        if(sync){
-          in_cal *the_call =
-            new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaSync))), 0);
-          tnl->append(new tree_instr(the_call));
-          //tnl->print();
-          sync = true;
-        }
-      }else{
-        tn_for->set_body(recursiveFindReplacePreferedIdxs(tn_for->body(), proc_syms, cudaSync, unkown_func, loop_idxs));
-        tnl->append(tn_for);
-      }
-    }else if (node->kind() == TREE_IF) {
-      tree_if *tni = static_cast<tree_if *>(node);
-      tni->set_then_part(recursiveFindReplacePreferedIdxs(tni->then_part(), proc_syms, cudaSync, unkown_func, loop_idxs));
-      tnl->append(tni);
-    }
-  }
-  //Do this after the loop to not screw up the pointer interator
-  /*
-    for(int i=0; i<r1.size(); i++){
-    swap_node_for_node_list(r1[i],r2[i]);
-    }*/
-  return tnl;
-}
-
-// loop_vars -> array references
-// loop_idxs -> <idx_name,idx_sym> map for when we encounter a loop with a different preferredIndex
-// dim_vars -> out param, fills with <old,new> var_sym pair for 2D array dimentions (messy stuff)
-tree_node_list* swapVarReferences(tree_node_list* code, replacements* r, CG_suifBuilder *ocg,
-                                  std::map<std::string, var_sym*>& loop_vars,
-                                  proc_symtab *proc_syms,
-                                  std::vector< std::pair<var_sym*,var_sym*> >& dim_vars)
-{
-  //Iterate over every expression, looking up each variable and type
-  //reference used and possibly replacing it or adding it to our symbol
-  //table
-  //
-  //We use the built-in cloning helper methods to seriously help us with this!
-  
-  //Need to do a recursive mark
-  recursiveFindRefs(code, proc_syms, r);
-  
-  
-  //We can't rely on type_node->clone() to do the heavy lifting when the
-  //old type is a two dimentional array with variable upper bounds as
-  //that requires creating and saveing variable references to the upper
-  //bounds. So we do one pass over the oldtypes doing this type of
-  //conversion, putting results in the fixed_types map for a second pass
-  //to pick up.
-  std::map<type_node*,type_node*> fixed_types; //array_types needing their upper bound installed
-  type_node_list_iter tlip(&r->oldtypes);
-  while(!tlip.is_empty())
-  {
-    type_node* old_tn = tlip.step();
-    type_node* new_tn = 0;
-    type_node* base_type = old_tn;
-    std::vector< std::pair<var_sym*, type_node*> > variable_upper_bouneds;
-    if(old_tn->is_ptr()){
-      while (base_type->is_array() || base_type->is_ptr()) {
-        if (base_type->is_array()){
-          array_bound ub = ((array_type*)base_type)->upper_bound();
-          if(ub.is_variable()){
-            var_sym* old_ub = (var_sym*)ub.variable();
-            var_sym *new_ub = proc_syms->new_unique_var(type_s32);
-            dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub));
-            variable_upper_bouneds.push_back( std::pair<var_sym*, type_node*>(new_ub, base_type) );
-          }
-          base_type = static_cast<array_type *>(base_type)->elem_type();
-        }
-        else if (base_type->is_ptr())
-          base_type = static_cast<ptr_type *>(base_type)->ref_type();
-      }
-    }
-    for (int i = variable_upper_bouneds.size()-1; i >= 0; i--) {
-      var_sym *var_ub = variable_upper_bouneds[i].first;
-      type_node* old_tn = variable_upper_bouneds[i].second;
-      if(new_tn == 0)
-        new_tn = new array_type(base_type, array_bound(1), array_bound(var_ub));
-      else
-        new_tn = new array_type(new_tn, array_bound(1), array_bound(var_ub));
-      proc_syms->add_type(new_tn);
-      fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn));
-    }
-    if(new_tn){
-      if(old_tn->is_ptr()){
-        new_tn = new ptr_type(new_tn);
-        proc_syms->add_type(new_tn);
-      }
-      fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn));
-    }
-  }
-  
-  //Quickly look for modifiers on our our array types (__shared__ float [][])
-  type_node_list_iter tliq(&r->oldtypes);
-  while(!tliq.is_empty())
-  {
-    type_node* old_tn = tliq.step();
-    if(old_tn->is_modifier()){
-      type_node* base_type = static_cast<modifier_type *>(old_tn)->base();
-      if(fixed_types.find(base_type) != fixed_types.end()){
-        type_node* fixed_base = (*fixed_types.find(base_type)).second;
-        //printf("Fix modifier with fixed base\n");
-        //This should work to copy over the annotations, but apparently doesn't work so well
-        type_node* new_tn = new modifier_type(static_cast<modifier_type*>(old_tn)->op(), fixed_base);
-        old_tn->copy_annotes(new_tn);
-        fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn));
-      }
-    }
-  }
-  
-  //Run through the types and create entries in r->newtypes but don't install
-  type_node_list_iter tli(&r->oldtypes);
-  while(!tli.is_empty())
-  {
-    type_node* old_tn = tli.step();
-    type_node* new_tn = 0;
-    
-    //If we recorded this as fixed by our special case, use that type
-    //instead of cloning.
-    if(fixed_types.find(old_tn) != fixed_types.end()){
-      new_tn = (*fixed_types.find(old_tn)).second;
-      //printf("Reusing fixed typ %u: ", new_tn->type_id());
-    }else{
-      new_tn = old_tn->clone();
-      //printf("Cloning type %u: ", old_tn->type_id());
-    }
-    new_tn = proc_syms->install_type(new_tn);
-    
-    //Ok, there is a weird case where an array type that has var_sym as
-    //their upper bounds can't be covered fully in this loop or the
-    //var_sym loop, so we need special code.
-    /*
-      if(old_tn->op() == TYPE_PTR && ((ptr_type*)old_tn)->ref_type()->op() == TYPE_ARRAY){
-      array_type* outer_array = (array_type*)((ptr_type*)old_tn)->ref_type();
-      array_bound ub = outer_array->upper_bound();
-      if(ub.is_variable()){
-      var_sym* old_ub = (var_sym*)ub.variable();
-      var_sym* new_ub = (var_sym*)((array_type*)((ptr_type*)new_tn)->ref_type())->upper_bound().variable();
-      //r->oldsyms.append(old_ub);
-      fix_ub.insert(std::pair<var_sym*,array_type*>(old_ub, (array_type*)((ptr_type*)new_tn)->ref_type()));
-      dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub));
-      printf("array var_sym: %p\n", new_ub);
-      }
-      if(outer_array->elem_type()->op() == TYPE_ARRAY)
-      {
-      array_type* inner_array = (array_type*)outer_array->elem_type();
-      array_bound ub = inner_array->upper_bound();
-      if(ub.is_variable()){
-      var_sym* old_ub = (var_sym*)ub.variable();
-      var_sym* new_ub = (var_sym*)((array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type())->upper_bound().variable();
-      dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub));
-      printf("array var_sym: %p\n", new_ub);
-      //r->oldsyms.append(old_ub);
-      fix_ub.insert(std::pair<var_sym*,array_type*>(old_ub, (array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type()));
-      }
-      }
-      }
-    */
-    r->newtypes.append(new_tn);
-  }
-  
-  //printf("proc_syms symbol run through\n");
-  //proc_syms->print();
-  
-  //Run through the syms creating new copies
-  sym_node_list_iter snli(&r->oldsyms);
-  while(!snli.is_empty())
-  {
-    sym_node *old_sn = snli.step();
-    
-    if(loop_vars.count(std::string(old_sn->name())) > 0)
-    {
-      r->newsyms.append(loop_vars[std::string(old_sn->name())]);
-      //printf("def exists: %s\n", old_sn->name());
-    }else{
-      sym_node *new_sn = old_sn->copy();
-      if(new_sn->is_var()){
-        var_sym* var = (var_sym*)new_sn;
-        type_node* new_type = var->type()->clone_helper(r);
-        
-        //TODO: Have a tagged list of variables to make shared
-        //Make local 2D arrays __shared__
-        if(new_type->op() == TYPE_ARRAY && ((array_type*)new_type)->elem_type()->op() == TYPE_ARRAY){
-          //protonu--changes suggested by Malik
-          //printf("Adding __shared__ annotation to : %s\n", new_sn->name());
-          //new_type = ocg->ModifyType(new_type, "__shared__");
-          //proc_syms->add_type(new_type);
-        }
-        var->set_type(new_type);
-      }
-      proc_syms->add_sym(new_sn);
-      r->newsyms.append(new_sn);
-      //printf("def new: %s\n", new_sn->name());
-    }
-  }
-  
-  //printf("proc_syms var runthrough\n");
-  //proc_syms->print();
-  return code->clone_helper(r);
-}
-
-bool LoopCuda::validIndexes(int stmt, const std::vector<std::string>& idxs){
-  for(int i=0; i<idxs.size(); i++){
-    bool found = false;
-    for(int j=0; j<idxNames[stmt].size(); j++){
-      if(strcmp(idxNames[stmt][j].c_str(), idxs[i].c_str()) == 0){
-        found=true;
-      }
-    }
-    if(!found){
-      return false;
-    }
-  }
-  return true;
-}
-
-
-bool LoopCuda::cudaize_v2(std::string kernel_name, std::map<std::string, int> array_dims,
-                          std::vector<std::string> blockIdxs, std::vector<std::string> threadIdxs)
-{
-  int stmt_num = 0;
-  if(cudaDebug){
-    printf("cudaize_v2(%s, {", kernel_name.c_str());
-    //for(
-    printf("}, blocks={"); printVs(blockIdxs); printf("}, thread={"); printVs(threadIdxs); printf("})\n");
-  }
-  
-  this->array_dims = array_dims;
-  if(!validIndexes(stmt_num, blockIdxs)){
-    throw std::runtime_error("One of the indexes in the block list was not "
-                             "found in the current set of indexes.");
-  }
-  if(!validIndexes(stmt_num, threadIdxs)){
-    throw std::runtime_error("One of the indexes in the thread list was not "
-                             "found in the current set of indexes.");
-  }
-  if(blockIdxs.size() ==0)
-    throw std::runtime_error("Cudaize: Need at least one block dimention");
-  int block_level=0;
-  //Now, we will determine the actual size (if possible, otherwise
-  //complain) for the block dimentions and thread dimentions based on our
-  //indexes and the relations for our stmt;
-  for(int i=0; i<blockIdxs.size(); i++){
-    int level = findCurLevel(stmt_num, blockIdxs[i]);
-    int ub,lb;
-    extractCudaUB(stmt_num,level,ub,lb);
-    if(lb!= 0){
-      //attempt to "normalize" the loop with an in-place tile and then re-check our bounds
-      if(cudaDebug) printf("Cudaize: doing tile at level %d to try and normalize lower bounds\n", level);
-      tile(stmt_num,level,1,level,CountedTile);
-      idxNames[stmt_num].insert(idxNames[stmt_num].begin()+(level),"");//TODO: possibly handle this for all sibling stmts
-      extractCudaUB(stmt_num,level,ub,lb);
-    }
-    if(lb != 0){
-      char buf[1024];
-      sprintf(buf, "Cudaize: Loop at level %d does not have 0 as it's lower bound", level);
-      throw std::runtime_error(buf);
-    }
-    if(ub < 0){
-      char buf[1024];
-      sprintf(buf, "Cudaize: Loop at level %d does not have a hard upper bound", level);
-      throw std::runtime_error(buf);
-    }
-    if(cudaDebug) printf("block idx %s level %d lb: %d ub %d\n", blockIdxs[i].c_str(), level, lb, ub);
-    if(i == 0){
-      block_level = level;
-      cu_bx = ub+1;
-      idxNames[stmt_num][level-1] = "bx";
-    }
-    else if(i == 1){
-      cu_by = ub+1;
-      idxNames[stmt_num][level-1] = "by";
-    }
-  }
-  if(!cu_by)
-    block_level=0;
-  int thread_level1 = 0;
-  int thread_level2 = 0;
-  for(int i=0; i<threadIdxs.size(); i++){
-    int level = findCurLevel(stmt_num, threadIdxs[i]);
-    int ub,lb;
-    extractCudaUB(stmt_num,level,ub,lb);
-    if(lb!= 0){
-      //attempt to "normalize" the loop with an in-place tile and then re-check our bounds
-      if(cudaDebug) printf("Cudaize: doing tile at level %d to try and normalize lower bounds\n", level);
-      tile(stmt_num,level,1,level,CountedTile);
-      idxNames[stmt_num].insert(idxNames[stmt_num].begin()+(level),"");
-      extractCudaUB(stmt_num,level,ub,lb);
-    }
-    if(lb != 0){
-      char buf[1024];
-      sprintf(buf, "Cudaize: Loop at level %d does not have 0 as it's lower bound", level);
-      throw std::runtime_error(buf);
-    }
-    if(ub < 0){
-      char buf[1024];
-      sprintf(buf, "Cudaize: Loop at level %d does not have a hard upper bound", level);
-      throw std::runtime_error(buf);
-    }
-    
-    if(cudaDebug) printf("thread idx %s level %d lb: %d ub %d\n", threadIdxs[i].c_str(), level, lb, ub);
-    if(i == 0){
-      thread_level1 = level;
-      cu_tx = ub+1;
-      idxNames[stmt_num][level-1] = "tx";
-    }
-    else if(i == 1){
-      thread_level2 = level;
-      cu_ty = ub+1;
-      idxNames[stmt_num][level-1] = "ty";
-    }
-    else if(i == 2){
-      cu_tz = ub+1;
-      idxNames[stmt_num][level-1] = "tz";
-    }
-  }
-  if(!cu_ty)
-    thread_level1 = 0; 
-  if(!cu_tz)
-    thread_level2 = 0; 
-  
-  //Make changes to nonsplitlevels
-  const int m = stmt.size();
-  for (int i = 0; i < m; i++) {
-    if(block_level){
-      //stmt[i].nonSplitLevels.append((block_level)*2);
-      stmt_nonSplitLevels[i].append((block_level)*2);
-    }
-    if(thread_level1){
-      //stmt[i].nonSplitLevels.append((thread_level1)*2);
-      stmt_nonSplitLevels[i].append((thread_level1)*2);
-    }
-    if(thread_level2){
-      //stmt[i].nonSplitLevels.append((thread_level1)*2);
-      stmt_nonSplitLevels[i].append((thread_level1)*2);
-    }
-  }
-  
-  if(cudaDebug) {
-    printf("Codegen: current names: ");
-    printVS(idxNames[stmt_num]);
-  }
-  //Set codegen flag
-  code_gen_flags |= GenCudaizeV2;
-  
-  //Save array dimention sizes
-  this->array_dims = array_dims;
-  cu_kernel_name = kernel_name.c_str();
-  
-}
-
-tree_node_list* LoopCuda::cudaize_codegen_v2()
-{
-    //printf("cudaize codegen V2\n");
-  CG_suifBuilder *ocg = dynamic_cast<CG_suifBuilder*>(ir->builder());
-  if(!ocg) return false;
-  
-  //protonu--adding an annote to track texture memory type
-  ANNOTE(k_cuda_texture_memory, "cuda texture memory", TRUE);
-  ANNOTE(k_cuda_constant_memory, "cuda constant memory", TRUE);
-  int tex_mem_on = 0;
-  int cons_mem_on = 0;
-  
-  
-  
-  CG_outputRepr* repr;
-  std::vector<VarDefs> arrayVars;
-  std::vector<VarDefs> localScopedVars;
-  
-  std::vector<IR_ArrayRef *> ro_refs;
-  std::vector<IR_ArrayRef *> wo_refs;
-  std::set<std::string> uniqueRefs;
-  std::set<std::string> uniqueWoRefs;
-  //protonu--let's try a much simpler approach of a map instead
-  //we also keep a map for constant memories
-  std::map<std::string , var_sym *>tex_ref_map;
-  std::map<std::string , var_sym *>cons_ref_map;
-  
-  for(int j=0; j<stmt.size(); j++)
-  {
-    std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[j].code);
-    for (int i = 0; i < refs.size(); i++)
-    {
-      //printf("ref %s wo %d\n", static_cast<const char*>(refs[i]->name()), refs[i]->is_write());
-      var_sym* var = symtab->lookup_var((char*)refs[i]->name().c_str(),false);
-      //If the array is not a parameter, then it's a local array and we
-      //want to recreate it as a stack variable in the kernel as opposed to
-      //passing it in.
-      if(!var->is_param())
-        continue;
-      if (uniqueRefs.find(refs[i]->name()) == uniqueRefs.end())
-      {
-        uniqueRefs.insert(refs[i]->name());
-        if(refs[i]->is_write()){
-          uniqueWoRefs.insert(refs[i]->name());
-          wo_refs.push_back(refs[i]);
-        }
-        else
-          ro_refs.push_back(refs[i]);
-      }
-      if (refs[i]->is_write() && uniqueWoRefs.find(refs[i]->name()) == uniqueWoRefs.end()){
-        uniqueWoRefs.insert(refs[i]->name());
-        wo_refs.push_back(refs[i]);
-        //printf("adding %s to wo\n", static_cast<const char*>(refs[i]->name()));
-      }
-    }
-  }
-  
-  // printf("reading from array ");
-  // for(int i=0; i<ro_refs.size(); i++)
-  //   printf("'%s' ", ro_refs[i]->name().c_str());
-  // printf("and writting to array ");
-  // for(int i=0; i<wo_refs.size(); i++)
-  //   printf("'%s' ", wo_refs[i]->name().c_str());
-  // printf("\n");
-  
-  const char* gridName = "dimGrid";
-  const char* blockName = "dimBlock";
-  
-  //TODO: Could allow for array_dims_vars to be a mapping from array
-  //references to to variable names that define their length.
-  var_sym* dim1 = 0;
-  var_sym* dim2 = 0;
-  
-  for(int i=0; i<wo_refs.size(); i++)
-  {
-    //TODO: Currently assume all arrays are floats of one or two dimentions
-    var_sym* outArray = 0;
-    std::string name = wo_refs[i]->name();
-    outArray = symtab->lookup_var((char*)name.c_str(),false);
-    
-    VarDefs v;
-    v.size_2d = -1;
-    char buf[32];
-    snprintf(buf, 32, "devO%dPtr", i+1);
-    v.name = buf;
-    if(outArray->type()->is_ptr())
-      if(((ptr_type *)(outArray->type()))->ref_type()->is_array())
-        v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type();
-      else
-        v.type = ((ptr_type *)(outArray->type()))->ref_type();
-    else
-      v.type = type_f32;
-    v.tex_mapped = false;
-    v.cons_mapped = false;
-    v.original_name = wo_refs[i]->name();
-    //Size of the array = dim1 * dim2 * num bytes of our array type
-    
-    //If our input array is 2D (non-linearized), we want the actual
-    //dimentions of the array
-    CG_outputRepr* size;
-    //Lookup in array_dims
-    std::map<std::string, int>::iterator it = array_dims.find(name.c_str());
-    if(outArray->type()->is_ptr() && outArray->type()->ref_type(0)->is_array())
-    {
-      array_type* t = (array_type*)outArray->type()->ref_type(0);
-      v.size_2d = t->upper_bound().constant()+1;
-      printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)wo_refs[i]->name().c_str());
-      size = ocg->CreateInt(v.size_2d * v.size_2d);
-    }else if(it != array_dims.end()){
-      int ref_size = it->second;
-      v.var_ref_size = ref_size;
-      size = ocg->CreateInt(ref_size);
-    }
-    else{
-      if(dim1){
-        size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)),
-                                new CG_suifRepr(operand(dim2)));
-      }else{
-        char buf[1024];
-        sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a "
-                "detectable size or specififed dimentions", name.c_str());
-        throw std::runtime_error(buf);
-      }
-    }
-    v.size_expr = operand(static_cast<CG_suifRepr*>(ocg->CreateTimes(
-                                                      size,
-                                                      ocg->CreateInt(v.type->size()/8)))->GetExpression());
-    v.in_data = 0;
-    v.out_data = outArray;
-    //Check for in ro_refs and remove it at this point
-    std::vector<IR_ArrayRef *>::iterator it_;
-    for(it_ = ro_refs.begin(); it_ != ro_refs.end(); it_++)
-    {
-      if((*it_)->name() == wo_refs[i]->name()){
-        break;
-      }
-    }
-    if(it_ != ro_refs.end())
-    {
-      v.in_data = outArray;
-      ro_refs.erase(it_);
-    }
-    
-    arrayVars.push_back(v);
-    
-  }
-  
-  //protonu-- assuming that all texture mapped memories were originally read only mems
-  //there should be safety checks for that, will implement those later
-  
-  int cs_ref_size = 0;
-  
-  for(int i=0; i<ro_refs.size(); i++)
-  {
-    var_sym* inArray = 0;
-    std::string name = ro_refs[i]->name();
-    inArray = symtab->lookup_var((char*)name.c_str(),false);
-    VarDefs v;
-    v.size_2d = -1;
-    char buf[32];
-    snprintf(buf, 32, "devI%dPtr", i+1);
-    v.name = buf;
-    if(inArray->type()->is_ptr())
-      if(((ptr_type *)(inArray->type()))->ref_type()->is_array())
-        v.type = ((array_type *)(((ptr_type *)(inArray->type()))->ref_type()))->elem_type();
-      else
-        v.type = ((ptr_type *)(inArray->type()))->ref_type(); 
-    else
-      v.type = type_f32;
-    v.tex_mapped = false;
-    v.cons_mapped = false;
-    v.original_name = ro_refs[i]->name();
-    if ( texture != NULL)
-      v.tex_mapped = (texture->is_array_tex_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars
-    if (v.tex_mapped){
-      printf("this variable  %s is mapped to texture memory", name.c_str());
-    }
-    if ( constant_mem != NULL)
-      v.cons_mapped = (constant_mem->is_array_cons_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars
-    if (v.cons_mapped){
-      printf("this variable  %s is mapped to constant memory", name.c_str());
-    }
-    
-    //Size of the array = dim1 * dim2 * num bytes of our array type
-    
-    //If our input array is 2D (non-linearized), we want the actual
-    //dimentions of the array (as it might be less than cu_n
-    CG_outputRepr* size;
-    //Lookup in array_dims
-    std::map<std::string, int>::iterator it = array_dims.find(name.c_str());
-    int ref_size = 0;
-    if(inArray->type()->is_ptr() && inArray->type()->ref_type(0)->is_array())
-    {
-      array_type* t = (array_type*)inArray->type()->ref_type(0);
-      v.size_2d = t->upper_bound().constant()+1;
-      printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)ro_refs[i]->name().c_str());
-      size = ocg->CreateInt(v.size_2d * v.size_2d);
-    }else if(it != array_dims.end()){
-      ref_size = it->second;
-      v.var_ref_size = ref_size;
-      size = ocg->CreateInt(ref_size);
-    }else{
-      if(dim1){
-        size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)),
-                                new CG_suifRepr(operand(dim2)));
-      }else{
-        char buf[1024];
-        sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a "
-                "detectable size or specififed dimentions", name.c_str());
-        throw std::runtime_error(buf);
-      }
-    }
-    
-    
-    
-    v.size_expr = operand(static_cast<CG_suifRepr*>(ocg->CreateTimes(
-                                                      size,
-                                                      ocg->CreateInt(v.type->size()/8)))->GetExpression());
-    
-    v.in_data = inArray;
-    v.out_data = 0;
-    arrayVars.push_back(v);
-  }
-  
-  
-  if(arrayVars.size() < 2)
-  {
-    fprintf(stderr, "cudaize error: Did not find two arrays being accessed\n");
-    return false;
-  }
-  
-  //protonu--debugging tool--the printf statement
-  //tex_mem_on signals use of tex mem
-  for(int i=0; i<arrayVars.size(); i++)
-  {
-    //printf("var name %s, tex_mem used %s\n", arrayVars[i].name.c_str(), (arrayVars[i].tex_mapped)?"true":"false");
-    if (arrayVars[i].tex_mapped  ) tex_mem_on ++;
-    if (arrayVars[i].cons_mapped  ) cons_mem_on ++;
-  }
-  
-  //Add CUDA function extern prototypes and function types
-  func_type* unkown_func = new func_type(type_s32); //function on unkown args that returns a i32
-  unkown_func = (func_type*)symtab->install_type(unkown_func);
-  func_type* void_func = new func_type(type_void); //function on unkown args that returns a void
-  void_func = (func_type*)globals->install_type(void_func);
-  func_type* float_func = new func_type(type_f32); //function on unkown args that returns a float
-  float_func = (func_type*)globals->install_type(float_func);
-  
-  type_node* result = ocg->ModifyType(type_void, "__global__");
-  result = globals->install_type(result);
-  func_type* kernel_type = new func_type(result); //function returns a '__global__ void'
-  
-  int numArgs =  arrayVars.size() + (dim1 ? 2 : 0) + localScopedVars.size();
-  //protonu--need to account for texture memory here, reduce the #args
-  if( tex_mem_on ) numArgs -= tex_mem_on;
-  if( cons_mem_on ) numArgs -= cons_mem_on;
-  kernel_type->set_num_args(numArgs);
-  int argCount = 0;
-  for(int i=0; i<arrayVars.size(); i++)
-  {
-    type_node* fptr;
-    if(arrayVars[i].in_data)
-      fptr = arrayVars[i].in_data->type()->clone();
-    else
-      fptr = arrayVars[i].out_data->type()->clone();
-    //protonu--skip this for texture mems
-    if( arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true )
-      kernel_type->set_arg_type(argCount++, fptr);
-  }
-  if(dim1){
-    kernel_type->set_arg_type(argCount++, type_s32); //width x height dimentions
-    kernel_type->set_arg_type(argCount++, type_s32);
-  }
-  kernel_type = (func_type*)globals->install_type(kernel_type);
-  
-  proc_sym* cudaMalloc = globals->new_proc(unkown_func, src_c, "cudaMalloc");
-  proc_sym* cudaMemcpy = globals->new_proc(unkown_func, src_c, "cudaMemcpy");
-  proc_sym* cudaFree = globals->new_proc(unkown_func, src_c, "cudaFree");
-  proc_sym* cudaSync = globals->new_proc(void_func, src_c, "__syncthreads");
-  proc_sym* cudaBind = globals->new_proc(unkown_func, src_c, "cudaBindTexture");
-  proc_sym* cudaMemcpySym = globals->new_proc(unkown_func, src_c, "cudaMemcpyToSymbol");
-  
-  
-  //protonu-removing Gabe's function, introducing mine, this is pretty cosmetic
-  //proc_sym* cudaFetch = globals->new_proc(float_func, src_c, "tex1Dfetch");
-  proc_sym* tex1D = globals->new_proc(float_func, src_c, "tex1Dfetch");
-  
-  var_sym *cudaMemcpyHostToDevice = new var_sym(type_s32, "cudaMemcpyHostToDevice");
-  var_sym *cudaMemcpyDeviceToHost = new var_sym(type_s32, "cudaMemcpyDeviceToHost");
-  cudaMemcpyDeviceToHost->set_param();
-  cudaMemcpyHostToDevice->set_param();
-  globals->add_sym(cudaMemcpyHostToDevice);
-  globals->add_sym(cudaMemcpyDeviceToHost);
-  
-  //protonu--adding the bool tex_mem to the structure struct_type
-  //to bypass the re-naming of struct texture, this is a hack fix
-  struct_type* texType = new struct_type(TYPE_GROUP, 0, "texture<float, 1, cudaReadModeElementType>", 0, true);
-  immed_list *iml_tex = new immed_list;
-  iml_tex->append(immed("texture memory"));
-  texType->append_annote(k_cuda_texture_memory, iml_tex);
-  //protonu--end my changes
-  texType = (struct_type*)globals->install_type(texType);
-  //protonu--should register the locals later on
-  //when we do the bind operation
-  //var_sym* texRef = new var_sym(texType, "texRef");
-  //globals->add_sym(texRef);
-  
-  //Add our mallocs (and input array memcpys)
-  for(int i=0; i<arrayVars.size(); i++)
-  {
-    //protonu--check if the variable is not a tex-mapped variable. If it is tex mapped
-    // allow a malloc and memcpy operation, and a bind, but only if it is tex mapped, but dont call
-    // the kernel with it as an argument.
-    
-    //Make a pointer of type a[i].type
-    //type_node* fptr = new ptr_type(arrayVars[i].type->clone());
-    //protonu--temporary change 
-    type_node* fptr = new ptr_type(arrayVars[i].type);
-    fptr = symtab->install_type(fptr);
-    var_sym *dvs = new var_sym(fptr, const_cast<char*>(
-                                 arrayVars[i].name.c_str()));
-    dvs->set_addr_taken();
-    symtab->add_sym(dvs);
-    
-    //cudaMalloc args
-    //protonu--no cudaMalloc required for constant memory
-    tree_node_list* tnl = new tree_node_list;
-    if(arrayVars[i].cons_mapped != true )
-    {
-      in_cal *the_call =
-        new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMalloc))), 2);
-      the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to()->ptr_to(), operand(), immed(dvs))));
-      the_call->set_argument(1, arrayVars[i].size_expr);
-      
-      tnl->append(new tree_instr(the_call));
-      setup_code = ocg->StmtListAppend(setup_code,
-                                       new CG_suifRepr(tnl));
-    }
-    if(arrayVars[i].in_data)
-    {
-      //cudaMemcpy args
-      //protonu-- no cudaMemcpy required for constant memory
-      if ( arrayVars[i].cons_mapped != true )
-      {
-        in_cal *the_call =
-          new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpy))), 4);
-        the_call->set_argument(0, operand(dvs));
-        the_call->set_argument(1, operand(arrayVars[i].in_data));
-        the_call->set_argument(2, arrayVars[i].size_expr.clone());
-        the_call->set_argument(3, operand(cudaMemcpyHostToDevice));
-        
-        tnl = new tree_node_list;
-        tnl->append(new tree_instr(the_call));
-        setup_code = ocg->StmtListAppend(setup_code,
-                                         new CG_suifRepr(tnl));
-      }
-      
-      //protonu--check if the arrayvar is tex mapped
-      if(arrayVars[i].tex_mapped == true)
-      {
-        //Need a texture reference variable
-        char buf[32];
-        snprintf(buf, 32, "tex%dRef", i+1);
-        arrayVars[i].secondName = buf;
-        
-        var_sym* texRef = new var_sym(texType, buf);
-        //printf("\n putting in %s\n", arrayVars[i].original_name.c_str());
-        tex_ref_map[arrayVars[i].original_name] = texRef;
-        globals->add_sym(texRef);
-        //protonu--added the above two lines
-        
-        in_cal *the_call =
-          new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaBind))), 4);
-        in_ldc *ins = new in_ldc(type_s32, operand(), immed(0));
-        the_call->set_argument(0, operand(ins));
-        the_call->set_argument(1, operand(texRef));//protonu--change to add the new sym
-        the_call->set_argument(2, operand(dvs));
-        the_call->set_argument(3, arrayVars[i].size_expr.clone());
-        
-        tnl = new tree_node_list;
-        tnl->append(new tree_instr(the_call));
-        setup_code = ocg->StmtListAppend(setup_code,
-                                         new CG_suifRepr(tnl));
-      }
-      
-      //protonu--if arrayvar is mapped to constant memory
-      if(arrayVars[i].cons_mapped == true)
-      {
-        char buf[32];
-        snprintf(buf, 32, "cs%dRef", i+1);
-        //arrayVars[i].secondName = buf;
-        array_bound low (0);
-        array_bound high (arrayVars[i].var_ref_size -1);
-        array_type *arr = new array_type(arrayVars[i].type,low, high);
-        type_node* cons_arr = ocg->ModifyType(arr, "__device__ __constant__");
-        cons_arr = globals->install_type(cons_arr);
-        var_sym* consRef = new var_sym(cons_arr, buf);
-        cons_ref_map[arrayVars[i].original_name] = consRef;
-        globals->add_sym(consRef);
-        
-        
-        
-        in_cal *the_call =
-          new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpySym))), 3);
-        the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to(), operand(), immed(consRef))));
-        the_call->set_argument(1, operand(arrayVars[i].in_data));
-        the_call->set_argument(2, arrayVars[i].size_expr.clone());
-        
-        tnl = new tree_node_list;
-        tnl->append(new tree_instr(the_call));
-        setup_code = ocg->StmtListAppend(setup_code,
-                                         new CG_suifRepr(tnl));
-        
-      }
-    }
-  }
-  
-  //Build dimGrid dim3 variables based on loop dimentions and ti/tj
-  char blockD1[120];
-  char blockD2[120];
-  if(dim1){
-    snprintf(blockD1, 120, "%s/%d", dim1->name(), cu_tx);
-    snprintf(blockD2, 120, "%s/%d", dim2->name(), cu_ty);
-  }else{
-    snprintf(blockD1, 120, "%d", cu_bx);
-    snprintf(blockD2, 120, "%d", cu_by);
-    //snprintf(blockD1, 120, "%d/%d", cu_nx, cu_tx);
-    //snprintf(blockD2, 120, "%d/%d", cu_ny, cu_ty);
-  }
-  repr = ocg->CreateDim3(immed((char*)gridName),
-                         immed(blockD1),
-                         immed(blockD2));
-  setup_code = ocg->StmtListAppend(setup_code, repr);
-  
-  repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx),immed(cu_ty));
-  
-  if(cu_tz > 1)
-    repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty), immed(cu_tz));
-  else
-    repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty));
-  setup_code = ocg->StmtListAppend(setup_code, repr);
-  
-  //call kernel function with name loop_name
-  //like: transpose_k<<<dimGrid,dimBlock>>>(devOPtr, devIPtr , width, height);
-  char dims[120];
-  snprintf(dims,120,"<<<%s,%s>>>",gridName, blockName);
-  immed_list *iml = new immed_list;
-  iml->append(immed((char*)cu_kernel_name.c_str()));
-  iml->append(immed(dims));
-  //printf("%s %s\n", static_cast<const char*>(cu_kernel_name), dims);
-  for(int i=0; i<arrayVars.size(); i++)
-    //Throw in a type cast if our kernel takes 2D array notation
-    //like (float(*) [1024])
-  {
-    //protonu--throwing in another hack to stop the caller from passing tex mapped 
-    //vars to the kernel.
-    if(arrayVars[i].tex_mapped == true || arrayVars[i].cons_mapped == true )
-      continue;     
-    if(arrayVars[i].size_2d >= 0)
-    {
-      snprintf(dims,120,"(float(*) [%d])%s", arrayVars[i].size_2d,
-               const_cast<char*>(arrayVars[i].name.c_str()));
-      //printf("%d %s\n", i, dims);
-      iml->append(immed(dims));
-    }else{
-      //printf("%d %s\n", i, static_cast<const char*>(arrayVars[i].name));
-      iml->append(immed(const_cast<char*>(
-                          arrayVars[i].name.c_str())));
-    }
-  }
-  if(dim1){
-    iml->append(immed(dim1));
-    iml->append(immed(dim2));
-  }
-  repr = ocg->CreateKernel(iml);//kernel call
-  setup_code = ocg->StmtListAppend(setup_code, repr);
-  
-  //cuda free variables
-  for(int i=0; i<arrayVars.size(); i++)
-  {
-    if(arrayVars[i].out_data)
-    {
-      //cudaMemcpy args
-      in_cal *the_call =
-        new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpy))), 4);
-      the_call->set_argument(0, operand(arrayVars[i].out_data));
-      the_call->set_argument(1, operand(symtab->lookup_var(const_cast<char*>(
-                                                             arrayVars[i].name.c_str()))));
-      the_call->set_argument(2, arrayVars[i].size_expr.clone());
-      the_call->set_argument(3, operand(cudaMemcpyDeviceToHost));
-      
-      tree_node_list* tnl = new tree_node_list;
-      tnl->append(new tree_instr(the_call));
-      teardown_code = ocg->StmtListAppend(teardown_code,
-                                          new CG_suifRepr(tnl));
-    }
-    
-    in_cal *the_call =
-      new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaFree))), 1);
-    the_call->set_argument(0, operand(symtab->lookup_var(const_cast<char*>(
-                                                           arrayVars[i].name.c_str()))));
-    
-    tree_node_list* tnl = new tree_node_list;
-    tnl->append(new tree_instr(the_call));
-    teardown_code = ocg->StmtListAppend(teardown_code,
-                                        new CG_suifRepr(tnl));
-  }
-  
-  // ---------------
-  // BUILD THE KERNEL
-  // ---------------
-  
-  //Extract out kernel body
-  tree_node_list* code = getCode();
-  //Get rid of wrapper if that original() added
-  if(code->head()->contents->kind() == TREE_IF)
-  {
-    tree_if* ifn = (tree_if*)code->head()->contents;
-    code = ifn->then_part();
-  }
-  
-  //Create kernel function body
-  proc_sym *new_psym = globals->new_proc(kernel_type, src_c, (char*)cu_kernel_name.c_str());
-  proc_symtab *new_proc_syms = new proc_symtab(new_psym->name());
-  globals->add_child(new_proc_syms);
-  
-  //Add Params
-  std::map<std::string, var_sym*> loop_vars;
-  //In-Out arrays 
-  type_node* fptr;
-  for(int i=0; i<arrayVars.size(); i++)
-  {
-    if(arrayVars[i].in_data)
-      //fptr = arrayVars[i].in_data->type()->clone();
-      fptr = arrayVars[i].in_data->type();
-    else
-      //fptr = arrayVars[i].out_data->type()->clone();
-      fptr = arrayVars[i].out_data->type();
-    fptr = new_proc_syms->install_type(fptr);
-    std::string name = arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name();
-    var_sym* sym = new var_sym(fptr, arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name());
-    //protonu--adding a check to ensure that texture memories are not passed in as arguments
-    if(arrayVars[i].tex_mapped != true     && arrayVars[i].cons_mapped !=true  ) 
-    {
-      sym->set_param();
-      new_proc_syms->params()->append(sym);
-      new_proc_syms->add_sym(sym);//protonu--added to suppress the addition of the redundant var in the kernel
-    }
-    if (arrayVars[i].cons_mapped == true)
-    {       
-      sym->set_param();
-      new_proc_syms->add_sym(sym);
-    }
-    //printf("inserting name: %s\n", static_cast<const char*>(name));
-    loop_vars.insert(std::pair<std::string, var_sym*>(std::string(name), sym));
-  }
-  
-  if(dim1)
-  {
-    //Array dimentions
-    var_sym* kdim1 = new var_sym(dim1->type(), dim1->name());
-    kdim1->set_param();
-    new_proc_syms->add_sym(kdim1);
-    loop_vars.insert(std::pair<std::string, var_sym*>(std::string(dim1->name()), kdim1));
-    var_sym* kdim2 = new var_sym(dim2->type(), dim2->name());
-    kdim2->set_param();
-    new_proc_syms->add_sym(kdim2);
-    loop_vars.insert(std::pair<std::string, var_sym*>(std::string(dim2->name()), kdim2));
-    new_proc_syms->params()->append(kdim1);
-    new_proc_syms->params()->append(kdim2);
-  }
-  //Put block and thread implicit variables into scope
-  std::vector<var_sym *> index_syms;
-  /* Currently we don't use the block dimentions
-     var_sym* blockDim_x = new var_sym(type_s32, "blockDim.x");
-     blockDim_x->set_param();
-     new_proc_syms->add_sym(blockDim_x);
-     var_sym* blockDim_y = new var_sym(type_s32, "blockDim.y");
-     blockDim_y->set_param();
-     new_proc_syms->add_sym(blockDim_y);
-  */
-  if(cu_bx > 1){
-    var_sym* blockIdx_x = new var_sym(type_s32, "blockIdx.x");
-    blockIdx_x->set_param();
-    new_proc_syms->add_sym(blockIdx_x);
-    index_syms.push_back(blockIdx_x);
-  }
-  if(cu_by > 1){
-    var_sym* blockIdx_y = new var_sym(type_s32, "blockIdx.y");
-    blockIdx_y->set_param();
-    new_proc_syms->add_sym(blockIdx_y);
-    index_syms.push_back(blockIdx_y);
-  }
-  if(cu_tx > 1){
-    var_sym* threadIdx_x = new var_sym(type_s32, "threadIdx.x");
-    threadIdx_x->set_param();
-    new_proc_syms->add_sym(threadIdx_x);
-    index_syms.push_back(threadIdx_x);
-  }
-  if(cu_ty > 1){
-    var_sym* threadIdx_y = new var_sym(type_s32, "threadIdx.y");
-    threadIdx_y->set_param();
-    new_proc_syms->add_sym(threadIdx_y);
-    index_syms.push_back(threadIdx_y);
-  }
-  
-  if(cu_tz > 1){
-    var_sym* threadIdx_z = new var_sym(type_s32, "threadIdx.z");
-    threadIdx_z->set_param();
-    new_proc_syms->add_sym(threadIdx_z);
-    index_syms.push_back(threadIdx_z);
-  }
-  
-  //Figure out which loop variables will be our thread and block dimention variables
-  std::vector<var_sym *> loop_syms;
-  //Get our indexes
-  std::vector<const char*> indexes;// = get_loop_indexes(code,cu_num_reduce);
-  int threadsPos=0;
-  if(cu_bx > 1)
-    indexes.push_back("bx");
-  if(cu_by > 1)
-    indexes.push_back("by");
-  if(cu_tx > 1){
-    threadsPos = indexes.size();
-    indexes.push_back("tx");
-  }
-  if(cu_ty > 1)
-    indexes.push_back("ty");
-  if(cu_tz > 1)
-    indexes.push_back("tz");
-  for(int i=0; i<indexes.size(); i++)
-  {
-    //printf("indexes[%d] = %s\n", i, (char*)indexes[i]);
-    loop_syms.push_back(new var_sym(type_s32, (char*)indexes[i]));
-    new_proc_syms->add_sym(loop_syms[i]);
-    //loop_vars.insert(std::pair<std::string, var_sym*>(std::string(indexes[i]), loop_syms[i]));
-  }
-  
-  //Generate this code
-  //int bx = blockIdx.x
-  //int by = blockIdx.y
-  //int tx = threadIdx.x
-  //int ty = threadIdx.y
-  CG_outputRepr *body=NULL;
-  for(int i=0; i<indexes.size(); i++){
-    CG_outputRepr *lhs = new CG_suifRepr(operand(loop_syms[i]));
-    //body = ocg->StmtListAppend(body, ocg->CreateStmtList(
-    //                             ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i])))));
-    body = ocg->StmtListAppend(body, ocg->StmtListAppend(
-                                 ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i]))), NULL));
-  }
-  
-  //Get our inital code prepped for loop reduction. First we need to swap
-  //out internal SUIF variable references to point to the new local
-  //function symbol table.
-  std::map<std::string, var_sym*> loop_idxs; //map from idx names to their new syms
-  std::vector< std::pair<var_sym*, var_sym*> > dim_vars; //pair is of <old,new> var_sym (for 2D array size initializations)
-  replacements r;
-  tree_node_list* swapped = swapVarReferences(code, &r, ocg, loop_vars, new_proc_syms, dim_vars);
-  //printf("\n code before recursiveFindReplacePreferedIdxs :\n");
-  //swapped->print();
-  swapped = recursiveFindReplacePreferedIdxs(swapped, new_proc_syms, cudaSync, void_func, loop_idxs);//in-place swapping
-  //printf("\n code after recursiveFindReplacePreferedIdxs :\n");
-  //swapped->print();
-  
-  for(int i=0; i<indexes.size(); i++){
-    std::vector<tree_for*> tfs = findCommentedFors(indexes[i], swapped);
-    for(int k=0; k<tfs.size(); k++){
-      //printf("replacing %p tfs for index %s\n", tfs[k], indexes[i]);
-      tree_node_list* newBlock = forReduce(tfs[k], loop_idxs[indexes[i]], new_proc_syms);
-      //newBlock->print();
-      swap_node_for_node_list(tfs[k], newBlock);
-      //printf("AFTER SWAP\n");        newBlock->print();
-    }
-  }
-  //printf("AFTER REDUCE\n"); swapped->print();
-  
-  if(static_cast<const IR_cudasuifCode *>(ir)->init_code()){
-    tree_node_list* orig_init_code = static_cast<CG_suifRepr *>(static_cast<const IR_cudasuifCode *>(ir)->init_code())->GetCode();
-    for(int i=0; i<dim_vars.size(); i++){
-      //We have a map of var_sym from the original function body and we know
-      //that these var_syms have initialization statements which define the
-      //array size. We need to mimic these initialization statements.
-      
-      //First find the assignment and pull out the constant initialization
-      //value
-      int value = -1;
-      tree_node_list_iter tnli(orig_init_code);
-      while (!tnli.is_empty()) {
-        tree_node *node = tnli.step();
-        if(node->kind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr)
-        {
-          in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr();
-          //expect the structure: cpy( _ = min(grab_me, _))
-          if(inst->opcode() == io_cpy && inst->dst_op().is_symbol()){
-            //printf("looking at instruction: ");
-            //inst->print();
-            var_sym* dest = inst->dst_op().symbol();
-            if(dest == dim_vars[i].first)
-            {
-              if(inst->src1_op().is_instr() && inst->src1_op().instr()->format() == inf_ldc){
-                value = ((in_ldc*)inst->src1_op().instr())->value().integer();
-              }
-            }
-          }
-        }
-      }
-      if(value < 0){
-        fprintf(stderr, "ERROR: Could not find initializing statement for variable used in upper_bound of array type");
-      }
-      CG_outputRepr *lhs = new CG_suifRepr(operand(dim_vars[i].second));
-      //body = ocg->StmtListAppend(body, ocg->CreateStmtList(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value))));
-      body = ocg->StmtListAppend(body, ocg->StmtListAppend(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value)), NULL));
-    }
-  }
-  
-  
-  body = ocg->StmtListAppend(body, new CG_suifRepr(swapped));
-  
-  //protonu--lets try creating our function definiton here
-  var_sym *tsym = NULL;
-  
-  
-  std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(body);
-  for(int i=0; i<refs.size(); i++)
-  {
-    //check if the array is tex mapped
-    if(texture != NULL && texture->is_array_tex_mapped(refs[i]->name().c_str()))
-    {
-      //protonu--our new tex lookup function
-      in_cal *tex_lookup =
-        new in_cal(type_f32, operand(), operand(new in_ldc(float_func->ptr_to(), operand(), immed(tex1D))), 2);
-      
-      //printf("name of the array to be mapped is %s\n", refs[i]->name().c_str());
-      tsym = tex_ref_map[refs[i]->name()];
-      tex_lookup->set_argument(0, operand(tsym));
-      
-      
-      int array_dims = ((IR_suifArrayRef *)refs[i])->ia_->dims();
-      
-      if (array_dims == 1){ 
-        tex_lookup->set_argument(1, ((IR_suifArrayRef *)refs[i])->ia_->index(0).clone());
-      }else if (array_dims > 2) {
-        printf(" \n we don't handle more than 2D arrays mapped to textures yet\n");
-      }else if (array_dims == 2) {
-        
-        IR_ArraySymbol *sym = refs[i]->symbol();
-        CG_outputRepr *sz = sym->size(1);
-        delete sym;  // free the wrapper object only
-        // find the builder ocg
-        CG_outputRepr *expr = ocg->CreateTimes(sz->clone(),refs[i]->index(0));
-        delete sz; // free the wrapper object only
-        expr = ocg->CreatePlus(expr, refs[i]->index(1));
-        // expr holds the 1D access expression and take it out
-        tex_lookup->set_argument(1, ((CG_suifRepr *)expr)->GetExpression());
-      }
-      
-      //using chun's function to replace the array look up with the function call
-      ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(tex_lookup)));
-    }
-    
-  }
-  
-  
-  tsym = NULL;
-  //protonu--now let's try what we did above for constant memory
-  for(int i=0; i<refs.size(); i++)
-  {
-    //check if the array is tex mapped
-    if(constant_mem != NULL && constant_mem->is_array_cons_mapped(refs[i]->name().c_str()))
-    {
-      
-      //printf("name of the array to be cons mapped is %s\n", refs[i]->name().c_str());
-      tsym = cons_ref_map[refs[i]->name()];
-      //we should create a IR_SuifArray here
-      IR_ArraySymbol *ar_sym = new IR_suifArraySymbol(ir,tsym);
-      std::vector<CG_outputRepr *> ar_index;
-      ar_index.push_back(((IR_suifArrayRef *)refs[i])->index(0));
-      IR_ArrayRef *ar_ref = ((IR_suifCode *)ir)->CreateArrayRef(ar_sym, ar_index);
-      //using chun's function to replace the array look up with the function call
-      ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(((IR_suifArrayRef *)ar_ref)->ia_)));
-      
-    }
-  }
-  
-  
-  tree_proc *new_body = new tree_proc(static_cast<CG_suifRepr*>(body)->GetCode(), new_proc_syms);
-  //globals->add_child(new_proc_syms);
-  new_psym->set_block(new_body);
-  new_procs.push_back(new_psym);
-  
-  return swapped;
-}
-
-//Order taking out dummy variables
-std::vector<std::string> cleanOrder(std::vector<std::string> idxNames){
-  std::vector<std::string> results;
-  for(int j=0; j<idxNames.size(); j++){
-    if(idxNames[j].length() != 0)
-      results.push_back(idxNames[j]);
-  }
-  return results;
-}
-
-//First non-dummy level in ascending order
-int LoopCuda::nonDummyLevel(int stmt, int level){
-  //level comes in 1-basd and should leave 1-based
-  for(int j=level-1; j<idxNames[stmt].size(); j++){
-    if(idxNames[stmt][j].length() != 0){
-      //printf("found non dummy level of %d with idx: %s when searching for %d\n", j+1, (const char*) idxNames[stmt][j], level);
-      return j+1;
-    }
-  }
-  char buf[128]; sprintf(buf, "%d", level);
-  throw std::runtime_error(std::string("Unable to find a non-dummy level starting from ") + std::string(buf));
-}
-
-int LoopCuda::findCurLevel(int stmt, std::string idx){
-  for(int j=0; j<idxNames[stmt].size(); j++){
-    if(strcmp(idxNames[stmt][j].c_str(),idx.c_str()) == 0)
-      return j+1;
-  }
-  throw std::runtime_error(std::string("Unable to find index ") + idx + std::string(" in current list of indexes"));
-}
-
-void LoopCuda::permute_cuda(int stmt, const std::vector<std::string>& curOrder)
-{
-  //printf("curOrder: ");
-  //printVs(curOrder);
-  //printf("idxNames: ");
-  //printVS(idxNames[stmt]);
-  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt]);
-  bool same=true;
-  std::vector<int> pi;
-  for(int i=0; i<curOrder.size(); i++){
-    bool found = false;
-    for(int j=0; j<cIdxNames.size(); j++){
-      if(strcmp(cIdxNames[j].c_str(), curOrder[i].c_str()) == 0){
-        pi.push_back(j+1);
-        found=true;
-        if(j!=i)
-          same=false;
-      }
-    }
-    if(!found){
-      throw std::runtime_error("One of the indexes in the permute order where not "
-                               "found in the current set of indexes.");
-    }
-  }
-  for(int i=curOrder.size(); i<cIdxNames.size(); i++){
-    pi.push_back(i);
-  }
-  if(same)
-    return;
-  permute(stmt, pi);
-  //Set old indexe names as new
-  for(int i=0; i<curOrder.size(); i++){
-    idxNames[stmt][i] = curOrder[i].c_str(); //what about sibling stmts?
-  }
-}
-
-
-bool LoopCuda::permute(int stmt_num, const std::vector<int> &pi)
-{
-// check for sanity of parameters
-  if (stmt_num >= stmt.size() || stmt_num < 0)
-    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
-  const int n = stmt[stmt_num].xform.n_out();
-  if (pi.size() > (n-1)/2)
-    throw std::invalid_argument("iteration space dimensionality does not match permute dimensionality");
-  int first_level = 0;
-  int last_level = 0;
-  for (int i = 0; i < pi.size(); i++) {
-    if (pi[i] > (n-1)/2 || pi[i] <= 0)
-      throw std::invalid_argument("invalid loop level " + to_string(pi[i]) + " in permuation");
-    
-    if (pi[i] != i+1) {
-      if (first_level == 0)
-        first_level = i+1;      
-      last_level = i+1;
-    }
-  }
-  if (first_level == 0)
-    return true;
-  
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  std::set<int> active = getStatements(lex, 2*first_level-2);
-  Loop::permute(active, pi);
-}
-
-
-void LoopCuda::tile_cuda(int stmt, int level, int outer_level)
-{
-  tile_cuda(stmt,level,1,outer_level,"","",CountedTile);
-}
-void LoopCuda::tile_cuda(int level, int tile_size, int outer_level, std::string idxName,
-                         std::string ctrlName, TilingMethodType method){
-  tile_cuda(0, level, tile_size, outer_level, idxName, ctrlName, method);
-}
-
-void LoopCuda::tile_cuda(int stmt, int level, int tile_size, int outer_level, std::string idxName,
-                         std::string ctrlName, TilingMethodType method){
-  //Do regular tile but then update the index and control loop variable
-  //names as well as the idxName to reflect the current state of things.
-  //printf("tile(%d,%d,%d,%d)\n", stmt, level, tile_size, outer_level);
-  //printf("idxNames before: ");
-  //printVS(idxNames[stmt]);
-  
-  tile(stmt, level, tile_size, outer_level, method);
-  
-  if(idxName.size())
-    idxNames[stmt][level-1] = idxName.c_str();
-  if(tile_size == 1){
-    //potentially rearrange loops
-    if(outer_level < level){
-      std::string tmp = idxNames[stmt][level-1];
-      for(int i=level-1; i>outer_level-1; i--){
-        if(i-1 >= 0)
-          idxNames[stmt][i] = idxNames[stmt][i-1];
-      }
-      idxNames[stmt][outer_level-1] = tmp;
-    }
-    //TODO: even with a tile size of one, you need a insert (of a dummy loop)
-    idxNames[stmt].insert(idxNames[stmt].begin()+(level),"");
-  }else{
-    if(!ctrlName.size())
-      throw std::runtime_error("No ctrl loop name for tile");
-    //insert
-    idxNames[stmt].insert(idxNames[stmt].begin()+(outer_level-1),ctrlName.c_str());
-  }
-  
-  //printf("idxNames after: ");
-  //printVS(idxNames[stmt]);
-}
-
-
-bool LoopCuda::datacopy_privatized_cuda(int stmt_num, int level, const std::string &array_name, const std::vector<int> &privatized_levels, bool allow_extra_read , int fastest_changing_dimension , int padding_stride , int padding_alignment , bool cuda_shared)
-{
-  int old_stmts =stmt.size();
-  //datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, cuda_shared);
-  if(cuda_shared)
-    datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 1);
-  else
-    datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 0);
-  
-  
-  //Adjust idxNames to reflect updated state
-  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
-  int new_stmts = stmt.size();
-  for(int i=old_stmts; i<new_stmts; i++){
-    //printf("fixing up statement %d\n", i);
-    std::vector<std::string> idxs;
-    
-    
-    //protonu-making sure the vector of nonSplitLevels grows along with
-    //the statement structure
-    stmt_nonSplitLevels.push_back(omega::Tuple<int>());
-    
-    //Indexes up to level will be the same
-    for(int j=0; j<level-1; j++)
-      idxs.push_back(cIdxNames[j]);
-    
-    //Expect privatized_levels to match
-    for(int j=0; j<privatized_levels.size(); j++)
-      idxs.push_back(cIdxNames[privatized_levels[j]-1]);//level is one-based
-    
-    //all further levels should match order they are in originally
-    if(privatized_levels.size()){
-      int last_privatized = privatized_levels.back();
-      int top_level = last_privatized + (stmt[i].IS.n_set()-idxs.size());
-      //printf("last privatized_levels: %d top_level: %d\n", last_privatized, top_level);
-      for(int j=last_privatized; j<top_level; j++){
-        idxs.push_back(cIdxNames[j]);
-        //printf("pushing back: %s\n", (const char*)cIdxNames[j]);
-      }
-    }
-    idxNames.push_back(idxs);
-  }
-}
-
-bool LoopCuda::datacopy_cuda(int stmt_num, int level, const std::string &array_name, std::vector<std::string> new_idxs, bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, bool cuda_shared)
-{
-  
-  int old_stmts =stmt.size();
-  //datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment,cuda_shared);
-  if(cuda_shared)
-    datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 1);
-  else
-    datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 0);
-  //Adjust idxNames to reflect updated state
-  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
-  int new_stmts = stmt.size();
-  for(int i=old_stmts; i<new_stmts; i++){
-    //printf("fixing up statement %d\n", i);
-    std::vector<std::string> idxs;
-    
-    //protonu-making sure the vector of nonSplitLevels grows along with
-    //the statement structure
-    stmt_nonSplitLevels.push_back(omega::Tuple<int>());
-    
-    //protonu--lets dump out the code from each statement here
-    //printf("\n dumping statement :%d", i);
-    //stmt[i].code->Dump();
-    
-    //Indexes up to level will be the same
-    for(int j=0; j<level-1; j++)
-      idxs.push_back(cIdxNames[j]);
-    
-    //all further levels should get names from new_idxs
-    int top_level = stmt[i].IS.n_set();
-    //printf("top_level: %d level: %d\n", top_level, level);
-    if(new_idxs.size() < top_level-level+1)
-      throw std::runtime_error("Need more new index names for new datacopy loop levels");
-    
-    for(int j=level-1; j<top_level; j++){
-      idxs.push_back(new_idxs[j-level+1].c_str());
-      //printf("pushing back: %s\n", new_idxs[j-level+1].c_str());
-    }
-    idxNames.push_back(idxs);
-  }
-}
-
-bool LoopCuda::unroll_cuda(int stmt_num, int level, int unroll_amount)
-{
-  int old_stmts =stmt.size();
-  //bool b= unroll(stmt_num, , unroll_amount);
-  
-  
-  int dim = 2*level-1;
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  std::set<int> same_loop = getStatements(lex, dim-1);
-  
-  level = nonDummyLevel(stmt_num,level);
-  //printf("unrolling %d at level %d\n", stmt_num,level);
-  
-  //protonu--using the new version of unroll, which returns
-  //a set of ints instead of a bool. To keep Gabe's logic
-  //I'll check the size of the set, if it's 0 return true
-  //bool b= unroll(stmt_num, level, unroll_amount);
-  std::set<int> b_set= unroll(stmt_num, level, unroll_amount);
-  bool b = false;
-  if (b_set.size() == 0) b = true;
-  //end--protonu
-  
-  //Adjust idxNames to reflect updated state
-  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
-  std::vector<std::string> origSource = idxNames[stmt_num];;
-  //Drop index names at level
-  if(unroll_amount == 0){
-    //For all statements that were in this unroll together, drop index name for unrolled level
-    idxNames[stmt_num][level-1] = "";
-    for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-      //printf("in same loop as %d is %d\n", stmt_num, (*i));
-      //idxNames[(*i)][level-1] = "";
-      idxNames[(*i)] = idxNames[stmt_num];
-    }
-  }
-  
-  lex = getLexicalOrder(stmt_num);
-  same_loop = getStatements(lex, dim-1);
-  
-  bool same_as_source = false;
-  int new_stmts = stmt.size();
-  for(int i=old_stmts; i<new_stmts; i++){
-    //Check whether we had a sync for the statement we are unrolling, if
-    //so, propogate that to newly created statements so that if they are
-    //in a different loop structure, they will also get a syncthreads
-    int size = syncs.size();
-    for(int j=0; j<size; j++){
-      if(syncs[j].first == stmt_num)
-        syncs.push_back(make_pair(i,syncs[j].second));
-    }
-    
-    //protonu-making sure the vector of nonSplitLevels grows along with
-    //the statement structure
-    stmt_nonSplitLevels.push_back(omega::Tuple<int>());
-    
-    
-    //We expect that new statements have a constant for the variable in
-    //stmt[i].IS at level (as seen with print_with_subs), otherwise there
-    //will be a for loop at level and idxNames should match stmt's
-    //idxNames pre-unrolled
-    Relation IS = stmt[i].IS;
-    //Ok, if you know how the hell to get anything out of a Relation, you
-    //should probably be able to do this more elegantly. But for now, I'm
-    //hacking it.
-    std::string s = IS.print_with_subs_to_string();
-    //s looks looks like
-    //{[_t49,8,_t51,_t52,128]: 0 <= _t52 <= 3 && 0 <= _t51 <= 15 && 0 <= _t49 && 64_t49+16_t52+_t51 <= 128}
-    //where level == 5, you see a integer in the input set
-    
-    //If that's not an integer and this is the first new statement, then
-    //we think codegen will have a loop at that level. It's not perfect,
-    //not sure if it can be determined without round-tripping to codegen.
-    int sIdx = 0;
-    int eIdx = 0;
-    for(int j=0; j<level-1; j++){
-      sIdx = s.find(",",sIdx+1);
-      if(sIdx < 0) break;
-    }
-    if(sIdx > 0){
-      eIdx = s.find("]");
-      int tmp = s.find(",",sIdx+1);
-      if(tmp > 0 && tmp < eIdx)
-        eIdx = tmp; //", before ]"
-      if(eIdx > 0){
-        sIdx++;
-        std::string var = s.substr(sIdx,eIdx-sIdx);
-        //printf("%s\n", s.c_str());
-        //printf("set var for stmt %d at level %d is %s\n", i, level, var.c_str());
-        if(atoi(var.c_str()) == 0 && i ==old_stmts){
-          //TODO:Maybe do see if this new statement would be in the same
-          //group as the original and if it would, don't say
-          //same_as_source
-          if(same_loop.find(i) == same_loop.end()){
-            printf("stmt %d level %d, newly created unroll statement should have same level indexes as source\n", i, level);
-            same_as_source = true;
-          }
-        }
-      }
-    }
-    
-    
-    //printf("fixing up statement %d n_set %d with %d levels\n", i, stmt[i].IS.n_set(), level-1);
-    if(same_as_source)
-      idxNames.push_back(origSource);
-    else
-      idxNames.push_back(idxNames[stmt_num]);
-  }
-  
-  return b;
-}
-
-void LoopCuda::copy_to_texture(const char *array_name)
-{
-  //protonu--placeholder for now
-  //set the bool for using cuda memory as true
-  //in a vector of strings, put the names of arrays to tex mapped
-  if ( !texture )
-    texture = new texture_memory_mapping(true, array_name);
-  else
-    texture->add(array_name);
-  
-  
-}
-
-
-void LoopCuda::copy_to_constant(const char *array_name)
-{
-  //protonu--placeholder for now
-  //set the bool for using cuda memory as true
-  //in a vector of strings, put the names of arrays to tex mapped
-  if ( !constant_mem )
-    constant_mem = new constant_memory_mapping(true, array_name);
-  else
-    constant_mem->add(array_name);
-}
-
-//protonu--moving this from Loop
-tree_node_list* LoopCuda::codegen()
-{
-  if(code_gen_flags & GenCudaizeV2)
-    return cudaize_codegen_v2();
-  //Do other flagged codegen methods, return plain vanilla generated code
-  return getCode();
-}
-
-//These three are in Omega code_gen.cc and are used as a massive hack to
-//get out some info from MMGenerateCode. Yea for nasty side-effects.
-namespace omega{
-  extern int checkLoopLevel;
-  extern int stmtForLoopCheck;
-  extern int upperBoundForLevel;
-  extern int lowerBoundForLevel;
-}
-
-
-void LoopCuda::extractCudaUB(int stmt_num, int level, int &outUpperBound, int &outLowerBound){
-  // check for sanity of parameters
-  const int m = stmt.size();
-  if (stmt_num >= m || stmt_num < 0)
-    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
-  const int n = stmt[stmt_num].xform.n_out();
-  if (level > (n-1)/2 || level <= 0)
-    throw std::invalid_argument("invalid loop level " + to_string(level));
-  
-  int dim = 2*level-1;
-  
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  std::set<int> same_loop = getStatements(lex, dim-1);
-  
-  // extract the intersection of the iteration space to be considered
-  Relation hull;
-  {
-    hull = Relation::True(n);
-    for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-      hull = Intersection(hull, project_onto_levels(getNewIS(*i), dim+1, true));
-      hull.simplify(2, 4);
-    }
-    
-    for (int i = 2; i <= dim+1; i+=2) {
-      //std::string name = std::string("_t") + to_string(t_counter++);
-      std::string name = std::string("_t") + to_string(tmp_loop_var_name_counter++);
-      hull.name_set_var(i, name);
-    }
-    hull.setup_names();
-  }
-  
-  // extract the exact loop bound of the dimension to be unrolled
-  if (is_single_iteration(hull, dim)){
-    throw std::runtime_error("No loop availabe at level to extract upper bound.");
-  }
-  Relation bound = get_loop_bound(hull, dim);
-  if (!bound.has_single_conjunct() || !bound.is_satisfiable() || bound.is_tautology())
-    throw loop_error("loop error: unable to extract loop bound for cudaize");
-  
-  // extract the loop stride
-  EQ_Handle stride_eq;
-  int stride = 1;
-  {
-    bool simple_stride = true;
-    int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(dim+1), stride_eq, simple_stride);
-    if (strides > 1)
-      throw loop_error("loop error: too many strides");
-    else if (strides == 1) {
-      int sign = stride_eq.get_coef(bound.set_var(dim+1));
-//      assert(sign == 1 || sign == -1);
-      Constr_Vars_Iter it(stride_eq, true);
-      stride = abs((*it).coef/sign);
-    }
-  }
-  if(stride != 1){
-    char buf[1024];
-    sprintf(buf, "Cudaize: Loop at level %d has non-one stride of %d", level, stride);
-    throw std::runtime_error(buf);
-  }
-  
-  //Use code generation system to build tell us our bound information. We
-  //need a hard upper bound a 0 lower bound.
-  
-  checkLoopLevel = level*2;
-  stmtForLoopCheck = stmt_num;
-  upperBoundForLevel = -1;
-  lowerBoundForLevel = -1;
-  printCode(1,false);
-  checkLoopLevel = 0;
-  
-  outUpperBound = upperBoundForLevel;
-  outLowerBound = lowerBoundForLevel;
-  return;
-}
-
-
-void LoopCuda::printCode(int effort, bool actuallyPrint) const {
-  const int m = stmt.size();
-  if (m == 0)
-    return;
-  const int n = stmt[0].xform.n_out();
-  
-  
-  
-  Tuple<Relation> IS(m);
-  Tuple<Relation> xform(m);
-  Tuple<IntTuple > nonSplitLevels(m);
-  for (int i = 0; i < m; i++) {
-    IS[i+1] = stmt[i].IS;
-    xform[i+1] = stmt[i].xform;
-    nonSplitLevels[i+1] = stmt_nonSplitLevels[i];
-    //nonSplitLevels[i+1] = stmt[i].nonSplitLevels;
-  }
-  
-  Tuple< Tuple<std::string> > idxTupleNames;
-  if(useIdxNames){
-    for(int i=0; i<idxNames.size(); i++){
-      Tuple<std::string> idxs;
-      for(int j=0; j<idxNames[i].size(); j++)
-        idxs.append(idxNames[i][j]);
-      idxTupleNames.append( idxs );
-    }
-  }
-  
-  Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
-  CG_stringBuilder *ocg = new CG_stringBuilder();
-  Tuple<CG_outputRepr *> nameInfo;
-  for (int i = 1; i <= m; i++)
-    nameInfo.append(new CG_stringRepr("s" + to_string(i)));
-  CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort);
-  if(actuallyPrint)
-    std::cout << GetString(repr);
-/*
-  for (int i = 1; i <= m; i++)
-  delete nameInfo[i];
-*/
-  
-  delete ocg;
-}
-
-
-
-void LoopCuda::printRuntimeInfo() const {
-  for(int i=0; i<stmt.size(); i++){
-    Relation IS = stmt[i].IS;
-    Relation xform = stmt[i].xform;
-    printf("stmt[%d]\n", i);
-    printf("IS\n");
-    IS.print_with_subs();
-    
-    printf("xform[%d]\n", i);
-    xform.print_with_subs();
-    
-    //printf("code\n");
-    //static_cast<CG_suifRepr *>(stmt[i].code)->GetCode()->print_expr();
-  }
-}
-
-void LoopCuda::printIndexes() const {
-  for(int i=0; i<stmt.size(); i++){
-    printf("stmt %d nset %d ", i, stmt[i].IS.n_set());
-    
-    for(int j=0; j<idxNames[i].size(); j++){
-      if(j>0)
-        printf(",");
-      printf("%s", idxNames[i][j].c_str());
-    }
-    printf("\n");
-  }
-}
-
-tree_node_list* LoopCuda::getCode(int effort) const {
-  const int m = stmt.size();
-  if (m == 0)
-    return new tree_node_list;
-  const int n = stmt[0].xform.n_out();
-  
-  
-  
-  Tuple<CG_outputRepr *> ni(m);
-  Tuple<Relation> IS(m);
-  Tuple<Relation> xform(m);
-  Tuple< IntTuple > nonSplitLevels(m);
-  for (int i = 0; i < m; i++) {
-    ni[i+1] = stmt[i].code;
-    IS[i+1] = stmt[i].IS;
-    xform[i+1] = stmt[i].xform;
-    nonSplitLevels[i+1] = stmt_nonSplitLevels[i];
-    //nonSplitLevels[i+1] = stmt[i].nonSplitLevels;
-  }
-  
-  
-  Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
-#ifdef DEBUG
-//  std::cout << GetString(MMGenerateCode(new CG_stringBuilder(), xform, IS, known, effort));
-#endif
-  Tuple< Tuple<std::string> > idxTupleNames;
-  if(useIdxNames){
-    for(int i=0; i<idxNames.size(); i++){
-      Tuple<std::string> idxs;
-      for(int j=0; j<idxNames[i].size(); j++)
-        idxs.append(idxNames[i][j]);
-      idxTupleNames.append( idxs );
-    }
-  }
-  
-  CG_outputBuilder *ocg = ir->builder();
-  CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, nonSplitLevels, syncs, idxTupleNames, effort);
-  
-  //CG_outputRepr *overflow_initialization = ocg->CreateStmtList();
-  //protonu--using the new function CG_suifBuilder::StmtListAppend
-  CG_outputRepr *overflow_initialization = ocg->StmtListAppend(NULL, NULL);
-  for (std::map<int, std::vector<Free_Var_Decl *> >::const_iterator i = overflow.begin(); i != overflow.end(); i++)
-    for (std::vector<Free_Var_Decl *>::const_iterator j = i->second.begin(); j != i->second.end(); j++)
-      //overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->CreateStmtList(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0))));
-      overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->StmtListAppend(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)), NULL));
-  
-  repr = ocg->StmtListAppend(overflow_initialization, repr);
-  tree_node_list *tnl = static_cast<CG_suifRepr *>(repr)->GetCode();
-  
-  delete repr;
-  /*
-    for (int i = 1; i <= m; i++)
-    delete ni[i];
-  */
-  
-  return tnl;
-}
-
-
-//protonu--adding constructors for the new derived class
-LoopCuda::LoopCuda():Loop(), code_gen_flags(GenInit){}
-
-LoopCuda::LoopCuda(IR_Control *irc, int loop_num)
-  :Loop(irc)
-{
-    setup_code = NULL;
-  teardown_code = NULL;
-  code_gen_flags = 0;
-  cu_bx = cu_by = cu_tx = cu_ty = cu_tz = 1;
-  cu_num_reduce = 0;
-  cu_mode = GlobalMem;
-  texture = NULL;
-  constant_mem = NULL;
-  
-  int m=stmt.size();
-  //printf("\n the size of stmt(initially) is: %d\n", stmt.size());
-  for(int i=0; i<m; i++)
-    stmt_nonSplitLevels.push_back(omega::Tuple<int>());
-  
-  
-  //protonu--setting up
-  //proc_symtab *symtab
-  //global_symtab *globals
-  
-  globals =  ((IR_cudasuifCode *)ir)->gsym_ ;
-  std::vector<tree_for *> tf = ((IR_cudasuifCode *)ir)->get_loops();
-  
-  symtab = tf[loop_num]->proc()->block()->proc_syms();
-  
-  std::vector<tree_for *> deepest = find_deepest_loops(tf[loop_num]);
-  
-  for (int i = 0; i < deepest.size(); i++){
-    index.push_back(deepest[i]->index()->name()); //reflects original code index names
-  }
-  
-  for(int i=0; i< stmt.size(); i++)
-    idxNames.push_back(index); //refects prefered index names (used as handles in cudaize v2)
-  useIdxNames=false;
-  
-}
-
diff --git a/loop_cuda_rose.cc b/loop_cuda_rose.cc
deleted file mode 100644
index c5633ee..0000000
--- a/loop_cuda_rose.cc
+++ /dev/null
@@ -1,3734 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2009 University of Utah
- All Rights Reserved.
-
- Purpose:
- Cudaize methods
-
- Notes:
-
- History:
- 1/7/10 Created by Gabe Rudy by migrating code from loop.cc
- 31/1/11 Modified by Protonu Basu
-*****************************************************************************/
-#define TRANSFORMATION_FILE_INFO Sg_File_Info::generateDefaultFileInfoForTransformationNode()
-#include <code_gen/CG_stringBuilder.h>
-#include <codegen.h>
-#include <code_gen/CG_utils.h>
-#include <code_gen/CG_outputRepr.h>
-#include "loop_cuda_rose.hh"
-#include "loop.hh"
-#include <math.h>
-//#include <useful.h>
-#include "omegatools.hh"
-#include "ir_cudarose.hh"
-#include "ir_rose.hh"
-#include "ir_rose_utils.hh"
-#include "chill_error.hh"
-#include <vector>
-#include "Outliner.hh"
-//#define DEBUG
-using namespace omega;
-using namespace SageBuilder;
-using namespace SageInterface;
-//using namespace Outliner;
-//using namespace ASTtools;
-char *k_cuda_texture_memory; //protonu--added to track texture memory type
-//extern char *omega::k_cuda_texture_memory; //protonu--added to track texture memory type
-extern char *omega::k_ocg_comment;
-
-static int cudaDebug;
-class CudaStaticInit {
-public:
-  CudaStaticInit() {
-    cudaDebug = 0; //Change this to 1 for debug
-  }
-};
-static CudaStaticInit junkInitInstance__;
-
-std::string& upcase(std::string& s) {
-  for (int i = 0; i < s.size(); i++)
-    s[i] = toupper(s[i]);
-  return s;
-}
-
-void printVs(const std::vector<std::string>& curOrder) {
-  if (!cudaDebug) return;
-  for (int i = 0; i < curOrder.size(); i++) {
-    if (i > 0)
-      printf(",");
-    printf("%s", curOrder[i].c_str());
-  }
-  printf("\n");
-}
-
-void printVS(const std::vector<std::string>& curOrder) {
-  if(!cudaDebug) return;
-  for (int i = 0; i < curOrder.size(); i++) {
-    if (i > 0)
-      printf(",");
-    printf("%s", curOrder[i].c_str());
-  }
-  printf("\n");
-}
-
-LoopCuda::~LoopCuda() {
-  const int m = stmt.size();
-  for (int i = 0; i < m; i++)
-    stmt[i].code->clear();
-}
-
-bool LoopCuda::symbolExists(std::string s) {
-  
-  if (body_symtab->find_variable(SgName(s.c_str()))
-      || parameter_symtab->find_variable(SgName(s.c_str())))
-    return true;
-  if (globals->lookup_variable_symbol(SgName(s.c_str())))
-    return true;
-  for (int i = 0; i < idxNames.size(); i++)
-    for (int j = 0; j < idxNames[i].size(); j++)
-      if (strcmp(idxNames[i][j].c_str(), s.c_str()) == 0)
-        return true;
-  return false;
-}
-
-void LoopCuda::addSync(int stmt_num, std::string idxName) {
-  //we store these and code-gen inserts sync to omega comments where stmt
-  //in loop that has idxName being generated
-  syncs.push_back(make_pair(stmt_num, idxName));
-}
-
-void LoopCuda::renameIndex(int stmt_num, std::string idx, std::string newName) {
-  int level = findCurLevel(stmt_num, idx);
-  if (idxNames.size() <= stmt_num || idxNames[stmt_num].size() < level)
-    throw std::runtime_error("Invalid statment number of index");
-  idxNames[stmt_num][level - 1] = newName.c_str();
-}
-
-enum Type {
-  Int
-};
-
-SgNode* wrapInIfFromMinBound(SgNode* then_part, SgForStatement* loop,
-                             SgScopeStatement* symtab, SgVariableSymbol* bound_sym) {
-  // CG_roseBuilder *ocg = new CG_roseBuilder(
-  
-  SgBinaryOp* test_expr = isSgBinaryOp(loop->get_test_expr());
-  SgExpression* upperBound;
-  SgExpression* conditional;
-  upperBound = test_expr->get_rhs_operand();
-  CG_outputRepr *ifstmt;
-  
-  SgCallExpression *call;
-  if (call = isSgCallExpression(upperBound))
-    if (isSgVarRefExp(call->get_function())->get_symbol()->get_name().getString()
-        == "__rose_lt") {
-      SgExprListExp* arg_list = call->get_args();
-      SgExpression *if_bound = *(arg_list->get_expressions().begin());
-      /*This relies on the minimum expression being the rhs operand of
-       * the min instruction.
-       */
-      SgIfStmt *ifstmt = buildIfStmt(
-        buildLessOrEqualOp(buildVarRefExp(bound_sym), if_bound),
-        isSgStatement(then_part), NULL);
-      return isSgNode(ifstmt);
-      
-    }
-  
-/*  if (isSgConditionalExp(upperBound)) {
-    conditional = isSgConditionalExp(upperBound)->get_conditional_exp();
-    
-    if (isSgBinaryOp(conditional)) {
-    SgBinaryOp* binop = isSgBinaryOp(conditional);
-    
-    if (isSgLessThanOp(binop) || isSgLessOrEqualOp(binop)) {
-    SgIfStmt *ifstmt = buildIfStmt(
-    buildLessOrEqualOp(buildVarRefExp(bound_sym),
-    test_expr), isSgStatement(then_part), NULL);
-    return isSgNode(ifstmt);
-    }
-    
-    }
-    
-    }
-*/
-  return then_part;
-}
-
-/**
- * This would be better if it was done by a CHiLL xformation instead of at codegen
- *
- * state:
- * for(...)
- *   for(...)
- *     cur_body
- *   stmt1
- *
- * stm1 is in-between two loops that are going to be reduced. The
- * solution is to put stmt1 at the end of cur_body but conditionally run
- * in on the last step of the for loop.
- *
- * A CHiLL command that would work better:
- *
- * for(...)
- *   stmt0
- *   for(for i=0; i<n; i++)
- *     cur_body
- *   stmt1
- * =>
- * for(...)
- *   for(for i=0; i<n; i++)
- *     if(i==0) stmt0
- *     cur_body
- *     if(i==n-1) stmt1
- */
-
-std::vector<SgForStatement*> findCommentedFors(const char* index, SgNode* tnl) {
-  std::vector<SgForStatement *> result;
-  bool next_loop_ok = false;
-  
-  if (isSgBasicBlock(tnl)) {
-    
-    SgStatementPtrList& list = isSgBasicBlock(tnl)->get_statements();
-    
-    for (SgStatementPtrList::iterator it = list.begin(); it != list.end();
-         it++) {
-      std::vector<SgForStatement*> t = findCommentedFors(index,
-                                                         isSgNode(*it));
-      std::copy(t.begin(), t.end(), back_inserter(result));
-    }
-  } else if (isSgForStatement(tnl)) {
-    
-    AstTextAttribute* att =
-      (AstTextAttribute*) (isSgNode(tnl)->getAttribute(
-                             "omega_comment"));
-    std::string comment = att->toString();
-    
-    if (comment.find("~cuda~") != std::string::npos
-        && comment.find("preferredIdx: ") != std::string::npos) {
-      std::string idx = comment.substr(
-        comment.find("preferredIdx: ") + 14, std::string::npos);
-      if (idx.find(" ") != std::string::npos)
-        idx = idx.substr(0, idx.find(" "));
-      if (strcmp(idx.c_str(), index) == 0)
-        next_loop_ok = true;
-    }
-    
-    if (next_loop_ok) {
-      //printf("found loop %s\n", static_cast<tree_for *>(tn)->index()->name());
-      result.push_back(isSgForStatement(tnl));
-    } else {
-      //printf("looking down for loop %s\n", static_cast<tree_for *>(tn)->index()->name());
-      std::vector<SgForStatement*> t = findCommentedFors(index,
-                                                         isSgForStatement(tnl)->get_loop_body());
-      std::copy(t.begin(), t.end(), back_inserter(result));
-    }
-    next_loop_ok = false;
-  } else if (isSgIfStmt(tnl)) {
-    //printf("looking down if\n");
-    SgIfStmt *tni = isSgIfStmt(tnl);
-    std::vector<SgForStatement*> t = findCommentedFors(index,
-                                                       tni->get_true_body());
-    std::copy(t.begin(), t.end(), back_inserter(result));
-  }
-  
-  return result;
-}
-
-SgNode* forReduce(SgForStatement* loop, SgVariableSymbol* reduceIndex,
-                  SgScopeStatement* body_syms) {
-  //We did the replacements all at once with recursiveFindPreferedIdxs
-  //replacements r;
-  //r.oldsyms.append(loop->index());
-  //r.newsyms.append(reduceIndex);
-  //tree_for* new_loop = (tree_for*)loop->clone_helper(&r, true);
-  SgForStatement* new_loop = loop;
-  
-  //return body one loops in
-  SgNode* tnl = loop_body_at_level(new_loop, 1);
-  //wrap in conditional if necessary
-  tnl = wrapInIfFromMinBound(tnl, new_loop, body_syms, reduceIndex);
-  return tnl;
-}
-
-void recursiveFindRefs(SgNode* code, std::set<const SgVariableSymbol *>& syms,
-                       SgFunctionDefinition* def) {
-  
-  SgStatement* s = isSgStatement(code);
-  // L = {symbols defined within 's'}, local variables declared within 's'
-  ASTtools::VarSymSet_t L;
-  ASTtools::collectDefdVarSyms(s, L);
-  //dump (L, "L = ");
-  
-  // U = {symbols used within 's'}
-  ASTtools::VarSymSet_t U;
-  ASTtools::collectRefdVarSyms(s, U);
-  //dump (U, "U = ");
-  
-  // U - L = {symbols used within 's' but not defined in 's'}
-  // variable references to non-local-declared variables
-  ASTtools::VarSymSet_t diff_U_L;
-  set_difference(U.begin(), U.end(), L.begin(), L.end(),
-                 inserter(diff_U_L, diff_U_L.begin()));
-  //dump (diff_U_L, "U - L = ");
-  
-  // Q = {symbols defined within the function surrounding 's' that are
-  // visible at 's'}, including function parameters
-  ASTtools::VarSymSet_t Q;
-  ASTtools::collectLocalVisibleVarSyms(def->get_declaration(), s, Q);
-//    dump (Q, "Q = ");
-  
-  // (U - L) \cap Q = {variables that need to be passed as parameters
-  // to the outlined function}
-  // a sub set of variables that are not globally visible (no need to pass at all)
-  // It excludes the variables with a scope between global and the enclosing function
-  set_intersection(diff_U_L.begin(), diff_U_L.end(), Q.begin(), Q.end(),
-                   inserter(syms, syms.begin()));
-  
-  /* std::vector<SgVariableSymbol *> scalars;
-  //SgNode  *tnl = static_cast<const omega::CG_roseRepr *>(repr)->GetCode();
-  SgStatement* stmt;
-  SgExpression* exp;
-  if (tnl != NULL) {
-  if(stmt = isSgStatement(tnl)){
-  if(isSgBasicBlock(stmt)){
-  SgStatementPtrList& stmts = isSgBasicBlock(stmt)->get_statements();
-  for(int i =0; i < stmts.size(); i++){
-  //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgNode(stmts[i]));
-  std::vector<SgVariableSymbol *> a = recursiveFindRefs(isSgNode(stmts[i]));
-  //delete r;
-  std::copy(a.begin(), a.end(), back_inserter(scalars));
-  }
-  
-  }
-  else if(isSgForStatement(stmt)){
-  
-  SgForStatement *tnf =  isSgForStatement(stmt);
-  //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgStatement(tnf->get_loop_body()));
-  std::vector<SgVariableSymbol *> a = recursiveFindRefs(isSgNode(tnf->get_loop_body()));
-  //delete r;
-  std::copy(a.begin(), a.end(), back_inserter(scalars));
-  }
-  else if(isSgFortranDo(stmt)){
-  SgFortranDo *tfortran =  isSgFortranDo(stmt);
-  omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgStatement(tfortran->get_body()));
-  std::vector<SgVariableSymbol *> a = recursiveFindRefs(r);
-  delete r;
-  std::copy(a.begin(), a.end(), back_inserter(scalars));
-  }
-  
-  else if(isSgIfStmt(stmt) ){
-  SgIfStmt* tni = isSgIfStmt(stmt);
-  //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgNode(tni->get_conditional()));
-  std::vector<SgVariableSymbol *> a = recursiveFindRefs(isSgNode(tni->get_conditional()));
-  //delete r;
-  std::copy(a.begin(), a.end(), back_inserter(scalars));
-  //r = new omega::CG_roseRepr(isSgNode(tni->get_true_body()));
-  a = recursiveFindRefs(isSgNode(tni->get_true_body()));
-  //delete r;
-  std::copy(a.begin(), a.end(), back_inserter(scalars));
-  //r = new omega::CG_roseRepr(isSgNode(tni->get_false_body()));
-  a = recursiveFindRefs(isSgNode(tni->get_false_body()));
-  //delete r;
-  std::copy(a.begin(), a.end(), back_inserter(scalars));
-  }
-  else if(isSgExprStatement(stmt)) {
-  //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgExpression(isSgExprStatement(stmt)->get_expression()));
-  std::vector<SgVariableSymbol *> a = recursiveFindRefs(isSgNode(isSgExprStatement(stmt)->get_expression()));
-  //delete r;
-  std::copy(a.begin(), a.end(), back_inserter(scalars));
-  
-  }
-  }
-  }
-  else{
-  SgExpression* op = isSgExpression(tnl);
-  if(isSgVarRefExp(op)){
-  
-  scalars.push_back(isSgVarRefExp(op)->get_symbol());
-  
-  }
-  else if( isSgAssignOp(op)){
-  //omega::CG_roseRepr *r1 = new omega::CG_roseRepr(isSgAssignOp(op)->get_lhs_operand());
-  std::vector<SgVariableSymbol *> a1 = recursiveFindRefs(isSgNode(isSgAssignOp(op)->get_lhs_operand()));
-  //delete r1;
-  std::copy(a1.begin(), a1.end(), back_inserter(scalars));
-  //omega::CG_roseRepr *r2 = new omega::CG_roseRepr(isSgAssignOp(op)->get_rhs_operand());
-  std::vector<SgVariableSymbol *> a2 = recursiveFindRefs(isSgNode(isSgAssignOp(op)->get_rhs_operand()));
-  //delete r2;
-  std::copy(a2.begin(), a2.end(), back_inserter(scalars));
-  
-  }
-  else if(isSgBinaryOp(op)){
-  // omega::CG_roseRepr *r1 = new omega::CG_roseRepr(isSgBinaryOp(op)->get_lhs_operand());
-  std::vector<SgVariableSymbol *> a1 = recursiveFindRefs(isSgNode(isSgBinaryOp(op)->get_lhs_operand()));
-  //delete r1;
-  std::copy(a1.begin(), a1.end(), back_inserter(scalars));
-  //omega::CG_roseRepr *r2 = new omega::CG_roseRepr(isSgBinaryOp(op)->get_rhs_operand());
-  std::vector<SgVariableSymbol *> a2 = recursiveFindRefs((isSgBinaryOp(op)->get_rhs_operand()));
-  //delete r2;
-  std::copy(a2.begin(), a2.end(), back_inserter(scalars));
-  }
-  else if(isSgUnaryOp(op)){
-  //omega::CG_roseRepr *r1 = new omega::CG_roseRepr(isSgUnaryOp(op)->get_operand());
-  std::vector<SgVariableSymbol *> a1 = recursiveFindRefs(isSgNode(isSgUnaryOp(op)->get_operand()));
-  //delete r1;
-  std::copy(a1.begin(), a1.end(), back_inserter(scalars));
-  }
-  
-  }
-  return scalars;
-  
-  
-  */
-  
-}
-
-SgNode* recursiveFindReplacePreferedIdxs(SgNode* code, SgSymbolTable* body_syms,
-                                         SgSymbolTable* param_syms, SgScopeStatement* body,
-                                         std::map<std::string, SgVariableSymbol*>& loop_idxs,
-                                         SgGlobal* globalscope, bool sync = false) {
-  //tree_node_list* tnl = new tree_node_list;
-  //tree_node_list_iter tnli(code);
-  SgVariableSymbol* idxSym = 0;
-  std::vector<SgStatement*> r1;
-  std::vector<SgNode*> r2;
-  SgNode* tnli;
-  SgNode* tnli1;
-  SgNode* tnli2;
-  SgBasicBlock * clone;
-  
-  if (isSgForStatement(code)) {
-    AstTextAttribute* att =
-      (AstTextAttribute*) (isSgNode(code)->getAttribute(
-                             "omega_comment"));
-    
-    std::string comment;
-    if (att != NULL)
-      comment = att->toString();
-    
-    if (comment.find("~cuda~") != std::string::npos
-        && comment.find("preferredIdx: ") != std::string::npos) {
-      std::string idx = comment.substr(
-        comment.find("preferredIdx: ") + 14, std::string::npos);
-      if (idx.find(" ") != std::string::npos)
-        idx = idx.substr(0, idx.find(" "));
-      if (loop_idxs.find(idx) != loop_idxs.end())
-        idxSym = loop_idxs.find(idx)->second;
-      //Get the proc variable sybol for this preferred index
-      if (idxSym == 0) {
-        idxSym = body_syms->find_variable(idx.c_str());
-        if (!idxSym)
-          idxSym = param_syms->find_variable(idx.c_str());
-        //printf("idx not found: lookup %p\n", idxSym);
-        if (!idxSym) {
-          SgVariableDeclaration* defn = buildVariableDeclaration(
-            SgName((char*) idx.c_str()), buildIntType());
-          //idxSym = new var_sym(type_s32, (char*)idx.c_str());
-          SgInitializedNamePtrList& variables = defn->get_variables();
-          SgInitializedNamePtrList::const_iterator i =
-            variables.begin();
-          SgInitializedName* initializedName = *i;
-          SgVariableSymbol* vs = new SgVariableSymbol(
-            initializedName);
-          prependStatement(defn, body);
-          vs->set_parent(body_syms);
-          body_syms->insert(SgName((char*) idx.c_str()), vs);
-          idxSym = vs;
-          //printf("idx created and inserted\n");
-        }
-        //Now insert into our map for future
-	if (cudaDebug)
-	    std::cout << idx << "\n\n";
-        loop_idxs.insert(make_pair(idx, idxSym));
-      }
-      //See if we have a sync as well
-      if (comment.find("sync") != std::string::npos) {
-        //printf("Inserting sync after current block\n");
-        sync = true;
-      }
-      
-    }
-    if (idxSym) {
-      SgForInitStatement* list =
-        isSgForStatement(code)->get_for_init_stmt();
-      SgStatementPtrList& initStatements = list->get_init_stmt();
-      SgStatementPtrList::const_iterator j = initStatements.begin();
-      const SgVariableSymbol* index;
-      
-      if (SgExprStatement *expr = isSgExprStatement(*j))
-        if (SgAssignOp* op = isSgAssignOp(expr->get_expression()))
-          if (SgVarRefExp* var_ref = isSgVarRefExp(
-                op->get_lhs_operand()))
-            index = var_ref->get_symbol();
-      
-      std::vector<SgVarRefExp *> array = substitute(code, index, NULL,
-                                                    isSgNode(body_syms));
-      
-      for (int j = 0; j < array.size(); j++)
-        array[j]->set_symbol(idxSym);
-    }
-    
-    SgStatement* body_ = isSgStatement(
-      recursiveFindReplacePreferedIdxs(
-        isSgNode((isSgForStatement(code)->get_loop_body())),
-        body_syms, param_syms, body, loop_idxs, globalscope));
-    
-    omega::CG_roseRepr * tnl = new omega::CG_roseRepr(code);
-    omega::CG_outputRepr* block = tnl->clone();
-    tnli = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-    
-    isSgForStatement(tnli)->set_loop_body(body_);
-    body_->set_parent(tnli);
-    
-    if (idxSym) {
-      SgForInitStatement* list =
-        isSgForStatement(tnli)->get_for_init_stmt();
-      SgStatementPtrList& initStatements = list->get_init_stmt();
-      SgStatementPtrList::const_iterator j = initStatements.begin();
-      const SgVariableSymbol* index;
-      
-      if (SgExprStatement *expr = isSgExprStatement(*j))
-        if (SgAssignOp* op = isSgAssignOp(expr->get_expression()))
-          if (SgVarRefExp* var_ref = isSgVarRefExp(
-                op->get_lhs_operand()))
-            index = var_ref->get_symbol();
-      
-      std::vector<SgVarRefExp *> array = substitute(tnli, index, NULL,
-                                                    isSgNode(body_syms));
-      
-      for (int j = 0; j < array.size(); j++)
-        array[j]->set_symbol(idxSym);
-    }
-    //  std::cout << isSgNode(body_)->unparseToString() << "\n\n";
-    if (att != NULL)
-      tnli->setAttribute("omega_comment", att);
-    
-    if (sync) {
-      SgName name_syncthreads("__syncthreads");
-      SgFunctionSymbol * syncthreads_symbol =
-        globalscope->lookup_function_symbol(name_syncthreads);
-      
-      // Create a call to __syncthreads():
-      SgFunctionCallExp * syncthreads_call = buildFunctionCallExp(
-        syncthreads_symbol, buildExprListExp());
-      
-      SgExprStatement* stmt = buildExprStatement(syncthreads_call);
-      
-      /*    if (SgBasicBlock* bb = isSgBasicBlock(
-            isSgForStatement(code)->get_loop_body()))
-            appendStatement(isSgStatement(stmt), bb);
-            
-            else if (SgStatement* ss = isSgStatement(
-            isSgForStatement(code)->get_loop_body())) {
-            SgBasicBlock* bb2 = buildBasicBlock();
-            
-            isSgNode(ss)->set_parent(bb2);
-            appendStatement(ss, bb2);
-            
-            appendStatement(isSgStatement(stmt), bb2);
-            isSgNode(stmt)->set_parent(bb2);
-            isSgForStatement(code)->set_loop_body(bb2);
-            isSgNode(bb2)->set_parent(code);
-            }
-      */
-      
-      SgBasicBlock* bb2 = buildBasicBlock();
-      
-      bb2->append_statement(isSgStatement(tnli));
-      bb2->append_statement(stmt);
-      /* SgNode* parent = code->get_parent();
-         if(!isSgStatement(parent))
-         throw loop_error("Parent not a statement");
-         
-         if(isSgForStatement(parent)){
-         if(SgStatement *ss = isSgForStatement(isSgForStatement(parent)->get_loop_body())){
-         omega::CG_roseRepr * tnl = new omega::CG_roseRepr(ss);
-         omega::CG_outputRepr* block= tnl->clone();
-         
-         SgNode *new_ss = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-         SgBasicBlock* bb2 = buildBasicBlock();
-         
-         isSgNode(new_ss)->set_parent(bb2);
-         appendStatement(isSgStatement(new_ss), bb2);
-         appendStatement(isSgStatement(stmt), bb2);
-         isSgNode(stmt)->set_parent(bb2);
-         
-         isSgStatement(parent)->replace_statement_from_basicBlock(ss, isSgStatement(bb2));
-         
-         }else if(isSgBasicBlock(isSgForStatement(parent)->get_loop_body()))
-         isSgStatement(isSgForStatement(parent)->get_loop_body())->insert_statement(isSgStatement(code), stmt, false);
-         else
-         throw loop_error("parent statement type undefined!!");
-         
-         }
-         else if(isSgBasicBlock(parent))
-         isSgStatement(parent)->insert_statement(isSgStatement(code), stmt, false);
-         else
-         throw loop_error("parent statement type undefined!!");
-         
-         //tnl->print();
-         *
-         *
-         */
-      sync = true;
-      return isSgNode(bb2);
-      
-    } else
-      return tnli;
-  } else if (isSgIfStmt(code)) {
-    SgStatement* body_ = isSgStatement(
-      recursiveFindReplacePreferedIdxs(
-        isSgNode((isSgIfStmt(code)->get_true_body())),
-        body_syms, param_syms, body, loop_idxs, globalscope));
-    
-    omega::CG_roseRepr * tnl = new omega::CG_roseRepr(code);
-    omega::CG_outputRepr* block = tnl->clone();
-    tnli = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-    
-    isSgIfStmt(tnli)->set_true_body(body_);
-    
-    if ((isSgIfStmt(code)->get_false_body()))
-      isSgIfStmt(tnli)->set_false_body(
-        isSgStatement(
-          recursiveFindReplacePreferedIdxs(
-            isSgNode(
-              (isSgIfStmt(code)->get_false_body())),
-            body_syms, param_syms, body, loop_idxs,
-            globalscope)));
-    
-    return tnli;
-  } else if (isSgStatement(code) && !isSgBasicBlock(code)) {
-    omega::CG_roseRepr * tnl = new omega::CG_roseRepr(code);
-    omega::CG_outputRepr* block = tnl->clone();
-    tnli = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-    
-    return tnli;
-    
-  } else if (isSgBasicBlock(code)) {
-    SgStatementPtrList& tnl = isSgBasicBlock(code)->get_statements();
-    
-    SgStatementPtrList::iterator temp;
-    clone = buildBasicBlock();
-    bool sync_found = false;
-    for (SgStatementPtrList::const_iterator it = tnl.begin();
-         it != tnl.end(); it++) {
-      
-      if (isSgForStatement(*it)) {
-        AstTextAttribute* att =
-          (AstTextAttribute*) (isSgNode(*it)->getAttribute(
-                                 "omega_comment"));
-        
-        std::string comment;
-        if (att != NULL)
-          comment = att->toString();
-        
-        if (comment.find("~cuda~") != std::string::npos
-            && comment.find("preferredIdx: ")
-            != std::string::npos) {
-          std::string idx = comment.substr(
-            comment.find("preferredIdx: ") + 14,
-            std::string::npos);
-          if (idx.find(" ") != std::string::npos)
-            idx = idx.substr(0, idx.find(" "));
-          //printf("sym_tab preferred index: %s\n", idx.c_str());
-          if (loop_idxs.find(idx) != loop_idxs.end())
-            idxSym = loop_idxs.find(idx)->second;
-          //Get the proc variable sybol for this preferred index
-          if (idxSym == 0) {
-            idxSym = body_syms->find_variable(idx.c_str());
-            if (!idxSym)
-              idxSym = param_syms->find_variable(idx.c_str());
-            //printf("idx not found: lookup %p\n", idxSym);
-            if (!idxSym) {
-              SgVariableDeclaration* defn =
-                buildVariableDeclaration(
-                  SgName((char*) idx.c_str()),
-                  buildIntType());
-              //idxSym = new var_sym(type_s32, (char*)idx.c_str());
-              SgInitializedNamePtrList& variables =
-                defn->get_variables();
-              SgInitializedNamePtrList::const_iterator i =
-                variables.begin();
-              SgInitializedName* initializedName = *i;
-              SgVariableSymbol* vs = new SgVariableSymbol(
-                initializedName);
-              prependStatement(defn, body);
-              vs->set_parent(body_syms);
-              body_syms->insert(SgName((char*) idx.c_str()), vs);
-              //printf("idx created and inserted\n");
-              idxSym = vs;
-            }
-            //Now insert into our map for future
-	    if (cudaDebug)
-		std::cout << idx << "\n\n";
-            loop_idxs.insert(make_pair(idx, idxSym));
-            
-          }
-          //See if we have a sync as well
-          if (comment.find("sync") != std::string::npos) {
-            //printf("Inserting sync after current block\n");
-            sync = true;
-          }
-          
-        }
-        if (idxSym) {
-          SgForInitStatement* list =
-            isSgForStatement(*it)->get_for_init_stmt();
-          SgStatementPtrList& initStatements = list->get_init_stmt();
-          SgStatementPtrList::const_iterator j =
-            initStatements.begin();
-          const SgVariableSymbol* index;
-          
-          if (SgExprStatement *expr = isSgExprStatement(*j))
-            if (SgAssignOp* op = isSgAssignOp(
-                  expr->get_expression()))
-              if (SgVarRefExp* var_ref = isSgVarRefExp(
-                    op->get_lhs_operand()))
-                index = var_ref->get_symbol();
-          
-          std::vector<SgVarRefExp *> array = substitute(*it, index,
-                                                        NULL, isSgNode(body_syms));
-          
-          for (int j = 0; j < array.size(); j++)
-            array[j]->set_symbol(idxSym);
-          
-        }
-        
-        SgStatement* body_ =
-          isSgStatement(
-            recursiveFindReplacePreferedIdxs(
-              isSgNode(
-                (isSgForStatement(*it)->get_loop_body())),
-              body_syms, param_syms, body, loop_idxs,
-              globalscope));
-        
-        omega::CG_roseRepr * tnl = new omega::CG_roseRepr(*it);
-        omega::CG_outputRepr* block = tnl->clone();
-        tnli =
-          static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-        
-        isSgForStatement(tnli)->set_loop_body(body_);
-        body_->set_parent(tnli);
-        if (idxSym) {
-          SgForInitStatement* list =
-            isSgForStatement(tnli)->get_for_init_stmt();
-          SgStatementPtrList& initStatements = list->get_init_stmt();
-          SgStatementPtrList::const_iterator j =
-            initStatements.begin();
-          const SgVariableSymbol* index;
-          
-          if (SgExprStatement *expr = isSgExprStatement(*j))
-            if (SgAssignOp* op = isSgAssignOp(
-                  expr->get_expression()))
-              if (SgVarRefExp* var_ref = isSgVarRefExp(
-                    op->get_lhs_operand()))
-                index = var_ref->get_symbol();
-          
-          std::vector<SgVarRefExp *> array = substitute(tnli, index,
-                                                        NULL, isSgNode(body_syms));
-          
-          for (int j = 0; j < array.size(); j++)
-            array[j]->set_symbol(idxSym);
-        }
-        idxSym = 0;
-        //  std::cout << isSgNode(body_)->unparseToString() << "\n\n";
-        if (att != NULL)
-          tnli->setAttribute("omega_comment", att);
-        clone->append_statement(isSgStatement(tnli));
-        if (sync) {
-          SgName name_syncthreads("__syncthreads");
-          SgFunctionSymbol * syncthreads_symbol =
-            globalscope->lookup_function_symbol(
-              name_syncthreads);
-          
-          // Create a call to __syncthreads():
-          SgFunctionCallExp * syncthreads_call = buildFunctionCallExp(
-            syncthreads_symbol, buildExprListExp());
-          
-          SgExprStatement* stmt = buildExprStatement(
-            syncthreads_call);
-          
-          /*    if (SgBasicBlock* bb = isSgBasicBlock(
-                isSgForStatement(code)->get_loop_body()))
-                appendStatement(isSgStatement(stmt), bb);
-                
-                else if (SgStatement* ss = isSgStatement(
-                isSgForStatement(code)->get_loop_body())) {
-                SgBasicBlock* bb2 = buildBasicBlock();
-                
-                isSgNode(ss)->set_parent(bb2);
-                appendStatement(ss, bb2);
-                
-                appendStatement(isSgStatement(stmt), bb2);
-                isSgNode(stmt)->set_parent(bb2);
-                isSgForStatement(code)->set_loop_body(bb2);
-                isSgNode(bb2)->set_parent(code);
-                }
-          */
-          
-          //SgBasicBlock* bb2 = buildBasicBlock();
-          clone->append_statement(stmt);
-          /* SgNode* parent = code->get_parent();
-             if(!isSgStatement(parent))
-             throw loop_error("Parent not a statement");
-             
-             if(isSgForStatement(parent)){
-             if(SgStatement *ss = isSgForStatement(isSgForStatement(parent)->get_loop_body())){
-             omega::CG_roseRepr * tnl = new omega::CG_roseRepr(ss);
-             omega::CG_outputRepr* block= tnl->clone();
-             
-             SgNode *new_ss = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-             SgBasicBlock* bb2 = buildBasicBlock();
-             
-             isSgNode(new_ss)->set_parent(bb2);
-             appendStatement(isSgStatement(new_ss), bb2);
-             appendStatement(isSgStatement(stmt), bb2);
-             isSgNode(stmt)->set_parent(bb2);
-             
-             isSgStatement(parent)->replace_statement_from_basicBlock(ss, isSgStatement(bb2));
-             
-             }else if(isSgBasicBlock(isSgForStatement(parent)->get_loop_body()))
-             isSgStatement(isSgForStatement(parent)->get_loop_body())->insert_statement(isSgStatement(code), stmt, false);
-             else
-             throw loop_error("parent statement type undefined!!");
-             
-             }
-             else if(isSgBasicBlock(parent))
-             isSgStatement(parent)->insert_statement(isSgStatement(code), stmt, false);
-             else
-             throw loop_error("parent statement type undefined!!");
-             
-             //tnl->print();
-             *
-             *
-             */
-          sync = true;
-          //    return isSgNode(bb2);
-          
-        }
-        
-        //  return tnli;
-      } else if (isSgIfStmt(*it)) {
-        SgStatement* body_ = isSgStatement(
-          recursiveFindReplacePreferedIdxs(
-            isSgNode((isSgIfStmt(*it)->get_true_body())),
-            body_syms, param_syms, body, loop_idxs,
-            globalscope));
-        
-        omega::CG_roseRepr * tnl = new omega::CG_roseRepr(*it);
-        omega::CG_outputRepr* block = tnl->clone();
-        tnli1 =
-          static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-        
-        isSgIfStmt(tnli1)->set_true_body(body_);
-        
-        if ((isSgIfStmt(*it)->get_false_body()))
-          isSgIfStmt(tnli1)->set_false_body(
-            isSgStatement(
-              recursiveFindReplacePreferedIdxs(
-                isSgNode(
-                  (isSgIfStmt(*it)->get_false_body())),
-                body_syms, param_syms, body,
-                loop_idxs, globalscope)));
-        
-        clone->append_statement(isSgStatement(tnli1));
-        //return tnli;
-      } else if (isSgStatement(*it)) {
-        omega::CG_roseRepr * tnl = new omega::CG_roseRepr(*it);
-        omega::CG_outputRepr* block = tnl->clone();
-        tnli2 =
-          static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-        
-        clone->append_statement(isSgStatement(tnli2));
-        //return tnli;
-        
-      }
-    }
-    
-    return isSgNode(clone);
-    
-  }
-  
-  /*    if (!isSgBasicBlock(
-        recursiveFindReplacePreferedIdxs(isSgNode(*it), body_syms,
-        param_syms, body, loop_idxs, globalscope))) {
-        SgStatement *to_push = isSgStatement(
-        recursiveFindReplacePreferedIdxs(isSgNode(*it),
-        body_syms, param_syms, body, loop_idxs,
-        globalscope, sync));
-        clone->append_statement(to_push);
-        
-        if ((sync_found) && isSgForStatement(to_push)) {
-        SgName name_syncthreads("__syncthreads");
-        SgFunctionSymbol * syncthreads_symbol =
-        globalscope->lookup_function_symbol(
-        name_syncthreads);
-        
-        // Create a call to __syncthreads():
-        SgFunctionCallExp * syncthreads_call = buildFunctionCallExp(
-        syncthreads_symbol, buildExprListExp());
-        
-        SgExprStatement* stmt = buildExprStatement(
-        syncthreads_call);
-        
-        clone->append_statement(isSgStatement(stmt));
-        }
-        //  std::cout<<isSgNode(*it)->unparseToString()<<"\n\n";
-        } else {
-        
-        SgStatementPtrList& tnl2 = isSgBasicBlock(
-        recursiveFindReplacePreferedIdxs(isSgNode(*it),
-        body_syms, param_syms, body, loop_idxs,
-        globalscope))->get_statements();
-        for (SgStatementPtrList::const_iterator it2 = tnl2.begin();
-        it2 != tnl2.end(); it2++) {
-        clone->append_statement(*it2);
-        
-        sync_found = true;
-        //  std::cout<<isSgNode(*it2)->unparseToString()<<"\n\n";
-        }
-        }
-        
-        }
-        return isSgNode(clone);
-        }
-  */
-//  return tnl;
-}
-
-// loop_vars -> array references
-// loop_idxs -> <idx_name,idx_sym> map for when we encounter a loop with a different preferredIndex
-// dim_vars -> out param, fills with <old,new> var_sym pair for 2D array dimentions (messy stuff)
-SgNode* swapVarReferences(SgNode* code,
-                          std::set<const SgVariableSymbol *>& syms, SgSymbolTable* param,
-                          SgSymbolTable* body, SgScopeStatement* body_stmt) {
-  //Iterate over every expression, looking up each variable and type
-  //reference used and possibly replacing it or adding it to our symbol
-  //table
-  //
-  //We use the built-in cloning helper methods to seriously help us with this!
-  
-  //Need to do a recursive mark
-  
-  std::set<const SgVariableSymbol *>::iterator myIterator;
-  for (myIterator = syms.begin(); myIterator != syms.end(); myIterator++) {
-    SgName var_name = (*myIterator)->get_name();
-    std::string x = var_name.getString();
-    
-    if ((param->find_variable(var_name) == NULL)
-        && (body->find_variable(var_name) == NULL)) {
-      SgInitializedName* decl = (*myIterator)->get_declaration();
-      
-      SgVariableSymbol* dvs = new SgVariableSymbol(decl);
-      SgVariableDeclaration* var_decl = buildVariableDeclaration(
-        dvs->get_name(), dvs->get_type());
-      
-      AstTextAttribute* att = (AstTextAttribute*) (isSgNode(
-                                                     decl->get_declaration())->getAttribute("__shared__"));
-      if (isSgNode(decl->get_declaration())->attributeExists(
-            "__shared__"))
-        var_decl->get_declarationModifier().get_storageModifier().setCudaShared();
-      
-      appendStatement(var_decl, body_stmt);
-      
-      dvs->set_parent(body);
-      body->insert(var_name, dvs);
-    }
-    
-    std::vector<SgVarRefExp *> array = substitute(code, *myIterator, NULL,
-                                                  isSgNode(body));
-    
-    SgVariableSymbol* var = (SgVariableSymbol*) (*myIterator);
-    for (int j = 0; j < array.size(); j++)
-      array[j]->set_symbol(var);
-  }
-  
-  return code;
-}
-
-bool LoopCuda::validIndexes(int stmt, const std::vector<std::string>& idxs) {
-  for (int i = 0; i < idxs.size(); i++) {
-    bool found = false;
-    for (int j = 0; j < idxNames[stmt].size(); j++) {
-      if (strcmp(idxNames[stmt][j].c_str(), idxs[i].c_str()) == 0) {
-        found = true;
-      }
-    }
-    if (!found) {
-      return false;
-    }
-  }
-  return true;
-}
-
-bool LoopCuda::cudaize_v2(std::string kernel_name,
-                          std::map<std::string, int> array_dims,
-                          std::vector<std::string> blockIdxs,
-                          std::vector<std::string> threadIdxs) {
-  CG_outputBuilder *ocg = ir->builder();
-  int stmt_num = 0;
-  if (cudaDebug) {
-    printf("cudaize_v2(%s, {", kernel_name.c_str());
-    //for(
-    printf("}, blocks={");
-    printVs(blockIdxs);
-    printf("}, thread={");
-    printVs(threadIdxs);
-    printf("})\n");
-  }
-  
-  this->array_dims = array_dims;
-  if (!validIndexes(stmt_num, blockIdxs)) {
-    throw std::runtime_error("One of the indexes in the block list was not "
-                             "found in the current set of indexes.");
-  }
-  if (!validIndexes(stmt_num, threadIdxs)) {
-    throw std::runtime_error(
-      "One of the indexes in the thread list was not "
-      "found in the current set of indexes.");
-  }
-  if (blockIdxs.size() == 0)
-    throw std::runtime_error("Cudaize: Need at least one block dimention");
-  int block_level = 0;
-  //Now, we will determine the actual size (if possible, otherwise
-  //complain) for the block dimentions and thread dimentions based on our
-  //indexes and the relations for our stmt;
-  for (int i = 0; i < blockIdxs.size(); i++) {
-    int level = findCurLevel(stmt_num, blockIdxs[i]);
-    int ub, lb;
-    CG_outputRepr* ubrepr = extractCudaUB(stmt_num, level, ub, lb);
-    if (lb != 0) {
-      //attempt to "normalize" the loop with an in-place tile and then re-check our bounds
-      if (cudaDebug)
-        printf(
-          "Cudaize: doing tile at level %d to try and normalize lower bounds\n",
-          level);
-      tile(stmt_num, level, 1, level, CountedTile);
-      idxNames[stmt_num].insert(idxNames[stmt_num].begin() + (level), ""); //TODO: possibly handle this for all sibling stmts
-      ubrepr = extractCudaUB(stmt_num, level, ub, lb);
-    }
-    if (lb != 0) {
-      char buf[1024];
-      sprintf(buf,
-              "Cudaize: Loop at level %d does not have 0 as it's lower bound",
-              level);
-      throw std::runtime_error(buf);
-    }
-    if (ub < 0) {
-      char buf[1024];
-      sprintf(buf,
-              "Cudaize: Loop at level %d does not have a hard upper bound",
-              level);
-      //Anand: Commenting out error indication for lack of constant upper bound
-      //throw std::runtime_error(buf);
-    }
-    if (cudaDebug)
-      printf("block idx %s level %d lb: %d ub %d\n", blockIdxs[i].c_str(),
-             level, lb, ub);
-    if (i == 0) {
-      block_level = level;
-      if (ubrepr == NULL) {
-        cu_bx = ub + 1;
-        cu_bx_repr = NULL;
-      } else {
-        cu_bx = 0;
-        cu_bx_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
-      }
-      idxNames[stmt_num][level - 1] = "bx";
-    } else if (i == 1) {
-      if (ubrepr == NULL) {
-        cu_by = ub + 1;
-        cu_by_repr = NULL;
-      } else {
-        cu_by = 0;
-        cu_by_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
-      }
-      idxNames[stmt_num][level - 1] = "by";
-    }
-  }
-  if (!cu_by && !cu_by_repr)
-    block_level = 0;
-  int thread_level1 = 0;
-  int thread_level2 = 0;
-  for (int i = 0; i < threadIdxs.size(); i++) {
-    int level = findCurLevel(stmt_num, threadIdxs[i]);
-    int ub, lb;
-    CG_outputRepr* ubrepr = extractCudaUB(stmt_num, level, ub, lb);
-    if (lb != 0) {
-      //attempt to "normalize" the loop with an in-place tile and then re-check our bounds
-      if (cudaDebug)
-        printf(
-          "Cudaize: doing tile at level %d to try and normalize lower bounds\n",
-          level);
-      tile(stmt_num, level, 1, level, CountedTile);
-      idxNames[stmt_num].insert(idxNames[stmt_num].begin() + (level), "");
-      ubrepr = extractCudaUB(stmt_num, level, ub, lb);
-    }
-    if (lb != 0) {
-      char buf[1024];
-      sprintf(buf,
-              "Cudaize: Loop at level %d does not have 0 as it's lower bound",
-              level);
-      throw std::runtime_error(buf);
-    }
-    if (ub < 0) {
-      char buf[1024];
-      sprintf(buf,
-              "Cudaize: Loop at level %d does not have a hard upper bound",
-              level);
-      //Anand: Commenting out error indication for lack of constant upper bound
-      //throw std::runtime_error(buf);
-    }
-    
-    if (cudaDebug)
-      printf("thread idx %s level %d lb: %d ub %d\n",
-             threadIdxs[i].c_str(), level, lb, ub);
-    if (i == 0) {
-      thread_level1 = level;
-      if (ubrepr == NULL) {
-        cu_tx = ub + 1;
-        cu_tx_repr = NULL;
-      } else {
-        cu_tx = 0;
-        cu_tx_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
-      }
-      idxNames[stmt_num][level - 1] = "tx";
-    } else if (i == 1) {
-      thread_level2 = level;
-      if (ubrepr == NULL) {
-        cu_ty = ub + 1;
-        cu_ty_repr = NULL;
-      } else {
-        cu_ty = 0;
-        cu_ty_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
-      }
-      idxNames[stmt_num][level - 1] = "ty";
-    } else if (i == 2) {
-      if (ubrepr == NULL) {
-        cu_tz = ub + 1;
-        cu_tz_repr = NULL;
-      } else {
-        cu_tz = 0;
-        cu_tz_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
-      }
-      idxNames[stmt_num][level - 1] = "tz";
-    }
-  }
-  if (!cu_ty && !cu_ty_repr)
-    thread_level1 = 0;
-  if (!cu_tz && !cu_tz_repr)
-    thread_level2 = 0;
-  
-  //Make changes to nonsplitlevels
-  const int m = stmt.size();
-  for (int i = 0; i < m; i++) {
-    if (block_level) {
-      //stmt[i].nonSplitLevels.append((block_level)*2);
-      stmt_nonSplitLevels[i].push_back((block_level) * 2);
-    }
-    if (thread_level1) {
-      //stmt[i].nonSplitLevels.append((thread_level1)*2);
-      stmt_nonSplitLevels[i].push_back((thread_level1) * 2);
-    }
-    if (thread_level2) {
-      //stmt[i].nonSplitLevels.append((thread_level1)*2);
-      stmt_nonSplitLevels[i].push_back((thread_level1) * 2);
-    }
-  }
-  
-  if (cudaDebug) {
-    printf("Codegen: current names: ");
-    printVS(idxNames[stmt_num]);
-  }
-  //Set codegen flag
-  code_gen_flags |= GenCudaizeV2;
-  
-  //Save array dimention sizes
-  this->array_dims = array_dims;
-  cu_kernel_name = kernel_name.c_str();
-  
-}
-
-/*
- * setupConstantVar
- * handles constant variable declaration
- * and adds a global constant variable
- * parameters:
- *   constant - the constant_memory_mapping object for this loop
- *   arr_def  - the VarDefs object for the mapped variable
- *   globals  - Rose Global variables
- *   i        - an index to keep new variable names unique
- *   symtab   - global symbol table
- */
-static void setupConstantVar(constant_memory_mapping* constant, VarDefs* arr_def, SgGlobal* globals, int i, SgSymbolTable* symtab) {
-  char* buf1 = new char[32];
-  snprintf(buf1, 32, "cs%dRef", i+1);
-  arr_def->secondName = buf1;
-  
-  char buf2[64];
-  snprintf(buf2, 64, "__device__ __constant__ float");
-  
-  SgVariableDeclaration* consvar_decl = buildVariableDeclaration(
-                     SgName(std::string(buf1)), buildArrayType(
-                                 buildOpaqueType(SgName(buf2),globals),
-                                 arr_def->size_expr));
-  SgInitializedNamePtrList& variables = consvar_decl->get_variables();
-  SgInitializedNamePtrList::const_iterator j = variables.begin();
-  SgInitializedName* initializedName = *j;
-  SgVariableSymbol* consvar_sym = new SgVariableSymbol(initializedName);
-  prependStatement(consvar_decl, globals);
-
-  consvar_sym->set_parent(symtab);
-  symtab->insert(SgName(std::string(buf1)), consvar_sym);
-  
-  constant->set_mapped_symbol(arr_def->original_name.c_str(), consvar_sym);
-  constant->set_vardef(arr_def->original_name.c_str(), arr_def);
-}
-
-/*
- * cudaBindConstantVar
- * allocs a variable to constant memory
- *   constant  - the constant mapping object
- *   arr_def   - the VarDefs abject
- *   globals   - global symbol table
- *   stmt_list - the GPU functions' statement list
- */
-static void cudaBindConstantVar(constant_memory_mapping* constant, VarDefs* arr_def, SgGlobal* globals, SgStatementPtrList* stmt_list) {
-  SgName cudaMemcpyToSymbol_name("cudaMemcpyToSymbol");
-  SgFunctionDeclaration* cudaMemcpyToSymbol_decl = buildNondefiningFunctionDeclaration(
-    cudaMemcpyToSymbol_name, buildVoidType(), buildFunctionParameterList(), globals);
-  SgExprListExp* args = buildExprListExp();
-  args->append_expression(buildCastExp(constant->get_mapped_symbol_exp(arr_def->original_name.c_str()),
-                                       buildPointerType(buildVoidType())));
-  args->append_expression(buildVarRefExp(arr_def->in_data));
-  args->append_expression(arr_def->size_expr);
-  stmt_list->push_back(buildExprStatement(
-    buildFunctionCallExp(buildFunctionRefExp(cudaMemcpyToSymbol_decl), args)));
-}
-
-static void consmapArrayRefs(constant_memory_mapping* constant, std::vector<IR_ArrayRef*>* refs, SgGlobal* globals, IR_Code* ir, CG_roseBuilder* ocg) {
-  // if constant mapping is not being used, ignore this function
-  if(constant == NULL) return;
-  for(int i = 0; i < refs->size(); i++) {
-    IR_ArrayRef* aref = (*refs)[i];
-    if(constant->is_array_mapped(aref->name().c_str())) {
-      // get array reference dimensions
-      int dims = aref->symbol()->n_dim();
-      if(dims > 2) {
-        printf(" \n CHiLL does not handle constant memory mapping for more than 2D arrays.\n");
-        return;
-      }
-      
-      SgExpression* varexp = constant->get_mapped_symbol_exp(aref->name().c_str());
-      SgExpression* index_exp;
-      // build index expression
-      if(dims == 1) {
-        index_exp = static_cast<omega::CG_roseRepr*>(aref->index(0)->clone())->GetExpression();
-      }
-      if(dims == 2) {
-        VarDefs* arr_def = constant->get_vardef(aref->name().c_str());
-        CG_outputRepr* i0 = aref->index(0)->clone();
-        CG_outputRepr* i1 = aref->index(1)->clone();
-        CG_outputRepr* sz = new CG_roseRepr(buildIntVal(arr_def->size_multi_dim[0]));
-        CG_outputRepr* exp = ocg->CreatePlus(ocg->CreateTimes(sz->clone(), i0), i1);
-        index_exp = static_cast<omega::CG_roseRepr*>(exp->clone())->GetExpression();
-      }
-      ir->ReplaceExpression(aref, new CG_roseRepr(buildPntrArrRefExp(varexp, index_exp)));
-    }
-  }
-}
-
-/*
- * setupTexmappingVar
- * handles texture variable declaration
- * and adds a global texture object 
- * parameters:
- *   texture    - the texture_memory_mapping object
- *   arr_def    - the VarDefs object for the mapped variable
- *   globals    - Rose Global variables
- *   i          - an index to keep the new variable names unique
- *   devptr_sym - the devptr that the original variable is associated with
- *   symtab     - GPU function symbol table
- */
-static void setupTexmappingVar(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals, int i, SgVariableSymbol* devptr_sym, SgSymbolTable* symtab) {  
-  char* buf1 = new char[32];
-  snprintf(buf1, 32, "tex%dRef", i+1);
-  arr_def->secondName = buf1;
-  
-  char buf2[64];
-  // single-dimensional 
-  snprintf(buf2, 64, "texture<float, %d, cudaReadModeElementType>", 1);
-  // multi-dimensional
-  // snprintf(buf2, 64, "texture<float, %d, cudaReadModeElemetType>", (int)(arr_def->size_multi_dim.size())); //*/
-  
-  SgVariableDeclaration* texvar_decl = buildVariableDeclaration(SgName(std::string(buf1)), buildOpaqueType(buf2, globals));
-  
-  SgInitializedNamePtrList& variables = texvar_decl->get_variables();
-  SgInitializedNamePtrList::const_iterator j = variables.begin();
-  SgInitializedName* initializedName = *j;
-  SgVariableSymbol* texvar_sym = new SgVariableSymbol(initializedName);
-  prependStatement(texvar_decl, globals);
-  
-  texvar_sym->set_parent(symtab);
-  symtab->insert(SgName(buf1), texvar_sym);
-  
-  texture->set_mapped_symbol(arr_def->original_name.c_str(), texvar_sym);
-  texture->set_devptr_symbol(arr_def->original_name.c_str(), devptr_sym);
-  texture->set_vardef(arr_def->original_name.c_str(), arr_def);
-}
-
-
-/*
- * One dimensional version of cudaBindTexture
- * see cudaBindTexture for details
- */
-static SgFunctionCallExp* cudaBindTexture1D(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals) {
-  SgName cudaBindTexture_name("cudaBindTexture");
-  SgFunctionDeclaration* cudaBindTexture_decl = buildNondefiningFunctionDeclaration(
-      cudaBindTexture_name, buildVoidType(), buildFunctionParameterList(), globals);
-  
-  SgExprListExp* args = buildExprListExp();
-  args->append_expression(buildIntVal(0));
-  args->append_expression(texture->get_mapped_symbol_exp(arr_def->original_name.c_str()));
-  args->append_expression(texture->get_devptr_symbol_exp(arr_def->original_name.c_str()));
-  args->append_expression(arr_def->size_expr);
-  return buildFunctionCallExp(buildFunctionRefExp(cudaBindTexture_decl), args);
-}
-
-/*
- * Two dimensional version of cudaBindTexture
- * see cudaBindTexture for details
- */
-//static SgFunctionCallExp* cudaBindTexture2D(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals) {
-//  SgName cudaBindTexture_name("cudaBindTexture2D");
-//  SgFunctionDeclaration* cudaBindTexture_decl = buildNondefiningFunctionDeclaration(
-//      cudaBindTexture_name, buildVoidType(), buildFunctionParameterList(), globals);
-//  
-//  SgExprListExp* args = buildExprListExp();
-//  args->append_expression(buildIntVal(0));
-//  args->append_expression(texture->get_tex_mapped_symbol_exp(arr_def->original_name.c_str()));
-//  args->append_expression(texture->get_devptr_symbol_exp(arr_def->original_name.c_str()));
-//  args->append_expression(buildIntVal(texture->get_dim_length(arr_def->original_name.c_str(), 0)));
-//  args->append_expression(buildIntVal(texture->get_dim_length(arr_def->original_name.c_str(), 1)));
-//  args->append_expression(arr_def->size_expr);
-//  return buildFunctionCallExp(buildFunctionRefExp(cudaBindTexture_decl), args);
-//}
-
-/*
- * cudaBindTexture
- * binds a variable to a texture
- * parameters:
- *    texture   - the texture mapping object
- *    arr_def   - the VarDefs object
- *    globals   - global symbol table
- *    stmt_list - the GPU functions' statement list
- * notes:
- *    only supports binding 1D textures, may need to consider cudaBindTexture2D for 2D textures
- */
-static void cudaBindTexture(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals, SgStatementPtrList* stmt_list) {
-  //int dims = (int)(arr_def->size_multi_dim.size());
-  //int dims = texture->get_dims(arr_def->original_name.c_str());
-  //if(dims == 1)
-    stmt_list->push_back(
-      buildExprStatement(cudaBindTexture1D(texture, arr_def, globals)));
-  //if(dims == 2)
-  //  stmt_list->push_back(
-  //    buildExprStatement(cudaBindTexture2D(texture, arr_def, globals)));
-}
-
-/*
- * texmapArrayRefs
- * maps array reference expresions of texture mapped variables to the tex1D function
- * parameters:
- *    texture - the texture mapping object
- *    refs    - a list of all array read operations
- *    globals - global symbol table
- *    ir      - handles IR_Code operations
- *    ocg     - handles CG_roseBuilder operations
-**/
-static void texmapArrayRefs(texture_memory_mapping* texture, std::vector<IR_ArrayRef*>* refs, SgGlobal* globals, IR_Code* ir, CG_roseBuilder *ocg) {
-  // if texture mapping is not being used, ignore this function
-  if(texture == NULL) return;
-  for(int i = 0; i < refs->size(); i++) {
-    IR_ArrayRef* aref = (*refs)[i];
-    if(texture->is_array_mapped(aref->name().c_str())) {
-      
-      // get array dimensions
-      VarDefs* arr_def = texture->get_vardef(aref->name().c_str());
-      int dims = aref->symbol()->n_dim();
-      if(dims > 2) {
-        printf(" \n CHiLL does not handle texture mapping for more than 2D arrays.\n");
-        // TODO throw some sort of error. or handle in texture_copy function
-        return;
-      }
-      
-      // build texture lookup function declaration
-      char texNDfetch_strName[16];
-      sprintf(texNDfetch_strName, "tex%dDfetch", 1); // for now, only support tex1Dfetch
-      //sprintf(texNDfetch_strName, "tex%dDfetch", dims);
-      SgFunctionDeclaration* fetch_decl = buildNondefiningFunctionDeclaration(
-          SgName(texNDfetch_strName), buildFloatType(), buildFunctionParameterList(), globals);
-      
-      // build args
-      SgExprListExp* args = buildExprListExp();
-      args->append_expression(texture->get_mapped_symbol_exp(aref->name().c_str()));
-      
-      // set indexing args
-      //for(int i = 0; i < dims; i++) {
-      //  args->append_expression((static_cast<omega::CG_roseRepr*>(aref->index(i)->clone()))->GetExpression());
-      //}
-      if(dims == 1) {
-        args->append_expression(static_cast<omega::CG_roseRepr*>(aref->index(0)->clone())->GetExpression());
-      }
-      else if(dims == 2) {
-        CG_outputRepr* i0 = aref->index(0)->clone();
-        CG_outputRepr* i1 = aref->index(1)->clone();
-        CG_outputRepr* sz = new CG_roseRepr(buildIntVal(arr_def->size_multi_dim[0]));
-        CG_outputRepr* expr = ocg->CreatePlus(ocg->CreateTimes(sz->clone(), i0), i1);
-        args->append_expression(static_cast<omega::CG_roseRepr*>(expr->clone())->GetExpression());
-      }
-      
-      // build function call and replace original array ref
-      SgFunctionCallExp* fetch_call = buildFunctionCallExp(buildFunctionRefExp(fetch_decl), args);
-      ir->ReplaceExpression(aref, new CG_roseRepr(fetch_call));
-    }
-  }
-}
-
-SgNode* LoopCuda::cudaize_codegen_v2() {
-    if(cudaDebug)
-	printf("cudaize codegen V2\n");
-  CG_roseBuilder *ocg = dynamic_cast<CG_roseBuilder*>(ir->builder());
-  if (!ocg)
-    return false;
-
- //protonu--adding an annote to track texture memory type
-  //ANNOTE(k_cuda_texture_memory, "cuda texture memory", TRUE);
-  //ANNOTE(k_cuda_constant_memory, "cuda constant memory", TRUE);
-  int tex_mem_on = 0;
-  int cons_mem_on = 0;
-
-
-  
-  CG_outputRepr* repr;
-  std::vector<VarDefs> arrayVars;
-  std::vector<VarDefs> localScopedVars;
-  
-  std::vector<IR_ArrayRef *> ro_refs;
-  std::vector<IR_ArrayRef *> wo_refs;
-  std::set<std::string> uniqueRefs;
-  std::set<std::string> uniqueWoRefs;
-  std::set<const SgVariableSymbol *> syms;
-  std::set<const SgVariableSymbol *> psyms;
-  std::set<const SgVariableSymbol *> pdSyms;
-  SgStatementPtrList* replacement_list = new SgStatementPtrList;
-  
-  for (int j = 0; j < stmt.size(); j++) {
-    std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[j].code);
-    for (int i = 0; i < refs.size(); i++) {
-      //printf("ref %s wo %d\n", static_cast<const char*>(refs[i]->name()), refs[i]->is_write());
-      SgVariableSymbol* var = body_symtab->find_variable(
-        SgName((char*) refs[i]->name().c_str()));
-      SgVariableSymbol* var2 = parameter_symtab->find_variable(
-        SgName((char*) refs[i]->name().c_str()));
-      
-      //If the array is not a parameter, then it's a local array and we
-      //want to recreate it as a stack variable in the kernel as opposed to
-      //passing it in.
-      if (var != NULL) {
-        //anand-- needs modification, if variable is parameter it wont be part of the
-        // block's symbol table but the functiond definition's symbol table
-        
-        continue;
-      }
-      if (uniqueRefs.find(refs[i]->name()) == uniqueRefs.end()) {
-        
-        uniqueRefs.insert(refs[i]->name());
-        if (refs[i]->is_write()) {
-          uniqueWoRefs.insert(refs[i]->name());
-          wo_refs.push_back(refs[i]);
-        } else
-          ro_refs.push_back(refs[i]);
-      }
-      if (refs[i]->is_write()
-          && uniqueWoRefs.find(refs[i]->name())
-          == uniqueWoRefs.end()) {
-        uniqueWoRefs.insert(refs[i]->name());
-        wo_refs.push_back(refs[i]);
-        //printf("adding %s to wo\n", static_cast<const char*>(refs[i]->name()));
-      }
-      pdSyms.insert((const SgVariableSymbol*) var2);
-    }
-  }
-  
-  if (cudaDebug) {
-      printf("reading from array ");
-      for (int i = 0; i < ro_refs.size(); i++)
-	  printf("'%s' ", ro_refs[i]->name().c_str());
-      printf("and writing to array ");
-      for (int i = 0; i < wo_refs.size(); i++)
-	  printf("'%s' ", wo_refs[i]->name().c_str());
-      printf("\n");
-  }  
-  const char* gridName = "dimGrid";
-  const char* blockName = "dimBlock";
-  
-  //TODO: Could allow for array_dims_vars to be a mapping from array
-  //references to to variable names that define their length.
-  SgVariableSymbol* dim1 = 0;
-  SgVariableSymbol* dim2 = 0;
-  
-  for (int i = 0; i < wo_refs.size(); i++) {
-    //TODO: Currently assume all arrays are floats of one or two dimentions
-    SgVariableSymbol* outArray = 0;
-    std::string name = wo_refs[i]->name();
-    outArray = body_symtab->find_variable(SgName((char*) name.c_str()));
-    int size_n_d;
-    if (outArray == NULL)
-      outArray = parameter_symtab->find_variable(
-        SgName((char*) name.c_str()));
-    
-    VarDefs v;
-    v.size_multi_dim = std::vector<int>();
-    char buf[32];
-    snprintf(buf, 32, "devO%dPtr", i + 1);
-    v.name = buf;
-    if (isSgPointerType(outArray->get_type())) {
-      if (isSgArrayType(
-            isSgNode(
-              isSgPointerType(outArray->get_type())->get_base_type()))) {
-        //  v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type();
-        SgType* t =
-          isSgPointerType(outArray->get_type())->get_base_type();
-        /*   SgExprListExp* dimList = t->get_dim_info();
-             SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
-             SgExpression* expr=NULL;
-             for (; j != dimList->get_expressions().end(); j++)
-             expr = *j;
-        */
-        while (isSgArrayType(t))
-          t = isSgArrayType(t)->get_base_type();
-        
-        if (!isSgType(t)) {
-          char buf[1024];
-          sprintf(buf, "CudaizeCodeGen: Array type undetected!");
-          throw std::runtime_error(buf);
-          
-        }
-        
-        v.type = t;
-      } else
-        v.type = isSgPointerType(outArray->get_type())->get_base_type();
-    } else if (isSgArrayType(outArray->get_type())) {
-      if (isSgArrayType(
-            isSgNode(
-              isSgArrayType(outArray->get_type())->get_base_type()))) {
-        //  v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type();
-        SgType* t =
-          isSgArrayType(outArray->get_type())->get_base_type();
-        /*   SgExprListExp* dimList = t->get_dim_info();
-             SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
-             SgExpression* expr=NULL;
-             for (; j != dimList->get_expressions().end(); j++)
-             expr = *j;
-        */
-        while (isSgArrayType(t))
-          t = isSgArrayType(t)->get_base_type();
-        
-        if (!isSgType(t)) {
-          char buf[1024];
-          sprintf(buf, "CudaizeCodeGen: Array type undetected!");
-          throw std::runtime_error(buf);
-          
-        }
-        
-        v.type = t;
-      } else
-        v.type = isSgArrayType(outArray->get_type())->get_base_type();
-    } else
-      v.type = buildFloatType();
-    v.tex_mapped = false;
-    v.cons_mapped = false;
-    v.original_name = wo_refs[i]->name();
-    //Size of the array = dim1 * dim2 * num bytes of our array type
-    
-    //If our input array is 2D (non-linearized), we want the actual
-    //dimentions of the array
-    CG_outputRepr* size;
-    //Lookup in array_dims
-    std::map<std::string, int>::iterator it = array_dims.find(name.c_str());
-    if (isSgPointerType(outArray->get_type())
-        && isSgArrayType(
-          isSgNode(
-            isSgPointerType(outArray->get_type())->get_base_type()))) {
-      SgType* t = isSgPointerType(outArray->get_type())->get_base_type();
-      /*   SgExprListExp* dimList = t->get_dim_info();
-           SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
-           SgExpression* expr=NULL;
-           for (; j != dimList->get_expressions().end(); j++)
-           expr = *j;
-      */
-      if (isSgIntVal(isSgArrayType(t)->get_index()))
-        size_n_d =
-          (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
-      else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgUnsignedIntVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgUnsignedLongVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d =
-          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
-      else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgLongLongIntVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d =
-          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
-      else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgUnsignedLongLongIntVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgAddOp(isSgArrayType(t)->get_index())) {
-        SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-        
-        SgExpression *lhs = op_add->get_lhs_operand();
-        SgExpression *rhs = op_add->get_rhs_operand();
-        
-        if (isSgIntVal(lhs))
-          size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value());
-        else if (isSgUnsignedIntVal(lhs))
-          size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value()
-            + (int) isSgUnsignedIntVal(rhs)->get_value();
-        else if (isSgUnsignedLongVal(lhs))
-          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                            + isSgUnsignedLongVal(rhs)->get_value());
-        else if (isSgLongIntVal(lhs))
-          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                            + isSgUnsignedLongVal(rhs)->get_value());
-        else if (isSgLongLongIntVal(lhs))
-          size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value()
-                            + isSgUnsignedLongVal(rhs)->get_value());
-        else if (isSgLongIntVal(lhs))
-          size_n_d = (int) (isSgLongIntVal(lhs)->get_value()
-                            + isSgLongIntVal(rhs)->get_value());
-        else if (isSgUnsignedLongLongIntVal(lhs))
-          size_n_d =
-            (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
-                   + isSgUnsignedLongLongIntVal(rhs)->get_value());
-        
-      }
-      t = isSgArrayType(t)->get_base_type();
-      while (isSgArrayType(t)) {
-        int dim;
-        if (isSgIntVal(isSgArrayType(t)->get_index()))
-          dim =
-            (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
-        else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgUnsignedIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgUnsignedLongVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgLongLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgUnsignedLongLongIntVal(
-                   isSgArrayType(t)->get_index()))
-          dim = (int) (isSgUnsignedLongLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgAddOp(isSgArrayType(t)->get_index())) {
-          SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-          
-          SgExpression *lhs = op_add->get_lhs_operand();
-          SgExpression *rhs = op_add->get_rhs_operand();
-          
-          if (isSgIntVal(lhs))
-            dim = (int) isSgIntVal(lhs)->get_value()
-              + (int) (isSgIntVal(rhs)->get_value());
-          else if (isSgUnsignedIntVal(lhs))
-            dim = (int) isSgUnsignedIntVal(lhs)->get_value()
-              + (int) isSgUnsignedIntVal(rhs)->get_value();
-          else if (isSgUnsignedLongVal(lhs))
-            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                         + isSgUnsignedLongVal(rhs)->get_value());
-          else if (isSgLongIntVal(lhs))
-            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                         + isSgUnsignedLongVal(rhs)->get_value());
-          else if (isSgLongLongIntVal(lhs))
-            dim = (int) (isSgLongLongIntVal(lhs)->get_value()
-                         + isSgUnsignedLongVal(rhs)->get_value());
-          else if (isSgLongIntVal(lhs))
-            dim = (int) (isSgLongIntVal(lhs)->get_value()
-                         + isSgLongIntVal(rhs)->get_value());
-          else if (isSgUnsignedLongLongIntVal(lhs))
-            dim =
-              (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
-                     + isSgUnsignedLongLongIntVal(rhs)->get_value());
-          
-        }
-        size_n_d *= dim;
-        v.size_multi_dim.push_back(dim);
-        t = isSgArrayType(t)->get_base_type();
-      }
-      //v.size_2d = (int) (isSgIntVal(t->get_index())->get_value());
-      
-      if (cudaDebug)
-	  printf("Detected Multi-dimensional array sized of %d for %s\n",
-		 size_n_d, (char*) wo_refs[i]->name().c_str());
-      size = ocg->CreateInt(size_n_d);
-    } else if (isSgArrayType(outArray->get_type())
-               && isSgArrayType(
-                 isSgNode(
-                   isSgArrayType(outArray->get_type())->get_base_type()))) {
-      SgType* t = outArray->get_type();
-      /*   SgExprListExp* dimList = t->get_dim_info();
-           SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
-           SgExpression* expr=NULL;
-           for (; j != dimList->get_expressions().end(); j++)
-           expr = *j;
-      */
-      
-      if (isSgIntVal(isSgArrayType(t)->get_index()))
-        size_n_d =
-          (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
-      else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgUnsignedIntVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgUnsignedLongVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d =
-          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
-      else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgLongLongIntVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d =
-          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
-      else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgUnsignedLongLongIntVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgAddOp(isSgArrayType(t)->get_index())) {
-        SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-        
-        SgExpression *lhs = op_add->get_lhs_operand();
-        SgExpression *rhs = op_add->get_rhs_operand();
-        
-        if (isSgIntVal(lhs))
-          size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value());
-        else if (isSgUnsignedIntVal(lhs))
-          size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value()
-            + (int) isSgUnsignedIntVal(rhs)->get_value();
-        else if (isSgUnsignedLongVal(lhs))
-          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                            + isSgUnsignedLongVal(rhs)->get_value());
-        else if (isSgLongIntVal(lhs))
-          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                            + isSgUnsignedLongVal(rhs)->get_value());
-        else if (isSgLongLongIntVal(lhs))
-          size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value()
-                            + isSgUnsignedLongVal(rhs)->get_value());
-        else if (isSgLongIntVal(lhs))
-          size_n_d = (int) (isSgLongIntVal(lhs)->get_value()
-                            + isSgLongIntVal(rhs)->get_value());
-        else if (isSgUnsignedLongLongIntVal(lhs))
-          size_n_d =
-            (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
-                   + isSgUnsignedLongLongIntVal(rhs)->get_value());
-        
-      }
-      t = isSgArrayType(t)->get_base_type();
-      while (isSgArrayType(t)) {
-        int dim;
-        if (isSgIntVal(isSgArrayType(t)->get_index()))
-          dim =
-            (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
-        else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgUnsignedIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgUnsignedLongVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgLongLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgUnsignedLongLongIntVal(
-                   isSgArrayType(t)->get_index()))
-          dim = (int) (isSgUnsignedLongLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgAddOp(isSgArrayType(t)->get_index())) {
-          SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-          
-          SgExpression *lhs = op_add->get_lhs_operand();
-          SgExpression *rhs = op_add->get_rhs_operand();
-          
-          if (isSgIntVal(lhs))
-            dim = (int) isSgIntVal(lhs)->get_value()
-              + (int) (isSgIntVal(rhs)->get_value());
-          else if (isSgUnsignedIntVal(lhs))
-            dim = (int) isSgUnsignedIntVal(lhs)->get_value()
-              + (int) isSgUnsignedIntVal(rhs)->get_value();
-          else if (isSgUnsignedLongVal(lhs))
-            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                         + isSgUnsignedLongVal(rhs)->get_value());
-          else if (isSgLongIntVal(lhs))
-            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                         + isSgUnsignedLongVal(rhs)->get_value());
-          else if (isSgLongLongIntVal(lhs))
-            dim = (int) (isSgLongLongIntVal(lhs)->get_value()
-                         + isSgUnsignedLongVal(rhs)->get_value());
-          else if (isSgLongIntVal(lhs))
-            dim = (int) (isSgLongIntVal(lhs)->get_value()
-                         + isSgLongIntVal(rhs)->get_value());
-          else if (isSgUnsignedLongLongIntVal(lhs))
-            dim =
-              (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
-                     + isSgUnsignedLongLongIntVal(rhs)->get_value());
-          
-        }
-        size_n_d *= dim;
-        v.size_multi_dim.push_back(dim);
-        t = isSgArrayType(t)->get_base_type();
-      }
-      
-      //v.size_2d = (int) (isSgIntVal(t->get_index())->get_value());
-      
-      if (cudaDebug)
-	  printf("Detected Multi-Dimensional array sized of %d for %s\n",
-		 size_n_d, (char*) wo_refs[i]->name().c_str());
-      size = ocg->CreateInt(size_n_d);
-    } else if (it != array_dims.end()) {
-      int ref_size = it->second;
-      //size =
-      //        ocg->CreateInt(
-      //                isSgIntVal(
-      //                        isSgArrayType(outArray->get_type())->get_index())->get_value());
-      //v.size_2d = isSgArrayType(outArray->get_type())->get_rank();
-      //v.var_ref_size = ref_size;
-      size = ocg->CreateInt(ref_size);
-      
-    } else {
-      if (dim1) {
-        size = ocg->CreateTimes(
-          new CG_roseRepr(isSgExpression(buildVarRefExp(dim1))),
-          new CG_roseRepr(isSgExpression(buildVarRefExp(dim2))));
-      } else {
-        char buf[1024];
-        sprintf(buf,
-                "CudaizeCodeGen: Array reference %s does not have a "
-                "detectable size or specififed dimentions",
-                name.c_str());
-        throw std::runtime_error(buf);
-      }
-    }
-    
-    v.size_expr =
-      static_cast<CG_roseRepr*>(ocg->CreateTimes(size,
-                                                 new omega::CG_roseRepr(
-                                                   isSgExpression(buildSizeOfOp(v.type)))))->GetExpression();
-    
-    v.in_data = 0;
-    v.out_data = outArray;
-    //Check for in ro_refs and remove it at this point
-    std::vector<IR_ArrayRef *>::iterator it_;
-    for (it_ = ro_refs.begin(); it_ != ro_refs.end(); it_++) {
-      if ((*it_)->name() == wo_refs[i]->name()) {
-        break;
-      }
-    }
-    if (it_ != ro_refs.end()) {
-      v.in_data = outArray;
-      ro_refs.erase(it_);
-    }
-    
-    arrayVars.push_back(v);
-    
-  }
-  
-  //protonu-- assuming that all texture mapped memories were originally read only mems
-  //there should be safety checks for that, will implement those later
-  
-  for (int i = 0; i < ro_refs.size(); i++) {
-    SgVariableSymbol* inArray = 0;
-    std::string name = ro_refs[i]->name();
-    inArray = body_symtab->find_variable(SgName((char*) name.c_str()));
-    if (inArray == NULL)
-      inArray = parameter_symtab->find_variable(
-        SgName((char*) name.c_str()));
-    
-    VarDefs v;
-    v.size_multi_dim = std::vector<int>();
-    char buf[32];
-    snprintf(buf, 32, "devI%dPtr", i + 1);
-    v.name = buf;
-    int size_n_d;
-    if (isSgPointerType(inArray->get_type())) {
-      if (isSgArrayType(
-            isSgNode(
-              isSgPointerType(inArray->get_type())->get_base_type()))) {
-        
-        SgType* t =
-          isSgPointerType(inArray->get_type())->get_base_type();
-        
-        while (isSgArrayType(t))
-          t = isSgArrayType(t)->get_base_type();
-        
-        if (!isSgType(t)) {
-          char buf[1024];
-          sprintf(buf, "CudaizeCodeGen: Array type undetected!");
-          throw std::runtime_error(buf);
-          
-        }
-        v.type = t;
-      } else
-        v.type = isSgPointerType(inArray->get_type())->get_base_type();
-    } else if (isSgArrayType(inArray->get_type())) {
-      if (isSgArrayType(
-            isSgNode(
-              isSgArrayType(inArray->get_type())->get_base_type()))) {
-        
-        SgType* t = inArray->get_type();
-        while (isSgArrayType(t))
-          t = isSgArrayType(t)->get_base_type();
-        
-        if (!isSgType(t)) {
-          char buf[1024];
-          sprintf(buf, "CudaizeCodeGen: Array type undetected!");
-          throw std::runtime_error(buf);
-          
-        }
-        v.type = t;
-      } else
-        v.type = isSgArrayType(inArray->get_type())->get_base_type();
-    }
-    
-    else
-      v.type = buildFloatType();
-    
-    v.tex_mapped = false;
-    v.cons_mapped = false;
-    v.original_name = ro_refs[i]->name();
-    
-    //derick -- adding texture and constant mapping
-    if ( texture != NULL)
-      v.tex_mapped = (texture->is_array_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars
-    if (v.tex_mapped){
-      printf("this variable  %s is mapped to texture memory", name.c_str());
-    }
-    //derick -- this is commented out until constant memory is implemeted
-    if ( constant_mem != NULL)
-      v.cons_mapped = (constant_mem->is_array_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars
-    if (v.cons_mapped){
-      printf("this variable  %s is mapped to constant memory", name.c_str());
-    }
-    
-    //Size of the array = dim1 * dim2 * num bytes of our array type
-    //If our input array is 2D (non-linearized), we want the actual
-    //dimentions of the array (as it might be less than cu_n
-    CG_outputRepr* size;
-    //Lookup in array_dims
-    std::map<std::string, int>::iterator it = array_dims.find(name.c_str());
-    if (isSgPointerType(inArray->get_type())
-        && isSgArrayType(
-          isSgPointerType(inArray->get_type())->get_base_type())) {
-      //SgArrayType* t = isSgArrayType(isSgArrayType(inArray->get_type())->get_base_type());
-      //v.size_2d = t->get_rank();
-      SgType* t = isSgPointerType(inArray->get_type())->get_base_type();
-      /*   SgExprListExp* dimList = t->get_dim_info();
-           SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
-           SgExpression* expr=NULL;
-           for (; j != dimList->get_expressions().end(); j++)
-           expr = *j;
-      */
-      //v.size_2d = 1;
-      if (isSgIntVal(isSgArrayType(t)->get_index()))
-        size_n_d =
-          (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
-      else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgUnsignedIntVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgUnsignedLongVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d =
-          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
-      else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgLongLongIntVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d =
-          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
-      else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgUnsignedLongLongIntVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgAddOp(isSgArrayType(t)->get_index())) {
-        SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-        
-        SgExpression *lhs = op_add->get_lhs_operand();
-        SgExpression *rhs = op_add->get_rhs_operand();
-        
-        if (isSgIntVal(lhs))
-          size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value());
-        else if (isSgUnsignedIntVal(lhs))
-          size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value()
-            + (int) isSgUnsignedIntVal(rhs)->get_value();
-        else if (isSgUnsignedLongVal(lhs))
-          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                            + isSgUnsignedLongVal(rhs)->get_value());
-        else if (isSgLongIntVal(lhs))
-          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                            + isSgUnsignedLongVal(rhs)->get_value());
-        else if (isSgLongLongIntVal(lhs))
-          size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value()
-                            + isSgUnsignedLongVal(rhs)->get_value());
-        else if (isSgLongIntVal(lhs))
-          size_n_d = (int) (isSgLongIntVal(lhs)->get_value()
-                            + isSgLongIntVal(rhs)->get_value());
-        else if (isSgUnsignedLongLongIntVal(lhs))
-          size_n_d =
-            (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
-                   + isSgUnsignedLongLongIntVal(rhs)->get_value());
-        
-      }
-      t = isSgArrayType(t)->get_base_type();
-      while (isSgArrayType(t)) {
-        int dim;
-        if (isSgIntVal(isSgArrayType(t)->get_index()))
-          dim =
-            (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
-        else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgUnsignedIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgUnsignedLongVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgLongLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgUnsignedLongLongIntVal(
-                   isSgArrayType(t)->get_index()))
-          dim = (int) (isSgUnsignedLongLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgAddOp(isSgArrayType(t)->get_index())) {
-          SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-          
-          SgExpression *lhs = op_add->get_lhs_operand();
-          SgExpression *rhs = op_add->get_rhs_operand();
-          
-          if (isSgIntVal(lhs))
-            dim = (int) isSgIntVal(lhs)->get_value()
-              + (int) (isSgIntVal(rhs)->get_value());
-          else if (isSgUnsignedIntVal(lhs))
-            dim = (int) isSgUnsignedIntVal(lhs)->get_value()
-              + (int) isSgUnsignedIntVal(rhs)->get_value();
-          else if (isSgUnsignedLongVal(lhs))
-            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                         + isSgUnsignedLongVal(rhs)->get_value());
-          else if (isSgLongIntVal(lhs))
-            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                         + isSgUnsignedLongVal(rhs)->get_value());
-          else if (isSgLongLongIntVal(lhs))
-            dim = (int) (isSgLongLongIntVal(lhs)->get_value()
-                         + isSgUnsignedLongVal(rhs)->get_value());
-          else if (isSgLongIntVal(lhs))
-            dim = (int) (isSgLongIntVal(lhs)->get_value()
-                         + isSgLongIntVal(rhs)->get_value());
-          else if (isSgUnsignedLongLongIntVal(lhs))
-            dim =
-              (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
-                     + isSgUnsignedLongLongIntVal(rhs)->get_value());
-          
-        }
-        size_n_d *= dim;
-        v.size_multi_dim.push_back(dim);
-        t = isSgArrayType(t)->get_base_type();
-      }
-      if (cudaDebug)
-	  printf("Detected Multi-dimensional array sized of %d for %s\n",
-		 size_n_d, (char*) ro_refs[i]->name().c_str());
-      size = ocg->CreateInt(size_n_d);
-    } else if (isSgArrayType(inArray->get_type())
-               && isSgArrayType(
-                 isSgArrayType(inArray->get_type())->get_base_type())) {
-      //SgArrayType* t = isSgArrayType(isSgArrayType(inArray->get_type())->get_base_type());
-      //v.size_2d = t->get_rank();
-      SgType* t = inArray->get_type();
-      /*   SgExprListExp* dimList = t->get_dim_info();
-           SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
-           SgExpression* expr=NULL;
-           for (; j != dimList->get_expressions().end(); j++)
-           expr = *j;
-      */
-      
-      if (isSgIntVal(isSgArrayType(t)->get_index()))
-        size_n_d =
-          (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
-      else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgUnsignedIntVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgUnsignedLongVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d =
-          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
-      else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgLongLongIntVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d =
-          (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
-      else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index()))
-        size_n_d = (int) (isSgUnsignedLongLongIntVal(
-                            isSgArrayType(t)->get_index())->get_value());
-      else if (isSgAddOp(isSgArrayType(t)->get_index())) {
-        SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-        
-        SgExpression *lhs = op_add->get_lhs_operand();
-        SgExpression *rhs = op_add->get_rhs_operand();
-        
-        if (isSgIntVal(lhs))
-          size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value());
-        else if (isSgUnsignedIntVal(lhs))
-          size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value()
-            + (int) isSgUnsignedIntVal(rhs)->get_value();
-        else if (isSgUnsignedLongVal(lhs))
-          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                            + isSgUnsignedLongVal(rhs)->get_value());
-        else if (isSgLongIntVal(lhs))
-          size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                            + isSgUnsignedLongVal(rhs)->get_value());
-        else if (isSgLongLongIntVal(lhs))
-          size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value()
-                            + isSgUnsignedLongVal(rhs)->get_value());
-        else if (isSgLongIntVal(lhs))
-          size_n_d = (int) (isSgLongIntVal(lhs)->get_value()
-                            + isSgLongIntVal(rhs)->get_value());
-        else if (isSgUnsignedLongLongIntVal(lhs))
-          size_n_d =
-            (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
-                   + isSgUnsignedLongLongIntVal(rhs)->get_value());
-        
-      }
-      t = isSgArrayType(t)->get_base_type();
-      while (isSgArrayType(t)) {
-        int dim;
-        if (isSgIntVal(isSgArrayType(t)->get_index()))
-          dim =
-            (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
-        else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgUnsignedIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgUnsignedLongVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgLongLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
-          dim = (int) (isSgLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgUnsignedLongLongIntVal(
-                   isSgArrayType(t)->get_index()))
-          dim = (int) (isSgUnsignedLongLongIntVal(
-                         isSgArrayType(t)->get_index())->get_value());
-        else if (isSgAddOp(isSgArrayType(t)->get_index())) {
-          SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-          
-          SgExpression *lhs = op_add->get_lhs_operand();
-          SgExpression *rhs = op_add->get_rhs_operand();
-          
-          if (isSgIntVal(lhs))
-            dim = (int) isSgIntVal(lhs)->get_value()
-              + (int) (isSgIntVal(rhs)->get_value());
-          else if (isSgUnsignedIntVal(lhs))
-            dim = (int) isSgUnsignedIntVal(lhs)->get_value()
-              + (int) isSgUnsignedIntVal(rhs)->get_value();
-          else if (isSgUnsignedLongVal(lhs))
-            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                         + isSgUnsignedLongVal(rhs)->get_value());
-          else if (isSgLongIntVal(lhs))
-            dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
-                         + isSgUnsignedLongVal(rhs)->get_value());
-          else if (isSgLongLongIntVal(lhs))
-            dim = (int) (isSgLongLongIntVal(lhs)->get_value()
-                         + isSgUnsignedLongVal(rhs)->get_value());
-          else if (isSgLongIntVal(lhs))
-            dim = (int) (isSgLongIntVal(lhs)->get_value()
-                         + isSgLongIntVal(rhs)->get_value());
-          else if (isSgUnsignedLongLongIntVal(lhs))
-            dim =
-              (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
-                     + isSgUnsignedLongLongIntVal(rhs)->get_value());
-          
-        }
-        size_n_d *= dim;
-        v.size_multi_dim.push_back(dim);
-        t = isSgArrayType(t)->get_base_type();
-      }
-      if (cudaDebug)
-	  printf("Detected Multi-Dimensional array sized of %d for %s\n",
-		 size_n_d, (char*) ro_refs[i]->name().c_str());
-      size = ocg->CreateInt(size_n_d);
-    }
-    
-    else if (it != array_dims.end()) {
-      int ref_size = it->second;
-      //                v.var_ref_size = ref_size;
-      size = ocg->CreateInt(ref_size);
-    } else {
-      if (dim1) {
-        size = ocg->CreateTimes(
-          new CG_roseRepr(isSgExpression(buildVarRefExp(dim1))),
-          new CG_roseRepr(isSgExpression(buildVarRefExp(dim2))));
-      } else {
-        char buf[1024];
-        sprintf(buf,
-                "CudaizeCodeGen: Array reference %s does not have a "
-                "detectable size or specififed dimentions",
-                name.c_str());
-        throw std::runtime_error(buf);
-      }
-    }
-    v.size_expr =
-      static_cast<CG_roseRepr*>(ocg->CreateTimes(size,
-                                                 new omega::CG_roseRepr(
-                                                   isSgExpression(buildSizeOfOp(v.type)))))->GetExpression();
-    
-    v.in_data = inArray;
-    v.out_data = 0;
-    arrayVars.push_back(v);
-  }
-  
-  if (arrayVars.size() < 2) {
-    fprintf(stderr,
-            "cudaize error: Did not find two arrays being accessed\n");
-    return false;
-  }
-
-  //protonu--debugging tool--the printf statement
-  //tex_mem_on signals use of tex mem
-  /* derick -- texmapping near malloc mcopy
-  for(int i=0; i<arrayVars.size(); i++)
-  {
-	  //printf("var name %s, tex_mem used %s\n", arrayVars[i].name.c_str(), (arrayVars[i].tex_mapped)?"true":"false");
-	  if (arrayVars[i].tex_mapped  ) tex_mem_on ++;
-	  //if (arrayVars[i].cons_mapped  ) cons_mem_on ++;
-  }
-  */
-  
-  //Add our mallocs (and input array memcpys)
-  for (int i = 0; i < arrayVars.size(); i++) {
-    if(arrayVars[i].cons_mapped) {
-      setupConstantVar(constant_mem, &arrayVars[i], globals, i, symtab);
-      SgStatementPtrList *tnl = new SgStatementPtrList;
-      cudaBindConstantVar(constant_mem, &arrayVars[i], globals, tnl);
-      setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl));
-    } 
-    else {
-      SgVariableDeclaration* defn = buildVariableDeclaration(
-        SgName(arrayVars[i].name.c_str()),
-        buildPointerType(arrayVars[i].type));
-      SgInitializedNamePtrList& variables = defn->get_variables();
-      SgInitializedNamePtrList::const_iterator j = variables.begin();
-      SgInitializedName* initializedName = *j;
-      SgVariableSymbol* dvs = new SgVariableSymbol(initializedName);
-      prependStatement(defn, func_body);
-      
-      dvs->set_parent(body_symtab);
-      body_symtab->insert(SgName(arrayVars[i].name.c_str()), dvs);
-    
-//    SgVariableSymbol* dvs = body_symtab->find_variable(SgName(arrayVars[i].name.c_str()));
-    
-    //  if(dvs == NULL)
-    //      dvs =  parameter_symtab->find_variable(SgName(arrayVars[i].name.c_str()));
-    
-    //cudaMalloc args
-    // SgBasicBlock* block = buildBasicBlock();
-      SgName name_cuda_malloc("cudaMalloc");
-      SgFunctionDeclaration * decl_cuda_malloc =
-        buildNondefiningFunctionDeclaration(name_cuda_malloc,
-                                            buildVoidType(), buildFunctionParameterList(), globals);
-      
-      SgName name_cuda_copy("cudaMemcpy");
-      SgFunctionDeclaration * decl_cuda_copy =
-        buildNondefiningFunctionDeclaration(name_cuda_copy,
-                                            buildVoidType(), buildFunctionParameterList(), globals);
-      
-      SgExprListExp* args = buildExprListExp();
-      args->append_expression(
-        buildCastExp(buildAddressOfOp(buildVarRefExp(dvs)),
-                     buildPointerType(buildPointerType(buildVoidType()))));
-      args->append_expression(arrayVars[i].size_expr);
-    
-//    decl_cuda_malloc->get_parameterList()->append_arg
-      SgFunctionCallExp *the_call = buildFunctionCallExp(
-        buildFunctionRefExp(decl_cuda_malloc), args);
-      
-      SgExprStatement* stmt = buildExprStatement(the_call);
-    
-    //  (*replacement_list).push_back (stmt);
-    
-      SgStatementPtrList* tnl = new SgStatementPtrList;
-      (*tnl).push_back(stmt);
-      setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl));
-      if (arrayVars[i].in_data) {
-      
-        SgExprListExp * cuda_copy_in_args = buildExprListExp();
-        cuda_copy_in_args->append_expression(
-          isSgExpression(buildVarRefExp(dvs)));
-        cuda_copy_in_args->append_expression(
-          isSgExpression(buildVarRefExp(arrayVars[i].in_data)));
-        CG_roseRepr* size_exp = new CG_roseRepr(arrayVars[i].size_expr);
-        cuda_copy_in_args->append_expression(
-          static_cast<CG_roseRepr*>(size_exp->clone())->GetExpression());
-        cuda_copy_in_args->append_expression(
-          buildOpaqueVarRefExp("cudaMemcpyHostToDevice", globals));
-      
-//                                      cuda_copy_in_args->append_expression(
-//                                              new SgVarRefExp(sourceLocation, )
-//                                      );
-        SgFunctionCallExp * cuda_copy_in_func_call = buildFunctionCallExp(
-          buildFunctionRefExp(decl_cuda_copy), cuda_copy_in_args);
-      
-        SgExprStatement* stmt = buildExprStatement(cuda_copy_in_func_call);
-      
-        SgStatementPtrList *tnl = new SgStatementPtrList;
-        (*tnl).push_back(stmt);
-        setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl));
-      
-        if(arrayVars[i].tex_mapped) {
-          setupTexmappingVar(texture, &arrayVars[i], globals, i, dvs, symtab);
-          SgStatementPtrList *tnl = new SgStatementPtrList;
-          cudaBindTexture(texture, &arrayVars[i], globals, tnl);
-          setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl));
-        }
-      }
-    }
-  }
-  
-  //Build dimGrid dim3 variables based on loop dimentions and ti/tj
-  char blockD1[120];
-  char blockD2[120];
-  if (dim1) {
-    snprintf(blockD1, 120, "%s/%d",
-             dim1->get_declaration()->get_name().getString().c_str(), cu_tx);
-    snprintf(blockD2, 120, "%s/%d",
-             dim2->get_declaration()->get_name().getString().c_str(), cu_ty);
-  } else {
-    snprintf(blockD1, 120, "%d", cu_bx);
-    snprintf(blockD2, 120, "%d", cu_by);
-    //snprintf(blockD1, 120, "%d/%d", cu_nx, cu_tx);
-    //snprintf(blockD2, 120, "%d/%d", cu_ny, cu_ty);
-  }
-  
-  SgInitializedName* arg1 = buildInitializedName("i", buildIntType());
-  SgInitializedName* arg2 = buildInitializedName("j", buildIntType());
-  SgInitializedName* arg3 = buildInitializedName("k", buildIntType());
-  SgName type_name("dim3");
-  //SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(type_name);
-  
-  //ROSE_ASSERT(type_symbol != NULL);
-  
-  //SgClassDeclaration * dim3classdecl = isSgClassDeclaration(
-  //        type_symbol->get_declaration());
-  
-  SgFunctionDeclaration * funcdecl = buildNondefiningFunctionDeclaration(
-    SgName("dim3"), buildOpaqueType("dim3", globalScope),
-    //isSgType(dim3classdecl->get_type()),
-    buildFunctionParameterList(arg1, arg2, arg3), globalScope);
-  
-  if (cu_bx && cu_by)
-    repr = ocg->CreateDim3((const char*) gridName, ocg->CreateInt(cu_bx),
-                           ocg->CreateInt(cu_by));
-  else if (cu_bx_repr && cu_by_repr)
-    repr = ocg->CreateDim3((const char*) gridName, cu_bx_repr, cu_by_repr);
-  else if (cu_bx_repr)
-    repr = ocg->CreateDim3((const char*) gridName, cu_bx_repr,
-                           ocg->CreateInt(1));
-  setup_code = ocg->StmtListAppend(setup_code, repr);
-  //SgStatementPtrList* dimList = static_cast<CG_roseRepr *>(repr)->GetList();
-  
-  //for(SgStatementPtrList::iterator it = (*dimList).begin(); it != (*dimList).end(); it++)
-  //    (*replacement_list).push_back (*it);
-  
-  //  repr = ocg->CreateDim3((const char*)blockName, cu_tx,cu_ty);
-  
-  if (cu_tz > 1 || cu_tz_repr) {
-    
-    if (cu_tx && cu_ty && cu_tz)
-      repr = ocg->CreateDim3((char*) blockName, ocg->CreateInt(cu_tx),
-                             ocg->CreateInt(cu_ty), ocg->CreateInt(cu_tz));
-    else if (cu_tx_repr && cu_ty_repr && cu_tz_repr)
-      repr = ocg->CreateDim3((char*) blockName, cu_tx_repr, cu_ty_repr,
-                             cu_tz_repr);
-    // SgStatementPtrList* dimList = static_cast<CG_roseRepr *>(repr)->GetList();
-    
-    // for(SgStatementPtrList::iterator it = (*dimList).begin(); it != (*dimList).end(); it++)
-    //    (*replacement_list).push_back (*it);
-    
-  } else {
-    if (cu_tx && cu_ty)
-      repr = ocg->CreateDim3((char*) blockName, ocg->CreateInt(cu_tx),
-                             ocg->CreateInt(cu_ty));
-    else if (cu_tx_repr && cu_ty_repr)
-      repr = ocg->CreateDim3((char*) blockName, cu_tx_repr, cu_ty_repr);
-    //SgStatementPtrList* dimList = static_cast<CG_roseRepr *>(repr)->GetList();
-    
-    //for(SgStatementPtrList::iterator it = (*dimList).begin(); it != (*dimList).end(); it++)
-    //   (*replacement_list).push_back (*it);
-    
-  }
-  
-  setup_code = ocg->StmtListAppend(setup_code, repr);
-  
-  SgCudaKernelExecConfig* config = new SgCudaKernelExecConfig(
-    buildVarRefExp(gridName), buildVarRefExp(blockName), NULL, NULL);
-  //SgCudaKernelExecConfig* config = new SgCudaKernelExecConfig(buildIntVal(cu_bx), , NULL, NULL);
-  SgExprListExp* iml = new SgExprListExp();
-  SgCastExp* dim_s;
-  
-  //Creating Kernel function
-  SgBasicBlock* bb = new SgBasicBlock(TRANSFORMATION_FILE_INFO);
-  SgFunctionDefinition* kernel_defn = new SgFunctionDefinition(
-    TRANSFORMATION_FILE_INFO, bb);
-  SgFunctionDeclaration* kernel_decl_ = new SgFunctionDeclaration(
-    TRANSFORMATION_FILE_INFO, SgName((char*)cu_kernel_name.c_str()),buildFunctionType(buildVoidType(), buildFunctionParameterList()), kernel_defn);
-  SgFunctionDeclaration* kernel_decl = new SgFunctionDeclaration(
-    TRANSFORMATION_FILE_INFO, SgName((char*)cu_kernel_name.c_str()),buildFunctionType(buildVoidType(), buildFunctionParameterList()), kernel_defn);
-  
-  //((kernel_decl->get_declarationModifier()).get_storageModifier()).setStatic();
-  
-  kernel_decl->set_definingDeclaration(kernel_decl);
-  kernel_defn->set_parent(kernel_decl);
-  bb->set_parent(kernel_defn);
-  bb->set_endOfConstruct(TRANSFORMATION_FILE_INFO);
-  bb->get_endOfConstruct()->set_parent(bb);
-  
-  //SgFunctionSymbol* functionSymbol = new SgFunctionSymbol(kernel_decl_);
-  //globals->insert_symbol(SgName((char*) cu_kernel_name.c_str()),
-  //            functionSymbol);
-  SgFunctionSymbol* functionSymbol2 = new SgFunctionSymbol(kernel_decl);
-  
-  globals->insert_symbol(SgName((char*) cu_kernel_name.c_str()),
-                         functionSymbol2);
-  
-  kernel_decl_->set_parent(globals);
-  
-  kernel_decl_->set_scope(globals);
-  
-  kernel_decl_->setForward();
-  
-  globals->prepend_declaration(kernel_decl_);
-  
-  kernel_decl->set_endOfConstruct(TRANSFORMATION_FILE_INFO);
-  kernel_decl->get_endOfConstruct()->set_parent(kernel_decl);
-  
-  kernel_decl->set_parent(globals);
-  kernel_decl->set_scope(globals);
-  
-  kernel_decl->get_definition()->set_endOfConstruct(TRANSFORMATION_FILE_INFO);
-  kernel_decl->get_definition()->get_endOfConstruct()->set_parent(
-    kernel_decl->get_definition());
-  
-  globals->append_statement(kernel_decl);
-  
-  //printf("%s %s\n", static_cast<const char*>(cu_kernel_name), dims);
-  //--derick - kernel function parameters  
-  for (int i = 0; i < arrayVars.size(); i++)
-    //Throw in a type cast if our kernel takes 2D array notation
-    //like (float(*) [1024])
-  {
-    //protonu--throwing in another hack to stop the caller from passing tex mapped
-    //vars to the kernel.
-    if (arrayVars[i].tex_mapped == true || arrayVars[i].cons_mapped)
-      continue;
-    if (!(arrayVars[i].size_multi_dim.empty())) {
-      //snprintf(dims,120,"(float(*) [%d])%s", arrayVars[i].size_2d,
-      //         const_cast<char*>(arrayVars[i].name.c_str()));
-      
-      SgType* t = arrayVars[i].type;
-      for (int k = arrayVars[i].size_multi_dim.size() - 1; k >= 0; k--) {
-        t = buildArrayType(t,
-                           buildIntVal(arrayVars[i].size_multi_dim[k]));
-      }
-      SgVariableSymbol* temp = body_symtab->find_variable(
-        SgName((char*) arrayVars[i].name.c_str()));
-      if (temp == NULL)
-        temp = parameter_symtab->find_variable(
-          SgName((char*) arrayVars[i].name.c_str()));
-      
-      dim_s = buildCastExp(buildVarRefExp(temp), buildPointerType(t),
-                           SgCastExp::e_C_style_cast);
-      
-      //printf("%d %s\n", i, dims);
-      iml->append_expression(dim_s);
-      
-      SgInitializedName* id = buildInitializedName(
-        (char*) arrayVars[i].original_name.c_str(),
-        buildPointerType(t));
-      kernel_decl->get_parameterList()->append_arg(id);
-      kernel_decl_->get_parameterList()->append_arg(id);
-      id->set_file_info(TRANSFORMATION_FILE_INFO);
-      
-      // DQ (9/8/2007): We now test this, so it has to be set explicitly.
-      id->set_scope(kernel_decl->get_definition());
-      
-      // DQ (9/8/2007): Need to add variable symbol to global scope!
-      //printf ("Fixing up the symbol table in scope = %p = %s for SgInitializedName = %p = %s \n",globalScope,globalScope->class_name().c_str(),var1_init_name,var1_init_name->get_name().str());
-      SgVariableSymbol *var_symbol = new SgVariableSymbol(id);
-      kernel_decl->get_definition()->insert_symbol(id->get_name(),
-                                                   var_symbol);
-      
-      // if(kernel_decl->get_definition()->get_symbol_table()->find((const) id) == NULL)
-      
-    } else {
-      //printf("%d %s\n", i, static_cast<const char*>(arrayVars[i].name));
-      SgVariableSymbol* temp = body_symtab->find_variable(
-        SgName((char*) arrayVars[i].name.c_str()));
-      if (temp == NULL)
-        temp = parameter_symtab->find_variable(
-          SgName((char*) arrayVars[i].name.c_str()));
-      iml->append_expression(buildVarRefExp(temp));
-      SgInitializedName* id = buildInitializedName(
-        (char*) arrayVars[i].original_name.c_str(),
-        buildPointerType(arrayVars[i].type));
-      kernel_decl->get_parameterList()->append_arg(id);
-      kernel_decl_->get_parameterList()->append_arg(id);
-      id->set_file_info(TRANSFORMATION_FILE_INFO);
-      
-      // DQ (9/8/2007): We now test this, so it has to be set explicitly.
-      id->set_scope(kernel_decl->get_definition());
-      
-      // DQ (9/8/2007): Need to add variable symbol to global scope!
-      //printf ("Fixing up the symbol table in scope = %p = %s for SgInitializedName = %p = %s \n"$
-      SgVariableSymbol *var_symbol = new SgVariableSymbol(id);
-      kernel_decl->get_definition()->insert_symbol(id->get_name(),
-                                                   var_symbol);
-      
-    }
-    
-  }
-  if (dim1) {
-    iml->append_expression(buildVarRefExp(dim1));
-    SgInitializedName* id = buildInitializedName(
-      dim1->get_name().getString().c_str(), dim1->get_type());
-    kernel_decl->get_parameterList()->append_arg(id);
-    
-    iml->append_expression(buildVarRefExp(dim2));
-    SgInitializedName* id2 = buildInitializedName(
-      dim2->get_name().getString().c_str(), dim2->get_type());
-    
-    kernel_decl->get_parameterList()->append_arg(id);
-    kernel_decl_->get_parameterList()->append_arg(id);
-  }
-  
-  kernel_decl->get_functionModifier().setCudaKernel();
-  kernel_decl_->get_functionModifier().setCudaKernel();
-  SgCudaKernelCallExp * cuda_call_site = new SgCudaKernelCallExp(
-    TRANSFORMATION_FILE_INFO, buildFunctionRefExp(kernel_decl), iml,config);
-  
-  //  SgStatementPtrList *tnl2 = new SgStatementPtrList;
-  
-  (*replacement_list).push_back(buildExprStatement(cuda_call_site));
-  
-  setup_code = ocg->StmtListAppend(setup_code,
-                                   new CG_roseRepr(replacement_list));
-  
-  //cuda free variables
-  for (int i = 0; i < arrayVars.size(); i++) {
-    if (arrayVars[i].out_data) {
-      
-      SgName name_cuda_copy("cudaMemcpy");
-      SgFunctionDeclaration * decl_cuda_copyout =
-        buildNondefiningFunctionDeclaration(name_cuda_copy,
-                                            buildVoidType(), buildFunctionParameterList(),
-                                            globals);
-      
-      SgExprListExp* args = buildExprListExp();
-      SgExprListExp * cuda_copy_out_args = buildExprListExp();
-      cuda_copy_out_args->append_expression(
-        isSgExpression(buildVarRefExp(arrayVars[i].out_data)));
-      cuda_copy_out_args->append_expression(
-        isSgExpression(buildVarRefExp(arrayVars[i].name)));
-      CG_roseRepr* size_exp = new CG_roseRepr(arrayVars[i].size_expr);
-      cuda_copy_out_args->append_expression(
-        static_cast<CG_roseRepr*>(size_exp->clone())->GetExpression());
-      cuda_copy_out_args->append_expression(
-        buildOpaqueVarRefExp("cudaMemcpyDeviceToHost", globals));
-      
-//                                      cuda_copy_in_args->append_expression(
-//                                              new SgVarRefExp(sourceLocation, )
-//                                      );
-      SgFunctionCallExp * cuda_copy_out_func_call = buildFunctionCallExp(
-        buildFunctionRefExp(decl_cuda_copyout), cuda_copy_out_args);
-      
-      SgFunctionCallExp *the_call = buildFunctionCallExp(
-        buildFunctionRefExp(decl_cuda_copyout), cuda_copy_out_args);
-      
-      SgExprStatement* stmt = buildExprStatement(the_call);
-      
-      SgStatementPtrList* tnl3 = new SgStatementPtrList;
-      
-      (*tnl3).push_back(stmt);
-      
-      //   tree_node_list* tnl = new tree_node_list;
-      //   tnl->append(new tree_instr(the_call));
-      setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl3));
-      
-    }
-    if(!arrayVars[i].cons_mapped) {
-      SgName name_cuda_free("cudaFree");
-      SgFunctionDeclaration * decl_cuda_free =
-        buildNondefiningFunctionDeclaration(name_cuda_free,
-                                            buildVoidType(), buildFunctionParameterList(), globals);
-      
-      SgExprListExp* args3 = buildExprListExp();
-      
-      SgVariableSymbol* tmp = body_symtab->find_variable(
-        SgName(arrayVars[i].name.c_str()));
-      if (tmp == NULL)
-        tmp = parameter_symtab->find_variable(
-          SgName(arrayVars[i].name.c_str()));
-      
-      args3->append_expression(buildVarRefExp(tmp));
-      
-      SgFunctionCallExp *the_call2 = buildFunctionCallExp(
-        buildFunctionRefExp(decl_cuda_free), args3);
-      
-      SgExprStatement* stmt2 = buildExprStatement(the_call2);
-      
-      SgStatementPtrList* tnl4 = new SgStatementPtrList;
-      
-      (*tnl4).push_back(stmt2);
-      //(*replacement_list).push_back (stmt2);
-    
-      setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl4));
-    }
-  }
-  
-  // ---------------
-  // BUILD THE KERNEL
-  // ---------------
-  
-  //Extract out kernel body
-  SgNode* code = getCode();
-  //Create kernel function body
-  //Add Params
-  std::map<std::string, SgVariableSymbol*> loop_vars;
-  //In-Out arrays
-  for (int i = 0; i < arrayVars.size(); i++) {
-    /*   if(arrayVars[i].in_data)
-         fptr = arrayVars[i].in_data->type()->clone();
-         else
-         fptr = arrayVars[i].out_data->type()->clone();
-    */
-    
-    // fptr = new_proc_syms->install_type(fptr);
-    std::string name =
-      arrayVars[i].in_data ?
-      arrayVars[i].in_data->get_declaration()->get_name().getString() :
-      arrayVars[i].out_data->get_declaration()->get_name().getString();
-    //SgVariableSymbol* sym = new var_sym(fptr, arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name());
-    
-    SgVariableSymbol* sym =
-      kernel_decl->get_definition()->get_symbol_table()->find_variable(
-        (const char*) name.c_str());
-    /* SgVariableDeclaration*  defn = buildVariableDeclaration(SgName(name.c_str()), buildFloatType());
-       SgInitializedNamePtrList& variables = defn->get_variables();
-       SgInitializedNamePtrList::const_iterator i = variables.begin();
-       SgInitializedName* initializedName = *i;
-       SgVariableSymbol* sym = new SgVariableSymbol(initializedName);
-       prependStatement(defn, isSgScopeStatement(root_));
-       
-       vs->set_parent(symtab2_);
-       symtab2_->insert(SgName(_s.c_str()), vs);
-    */
-    
-    if (sym != NULL)
-      loop_vars.insert(
-        std::pair<std::string, SgVariableSymbol*>(std::string(name),
-                                                  sym));
-  }
-  
-  //Figure out which loop variables will be our thread and block dimention variables
-  std::vector<SgVariableSymbol *> loop_syms;
-  //Get our indexes
-  std::vector<const char*> indexes; // = get_loop_indexes(code,cu_num_reduce);
-  int threadsPos = 0;
-  
-  CG_outputRepr *body = NULL;
-  SgFunctionDefinition* func_d = func_definition;
-  //std::vector<SgVariableSymbol *> symbols =  recursiveFindRefs(code);
-  
-  SgName name_sync("__syncthreads");
-  SgFunctionDeclaration * decl_sync = buildNondefiningFunctionDeclaration(
-    name_sync, buildVoidType(), buildFunctionParameterList(),
-    globalScope);
-  
-  recursiveFindRefs(code, syms, func_d);
-  
-  //SgFunctionDeclaration* func = Outliner::generateFunction (code, (char*)cu_kernel_name.c_str(), syms, pdSyms, psyms, NULL, globalScope);
-  
-  if (cu_bx > 1 || cu_bx_repr) {
-    indexes.push_back("bx");
-    SgName type_name("blockIdx.x");
-    SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
-      type_name);
-    SgVariableDeclaration * var_decl = buildVariableDeclaration("bx",
-                                                                buildIntType(), NULL,
-                                                                isSgScopeStatement(kernel_decl->get_definition()->get_body()));
-    SgStatementPtrList *tnl = new SgStatementPtrList;
-    // (*tnl).push_back(isSgStatement(var_decl));
-    appendStatement(var_decl, kernel_decl->get_definition()->get_body());
-    
-    SgVariableSymbol* bx =
-      kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
-        SgName("bx"));
-    SgStatement* assign = isSgStatement(
-      buildAssignStatement(buildVarRefExp(bx),
-                           buildOpaqueVarRefExp("blockIdx.x",
-                                                kernel_decl->get_definition()->get_body())));
-    (*tnl).push_back(assign);
-    // body = ocg->StmtListAppend(body,
-    //                                  new CG_roseRepr(tnl));
-    appendStatement(assign, kernel_decl->get_definition()->get_body());
-    
-  }
-  if (cu_by > 1 || cu_by_repr) {
-    indexes.push_back("by");
-    SgName type_name("blockIdx.y");
-    SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
-      type_name);
-    SgVariableDeclaration * var_decl = buildVariableDeclaration("by",
-                                                                buildIntType(), NULL,
-                                                                isSgScopeStatement(kernel_decl->get_definition()->get_body()));
-    // SgStatementPtrList *tnl = new SgStatementPtrList;
-    // (*tnl).push_back(isSgStatement(var_decl));
-    appendStatement(var_decl, kernel_decl->get_definition()->get_body());
-    
-    SgVariableSymbol* by =
-      kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
-        SgName("by"));
-    SgStatement* assign = isSgStatement(
-      buildAssignStatement(buildVarRefExp(by),
-                           buildOpaqueVarRefExp("blockIdx.y",
-                                                kernel_decl->get_definition()->get_body())));
-    //(*tnl).push_back(assign);
-    // body = ocg->StmtListAppend(body,
-    //                                  new CG_roseRepr(tnl));
-    appendStatement(assign, kernel_decl->get_definition()->get_body());
-    
-  }
-  if (cu_tx_repr || cu_tx > 1) {
-    threadsPos = indexes.size();
-    indexes.push_back("tx");
-    SgName type_name("threadIdx.x");
-    SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
-      type_name);
-    SgVariableDeclaration * var_decl = buildVariableDeclaration("tx",
-                                                                buildIntType(), NULL,
-                                                                isSgScopeStatement(kernel_decl->get_definition()->get_body()));
-    //  SgStatementPtrList *tnl = new SgStatementPtrList;
-    //  (*tnl).push_back(isSgStatement(var_decl));
-    appendStatement(var_decl, kernel_decl->get_definition()->get_body());
-    
-    SgVariableSymbol* tx =
-      kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
-        SgName("tx"));
-    SgStatement* assign = isSgStatement(
-      buildAssignStatement(buildVarRefExp(tx),
-                           buildOpaqueVarRefExp("threadIdx.x",
-                                                kernel_decl->get_definition()->get_body())));
-    //(*tnl).push_back(assign);
-    // body = ocg->StmtListAppend(body,
-    //                                  new CG_roseRepr(tnl));
-    appendStatement(assign, kernel_decl->get_definition()->get_body());
-    
-  }
-  if (cu_ty_repr || cu_ty > 1) {
-    indexes.push_back("ty");
-    SgName type_name("threadIdx.y");
-    SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
-      type_name);
-    SgVariableDeclaration * var_decl = buildVariableDeclaration("ty",
-                                                                buildIntType(), NULL,
-                                                                isSgScopeStatement(kernel_decl->get_definition()->get_body()));
-    appendStatement(var_decl, kernel_decl->get_definition()->get_body());
-    
-    // SgStatementPtrList *tnl = new SgStatementPtrList;
-    // (*tnl).push_back(isSgStatement(var_decl));
-    SgVariableSymbol* ty =
-      kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
-        SgName("ty"));
-    SgStatement* assign = isSgStatement(
-      buildAssignStatement(buildVarRefExp(ty),
-                           buildOpaqueVarRefExp("threadIdx.y",
-                                                kernel_decl->get_definition()->get_body())));
-    // (*tnl).push_back(assign);
-    //  body = ocg->StmtListAppend(body,
-    //                                   new CG_roseRepr(tnl));
-    appendStatement(assign, kernel_decl->get_definition()->get_body());
-    
-  }
-  if (cu_tz_repr || cu_tz > 1) {
-    indexes.push_back("tz");
-    SgName type_name("threadIdx.z");
-    SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
-      type_name);
-    SgVariableDeclaration * var_decl = buildVariableDeclaration("tz",
-                                                                buildIntType(), NULL,
-                                                                isSgScopeStatement(kernel_decl->get_definition()->get_body()));
-    //   SgStatementPtrList *tnl = new SgStatementPtrList;
-    //   (*tnl).push_back(isSgStatement(var_decl));
-    appendStatement(var_decl, kernel_decl->get_definition()->get_body());
-    
-    SgVariableSymbol* tz =
-      kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
-        "tz");
-    SgStatement* assign = isSgStatement(
-      buildAssignStatement(buildVarRefExp(tz),
-                           buildOpaqueVarRefExp("threadIdx.z",
-                                                kernel_decl->get_definition()->get_body())));
-    //    (*tnl).push_back(assign);
-    //     body = ocg->StmtListAppend(body,
-    //                                      new CG_roseRepr(tnl));
-    appendStatement(assign, kernel_decl->get_definition()->get_body());
-    
-  }
-  
-  std::map<std::string, SgVariableSymbol*> loop_idxs; //map from idx names to their new syms
-  
-  SgNode* swapped_ = swapVarReferences(code, syms,
-                                       kernel_decl->get_definition()->get_symbol_table(),
-                                       kernel_decl->get_definition()->get_body()->get_symbol_table(),
-                                       kernel_decl->get_definition()->get_body());
-  
-  //std::cout << swapped_->unparseToString() << std::endl << std::endl;
-  
-  SgNode *swapped = recursiveFindReplacePreferedIdxs(swapped_,
-                                                     kernel_decl->get_definition()->get_body()->get_symbol_table(),
-                                                     kernel_decl->get_definition()->get_symbol_table(),
-                                                     kernel_decl->get_definition()->get_body(), loop_idxs, globalScope); //in-place swapping
-  //swapped->print();
-  
-  if (!isSgBasicBlock(swapped)) {
-    appendStatement(isSgStatement(swapped),
-                    kernel_decl->get_definition()->get_body());
-    swapped->set_parent(
-      isSgNode(kernel_decl->get_definition()->get_body()));
-  } else {
-    
-    for (SgStatementPtrList::iterator it =
-           isSgBasicBlock(swapped)->get_statements().begin();
-         it != isSgBasicBlock(swapped)->get_statements().end(); it++) {
-      appendStatement(*it, kernel_decl->get_definition()->get_body());
-      (*it)->set_parent(
-        isSgNode(kernel_decl->get_definition()->get_body()));
-      
-    }
-    
-  }
-  
-  for (int i = 0; i < indexes.size(); i++) {
-    std::vector<SgForStatement*> tfs = findCommentedFors(indexes[i],
-                                                         swapped);
-    for (int k = 0; k < tfs.size(); k++) {
-      //printf("replacing %p tfs for index %s\n", tfs[k], indexes[i]);
-      SgNode* newBlock = forReduce(tfs[k], loop_idxs[indexes[i]],
-                                   kernel_decl->get_definition());
-      //newBlock->print();
-      swap_node_for_node_list(tfs[k], newBlock);
-      //printf("AFTER SWAP\n");        newBlock->print();
-    }
-  }
-
-  //--derick replace array refs of texture mapped vars here
-  body = new CG_roseRepr(kernel_decl->get_definition()->get_body());
-  std::vector<IR_ArrayRef*> refs = ir->FindArrayRef(body);
-  texmapArrayRefs(texture, &refs, globals, ir, ocg);
-  // do the same for constant mapped vars
-  consmapArrayRefs(constant_mem, &refs, globals, ir, ocg);
-  
-  return swapped;
-}
-
-//Order taking out dummy variables
-std::vector<std::string> cleanOrder(std::vector<std::string> idxNames) {
-  std::vector<std::string> results;
-  for (int j = 0; j < idxNames.size(); j++) {
-    if (idxNames[j].length() != 0)
-      results.push_back(idxNames[j]);
-  }
-  return results;
-}
-
-//First non-dummy level in ascending order
-int LoopCuda::nonDummyLevel(int stmt, int level) {
-  //level comes in 1-basd and should leave 1-based
-  for (int j = level - 1; j < idxNames[stmt].size(); j++) {
-    if (idxNames[stmt][j].length() != 0) {
-      //printf("found non dummy level of %d with idx: %s when searching for %d\n", j+1, (const char*) idxNames[stmt][j], level);
-      return j + 1;
-    }
-  }
-  char buf[128];
-  sprintf(buf, "%d", level);
-  throw std::runtime_error(
-    std::string("Unable to find a non-dummy level starting from ")
-    + std::string(buf));
-}
-
-int LoopCuda::findCurLevel(int stmt, std::string idx) {
-  for (int j = 0; j < idxNames[stmt].size(); j++) {
-    if (strcmp(idxNames[stmt][j].c_str(), idx.c_str()) == 0)
-      return j + 1;
-  }
-  throw std::runtime_error(
-    std::string("Unable to find index ") + idx
-    + std::string(" in current list of indexes"));
-}
-
-void LoopCuda::permute_cuda(int stmt,
-                            const std::vector<std::string>& curOrder) {
-  //printf("curOrder: ");
-  //printVs(curOrder);
-  //printf("idxNames: ");
-  //printVS(idxNames[stmt]);
-  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt]);
-  bool same = true;
-  std::vector<int> pi;
-  for (int i = 0; i < curOrder.size(); i++) {
-    bool found = false;
-    for (int j = 0; j < cIdxNames.size(); j++) {
-      if (strcmp(cIdxNames[j].c_str(), curOrder[i].c_str()) == 0) {
-        pi.push_back(j + 1);
-        found = true;
-        if (j != i)
-          same = false;
-      }
-    }
-    if (!found) {
-      throw std::runtime_error(
-        "One of the indexes in the permute order were not "
-        "found in the current set of indexes.");
-    }
-  }
-  for (int i = curOrder.size(); i < cIdxNames.size(); i++) {
-    pi.push_back(i);
-  }
-  if (same)
-    return;
-  permute(stmt, pi);
-  //Set old indexe names as new
-  for (int i = 0; i < curOrder.size(); i++) {
-    idxNames[stmt][i] = curOrder[i].c_str(); //what about sibling stmts?
-  }
-}
-
-bool LoopCuda::permute(int stmt_num, const std::vector<int> &pi) {
-// check for sanity of parameters
-  if (stmt_num >= stmt.size() || stmt_num < 0)
-    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
-  const int n = stmt[stmt_num].xform.n_out();
-  if (pi.size() > (n - 1) / 2)
-    throw std::invalid_argument(
-      "iteration space dimensionality does not match permute dimensionality");
-  int first_level = 0;
-  int last_level = 0;
-  for (int i = 0; i < pi.size(); i++) {
-    if (pi[i] > (n - 1) / 2 || pi[i] <= 0)
-      throw std::invalid_argument(
-        "invalid loop level " + to_string(pi[i])
-        + " in permuation");
-    
-    if (pi[i] != i + 1) {
-      if (first_level == 0)
-        first_level = i + 1;
-      last_level = i + 1;
-    }
-  }
-  if (first_level == 0)
-    return true;
-  
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  std::set<int> active = getStatements(lex, 2 * first_level - 2);
-  Loop::permute(active, pi);
-}
-
-void LoopCuda::tile_cuda(int stmt, int level, int outer_level) {
-  tile_cuda(stmt, level, 1, outer_level, "", "", CountedTile);
-}
-void LoopCuda::tile_cuda(int level, int tile_size, int outer_level,
-                         std::string idxName, std::string ctrlName, TilingMethodType method) {
-  tile_cuda(0, level, tile_size, outer_level, idxName, ctrlName, method);
-}
-
-void LoopCuda::tile_cuda(int stmt, int level, int tile_size, int outer_level,
-                         std::string idxName, std::string ctrlName, TilingMethodType method) {
-  //Do regular tile but then update the index and control loop variable
-  //names as well as the idxName to reflect the current state of things.
-  //printf("tile(%d,%d,%d,%d)\n", stmt, level, tile_size, outer_level);
-  //printf("idxNames before: ");
-  //printVS(idxNames[stmt]);
-  
-  tile(stmt, level, tile_size, outer_level, method);
-  
-  if (idxName.size())
-    idxNames[stmt][level - 1] = idxName.c_str();
-  if (tile_size == 1) {
-    //potentially rearrange loops
-    if (outer_level < level) {
-      std::string tmp = idxNames[stmt][level - 1];
-      for (int i = level - 1; i > outer_level - 1; i--) {
-        if (i - 1 >= 0)
-          idxNames[stmt][i] = idxNames[stmt][i - 1];
-      }
-      idxNames[stmt][outer_level - 1] = tmp;
-    }
-    //TODO: even with a tile size of one, you need a insert (of a dummy loop)
-    idxNames[stmt].insert(idxNames[stmt].begin() + (level), "");
-  } else {
-    if (!ctrlName.size())
-      throw std::runtime_error("No ctrl loop name for tile");
-    //insert
-    idxNames[stmt].insert(idxNames[stmt].begin() + (outer_level - 1),
-                          ctrlName.c_str());
-  }
-  
-  //printf("idxNames after: ");
-  //printVS(idxNames[stmt]);
-}
-
-bool LoopCuda::datacopy_privatized_cuda(int stmt_num, int level,
-                                        const std::string &array_name,
-                                        const std::vector<int> &privatized_levels, bool allow_extra_read,
-                                        int fastest_changing_dimension, int padding_stride,
-                                        int padding_alignment, bool cuda_shared) {
-  int old_stmts = stmt.size();
-  //  printf("before datacopy_privatized:\n");
-  printIS();
-  //datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, cuda_shared);
-  if (cuda_shared)
-    datacopy_privatized(stmt_num, level, array_name, privatized_levels,
-                        allow_extra_read, fastest_changing_dimension, padding_stride,
-                        padding_alignment, 1);
-  else
-    datacopy_privatized(stmt_num, level, array_name, privatized_levels,
-                        allow_extra_read, fastest_changing_dimension, padding_stride,
-                        padding_alignment, 0);
-  //  printf("after datacopy_privatized:\n");
-  printIS();
-  
-  //Adjust idxNames to reflect updated state
-  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
-  int new_stmts = stmt.size();
-  for (int i = old_stmts; i < new_stmts; i++) {
-    //printf("fixing up statement %d\n", i);
-    std::vector<std::string> idxs;
-    
-    //protonu-making sure the vector of nonSplitLevels grows along with
-    //the statement structure
-    stmt_nonSplitLevels.push_back(std::vector<int>());
-    
-    //Indexes up to level will be the same
-    for (int j = 0; j < level - 1; j++)
-      idxs.push_back(cIdxNames[j]);
-    
-    //Expect privatized_levels to match
-    for (int j = 0; j < privatized_levels.size(); j++)
-      idxs.push_back(cIdxNames[privatized_levels[j] - 1]);//level is one-based
-    
-    //all further levels should match order they are in originally
-    if (privatized_levels.size()) {
-      int last_privatized = privatized_levels.back();
-      int top_level = last_privatized
-        + (stmt[i].IS.n_set() - idxs.size());
-      //printf("last privatized_levels: %d top_level: %d\n", last_privatized, top_level);
-      for (int j = last_privatized; j < top_level; j++) {
-        idxs.push_back(cIdxNames[j]);
-        //printf("pushing back: %s\n", (const char*)cIdxNames[j]);
-      }
-    }
-    idxNames.push_back(idxs);
-  }
-}
-
-bool LoopCuda::datacopy_cuda(int stmt_num, int level,
-                             const std::string &array_name, 
-                             const std::vector<std::string> new_idxs,
-                             bool allow_extra_read, int fastest_changing_dimension,
-                             int padding_stride, int padding_alignment, bool cuda_shared) {
-  
-  int old_stmts = stmt.size();
-  //datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment,cuda_shared);
-  //  printf("before datacopy:\n");
-  //  printIS();
-  if (cuda_shared)
-    datacopy(stmt_num, level, array_name, allow_extra_read,
-             fastest_changing_dimension, padding_stride, padding_alignment,
-             1);
-  else
-    datacopy(stmt_num, level, array_name, allow_extra_read,
-             fastest_changing_dimension, padding_stride, padding_alignment,
-             0);
-  //  printf("after datacopy:\n");
-  printIS();
-
-  //Adjust idxNames to reflect updated state
-  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
-  int new_stmts = stmt.size();
-  for (int i = old_stmts; i < new_stmts; i++) {
-    //printf("fixing up statement %d\n", i);
-    std::vector<std::string> idxs;
-    
-    //protonu-making sure the vector of nonSplitLevels grows along with
-    //the statement structure
-    stmt_nonSplitLevels.push_back(std::vector<int>());
-    
-    //Indexes up to level will be the same
-    for (int j = 0; j < level - 1; j++)
-      idxs.push_back(cIdxNames[j]);
-    
-    //all further levels should get names from new_idxs
-    int top_level = stmt[i].IS.n_set();
-    //printf("top_level: %d level: %d\n", top_level, level);
-    if (new_idxs.size() < top_level - level + 1)
-      throw std::runtime_error(
-        "Need more new index names for new datacopy loop levels");
-    
-    for (int j = level - 1; j < top_level; j++) {
-      idxs.push_back(new_idxs[j - level + 1].c_str());
-      //printf("pushing back: %s\n", new_idxs[j-level+1].c_str());
-    }
-    idxNames.push_back(idxs);
-  }
-}
-
-bool LoopCuda::unroll_cuda(int stmt_num, int level, int unroll_amount) {
-  int old_stmts = stmt.size();
-  //bool b= unroll(stmt_num, , unroll_amount);
-  
-  int dim = 2 * level - 1;
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  std::set<int> same_loop = getStatements(lex, dim - 1);
-  
-  level = nonDummyLevel(stmt_num, level);
-  //printf("unrolling %d at level %d\n", stmt_num,level);
-  
-  //protonu--using the new version of unroll, which returns
-  //a set of ints instead of a bool. To keep Gabe's logic
-  //I'll check the size of the set, if it's 0 return true
-  //bool b= unroll(stmt_num, level, unroll_amount);
-  std::set<int> b_set = unroll(stmt_num, level, unroll_amount, idxNames);
-  bool b = false;
-  if (b_set.size() == 0)
-    b = true;
-  //end--protonu
-  
-  //Adjust idxNames to reflect updated state
-  std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
-  std::vector<std::string> origSource = idxNames[stmt_num];
-  ;
-  //Drop index names at level
-  if (unroll_amount == 0) {
-    //For all statements that were in this unroll together, drop index name for unrolled level
-    idxNames[stmt_num][level - 1] = "";
-    for (std::set<int>::iterator i = same_loop.begin();
-         i != same_loop.end(); i++) {
-      //printf("in same loop as %d is %d\n", stmt_num, (*i));
-      //idxNames[(*i)][level-1] = "";
-      idxNames[(*i)] = idxNames[stmt_num];
-    }
-  }
-  
-  lex = getLexicalOrder(stmt_num);
-  same_loop = getStatements(lex, dim - 1);
-  
-  bool same_as_source = false;
-  int new_stmts = stmt.size();
-  for (int i = old_stmts; i < new_stmts; i++) {
-    //Check whether we had a sync for the statement we are unrolling, if
-    //so, propogate that to newly created statements so that if they are
-    //in a different loop structure, they will also get a syncthreads
-    int size = syncs.size();
-    for (int j = 0; j < size; j++) {
-      if (syncs[j].first == stmt_num)
-        syncs.push_back(make_pair(i, syncs[j].second));
-    }
-    
-    //protonu-making sure the vector of nonSplitLevels grows along with
-    //the statement structure
-    stmt_nonSplitLevels.push_back(std::vector<int>());
-    
-    //We expect that new statements have a constant for the variable in
-    //stmt[i].IS at level (as seen with print_with_subs), otherwise there
-    //will be a for loop at level and idxNames should match stmt's
-    //idxNames pre-unrolled
-    Relation IS = stmt[i].IS;
-    //Ok, if you know how the hell to get anything out of a Relation, you
-    //should probably be able to do this more elegantly. But for now, I'm
-    //hacking it.
-    std::string s = IS.print_with_subs_to_string();
-    //s looks looks like
-    //{[_t49,8,_t51,_t52,128]: 0 <= _t52 <= 3 && 0 <= _t51 <= 15 && 0 <= _t49 && 64_t49+16_t52+_t51 <= 128}
-    //where level == 5, you see a integer in the input set
-    
-    //If that's not an integer and this is the first new statement, then
-    //we think codegen will have a loop at that level. It's not perfect,
-    //not sure if it can be determined without round-tripping to codegen.
-    int sIdx = 0;
-    int eIdx = 0;
-    for (int j = 0; j < level - 1; j++) {
-      sIdx = s.find(",", sIdx + 1);
-      if (sIdx < 0)
-        break;
-    }
-    if (sIdx > 0) {
-      eIdx = s.find("]");
-      int tmp = s.find(",", sIdx + 1);
-      if (tmp > 0 && tmp < eIdx)
-        eIdx = tmp; //", before ]"
-      if (eIdx > 0) {
-        sIdx++;
-        std::string var = s.substr(sIdx, eIdx - sIdx);
-        //printf("%s\n", s.c_str());
-        //printf("set var for stmt %d at level %d is %s\n", i, level, var.c_str());
-        if (atoi(var.c_str()) == 0 && i == old_stmts) {
-          //TODO:Maybe do see if this new statement would be in the same
-          //group as the original and if it would, don't say
-          //same_as_source
-          if (same_loop.find(i) == same_loop.end()) {
-            printf(
-              "stmt %d level %d, newly created unroll statement should have same level indexes as source\n",
-              i, level);
-            same_as_source = true;
-          }
-        }
-      }
-    }
-    
-    //printf("fixing up statement %d n_set %d with %d levels\n", i, stmt[i].IS.n_set(), level-1);
-    if (same_as_source)
-      idxNames.push_back(origSource);
-    else
-      idxNames.push_back(idxNames[stmt_num]);
-  }
-  
-  return b;
-}
-
-void LoopCuda::copy_to_texture(const char *array_name) {
-  //protonu--placeholder for now
-  //set the bool for using cuda memory as true
-  //in a vector of strings, put the names of arrays to tex mapped
-  if (!texture)
-    texture = new texture_memory_mapping(true, array_name);
-  else
-    texture->add(array_name);
-  
-}
-
-//void LoopCuda::copy_to_texture_2d(const char *array_name, int width, int height) {
-//  if (!texture)
-//    texture = new texture_memory_mapping(true, array_name, width, height);
-//  else
-//    texture->add(array_name, width, height);
-//}
-
-void LoopCuda::copy_to_constant(const char *array_name) {
-  if(!constant_mem)
-    constant_mem = new constant_memory_mapping(true, array_name);
-  else
-    constant_mem->add(array_name);
-}
-
-//protonu--moving this from Loop
-SgNode* LoopCuda::codegen() {
-  if (code_gen_flags & GenCudaizeV2)
-    return cudaize_codegen_v2();
-  //Do other flagged codegen methods, return plain vanilla generated code
-  return getCode();
-}
-
-//These three are in Omega code_gen.cc and are used as a massive hack to
-//get out some info from MMGenerateCode. Yea for nasty side-effects.
-namespace omega {
-  extern int checkLoopLevel;
-  extern int stmtForLoopCheck;
-  extern int upperBoundForLevel;
-  extern int lowerBoundForLevel;
-}
-
-CG_outputRepr* LoopCuda::extractCudaUB(int stmt_num, int level,
-                                       int &outUpperBound, int &outLowerBound) {
-  // check for sanity of parameters
-  const int m = stmt.size();
-  if (stmt_num >= m || stmt_num < 0)
-    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
-  const int n = stmt[stmt_num].xform.n_out();
-  if (level > (n - 1) / 2 || level <= 0)
-    throw std::invalid_argument("invalid loop level " + to_string(level));
-  
-  int dim = 2 * level - 1;
-  
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  std::set<int> same_loop = getStatements(lex, dim - 1);
-  
-  // extract the intersection of the iteration space to be considered
-  Relation hull;
-  {
-    hull = Relation::True(n);
-    for (std::set<int>::iterator i = same_loop.begin();
-         i != same_loop.end(); i++) {
-      Relation r = getNewIS(*i);
-      for (int j = dim + 2; j <= r.n_set(); j++)
-        r = Project(r, r.set_var(j));
-      hull = Intersection(hull, r);
-      hull.simplify(2, 4);
-    }
-    
-    for (int i = 2; i <= dim + 1; i += 2) {
-      //std::string name = std::string("_t") + to_string(t_counter++);
-      std::string name = std::string("_t")
-        + to_string(tmp_loop_var_name_counter++);
-      hull.name_set_var(i, name);
-    }
-    hull.setup_names();
-  }
-  
-  // extract the exact loop bound of the dimension to be unrolled
-  if (is_single_iteration(hull, dim)) {
-    throw std::runtime_error(
-      "No loop availabe at level to extract upper bound.");
-  }
-  Relation bound = get_loop_bound(hull, dim);
-  if (!bound.has_single_conjunct() || !bound.is_satisfiable()
-      || bound.is_tautology())
-    throw loop_error(
-      "loop error: unable to extract loop bound for cudaize");
-  
-  // extract the loop stride
-  EQ_Handle stride_eq;
-  /*int stride = 1;
-    {
-    bool simple_stride = true;
-    int strides = countStrides(bound.query_DNF()->single_conjunct(),
-    bound.set_var(dim + 1), stride_eq, simple_stride);
-    if (strides > 1)
-    throw loop_error("loop error: too many strides");
-    else if (strides == 1) {
-    int sign = stride_eq.get_coef(bound.set_var(dim + 1));
-    //      assert(sign == 1 || sign == -1);
-    Constr_Vars_Iter it(stride_eq, true);
-    stride = abs((*it).coef / sign);
-    }
-    }
-  */
-  int stride = 1;
-  {
-    
-    coef_t stride;
-    std::pair<EQ_Handle, Variable_ID> result = find_simplest_stride(bound,
-                                                                    bound.set_var(dim + 1));
-    if (result.second == NULL)
-      stride = 1;
-    else
-      stride = abs(result.first.get_coef(result.second))
-        / gcd(abs(result.first.get_coef(result.second)),
-              abs(result.first.get_coef(bound.set_var(dim + 1))));
-    
-    if (stride > 1)
-      throw loop_error("loop error: too many strides");
-    /*else if (stride == 1) {
-      int sign = result.first.get_coef(bound.set_var(dim+1));
-      assert(sign == 1 || sign == -1);
-      } */
-  }
-  
-  if (stride != 1) {
-    char buf[1024];
-    sprintf(buf, "Cudaize: Loop at level %d has non-one stride of %d",
-            level, stride);
-    throw std::runtime_error(buf);
-  }
-  
-  //Use code generation system to build tell us our bound information. We
-  //need a hard upper bound a 0 lower bound.
-  
-  checkLoopLevel = level * 2;
-  stmtForLoopCheck = stmt_num;
-  upperBoundForLevel = -1;
-  lowerBoundForLevel = -1;
-  printCode(1, false);
-  checkLoopLevel = 0;
-  
-  outUpperBound = upperBoundForLevel;
-  outLowerBound = lowerBoundForLevel;
-  
-  if (outUpperBound == -1) {
-    
-    CG_result* temp = last_compute_cgr_;
-    
-    while (temp) {
-      CG_loop * loop;
-      if (loop = dynamic_cast<CG_loop*>(temp)) {
-        if (loop->level_ == 2 * level) {
-          Relation bound = copy(loop->bounds_);
-          Variable_ID v = bound.set_var(2 * level);
-          for (GEQ_Iterator e(
-                 const_cast<Relation &>(bound).single_conjunct()->GEQs());
-               e; e++) {
-            if ((*e).get_coef(v) < 0
-                && (*e).is_const_except_for_global(v))
-              return output_upper_bound_repr(ir->builder(), *e, v,
-                                             bound,
-                                             std::vector<std::pair<CG_outputRepr *, int> >(
-                                               bound.n_set(),
-                                               std::make_pair(
-                                                 static_cast<CG_outputRepr *>(NULL),
-                                                 0)));
-          }
-        }
-        if (loop->level_ > 2 * level)
-          break;
-        else
-          temp = loop->body_;
-      } else
-        break;
-    }
-  }
-  
-  return NULL;
-}
-
-void LoopCuda::printCode(int effort, bool actuallyPrint) const {
-  const int m = stmt.size();
-  if (m == 0)
-    return;
-  const int n = stmt[0].xform.n_out();
-  
-  /*or (int i = 0; i < m; i++) {
-    IS[i + 1] = stmt[i].IS;
-    xform[i + 1] = stmt[i].xform;
-    
-    //nonSplitLevels[i+1] = stmt[i].nonSplitLevels;
-    }
-  */
-  
-  // invalidate saved codegen computation
-  if (last_compute_cgr_ != NULL) {
-    delete last_compute_cgr_;
-    last_compute_cgr_ = NULL;
-  }
-  
-  if (last_compute_cg_ != NULL) {
-    delete last_compute_cg_;
-    last_compute_cg_ = NULL;
-  }
-  
-  //Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
-  /*CG_stringBuilder *ocg = new CG_stringBuilder();
-    Tuple<CG_outputRepr *> nameInfo;
-    for (int i = 1; i <= m; i++)
-    nameInfo.append(new CG_stringRepr("s" + to_string(i)));
-  */
-  
-  // -- replacing MMGenerateCode
-  // -- formally CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort);
-  // -- in the future, these if statements need to be cleaned up.
-  // -- something like check_lastComputeCG might be a decent protected member function
-  // -- and/or something that returns a std::vector<CG_outputRepr*> that also checks last_compute_cg_
-  //if (last_compute_cg_ == NULL) {
-  std::vector<Relation> IS(m);
-  std::vector<Relation> xforms(m);
-  std::vector<std::vector<int> > nonSplitLevels(m);
-  
-  /*    std::vector < std::vector <std::string> > idxTupleNames;
-        if (useIdxNames) {
-        for (int i = 0; i < idxNames.size(); i++) {
-        Tuple<std::string> idxs;
-        for (int j = 0; j < idxNames[i].size(); j++)
-        idxs.append(idxNames[i][j]);
-        idxTupleNames.append(idxs);
-        }
-        }
-  */
-  for (int i = 0; i < m; i++) {
-    IS[i] = stmt[i].IS;
-    xforms[i] = stmt[i].xform;
-    nonSplitLevels[i] = stmt_nonSplitLevels[i];
-  }
-  Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
-  
-  last_compute_cg_ = new CodeGen(xforms, IS, known, nonSplitLevels, idxNames,
-                                 syncs);
-
-  delete last_compute_cgr_;  // this was just done  above? 
-  last_compute_cgr_ = NULL;
-  //}
-  
-  if (last_compute_cgr_ == NULL || last_compute_effort_ != effort) {
-    delete last_compute_cgr_;
-    last_compute_cgr_ = last_compute_cg_->buildAST(effort);
-    last_compute_effort_ = effort;
-  }
-  
-  //std::vector<CG_outputRepr *> stmts(m);
-  //for (int i = 0; i < m; i++)
-  //    stmts[i] = stmt[i].code;
-  //CG_outputRepr* repr = last_compute_cgr_->printRepr(ocg, stmts);
-  // -- end replacing MMGenerateCode
-  std::string repr = last_compute_cgr_->printString();
-  
-  if (actuallyPrint)
-    std::cout << repr << std::endl;
-  //std::cout << static_cast<CG_stringRepr*>(repr)->GetString();
-  /*
-    for (int i = 1; i <= m; i++)
-    delete nameInfo[i];
-  */
-  
-  //delete ocg;
-}
-
-void LoopCuda::printRuntimeInfo() const {
-  for (int i = 0; i < stmt.size(); i++) {
-    Relation IS = stmt[i].IS;
-    Relation xform = stmt[i].xform;
-    printf("stmt[%d]\n", i);
-    printf("IS\n");
-    IS.print_with_subs();
-    
-    printf("xform[%d]\n", i);
-    xform.print_with_subs();
-    
-  }
-}
-
-void LoopCuda::printIndexes() const {
-  for (int i = 0; i < stmt.size(); i++) {
-    printf("stmt %d nset %d ", i, stmt[i].IS.n_set());
-    
-    for (int j = 0; j < idxNames[i].size(); j++) {
-      if (j > 0)
-        printf(",");
-      printf("%s", idxNames[i][j].c_str());
-    }
-    printf("\n");
-  }
-}
-
-SgNode* LoopCuda::getCode(int effort) const {
-  const int m = stmt.size();
-  if (m == 0)
-    return new SgNode;
-  const int n = stmt[0].xform.n_out();
-  /*
-    Tuple<CG_outputRepr *> ni(m);
-    Tuple < Relation > IS(m);
-    Tuple < Relation > xform(m);
-    vector < vector <int> > nonSplitLevels(m);
-    for (int i = 0; i < m; i++) {
-    ni[i + 1] = stmt[i].code;
-    IS[i + 1] = stmt[i].IS;
-    xform[i + 1] = stmt[i].xform;
-    nonSplitLevels[i + 1] = stmt_nonSplitLevels[i];
-    
-    //nonSplitLevels[i+1] = stmt[i].nonSplitLevels;
-    }
-  */
-  //Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
-//#ifdef DEBUG
-//#endif
-  //std::cout << GetString(MMGenerateCode(new CG_stringBuilder(), xform, IS, ni, known,
-  //                nonSplitLevels, syncs, idxTupleNames, effort));
-  if (last_compute_cgr_ != NULL) {
-    delete last_compute_cgr_;
-    last_compute_cgr_ = NULL;
-  }
-  
-  if (last_compute_cg_ != NULL) {
-    delete last_compute_cg_;
-    last_compute_cg_ = NULL;
-  }
-  
-  CG_outputBuilder *ocg = ir->builder();
-  // -- replacing MMGenerateCode
-  // -- formally CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort);
-  // -- in the future, these if statements need to be cleaned up.
-  // -- something like check_lastComputeCG might be a decent protected member function
-  // -- and/or something that returns a std::vector<CG_outputRepr*> that also checks last_compute_cg_
-  //if (last_compute_cg_ == NULL) {
-  std::vector<Relation> IS(m);
-  std::vector<Relation> xforms(m);
-  std::vector<std::vector<int> > nonSplitLevels(m);
-  for (int i = 0; i < m; i++) {
-    IS[i] = stmt[i].IS;
-    xforms[i] = stmt[i].xform;
-    nonSplitLevels[i] = stmt_nonSplitLevels[i];
-  }
-  
-  /*std::vector < std::vector<std::string> > idxTupleNames;
-    if (useIdxNames) {
-    for (int i = 0; i < idxNames.size(); i++) {
-    std::vector<std::string> idxs;
-    for (int j = 0; j < idxNames[i].size(); j++)
-    idxs.push_back(idxNames[i][j]);
-    idxTupleNames.push_back(idxs);
-    }
-    }
-  */
-  Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
-  
-  last_compute_cg_ = new CodeGen(xforms, IS, known, nonSplitLevels, idxNames,
-                                 syncs);
-  delete last_compute_cgr_;
-  last_compute_cgr_ = NULL;
-  //}
-  
-  if (last_compute_cgr_ == NULL || last_compute_effort_ != effort) {
-    delete last_compute_cgr_;
-    last_compute_cgr_ = last_compute_cg_->buildAST(effort);
-    last_compute_effort_ = effort;
-  }
-  
-  std::vector<CG_outputRepr *> stmts(m);
-  for (int i = 0; i < m; i++)
-    stmts[i] = stmt[i].code;
-  CG_outputRepr* repr = last_compute_cgr_->printRepr(ocg, stmts);
-  // -- end replacing MMGenerateCode
-  
-  //CG_outputRepr *overflow_initialization = ocg->CreateStmtList();
-  CG_outputRepr *overflow_initialization = ocg->StmtListAppend(NULL, NULL);
-  for (std::map<int, std::vector<Free_Var_Decl *> >::const_iterator i =
-         overflow.begin(); i != overflow.end(); i++)
-    for (std::vector<Free_Var_Decl *>::const_iterator j = i->second.begin();
-         j != i->second.end(); j++)
-      //overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->CreateStmtList(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0))));
-      overflow_initialization = ocg->StmtListAppend(
-        overflow_initialization,
-        ocg->StmtListAppend(
-          ocg->CreateAssignment(0,
-                                ocg->CreateIdent((*j)->base_name()),
-                                ocg->CreateInt(0)), NULL));
-  
-  repr = ocg->StmtListAppend(overflow_initialization, repr);
-  SgNode *tnl = static_cast<CG_roseRepr *>(repr)->GetCode();
-  SgStatementPtrList *list = static_cast<CG_roseRepr *>(repr)->GetList();
-  
-  if (tnl != NULL)
-    return tnl;
-  else if (tnl == NULL && list != NULL) {
-    SgBasicBlock* bb2 = buildBasicBlock();
-    
-    for (SgStatementPtrList::iterator it = (*list).begin();
-         it != (*list).end(); it++)
-      bb2->append_statement(*it);
-    
-    tnl = isSgNode(bb2);
-  } else
-    throw loop_error("codegen failed");
-  
-  delete repr;
-  /*
-    for (int i = 1; i <= m; i++)
-    delete ni[i];
-  */
-  return tnl;
-  
-}
-
-//protonu--adding constructors for the new derived class
-LoopCuda::LoopCuda() :
-  Loop(), code_gen_flags(GenInit) {
-}
-
-LoopCuda::LoopCuda(IR_Control *irc, int loop_num) :
-  Loop(irc) {
-  setup_code = NULL;
-  teardown_code = NULL;
-  code_gen_flags = 0;
-  cu_bx = cu_by = cu_tx = cu_ty = cu_tz = 1;
-  cu_bx_repr = NULL;
-  cu_tx_repr = NULL;
-  cu_by_repr = NULL;
-  cu_ty_repr = NULL;
-  cu_tz_repr = NULL;
-  
-  cu_num_reduce = 0;
-  cu_mode = GlobalMem;
-  texture = NULL;
-  constant_mem = NULL;
-  
-  int m = stmt.size();
-  //printf("\n the size of stmt(initially) is: %d\n", stmt.size());
-  for (int i = 0; i < m; i++)
-    stmt_nonSplitLevels.push_back(std::vector<int>());
-  
-  globals = ((IR_cudaroseCode *) ir)->gsym_;
-  globalScope = ((IR_cudaroseCode *) ir)->first_scope;
-  parameter_symtab = ((IR_cudaroseCode *) ir)->parameter;
-  body_symtab = ((IR_cudaroseCode *) ir)->body;
-  func_body = ((IR_cudaroseCode *) ir)->defn;
-  func_definition = ((IR_cudaroseCode *) ir)->func_defn;
-  std::vector<SgForStatement *> tf = ((IR_cudaroseCode *) ir)->get_loops();
-  
-  symtab = tf[loop_num]->get_symbol_table();
-  
-  std::vector<SgForStatement *> deepest = find_deepest_loops(
-    isSgNode(tf[loop_num]));
-  
-  for (int i = 0; i < deepest.size(); i++) {
-    SgVariableSymbol* vs;
-    SgForInitStatement* list = deepest[i]->get_for_init_stmt();
-    SgStatementPtrList& initStatements = list->get_init_stmt();
-    SgStatementPtrList::const_iterator j = initStatements.begin();
-    if (SgExprStatement *expr = isSgExprStatement(*j))
-      if (SgAssignOp* op = isSgAssignOp(expr->get_expression()))
-        if (SgVarRefExp* var_ref = isSgVarRefExp(op->get_lhs_operand()))
-          vs = var_ref->get_symbol();
-    
-    index.push_back(vs->get_name().getString().c_str()); //reflects original code index names
-  }
-  
-  for (int i = 0; i < stmt.size(); i++)
-    idxNames.push_back(index); //refects prefered index names (used as handles in cudaize v2)
-  useIdxNames = false;
-  
-}
-
-void LoopCuda::printIS() {
- if (!cudaDebug) return;
-  int k = stmt.size();
-  for (int i = 0; i < k; i++) {
-    printf(" printing statement:%d\n", i);
-    stmt[i].IS.print();
-  }
-}
-
diff --git a/loop_modified.cc b/loop_modified.cc
deleted file mode 100644
index 9686f6d..0000000
--- a/loop_modified.cc
+++ /dev/null
@@ -1,4234 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2008 University of Southern California
- Copyright (C) 2009-2010 University of Utah
- All Rights Reserved.
-
- Purpose:
- Core loop transformation functionality.
-
- Notes:
- "level" (starting from 1) means loop level and it corresponds to "dim"
- (starting from 0) in transformed iteration space [c_1,l_1,c_2,l_2,....,
- c_n,l_n,c_(n+1)], e.g., l_2 is loop level 2 in generated code, dim 3
- in transformed iteration space, and variable 4 in Omega relation.
- All c's are constant numbers only and they will not show up as actual loops.
- Formula:
- dim = 2*level - 1
- var = dim + 1
-
- History:
- 10/2005 Created by Chun Chen.
- 09/2009 Expand tile functionality, -chun
- 10/2009 Initialize unfusible loop nest without bailing out, -chun
-*****************************************************************************/
-
-#include <limits.h>
-#include <math.h>
-#include <code_gen/code_gen.h>
-#include <code_gen/CG_outputBuilder.h>
-#include <code_gen/output_repr.h>
-#include <iostream>
-#include <map>
-#include "loop.hh"
-#include "omegatools.hh"
-#include "irtools.hh"
-#include "chill_error.hh"
-#include <string.h>
-using namespace omega;
-
-const std::string Loop::tmp_loop_var_name_prefix = std::string("_t");
-const std::string Loop::overflow_var_name_prefix = std::string("over");
-
-//-----------------------------------------------------------------------------
-// Class Loop
-//-----------------------------------------------------------------------------
-
-bool Loop::init_loop(std::vector<ir_tree_node *> &ir_tree,
-                     std::vector<ir_tree_node *> &ir_stmt) {
-  ir_stmt = extract_ir_stmts(ir_tree);
-  stmt_nesting_level_.resize(ir_stmt.size());
-  std::vector<int> stmt_nesting_level(ir_stmt.size());
-  for (int i = 0; i < ir_stmt.size(); i++) {
-    ir_stmt[i]->payload = i;
-    int t = 0;
-    ir_tree_node *itn = ir_stmt[i];
-    while (itn->parent != NULL) {
-      itn = itn->parent;
-      if (itn->content->type() == IR_CONTROL_LOOP)
-        t++;
-    }
-    stmt_nesting_level_[i] = t;
-    stmt_nesting_level[i] = t;
-  }
-  
-  stmt = std::vector<Statement>(ir_stmt.size());
-  int n_dim = -1;
-  int max_loc;
-  //std::vector<std::string> index;
-  for (int i = 0; i < ir_stmt.size(); i++) {
-    int max_nesting_level = -1;
-    int loc;
-    for (int j = 0; j < ir_stmt.size(); j++)
-      if (stmt_nesting_level[j] > max_nesting_level) {
-        max_nesting_level = stmt_nesting_level[j];
-        loc = j;
-      }
-    
-    // most deeply nested statement acting as a reference point
-    if (n_dim == -1) {
-      n_dim = max_nesting_level;
-      max_loc = loc;
-      
-      index = std::vector<std::string>(n_dim);
-      
-      ir_tree_node *itn = ir_stmt[loc];
-      int cur_dim = n_dim - 1;
-      while (itn->parent != NULL) {
-        itn = itn->parent;
-        if (itn->content->type() == IR_CONTROL_LOOP) {
-          index[cur_dim] =
-            static_cast<IR_Loop *>(itn->content)->index()->name();
-          itn->payload = cur_dim--;
-        }
-      }
-    }
-    
-    // align loops by names, temporary solution
-    ir_tree_node *itn = ir_stmt[loc];
-    int depth = stmt_nesting_level_[loc] - 1;
-    /*   while (itn->parent != NULL) {
-         itn = itn->parent;
-         if (itn->content->type() == IR_CONTROL_LOOP && itn->payload == -1) {
-         std::string name = static_cast<IR_Loop *>(itn->content)->index()->name();
-         for (int j = 0; j < n_dim; j++)
-         if (index[j] == name) {
-         itn->payload = j;
-         break;
-         }
-         if (itn->payload == -1)
-         throw loop_error("no complex alignment yet");
-         }
-         }
-    */
-    for (int t = depth; t >= 0; t--) {
-      int y = t;
-      ir_tree_node *itn = ir_stmt[loc];
-      
-      while ((itn->parent != NULL) && (y >= 0)) {
-        itn = itn->parent;
-        if (itn->content->type() == IR_CONTROL_LOOP)
-          y--;
-      }
-      
-      if (itn->content->type() == IR_CONTROL_LOOP && itn->payload == -1) {
-        CG_outputBuilder *ocg = ir_->builder();
-        
-        itn->payload = depth - t;
-        
-        CG_outputRepr *code =
-          static_cast<IR_Block *>(ir_stmt[loc]->content)->extract();
-        
-        Tuple<CG_outputRepr *> index_expr;
-        Tuple<std::string> old_index;
-        CG_outputRepr *repl = ocg->CreateIdent(index[itn->payload]);
-        index_expr.append(repl);
-        old_index.append(
-          static_cast<IR_Loop *>(itn->content)->index()->name());
-        
-        code = ocg->CreatePlaceHolder(0, code, index_expr, old_index);
-        replace.insert(std::pair<int, CG_outputRepr*>(loc, code));
-        //stmt[loc].code = code;
-        
-      }
-    }
-    
-    // set relation variable names
-    Relation r(n_dim);
-    F_And *f_root = r.add_and();
-    itn = ir_stmt[loc];
-    int temp_depth = depth;
-    while (itn->parent != NULL) {
-      
-      itn = itn->parent;
-      if (itn->content->type() == IR_CONTROL_LOOP) {
-        r.name_set_var(itn->payload + 1, index[temp_depth]);
-        
-        temp_depth--;
-      }
-      //static_cast<IR_Loop *>(itn->content)->index()->name());
-    }
-    
-    /*while (itn->parent != NULL) {
-      itn = itn->parent;
-      if (itn->content->type() == IR_CONTROL_LOOP)
-      r.name_set_var(itn->payload+1, static_cast<IR_Loop *>(itn->content)->index()->name());
-      }*/
-    
-    // extract information from loop/if structures
-    std::vector<bool> processed(n_dim, false);
-    Tuple<std::string> vars_to_be_reversed;
-    itn = ir_stmt[loc];
-    while (itn->parent != NULL) {
-      itn = itn->parent;
-      
-      switch (itn->content->type()) {
-      case IR_CONTROL_LOOP: {
-        IR_Loop *lp = static_cast<IR_Loop *>(itn->content);
-        Variable_ID v = r.set_var(itn->payload + 1);
-        int c;
-        
-        try {
-          c = lp->step_size();
-          if (c > 0) {
-            CG_outputRepr *lb = lp->lower_bound();
-            exp2formula(ir, r, f_root, freevar, lb, v, 's',
-                        IR_COND_GE, true);
-            CG_outputRepr *ub = lp->upper_bound();
-            IR_CONDITION_TYPE cond = lp->stop_cond();
-            if (cond == IR_COND_LT || cond == IR_COND_LE)
-              exp2formula(ir, r, f_root, freevar, ub, v, 's',
-                          cond, true);
-            else
-              throw ir_error("loop condition not supported");
-            
-          } else if (c < 0) {
-            CG_outputBuilder *ocg = ir->builder();
-            CG_outputRepr *lb = lp->lower_bound();
-            lb = ocg->CreateMinus(NULL, lb);
-            exp2formula(ir, r, f_root, freevar, lb, v, 's',
-                        IR_COND_GE, true);
-            CG_outputRepr *ub = lp->upper_bound();
-            ub = ocg->CreateMinus(NULL, ub);
-            IR_CONDITION_TYPE cond = lp->stop_cond();
-            if (cond == IR_COND_GE)
-              exp2formula(ir, r, f_root, freevar, ub, v, 's',
-                          IR_COND_LE, true);
-            else if (cond == IR_COND_GT)
-              exp2formula(ir, r, f_root, freevar, ub, v, 's',
-                          IR_COND_LT, true);
-            else
-              throw ir_error("loop condition not supported");
-            
-            vars_to_be_reversed.append(lp->index()->name());
-          } else
-            throw ir_error("loop step size zero");
-        } catch (const ir_error &e) {
-          for (int i = 0; i < itn->children.size(); i++)
-            delete itn->children[i];
-          itn->children = std::vector<ir_tree_node *>();
-          itn->content = itn->content->convert();
-          return false;
-        }
-        
-        if (abs(c) != 1) {
-          F_Exists *f_exists = f_root->add_exists();
-          Variable_ID e = f_exists->declare();
-          F_And *f_and = f_exists->add_and();
-          Stride_Handle h = f_and->add_stride(abs(c));
-          if (c > 0)
-            h.update_coef(e, 1);
-          else
-            h.update_coef(e, -1);
-          h.update_coef(v, -1);
-          CG_outputRepr *lb = lp->lower_bound();
-          exp2formula(ir, r, f_and, freevar, lb, e, 's', IR_COND_EQ,
-                      true);
-        }
-        
-        processed[itn->payload] = true;
-        break;
-      }
-      case IR_CONTROL_IF: {
-        CG_outputRepr *cond =
-          static_cast<IR_If *>(itn->content)->condition();
-        try {
-          if (itn->payload % 2 == 1)
-            exp2constraint(ir, r, f_root, freevar, cond, true);
-          else {
-            F_Not *f_not = f_root->add_not();
-            F_And *f_and = f_not->add_and();
-            exp2constraint(ir, r, f_and, freevar, cond, true);
-          }
-        } catch (const ir_error &e) {
-          std::vector<ir_tree_node *> *t;
-          if (itn->parent == NULL)
-            t = &ir_tree;
-          else
-            t = &(itn->parent->children);
-          int id = itn->payload;
-          int i = t->size() - 1;
-          while (i >= 0) {
-            if ((*t)[i] == itn) {
-              for (int j = 0; j < itn->children.size(); j++)
-                delete itn->children[j];
-              itn->children = std::vector<ir_tree_node *>();
-              itn->content = itn->content->convert();
-            } else if ((*t)[i]->payload >> 1 == id >> 1) {
-              delete (*t)[i];
-              t->erase(t->begin() + i);
-            }
-            i--;
-          }
-          return false;
-        }
-        
-        break;
-      }
-      default:
-        for (int i = 0; i < itn->children.size(); i++)
-          delete itn->children[i];
-        itn->children = std::vector<ir_tree_node *>();
-        itn->content = itn->content->convert();
-        return false;
-      }
-    }
-    
-    // add information for missing loops
-    for (int j = 0; j < n_dim; j++)
-      if (!processed[j]) {
-        ir_tree_node *itn = ir_stmt[max_loc];
-        while (itn->parent != NULL) {
-          itn = itn->parent;
-          if (itn->content->type() == IR_CONTROL_LOOP
-              && itn->payload == j)
-            break;
-        }
-        
-        Variable_ID v = r.set_var(j + 1);
-        if (loc < max_loc) {
-          CG_outputRepr *lb =
-            static_cast<IR_Loop *>(itn->content)->lower_bound();
-          exp2formula(ir, r, f_root, freevar, lb, v, 's', IR_COND_EQ,
-                      true);
-        } else { // loc > max_loc
-          CG_outputRepr *ub =
-            static_cast<IR_Loop *>(itn->content)->upper_bound();
-          exp2formula(ir, r, f_root, freevar, ub, v, 's', IR_COND_EQ,
-                      true);
-        }
-      }
-    
-    r.setup_names();
-    r.simplify();
-    
-    // insert the statement
-    CG_outputBuilder *ocg = ir->builder();
-    Tuple<CG_outputRepr *> reverse_expr;
-    for (int j = 1; j <= vars_to_be_reversed.size(); j++) {
-      CG_outputRepr *repl = ocg->CreateIdent(vars_to_be_reversed[j]);
-      repl = ocg->CreateMinus(NULL, repl);
-      reverse_expr.append(repl);
-    }
-    CG_outputRepr *code =
-      static_cast<IR_Block *>(ir_stmt[loc]->content)->original();
-    code = ocg->CreatePlaceHolder(0, code, reverse_expr,
-                                  vars_to_be_reversed);
-    stmt[loc].code = code;
-    stmt[loc].IS = r;
-    stmt[loc].loop_level = std::vector<LoopLevel>(n_dim);
-    for (int i = 0; i < n_dim; i++) {
-      stmt[loc].loop_level[i].type = LoopLevelOriginal;
-      stmt[loc].loop_level[i].payload = i;
-      stmt[loc].loop_level[i].parallel_level = 0;
-    }
-    
-    stmt_nesting_level[loc] = -1;
-  }
-  
-  return true;
-}
-
-Loop::Loop(const IR_Control *control) {
-  ir = const_cast<IR_Code *>(control->ir_);
-  init_code = NULL;
-  cleanup_code = NULL;
-  tmp_loop_var_name_counter = 1;
-  overflow_var_name_counter = 1;
-  known = Relation::True(0);
-  
-  std::vector<ir_tree_node *> ir_tree = build_ir_tree(control->clone(), NULL);
-  std::vector<ir_tree_node *> ir_stmt;
-  
-  while (!init_loop(ir_tree, ir_stmt)) {
-  }
-  
-  // init the dependence graph
-  for (int i = 0; i < stmt.size(); i++)
-    dep.insert();
-  
-  for (int i = 0; i < stmt.size(); i++)
-    for (int j = i; j < stmt.size(); j++) {
-      std::pair<std::vector<DependenceVector>,
-        std::vector<DependenceVector> > dv = test_data_dependences(
-          ir_, stmt[i].code, stmt[i].IS, stmt[j].code, stmt[j].IS,
-          freevar, index, stmt_nesting_level_[i],
-          stmt_nesting_level[j]);
-      
-      for (int k = 0; k < dv.first.size(); k++) {
-        if (is_dependence_valid(ir_stmt[i], ir_stmt[j], dv.first[k],
-                                true))
-          dep.connect(i, j, dv.first[k]);
-        else {
-          dep.connect(j, i, dv.first[k].reverse());
-        }
-        
-      }
-      for (int k = 0; k < dv.second.size(); k++)
-        if (is_dependence_valid(ir_stmt[j], ir_stmt[i], dv.second[k],
-                                false))
-          dep.connect(j, i, dv.second[k]);
-        else {
-          dep.connect(i, j, dv.second[k].reverse());
-        }
-      // std::pair<std::vector<DependenceVector>,
-      //                std::vector<DependenceVector> > dv_ = test_data_dependences(
-      
-    }
-  
-  for (int i = 0; i < stmt.size(); i++) {
-    std::map<int, CG_outputRepr*>::iterator it = replace.find(i);
-    
-    if (it != replace.end())
-      stmt[i].code = (it->second)->clone();
-    else
-      stmt[i].code = stmt[i].code->clone();
-  }
-  
-  // cleanup the IR tree
-  for (int i = 0; i < ir_tree.size(); i++)
-    delete ir_tree[i];
-  
-  // init dumb transformation relations e.g. [i, j] -> [ 0, i, 0, j, 0]
-  for (int i = 0; i < stmt.size(); i++) {
-    int n = stmt[i].IS.n_set();
-    stmt[i].xform = Relation(n, 2 * n + 1);
-    F_And *f_root = stmt[i].xform.add_and();
-    
-    for (int j = 1; j <= n; j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(stmt[i].xform.output_var(2 * j), 1);
-      h.update_coef(stmt[i].xform.input_var(j), -1);
-    }
-    
-    for (int j = 1; j <= 2 * n + 1; j += 2) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(stmt[i].xform.output_var(j), 1);
-    }
-    stmt[i].xform.simplify();
-  }
-  
-  if (stmt.size() != 0)
-    num_dep_dim = stmt[0].IS.n_set();
-  else
-    num_dep_dim = 0;
-}
-
-Loop::~Loop() {
-  for (int i = 0; i < stmt.size(); i++)
-    if (stmt[i].code != NULL) {
-      stmt[i].code->clear();
-      delete stmt[i].code;
-    }
-  if (init_code != NULL) {
-    init_code->clear();
-    delete init_code;
-  }
-  if (cleanup_code != NULL) {
-    cleanup_code->clear();
-    delete cleanup_code;
-  }
-}
-
-int Loop::get_dep_dim_of(int stmt_num, int level) const {
-  if (stmt_num < 0 || stmt_num >= stmt.size())
-    throw std::invalid_argument("invaid statement " + to_string(stmt_num));
-  
-  if (level < 1 || level > stmt[stmt_num].loop_level.size())
-    return -1;
-  
-  int trip_count = 0;
-  while (true) {
-    switch (stmt[stmt_num].loop_level[level - 1].type) {
-    case LoopLevelOriginal:
-      return stmt[stmt_num].loop_level[level - 1].payload;
-    case LoopLevelTile:
-      level = stmt[stmt_num].loop_level[level - 1].payload;
-      if (level < 1)
-        return -1;
-      if (level > stmt[stmt_num].loop_level.size())
-        throw loop_error(
-          "incorrect loop level information for statement "
-          + to_string(stmt_num));
-      break;
-    default:
-      throw loop_error(
-        "unknown loop level information for statement "
-        + to_string(stmt_num));
-    }
-    trip_count++;
-    if (trip_count >= stmt[stmt_num].loop_level.size())
-      throw loop_error(
-        "incorrect loop level information for statement "
-        + to_string(stmt_num));
-  }
-}
-
-int Loop::get_last_dep_dim_before(int stmt_num, int level) const {
-  if (stmt_num < 0 || stmt_num >= stmt.size())
-    throw std::invalid_argument("invaid statement " + to_string(stmt_num));
-  
-  if (level < 1)
-    return -1;
-  if (level > stmt[stmt_num].loop_level.size())
-    level = stmt[stmt_num].loop_level.size() + 1;
-  
-  for (int i = level - 1; i >= 1; i--)
-    if (stmt[stmt_num].loop_level[i - 1].type == LoopLevelOriginal)
-      return stmt[stmt_num].loop_level[i - 1].payload;
-  
-  return -1;
-}
-
-void Loop::print_internal_loop_structure() const {
-  for (int i = 0; i < stmt.size(); i++) {
-    std::vector<int> lex = getLexicalOrder(i);
-    std::cout << "s" << i + 1 << ": ";
-    for (int j = 0; j < stmt[i].loop_level.size(); j++) {
-      if (2 * j < lex.size())
-        std::cout << lex[2 * j];
-      switch (stmt[i].loop_level[j].type) {
-      case LoopLevelOriginal:
-        std::cout << "(dim:" << stmt[i].loop_level[j].payload << ")";
-        break;
-      case LoopLevelTile:
-        std::cout << "(tile:" << stmt[i].loop_level[j].payload << ")";
-        break;
-      default:
-        std::cout << "(unknown)";
-      }
-      std::cout << ' ';
-    }
-    for (int j = 2 * stmt[i].loop_level.size(); j < lex.size(); j += 2) {
-      std::cout << lex[j];
-      if (j != lex.size() - 1)
-        std::cout << ' ';
-    }
-    std::cout << std::endl;
-  }
-}
-
-CG_outputRepr *Loop::getCode(int effort) const {
-  const int m = stmt.size();
-  if (m == 0)
-    return NULL;
-  const int n = stmt[0].xform.n_out();
-  
-  Tuple<CG_outputRepr *> ni(m);
-  Tuple < Relation > IS(m);
-  Tuple < Relation > xform(m);
-  for (int i = 0; i < m; i++) {
-    ni[i + 1] = stmt[i].code;
-    IS[i + 1] = stmt[i].IS;
-    xform[i + 1] = stmt[i].xform;
-  }
-  
-  Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
-  CG_outputBuilder *ocg = ir->builder();
-  CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, effort);
-  
-  if (init_code != NULL)
-    repr = ocg->StmtListAppend(init_code->clone(), repr);
-  if (cleanup_code != NULL)
-    repr = ocg->StmtListAppend(repr, cleanup_code->clone());
-  
-  return repr;
-}
-
-void Loop::printCode(int effort) const {
-  const int m = stmt.size();
-  if (m == 0)
-    return;
-  const int n = stmt[0].xform.n_out();
-  
-  Tuple < Relation > IS(m);
-  Tuple < Relation > xform(m);
-  for (int i = 0; i < m; i++) {
-    IS[i + 1] = stmt[i].IS;
-    xform[i + 1] = stmt[i].xform;
-  }
-  
-  Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
-  std::cout << MMGenerateCode(xform, IS, known, effort);
-}
-
-Relation Loop::getNewIS(int stmt_num) const {
-  Relation result;
-  
-  if (stmt[stmt_num].xform.is_null()) {
-    Relation known = Extend_Set(copy(this->known),
-                                stmt[stmt_num].IS.n_set() - this->known.n_set());
-    result = Intersection(copy(stmt[stmt_num].IS), known);
-  } else {
-    Relation known = Extend_Set(copy(this->known),
-                                stmt[stmt_num].xform.n_out() - this->known.n_set());
-    result = Intersection(
-      Range(
-        Restrict_Domain(copy(stmt[stmt_num].xform),
-                        copy(stmt[stmt_num].IS))), known);
-  }
-  
-  result.simplify(2, 4);
-  
-  return result;
-}
-
-std::vector<Relation> Loop::getNewIS() const {
-  const int m = stmt.size();
-  
-  std::vector<Relation> new_IS(m);
-  for (int i = 0; i < m; i++)
-    new_IS[i] = getNewIS(i);
-  
-  return new_IS;
-}
-
-void Loop::permute(const std::vector<int> &pi) {
-  std::set<int> active;
-  for (int i = 0; i < stmt.size(); i++)
-    active.insert(i);
-  
-  permute(active, pi);
-}
-
-void Loop::original() {
-  std::set<int> active;
-  for (int i = 0; i < stmt.size(); i++)
-    active.insert(i);
-  setLexicalOrder(0, active);
-}
-
-void Loop::permute(const std::set<int> &active, const std::vector<int> &pi) {
-  if (active.size() == 0 || pi.size() == 0)
-    return;
-  
-  // check for sanity of parameters
-  int level = pi[0];
-  for (int i = 1; i < pi.size(); i++)
-    if (pi[i] < level)
-      level = pi[i];
-  if (level < 1)
-    throw std::invalid_argument("invalid permuation");
-  std::vector<int> reverse_pi(pi.size(), 0);
-  for (int i = 0; i < pi.size(); i++)
-    if (pi[i] >= level + pi.size())
-      throw std::invalid_argument("invalid permutation");
-    else
-      reverse_pi[pi[i] - level] = i + level;
-  for (int i = 0; i < reverse_pi.size(); i++)
-    if (reverse_pi[i] == 0)
-      throw std::invalid_argument("invalid permuation");
-  int ref_stmt_num;
-  std::vector<int> lex;
-  for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
-    if (*i < 0 || *i >= stmt.size())
-      throw std::invalid_argument("invalid statement " + to_string(*i));
-    if (i == active.begin()) {
-      ref_stmt_num = *i;
-      lex = getLexicalOrder(*i);
-    } else {
-      if (level + pi.size() - 1 > stmt[*i].loop_level.size())
-        throw std::invalid_argument("invalid permuation");
-      std::vector<int> lex2 = getLexicalOrder(*i);
-      for (int j = 0; j < 2 * level - 3; j += 2)
-        if (lex[j] != lex2[j])
-          throw std::invalid_argument(
-            "statements to permute must be in the same subloop");
-      for (int j = 0; j < pi.size(); j++)
-        if (!(stmt[*i].loop_level[level + j - 1].type
-              == stmt[ref_stmt_num].loop_level[level + j - 1].type
-              && stmt[*i].loop_level[level + j - 1].payload
-              == stmt[ref_stmt_num].loop_level[level + j - 1].payload))
-          throw std::invalid_argument(
-            "permuted loops must have the same loop level types");
-    }
-  }
-  
-  // Update transformation relations
-  for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
-    int n = stmt[*i].xform.n_out();
-    Relation mapping(n, n);
-    F_And *f_root = mapping.add_and();
-    for (int j = 1; j <= n; j += 2) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(j), 1);
-      h.update_coef(mapping.input_var(j), -1);
-    }
-    for (int j = 0; j < pi.size(); j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(2 * (level + j)), 1);
-      h.update_coef(mapping.input_var(2 * pi[j]), -1);
-    }
-    for (int j = 1; j < level; j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(2 * j), 1);
-      h.update_coef(mapping.input_var(2 * j), -1);
-    }
-    for (int j = level + pi.size(); j <= stmt[*i].loop_level.size(); j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(2 * j), 1);
-      h.update_coef(mapping.input_var(2 * j), -1);
-    }
-    
-    stmt[*i].xform = Composition(mapping, stmt[*i].xform);
-    stmt[*i].xform.simplify();
-  }
-  
-  // get the permuation for dependence vectors
-  std::vector<int> t;
-  for (int i = 0; i < pi.size(); i++)
-    if (stmt[ref_stmt_num].loop_level[pi[i] - 1].type == LoopLevelOriginal)
-      t.push_back(stmt[ref_stmt_num].loop_level[pi[i] - 1].payload);
-  int max_dep_dim = -1;
-  int min_dep_dim = num_dep_dim;
-  for (int i = 0; i < t.size(); i++) {
-    if (t[i] > max_dep_dim)
-      max_dep_dim = t[i];
-    if (t[i] < min_dep_dim)
-      min_dep_dim = t[i];
-  }
-  if (min_dep_dim > max_dep_dim)
-    return;
-  if (max_dep_dim - min_dep_dim + 1 != t.size())
-    throw loop_error("cannot update the dependence graph after permuation");
-  std::vector<int> dep_pi(num_dep_dim);
-  for (int i = 0; i < min_dep_dim; i++)
-    dep_pi[i] = i;
-  for (int i = min_dep_dim; i <= max_dep_dim; i++)
-    dep_pi[i] = t[i - min_dep_dim];
-  for (int i = max_dep_dim + 1; i < num_dep_dim; i++)
-    dep_pi[i] = i;
-  
-  // update the dependence graph
-  DependenceGraph g;
-  for (int i = 0; i < dep.vertex.size(); i++)
-    g.insert();
-  for (int i = 0; i < dep.vertex.size(); i++)
-    for (DependenceGraph::EdgeList::iterator j =
-           dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();
-         j++) {
-      if ((active.find(i) != active.end()
-           && active.find(j->first) != active.end())) {
-        std::vector<DependenceVector> dv = j->second;
-        for (int k = 0; k < dv.size(); k++) {
-          switch (dv[k].type) {
-          case DEP_W2R:
-          case DEP_R2W:
-          case DEP_W2W:
-          case DEP_R2R: {
-            std::vector<coef_t> lbounds(num_dep_dim);
-            std::vector<coef_t> ubounds(num_dep_dim);
-            for (int d = 0; d < num_dep_dim; d++) {
-              lbounds[d] = dv[k].lbounds[dep_pi[d]];
-              ubounds[d] = dv[k].ubounds[dep_pi[d]];
-            }
-            dv[k].lbounds = lbounds;
-            dv[k].ubounds = ubounds;
-            break;
-          }
-          case DEP_CONTROL: {
-            break;
-          }
-          default:
-            throw loop_error("unknown dependence type");
-          }
-        }
-        g.connect(i, j->first, dv);
-      } else if (active.find(i) == active.end()
-                 && active.find(j->first) == active.end()) {
-        std::vector<DependenceVector> dv = j->second;
-        g.connect(i, j->first, dv);
-      } else {
-        std::vector<DependenceVector> dv = j->second;
-        for (int k = 0; k < dv.size(); k++)
-          switch (dv[k].type) {
-          case DEP_W2R:
-          case DEP_R2W:
-          case DEP_W2W:
-          case DEP_R2R: {
-            for (int d = 0; d < num_dep_dim; d++)
-              if (dep_pi[d] != d) {
-                dv[k].lbounds[d] = -posInfinity;
-                dv[k].ubounds[d] = posInfinity;
-              }
-            break;
-          }
-          case DEP_CONTROL:
-            break;
-          default:
-            throw loop_error("unknown dependence type");
-          }
-        g.connect(i, j->first, dv);
-      }
-    }
-  dep = g;
-  
-  // update loop level information
-  for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
-    int cur_dep_dim = min_dep_dim;
-    std::vector<LoopLevel> new_loop_level(stmt[*i].loop_level.size());
-    for (int j = 1; j <= stmt[*i].loop_level.size(); j++)
-      if (j >= level && j < level + pi.size()) {
-        switch (stmt[*i].loop_level[reverse_pi[j - level] - 1].type) {
-        case LoopLevelOriginal:
-          new_loop_level[j - 1].type = LoopLevelOriginal;
-          new_loop_level[j - 1].payload = cur_dep_dim++;
-          new_loop_level[j - 1].parallel_level =
-            stmt[*i].loop_level[reverse_pi[j - level] - 1].parallel_level;
-          break;
-        case LoopLevelTile: {
-          new_loop_level[j - 1].type = LoopLevelTile;
-          int ref_level = stmt[*i].loop_level[reverse_pi[j - level]
-                                              - 1].payload;
-          if (ref_level >= level && ref_level < level + pi.size())
-            new_loop_level[j - 1].payload = reverse_pi[ref_level
-                                                       - level];
-          else
-            new_loop_level[j - 1].payload = ref_level;
-          new_loop_level[j - 1].parallel_level =
-            stmt[*i].loop_level[reverse_pi[j - level] - 1].parallel_level;
-          break;
-        }
-        default:
-          throw loop_error(
-            "unknown loop level information for statement "
-            + to_string(*i));
-        }
-      } else {
-        switch (stmt[*i].loop_level[j - 1].type) {
-        case LoopLevelOriginal:
-          new_loop_level[j - 1].type = LoopLevelOriginal;
-          new_loop_level[j - 1].payload =
-            stmt[*i].loop_level[j - 1].payload;
-          new_loop_level[j - 1].parallel_level = stmt[*i].loop_level[j
-                                                                     - 1].parallel_level;
-          break;
-        case LoopLevelTile: {
-          new_loop_level[j - 1].type = LoopLevelTile;
-          int ref_level = stmt[*i].loop_level[j - 1].payload;
-          if (ref_level >= level && ref_level < level + pi.size())
-            new_loop_level[j - 1].payload = reverse_pi[ref_level
-                                                       - level];
-          else
-            new_loop_level[j - 1].payload = ref_level;
-          new_loop_level[j - 1].parallel_level = stmt[*i].loop_level[j
-                                                                     - 1].parallel_level;
-          break;
-        }
-        default:
-          throw loop_error(
-            "unknown loop level information for statement "
-            + to_string(*i));
-        }
-      }
-    stmt[*i].loop_level = new_loop_level;
-  }
-  
-  setLexicalOrder(2 * level - 2, active);
-}
-
-std::set<int> Loop::split(int stmt_num, int level, const Relation &cond) {
-  // check for sanity of parameters
-  if (stmt_num < 0 || stmt_num >= stmt.size())
-    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
-  if (level <= 0 || level > stmt[stmt_num].loop_level.size())
-    throw std::invalid_argument("invalid loop level " + to_string(level));
-  
-  std::set<int> result;
-  int dim = 2 * level - 1;
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  std::set<int> same_loop = getStatements(lex, dim - 1);
-  
-  Relation cond2 = copy(cond);
-  cond2.simplify();
-  cond2 = EQs_to_GEQs(cond2);
-  Conjunct *c = cond2.single_conjunct();
-  int cur_lex = lex[dim - 1];
-  for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
-    int max_level = (*gi).max_tuple_pos();
-    Relation single_cond(max_level);
-    single_cond.and_with_GEQ(*gi);
-    
-    // TODO: should decide where to place newly created statements with
-    // complementary split condition from dependence graph.
-    bool place_after;
-    if (max_level == 0)
-      place_after = true;
-    else if ((*gi).get_coef(cond2.set_var(max_level)) < 0)
-      place_after = true;
-    else
-      place_after = false;
-    
-    // original statements with split condition,
-    // new statements with complement of split condition
-    int old_num_stmt = stmt.size();
-    std::map<int, int> what_stmt_num;
-    apply_xform(same_loop);
-    for (std::set<int>::iterator i = same_loop.begin();
-         i != same_loop.end(); i++) {
-      int n = stmt[*i].IS.n_set();
-      Relation part1, part2;
-      if (max_level > n) {
-        part1 = copy(stmt[*i].IS);
-        part2 = Relation::False(0);
-      } else {
-        part1 = Intersection(copy(stmt[*i].IS),
-                             Extend_Set(copy(single_cond), n - max_level));
-        part2 = Intersection(copy(stmt[*i].IS),
-                             Extend_Set(Complement(copy(single_cond)),
-                                        n - max_level));
-      }
-      
-      //split dependence check
-      
-      if (max_level > level) {
-        
-        DNF_Iterator di1(stmt[*i].IS.query_DNF());
-        DNF_Iterator di2(part1.query_DNF());
-        for (; di1 && di2; di1++, di2++) {
-          //printf("In next conjunct,\n");
-          EQ_Iterator ei1 = (*di1)->EQs();
-          EQ_Iterator ei2 = (*di2)->EQs();
-          for (; ei1 && ei2; ei1++, ei2++) {
-            //printf(" In next equality constraint,\n");
-            Constr_Vars_Iter cvi1(*ei1);
-            Constr_Vars_Iter cvi2(*ei2);
-            int dimension = (*cvi1).var->get_position();
-            int same = 0;
-            bool identical = false;
-            if (identical = !strcmp((*cvi1).var->char_name(),
-                                    (*cvi2).var->char_name())) {
-              
-              for (; cvi1 && cvi2; cvi1++, cvi2++) {
-                
-                if (((*cvi1).coef != (*cvi2).coef
-                     || (*ei1).get_const()
-                     != (*ei2).get_const())
-                    || (strcmp((*cvi1).var->char_name(),
-                               (*cvi2).var->char_name()))) {
-                  
-                  same++;
-                }
-              }
-            }
-            if ((same != 0) || !identical) {
-              
-              dimension = dimension - 1;
-              
-              while (stmt[*i].loop_level[dimension].type
-                     == LoopLevelTile)
-                dimension = xform_index[dimension].first;
-              
-              dimension = stmt[*i].loop_level[dimension].payload;
-              
-              for (int i = 0; i < stmt.size(); i++) {
-                std::vector<std::pair<int, DependenceVector> > D;
-                for (DependenceGraph::EdgeList::iterator j =
-                       dep.vertex[i].second.begin();
-                     j != dep.vertex[i].second.end(); j++) {
-                  for (int k = 0; k < j->second.size(); k++) {
-                    DependenceVector dv = j->second[k];
-                    if ((dv.hasNegative(dimension)
-                         && !dv.quasi)
-                        || (dv.hasPositive(dimension)
-                            && dv.quasi))
-                      
-                      throw loop_error(
-                        "loop error: Split is illegal, dependence violation!");
-                    
-                  }
-                }
-              }
-              
-            }
-            
-            GEQ_Iterator gi1 = (*di1)->GEQs();
-            GEQ_Iterator gi2 = (*di2)->GEQs();
-            
-            for (; gi1 && gi2; gi++, gi2++) {
-              
-              Constr_Vars_Iter cvi1(*gi1);
-              Constr_Vars_Iter cvi2(*gi2);
-              int dimension = (*cvi1).var->get_position();
-              int same = 0;
-              bool identical = false;
-              if (identical = !strcmp((*cvi1).var->char_name(),
-                                      (*cvi2).var->char_name())) {
-                
-                for (; cvi1 && cvi2; cvi1++, cvi2++) {
-                  
-                  if (((*cvi1).coef != (*cvi2).coef
-                       || (*gi1).get_const()
-                       != (*gi2).get_const())
-                      || (strcmp((*cvi1).var->char_name(),
-                                 (*cvi2).var->char_name()))) {
-                    
-                    same++;
-                  }
-                }
-              }
-              if ((same != 0) || !identical) {
-                dimension = dimension - 1;
-                
-                while (stmt[*i].loop_level[dimension].type
-                       == LoopLevelTile)
-                  dimension = xform_index[dimension].first;
-                
-                dimension =
-                  stmt[*i].loop_level[dimension].payload;
-                
-                for (int i = 0; i < stmt.size(); i++) {
-                  std::vector<std::pair<int, DependenceVector> > D;
-                  for (DependenceGraph::EdgeList::iterator j =
-                         dep.vertex[i].second.begin();
-                       j != dep.vertex[i].second.end();
-                       j++) {
-                    for (int k = 0; k < j->second.size();
-                         k++) {
-                      DependenceVector dv = j->second[k];
-                      if ((dv.hasNegative(dimension)
-                           && !dv.quasi)
-                          || (dv.hasPositive(
-                                dimension)
-                              && dv.quasi))
-                        
-                        throw loop_error(
-                          "loop error: Split is illegal, dependence violation!");
-                      
-                    }
-                  }
-                }
-                
-              }
-              
-            }
-            
-          }
-          
-        }
-        
-        DNF_Iterator di3(stmt[*i].IS.query_DNF());
-        DNF_Iterator di4(part2.query_DNF());
-        for (; di3 && di4; di3++, di4++) {
-          EQ_Iterator ei1 = (*di3)->EQs();
-          EQ_Iterator ei2 = (*di4)->EQs();
-          for (; ei1 && ei2; ei1++, ei2++) {
-            Constr_Vars_Iter cvi1(*ei1);
-            Constr_Vars_Iter cvi2(*ei2);
-            int dimension = (*cvi1).var->get_position();
-            int same = 0;
-            bool identical = false;
-            if (identical = !strcmp((*cvi1).var->char_name(),
-                                    (*cvi2).var->char_name())) {
-              
-              for (; cvi1 && cvi2; cvi1++, cvi2++) {
-                
-                if (((*cvi1).coef != (*cvi2).coef
-                     || (*ei1).get_const()
-                     != (*ei2).get_const())
-                    || (strcmp((*cvi1).var->char_name(),
-                               (*cvi2).var->char_name()))) {
-                  
-                  same++;
-                }
-              }
-            }
-            if ((same != 0) || !identical) {
-              dimension = dimension - 1;
-              
-              while (stmt[*i].loop_level[dimension].type
-                     == LoopLevelTile)
-                dimension = xform_index[dimension].first;
-              
-              dimension = stmt[*i].loop_level[dimension].payload;
-              
-              for (int i = 0; i < stmt.size(); i++) {
-                std::vector<std::pair<int, DependenceVector> > D;
-                for (DependenceGraph::EdgeList::iterator j =
-                       dep.vertex[i].second.begin();
-                     j != dep.vertex[i].second.end(); j++) {
-                  for (int k = 0; k < j->second.size(); k++) {
-                    DependenceVector dv = j->second[k];
-                    if ((dv.hasNegative(dimension)
-                         && !dv.quasi)
-                        || (dv.hasPositive(dimension)
-                            && dv.quasi))
-                      
-                      throw loop_error(
-                        "loop error: Split is illegal, dependence violation!");
-                    
-                  }
-                }
-              }
-              
-            }
-            
-          }
-          GEQ_Iterator gi1 = (*di3)->GEQs();
-          GEQ_Iterator gi2 = (*di4)->GEQs();
-          
-          for (; gi1 && gi2; gi++, gi2++) {
-            Constr_Vars_Iter cvi1(*gi1);
-            Constr_Vars_Iter cvi2(*gi2);
-            int dimension = (*cvi1).var->get_position();
-            int same = 0;
-            bool identical = false;
-            if (identical = !strcmp((*cvi1).var->char_name(),
-                                    (*cvi2).var->char_name())) {
-              
-              for (; cvi1 && cvi2; cvi1++, cvi2++) {
-                
-                if (((*cvi1).coef != (*cvi2).coef
-                     || (*gi1).get_const()
-                     != (*gi2).get_const())
-                    || (strcmp((*cvi1).var->char_name(),
-                               (*cvi2).var->char_name()))) {
-                  
-                  same++;
-                }
-              }
-            }
-            if ((same != 0) || !identical) {
-              dimension = dimension - 1;
-              
-              while (stmt[*i].loop_level[dimension].type
-                     == LoopLevelTile)
-                dimension = xform_index[dimension].first;
-              
-              dimension = stmt[*i].loop_level[dimension].payload;
-              
-              for (int i = 0; i < stmt.size(); i++) {
-                std::vector<std::pair<int, DependenceVector> > D;
-                for (DependenceGraph::EdgeList::iterator j =
-                       dep.vertex[i].second.begin();
-                     j != dep.vertex[i].second.end(); j++) {
-                  for (int k = 0; k < j->second.size(); k++) {
-                    DependenceVector dv = j->second[k];
-                    if ((dv.hasNegative(dimension)
-                         && !dv.quasi)
-                        || (dv.hasPositive(dimension)
-                            && dv.quasi))
-                      
-                      throw loop_error(
-                        "loop error: Split is illegal, dependence violation!");
-                    
-                  }
-                }
-              }
-              
-            }
-            
-          }
-          
-        }
-        
-      }
-      
-      stmt[*i].IS = part1;
-      
-      if (Intersection(copy(part2),
-                       Extend_Set(copy(this->known), n - this->known.n_set())).is_upper_bound_satisfiable()) {
-        Statement new_stmt;
-        new_stmt.code = stmt[*i].code->clone();
-        new_stmt.IS = part2;
-        new_stmt.xform = copy(stmt[*i].xform);
-        
-        new_stmt.loop_level = stmt[*i].loop_level;
-        stmt.push_back(new_stmt);
-        dep.insert();
-        what_stmt_num[*i] = stmt.size() - 1;
-        if (*i == stmt_num)
-          result.insert(stmt.size() - 1);
-        
-        stmt_nesting_level_.push_back(stmt_nesting_level[*i]);
-        std::pair<std::vector<DependenceVector>,
-          std::vector<DependenceVector> > dv =
-          test_data_dependences(ir_, stmt[*i].code, part1,
-                                stmt[*i].code, part2, freevar, index,
-                                stmt_nesting_level[*i],
-                                stmt_nesting_level[stmt.size() - 1]);
-        
-        int part1_to_part2 = 0;
-        int part2_to_part1 = 0;
-        
-        for (int k = 0; k < dv.first.size(); k++)
-          if (is_dependence_valid_based_on_lex_order(*i,
-                                                     what_stmt_num[*i], dv.first[k], true))
-            part1_to_part2++;
-          else
-            part2_to_part1++;
-        
-        if (part1_to_part2 > 0 && part2_to_part1 > 0)
-          throw loop_error(
-            "loop error: Aborting, split resulted in impossible dependence cycle!");
-        
-        for (int k = 0; k < dv.second.size(); k++)
-          if (is_dependence_valid_based_on_lex_order(
-                what_stmt_num[*i], *i, dv.second[k], false))
-            part2_to_part1++;
-        
-          else
-            part1_to_part2++;
-        
-        if (part1_to_part2 > 0 && part2_to_part1 > 0)
-          throw loop_error(
-            "loop error: Aborting, split resulted in impossible dependence cycle!");
-        bool temp_place_after;
-        if (part2_to_part1 > 0)
-          temp_place_after = false;
-        else
-          temp_place_after = true;
-        
-        if (i == same_loop.begin())
-          place_after = temp_place_after;
-        else {
-          if (temp_place_after != place_after)
-            throw loop_error(
-              "loop error: Aborting, split resulted in impossible dependence cycle!");
-          
-        }
-        
-        if (place_after)
-          assign_const(new_stmt.xform, dim - 1, cur_lex + 1);
-        else
-          assign_const(new_stmt.xform, dim - 1, cur_lex - 1);
-        
-      }
-      
-    }
-    // make adjacent lexical number available for new statements
-    if (place_after) {
-      lex[dim - 1] = cur_lex + 1;
-      shiftLexicalOrder(lex, dim - 1, 1);
-    } else {
-      lex[dim - 1] = cur_lex - 1;
-      shiftLexicalOrder(lex, dim - 1, -1);
-    }
-    // update dependence graph
-    int dep_dim = get_dep_dim_of(stmt_num, level);
-    for (int i = 0; i < old_num_stmt; i++) {
-      std::vector<std::pair<int, std::vector<DependenceVector> > > D;
-      
-      for (DependenceGraph::EdgeList::iterator j =
-             dep.vertex[i].second.begin();
-           j != dep.vertex[i].second.end(); j++) {
-        if (same_loop.find(i) != same_loop.end()) {
-          if (same_loop.find(j->first) != same_loop.end()) {
-            if (what_stmt_num.find(i) != what_stmt_num.end()
-                && what_stmt_num.find(j->first)
-                != what_stmt_num.end())
-              dep.connect(what_stmt_num[i],
-                          what_stmt_num[j->first], j->second);
-            if (place_after
-                && what_stmt_num.find(j->first)
-                != what_stmt_num.end()) {
-              std::vector<DependenceVector> dvs;
-              for (int k = 0; k < j->second.size(); k++) {
-                DependenceVector dv = j->second[k];
-                if (dv.is_data_dependence() && dep_dim != -1) {
-                  dv.lbounds[dep_dim] = -posInfinity;
-                  dv.ubounds[dep_dim] = posInfinity;
-                }
-                dvs.push_back(dv);
-              }
-              if (dvs.size() > 0)
-                D.push_back(
-                  std::make_pair(what_stmt_num[j->first],
-                                 dvs));
-            } else if (!place_after
-                       && what_stmt_num.find(i)
-                       != what_stmt_num.end()) {
-              std::vector<DependenceVector> dvs;
-              for (int k = 0; k < j->second.size(); k++) {
-                DependenceVector dv = j->second[k];
-                if (dv.is_data_dependence() && dep_dim != -1) {
-                  dv.lbounds[dep_dim] = -posInfinity;
-                  dv.ubounds[dep_dim] = posInfinity;
-                }
-                dvs.push_back(dv);
-              }
-              if (dvs.size() > 0)
-                dep.connect(what_stmt_num[i], j->first, dvs);
-              
-            }
-          } else {
-            if (what_stmt_num.find(i) != what_stmt_num.end())
-              dep.connect(what_stmt_num[i], j->first, j->second);
-          }
-        } else if (same_loop.find(j->first) != same_loop.end()) {
-          if (what_stmt_num.find(j->first) != what_stmt_num.end())
-            D.push_back(
-              std::make_pair(what_stmt_num[j->first],
-                             j->second));
-        }
-      }
-      
-      for (int j = 0; j < D.size(); j++)
-        dep.connect(i, D[j].first, D[j].second);
-    }
-    
-  }
-  
-  return result;
-}
-
-void Loop::tile(int stmt_num, int level, int tile_size, int outer_level,
-                TilingMethodType method, int alignment_offset, int alignment_multiple) {
-  // check for sanity of parameters
-  if (tile_size < 0)
-    throw std::invalid_argument("invalid tile size");
-  if (alignment_multiple < 1 || alignment_offset < 0)
-    throw std::invalid_argument("invalid alignment for tile");
-  if (stmt_num < 0 || stmt_num >= stmt.size())
-    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
-  if (level <= 0)
-    throw std::invalid_argument("invalid loop level " + to_string(level));
-  if (level > stmt[stmt_num].loop_level.size())
-    throw std::invalid_argument(
-      "there is no loop level " + to_string(level) + " for statement "
-      + to_string(stmt_num));
-  if (outer_level <= 0 || outer_level > level)
-    throw std::invalid_argument(
-      "invalid tile controlling loop level "
-      + to_string(outer_level));
-  
-  int dim = 2 * level - 1;
-  int outer_dim = 2 * outer_level - 1;
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  std::set<int> same_tiled_loop = getStatements(lex, dim - 1);
-  std::set<int> same_tile_controlling_loop = getStatements(lex,
-                                                           outer_dim - 1);
-  
-  for (int i = 0; i < stmt.size(); i++) {
-    std::vector<std::pair<int, DependenceVector> > D;
-    for (DependenceGraph::EdgeList::iterator j =
-           dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();
-         j++) {
-      for (int k = 0; k < j->second.size(); k++) {
-        DependenceVector dv = j->second[k];
-        int dim2 = level - 1;
-        if ((dv.type != DEP_CONTROL) && (dv.type != DEP_UNKNOWN)) {
-          while (stmt[i].loop_level[dim2].type == LoopLevelTile) {
-            dim2 = stmt[i].loop_level[dim2].payload;
-          }
-          dim2 = stmt[i].loop_level[dim2].payload;
-          
-          if ((dv.hasNegative(dim2) && (!dv.quasi))
-              || (dv.quasi && dv.hasPositive(dim2))) {
-            for (int l = outer_level; l < level; l++)
-              if (stmt[i].loop_level[l - 1].type
-                  != LoopLevelTile) {
-                if (dv.isCarried(
-                      stmt[i].loop_level[l - 1].payload))
-                  throw loop_error(
-                    "loop error: Tiling is illegal, dependence violation!");
-              } else {
-                
-                int dim3 = l - 1;
-                while (stmt[i].loop_level[l - 1].type
-                       != LoopLevelTile) {
-                  dim3 = stmt[i].loop_level[l - 1].payload;
-                  
-                }
-                
-                dim3 = stmt[i].loop_level[l - 1].payload;
-                if (dim3 < level - 1)
-                  if (dv.isCarried(dim3))
-                    throw loop_error(
-                      "loop error: Tiling is illegal, dependence violation!");
-              }
-          }
-        }
-      }
-    }
-  }
-  // special case for no tiling
-  if (tile_size == 0) {
-    for (std::set<int>::iterator i = same_tile_controlling_loop.begin();
-         i != same_tile_controlling_loop.end(); i++) {
-      Relation r(stmt[*i].xform.n_out(), stmt[*i].xform.n_out() + 2);
-      F_And *f_root = r.add_and();
-      for (int j = 1; j <= 2 * outer_level - 1; j++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(r.input_var(j), 1);
-        h.update_coef(r.output_var(j), -1);
-      }
-      EQ_Handle h1 = f_root->add_EQ();
-      h1.update_coef(r.output_var(2 * outer_level), 1);
-      EQ_Handle h2 = f_root->add_EQ();
-      h2.update_coef(r.output_var(2 * outer_level + 1), 1);
-      for (int j = 2 * outer_level; j <= stmt[*i].xform.n_out(); j++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(r.input_var(j), 1);
-        h.update_coef(r.output_var(j + 2), -1);
-      }
-      
-      stmt[*i].xform = Composition(copy(r), stmt[*i].xform);
-    }
-  }
-  // normal tiling
-  else {
-    std::set<int> private_stmt;
-    for (std::set<int>::iterator i = same_tile_controlling_loop.begin();
-         i != same_tile_controlling_loop.end(); i++) {
-//     if (same_tiled_loop.find(*i) == same_tiled_loop.end() && !is_single_iteration(getNewIS(*i), dim))
-//       same_tiled_loop.insert(*i);
-      
-      // should test dim's value directly but it is ok for now
-//    if (same_tiled_loop.find(*i) == same_tiled_loop.end() && get_const(stmt[*i].xform, dim+1, Output_Var) == posInfinity)
-      if (same_tiled_loop.find(*i) == same_tiled_loop.end()
-          && overflow.find(*i) != overflow.end())
-        private_stmt.insert(*i);
-    }
-    
-    // extract the union of the iteration space to be considered
-    Relation hull;
-    {
-      Tuple < Relation > r_list;
-      Tuple<int> r_mask;
-      
-      for (std::set<int>::iterator i = same_tile_controlling_loop.begin();
-           i != same_tile_controlling_loop.end(); i++)
-        if (private_stmt.find(*i) == private_stmt.end()) {
-          Relation r = project_onto_levels(getNewIS(*i), dim + 1,
-                                           true);
-          for (int j = outer_dim; j < dim; j++)
-            r = Project(r, j + 1, Set_Var);
-          for (int j = 0; j < outer_dim; j += 2)
-            r = Project(r, j + 1, Set_Var);
-          r_list.append(r);
-          r_mask.append(1);
-        }
-      
-      hull = Hull(r_list, r_mask, 1, true);
-    }
-    
-    // extract the bound of the dimension to be tiled
-    Relation bound = get_loop_bound(hull, dim);
-    if (!bound.has_single_conjunct()) {
-      // further simplify the bound
-      hull = Approximate(hull);
-      bound = get_loop_bound(hull, dim);
-      
-      int i = outer_dim - 2;
-      while (!bound.has_single_conjunct() && i >= 0) {
-        hull = Project(hull, i + 1, Set_Var);
-        bound = get_loop_bound(hull, dim);
-        i -= 2;
-      }
-      
-      if (!bound.has_single_conjunct())
-        throw loop_error("cannot handle tile bounds");
-    }
-    
-    // separate lower and upper bounds
-    std::vector<GEQ_Handle> lb_list, ub_list;
-    {
-      Conjunct *c = bound.query_DNF()->single_conjunct();
-      for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
-        int coef = (*gi).get_coef(bound.set_var(dim + 1));
-        if (coef < 0)
-          ub_list.push_back(*gi);
-        else if (coef > 0)
-          lb_list.push_back(*gi);
-      }
-    }
-    if (lb_list.size() == 0)
-      throw loop_error(
-        "unable to calculate tile controlling loop lower bound");
-    if (ub_list.size() == 0)
-      throw loop_error(
-        "unable to calculate tile controlling loop upper bound");
-    
-    // find the simplest lower bound for StridedTile or simplest iteration count for CountedTile
-    int simplest_lb = 0, simplest_ub = 0;
-    if (method == StridedTile) {
-      int best_cost = INT_MAX;
-      for (int i = 0; i < lb_list.size(); i++) {
-        int cost = 0;
-        for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) {
-          switch ((*ci).var->kind()) {
-          case Input_Var: {
-            cost += 5;
-            break;
-          }
-          case Global_Var: {
-            cost += 2;
-            break;
-          }
-          default:
-            cost += 15;
-            break;
-          }
-        }
-        
-        if (cost < best_cost) {
-          best_cost = cost;
-          simplest_lb = i;
-        }
-      }
-    } else if (method == CountedTile) {
-      std::map<Variable_ID, coef_t> s1, s2, s3;
-      int best_cost = INT_MAX;
-      for (int i = 0; i < lb_list.size(); i++)
-        for (int j = 0; j < ub_list.size(); j++) {
-          int cost = 0;
-          
-          for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var: {
-              s1[(*ci).var] += (*ci).coef;
-              break;
-            }
-            case Global_Var: {
-              s2[(*ci).var] += (*ci).coef;
-              break;
-            }
-            case Exists_Var:
-            case Wildcard_Var: {
-              s3[(*ci).var] += (*ci).coef;
-              break;
-            }
-            default:
-              cost = INT_MAX - 2;
-              break;
-            }
-          }
-          
-          for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var: {
-              s1[(*ci).var] += (*ci).coef;
-              break;
-            }
-            case Global_Var: {
-              s2[(*ci).var] += (*ci).coef;
-              break;
-            }
-            case Exists_Var:
-            case Wildcard_Var: {
-              s3[(*ci).var] += (*ci).coef;
-              break;
-            }
-            default:
-              if (cost == INT_MAX - 2)
-                cost = INT_MAX - 1;
-              else
-                cost = INT_MAX - 3;
-              break;
-            }
-          }
-          
-          if (cost == 0) {
-            for (std::map<Variable_ID, coef_t>::iterator k =
-                   s1.begin(); k != s1.end(); k++)
-              if ((*k).second != 0)
-                cost += 5;
-            for (std::map<Variable_ID, coef_t>::iterator k =
-                   s2.begin(); k != s2.end(); k++)
-              if ((*k).second != 0)
-                cost += 2;
-            for (std::map<Variable_ID, coef_t>::iterator k =
-                   s3.begin(); k != s3.end(); k++)
-              if ((*k).second != 0)
-                cost += 15;
-          }
-          
-          if (cost < best_cost) {
-            best_cost = cost;
-            simplest_lb = i;
-            simplest_ub = j;
-          }
-        }
-    }
-    
-    // prepare the new transformation relations
-    for (std::set<int>::iterator i = same_tile_controlling_loop.begin();
-         i != same_tile_controlling_loop.end(); i++) {
-      Relation r(stmt[*i].xform.n_out(), stmt[*i].xform.n_out() + 2);
-      F_And *f_root = r.add_and();
-      for (int j = 0; j < outer_dim - 1; j++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(r.output_var(j + 1), 1);
-        h.update_coef(r.input_var(j + 1), -1);
-      }
-      
-      for (int j = outer_dim - 1; j < stmt[*i].xform.n_out(); j++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(r.output_var(j + 3), 1);
-        h.update_coef(r.input_var(j + 1), -1);
-      }
-      
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(r.output_var(outer_dim), 1);
-      h.update_const(-lex[outer_dim - 1]);
-      
-      stmt[*i].xform = Composition(r, stmt[*i].xform);
-    }
-    
-    // add tiling constraints.
-    for (std::set<int>::iterator i = same_tile_controlling_loop.begin();
-         i != same_tile_controlling_loop.end(); i++) {
-      F_And *f_super_root = stmt[*i].xform.and_with_and();
-      F_Exists *f_exists = f_super_root->add_exists();
-      F_And *f_root = f_exists->add_and();
-      
-      // create a lower bound variable for easy formula creation later
-      Variable_ID aligned_lb;
-      {
-        Variable_ID lb = f_exists->declare();
-        coef_t coef = lb_list[simplest_lb].get_coef(
-          bound.set_var(dim + 1));
-        if (coef == 1) { // e.g. if i >= m+5, then LB = m+5
-          EQ_Handle h = f_root->add_EQ();
-          h.update_coef(lb, 1);
-          for (Constr_Vars_Iter ci(lb_list[simplest_lb]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var: {
-              int pos = (*ci).var->get_position();
-              if (pos != dim + 1)
-                h.update_coef(stmt[*i].xform.output_var(pos),
-                              (*ci).coef);
-              break;
-            }
-            case Global_Var: {
-              Global_Var_ID g = (*ci).var->get_global_var();
-              Variable_ID v;
-              if (g->arity() == 0)
-                v = stmt[*i].xform.get_local(g);
-              else
-                v = stmt[*i].xform.get_local(g,
-                                             (*ci).var->function_of());
-              h.update_coef(v, (*ci).coef);
-              break;
-            }
-            default:
-              throw loop_error("cannot handle tile bounds");
-            }
-          }
-          h.update_const(lb_list[simplest_lb].get_const());
-        } else { // e.g. if 2i >= m+5, then m+5 <= 2*LB < m+5+2
-          GEQ_Handle h1 = f_root->add_GEQ();
-          GEQ_Handle h2 = f_root->add_GEQ();
-          for (Constr_Vars_Iter ci(lb_list[simplest_lb]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var: {
-              int pos = (*ci).var->get_position();
-              if (pos == dim + 1) {
-                h1.update_coef(lb, (*ci).coef);
-                h2.update_coef(lb, -(*ci).coef);
-              } else {
-                h1.update_coef(stmt[*i].xform.output_var(pos),
-                               (*ci).coef);
-                h2.update_coef(stmt[*i].xform.output_var(pos),
-                               -(*ci).coef);
-              }
-              break;
-            }
-            case Global_Var: {
-              Global_Var_ID g = (*ci).var->get_global_var();
-              Variable_ID v;
-              if (g->arity() == 0)
-                v = stmt[*i].xform.get_local(g);
-              else
-                v = stmt[*i].xform.get_local(g,
-                                             (*ci).var->function_of());
-              h1.update_coef(v, (*ci).coef);
-              h2.update_coef(v, -(*ci).coef);
-              break;
-            }
-            default:
-              throw loop_error("cannot handle tile bounds");
-            }
-          }
-          h1.update_const(lb_list[simplest_lb].get_const());
-          h2.update_const(-lb_list[simplest_lb].get_const());
-          h2.update_const(coef - 1);
-        }
-        
-        Variable_ID offset_lb;
-        if (alignment_offset == 0)
-          offset_lb = lb;
-        else {
-          EQ_Handle h = f_root->add_EQ();
-          offset_lb = f_exists->declare();
-          h.update_coef(offset_lb, 1);
-          h.update_coef(lb, -1);
-          h.update_const(alignment_offset);
-        }
-        
-        if (alignment_multiple == 1) { // trivial
-          aligned_lb = offset_lb;
-        } else { // e.g. to align at 4, aligned_lb = 4*alpha && LB-4 < 4*alpha <= LB
-          aligned_lb = f_exists->declare();
-          Variable_ID e = f_exists->declare();
-          
-          EQ_Handle h = f_root->add_EQ();
-          h.update_coef(aligned_lb, 1);
-          h.update_coef(e, -alignment_multiple);
-          
-          GEQ_Handle h1 = f_root->add_GEQ();
-          GEQ_Handle h2 = f_root->add_GEQ();
-          h1.update_coef(e, alignment_multiple);
-          h2.update_coef(e, -alignment_multiple);
-          h1.update_coef(offset_lb, -1);
-          h2.update_coef(offset_lb, 1);
-          h1.update_const(alignment_multiple - 1);
-        }
-      }
-      
-      // create an upper bound variable for easy formula creation later
-      Variable_ID ub = f_exists->declare();
-      {
-        coef_t coef = -ub_list[simplest_ub].get_coef(
-          bound.set_var(dim + 1));
-        if (coef == 1) { // e.g. if i <= m+5, then UB = m+5
-          EQ_Handle h = f_root->add_EQ();
-          h.update_coef(ub, -1);
-          for (Constr_Vars_Iter ci(ub_list[simplest_ub]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var: {
-              int pos = (*ci).var->get_position();
-              if (pos != dim + 1)
-                h.update_coef(stmt[*i].xform.output_var(pos),
-                              (*ci).coef);
-              break;
-            }
-            case Global_Var: {
-              Global_Var_ID g = (*ci).var->get_global_var();
-              Variable_ID v;
-              if (g->arity() == 0)
-                v = stmt[*i].xform.get_local(g);
-              else
-                v = stmt[*i].xform.get_local(g,
-                                             (*ci).var->function_of());
-              h.update_coef(v, (*ci).coef);
-              break;
-            }
-            default:
-              throw loop_error("cannot handle tile bounds");
-            }
-          }
-          h.update_const(ub_list[simplest_ub].get_const());
-        } else { // e.g. if 2i <= m+5, then m+5-2 < 2*UB <= m+5
-          GEQ_Handle h1 = f_root->add_GEQ();
-          GEQ_Handle h2 = f_root->add_GEQ();
-          for (Constr_Vars_Iter ci(ub_list[simplest_ub]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var: {
-              int pos = (*ci).var->get_position();
-              if (pos == dim + 1) {
-                h1.update_coef(ub, -(*ci).coef);
-                h2.update_coef(ub, (*ci).coef);
-              } else {
-                h1.update_coef(stmt[*i].xform.output_var(pos),
-                               -(*ci).coef);
-                h2.update_coef(stmt[*i].xform.output_var(pos),
-                               (*ci).coef);
-              }
-              break;
-            }
-            case Global_Var: {
-              Global_Var_ID g = (*ci).var->get_global_var();
-              Variable_ID v;
-              if (g->arity() == 0)
-                v = stmt[*i].xform.get_local(g);
-              else
-                v = stmt[*i].xform.get_local(g,
-                                             (*ci).var->function_of());
-              h1.update_coef(v, -(*ci).coef);
-              h2.update_coef(v, (*ci).coef);
-              break;
-            }
-            default:
-              throw loop_error("cannot handle tile bounds");
-            }
-          }
-          h1.update_const(-ub_list[simplest_ub].get_const());
-          h2.update_const(ub_list[simplest_ub].get_const());
-          h1.update_const(coef - 1);
-        }
-      }
-      
-      // insert tile controlling loop constraints
-      if (method == StridedTile) { // e.g. ii = LB + 32 * alpha && alpha >= 0
-        Variable_ID e = f_exists->declare();
-        GEQ_Handle h1 = f_root->add_GEQ();
-        h1.update_coef(e, 1);
-        
-        EQ_Handle h2 = f_root->add_EQ();
-        h2.update_coef(stmt[*i].xform.output_var(outer_dim + 1), 1);
-        h2.update_coef(e, -tile_size);
-        h2.update_coef(aligned_lb, -1);
-      } else if (method == CountedTile) { // e.g. 0 <= ii < ceiling((UB-LB+1)/32)
-        GEQ_Handle h1 = f_root->add_GEQ();
-        h1.update_coef(stmt[*i].xform.output_var(outer_dim + 1), 1);
-        
-        GEQ_Handle h2 = f_root->add_GEQ();
-        h2.update_coef(stmt[*i].xform.output_var(outer_dim + 1),
-                       -tile_size);
-        h2.update_coef(aligned_lb, -1);
-        h2.update_coef(ub, 1);
-      }
-      
-      // special care for private statements like overflow assignment
-      if (private_stmt.find(*i) != private_stmt.end()) { // e.g. ii <= UB
-        GEQ_Handle h = f_root->add_GEQ();
-        h.update_coef(stmt[*i].xform.output_var(outer_dim + 1), -1);
-        h.update_coef(ub, 1);
-      }
-      // if (private_stmt.find(*i) != private_stmt.end()) {
-      //   if (stmt[*i].xform.n_out() > dim+3) { // e.g. ii <= UB && i = ii
-      //     GEQ_Handle h = f_root->add_GEQ();
-      //     h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1);
-      //     h.update_coef(ub, 1);
-      
-      //     stmt[*i].xform = Project(stmt[*i].xform, dim+3, Output_Var);
-      //     f_root = stmt[*i].xform.and_with_and();
-      //     EQ_Handle h1 = f_root->add_EQ();
-      //     h1.update_coef(stmt[*i].xform.output_var(dim+3), 1);
-      //     h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1);
-      //   }
-      //   else if (method == StridedTile) { // e.g. ii <= UB since i does not exist
-      //     GEQ_Handle h = f_root->add_GEQ();
-      //     h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1);
-      //     h.update_coef(ub, 1);
-      //   }
-      // }
-      
-      // restrict original loop index inside the tile
-      else {
-        if (method == StridedTile) { // e.g. ii <= i < ii + tile_size
-          GEQ_Handle h1 = f_root->add_GEQ();
-          h1.update_coef(stmt[*i].xform.output_var(dim + 3), 1);
-          h1.update_coef(stmt[*i].xform.output_var(outer_dim + 1),
-                         -1);
-          
-          GEQ_Handle h2 = f_root->add_GEQ();
-          h2.update_coef(stmt[*i].xform.output_var(dim + 3), -1);
-          h2.update_coef(stmt[*i].xform.output_var(outer_dim + 1), 1);
-          h2.update_const(tile_size - 1);
-        } else if (method == CountedTile) { // e.g. LB+32*ii <= i < LB+32*ii+tile_size
-          GEQ_Handle h1 = f_root->add_GEQ();
-          h1.update_coef(stmt[*i].xform.output_var(outer_dim + 1),
-                         -tile_size);
-          h1.update_coef(stmt[*i].xform.output_var(dim + 3), 1);
-          h1.update_coef(aligned_lb, -1);
-          
-          GEQ_Handle h2 = f_root->add_GEQ();
-          h2.update_coef(stmt[*i].xform.output_var(outer_dim + 1),
-                         tile_size);
-          h2.update_coef(stmt[*i].xform.output_var(dim + 3), -1);
-          h2.update_const(tile_size - 1);
-          h2.update_coef(aligned_lb, 1);
-        }
-      }
-    }
-  }
-  
-  // update loop level information
-  for (std::set<int>::iterator i = same_tile_controlling_loop.begin();
-       i != same_tile_controlling_loop.end(); i++) {
-    for (int j = 1; j <= stmt[*i].loop_level.size(); j++)
-      switch (stmt[*i].loop_level[j - 1].type) {
-      case LoopLevelOriginal:
-        break;
-      case LoopLevelTile:
-        if (stmt[*i].loop_level[j - 1].payload >= outer_level)
-          stmt[*i].loop_level[j - 1].payload++;
-        break;
-      default:
-        throw loop_error(
-          "unknown loop level type for statement "
-          + to_string(*i));
-      }
-    
-    LoopLevel ll;
-    ll.type = LoopLevelTile;
-    ll.payload = level + 1;
-    ll.parallel_level = 0;
-    stmt[*i].loop_level.insert(
-      stmt[*i].loop_level.begin() + (outer_level - 1), ll);
-  }
-}
-
-std::set<int> Loop::unroll(int stmt_num, int level, int unroll_amount) {
-  // check for sanity of parameters
-  if (unroll_amount < 0)
-    throw std::invalid_argument(
-      "invalid unroll amount " + to_string(unroll_amount));
-  if (stmt_num < 0 || stmt_num >= stmt.size())
-    throw std::invalid_argument("invalid statement " + to_string(stmt_num));
-  if (level <= 0 || level > stmt[stmt_num].loop_level.size())
-    throw std::invalid_argument("invalid loop level " + to_string(level));
-  
-  int dim = 2 * level - 1;
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  std::set<int> same_loop = getStatements(lex, dim - 1);
-  
-  // nothing to do
-  if (unroll_amount == 1)
-    return std::set<int>();
-  
-  for (int i = 0; i < stmt.size(); i++) {
-    std::vector<std::pair<int, DependenceVector> > D;
-    for (DependenceGraph::EdgeList::iterator j =
-           dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();
-         j++) {
-      for (int k = 0; k < j->second.size(); k++) {
-        DependenceVector dv = j->second[k];
-        int dim2 = level - 1;
-        if ((dv.type != DEP_CONTROL) && (dv.type != DEP_UNKNOWN)) {
-          
-          while (stmt[i].loop_level[dim2].type == LoopLevelTile) {
-            dim2 = xform_index[dim2].first;
-          }
-          dim2 = stmt[i].loop_level[dim2].payload;
-          
-          if (dv.isCarried(dim2)
-              && (dv.hasNegative(dim2) && !dv.quasi))
-            throw loop_error(
-              "loop error: Unrolling is illegal, dependence violation!");
-          
-          if (dv.isCarried(dim2)
-              && (dv.hasPositive(dim2) && dv.quasi))
-            throw loop_error(
-              "loop error: Unrolling is illegal, dependence violation!");
-          bool safe = false;
-          
-          if (dv.isCarried(dim2)) {
-            
-            if (!dv.quasi) {
-              if (dv.lbounds[dim2] != posInfinity) {
-                if (dv.lbounds[dim2] != negInfinity)
-                  if (dv.lbounds[dim2] > unroll_amount)
-                    safe = true;
-              } else
-                safe = true;
-            } else {
-              if (dv.ubounds[dim2] != negInfinity) {
-                if (dv.ubounds[dim2] != posInfinity)
-                  if ((-(dv.ubounds[dim2])) > unroll_amount)
-                    safe = true;
-              } else
-                safe = true;
-            }
-            
-            if (!safe) {
-              for (int l = level; l <= (n - 1) / 2; l++) {
-                int dim3 = l - 1;
-                
-                if (stmt[i].loop_level[dim3].type
-                    != LoopLevelTile)
-                  dim3 = stmt[i].loop_level[dim3].payload;
-                else {
-                  while (stmt[i].loop_level[dim2].type
-                         == LoopLevelTile) {
-                    dim3 = stmt[i].loop_level[dim3].payload;
-                  }
-                  dim3 = stmt[i].loop_level[dim3].payload;
-                }
-                
-                if (dim3 > dim2) {
-                  if ((dv.hasPositive(dim3) && !dv.quasi)
-                      || (dv.hasNegative(dim3) && dv.quasi))
-                    break;
-                  else if ((dv.hasNegative(dim3) && !dv.quasi)
-                           || (dv.hasPositive(dim3) && dv.quasi))
-                    throw loop_error(
-                      "loop error: Unrolling is illegal, dependence violation!");
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  
-  // extract the intersection of the iteration space to be considered
-  Relation hull = Relation::True(level);
-  apply_xform(same_loop);
-  for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end();
-       i++) {
-    if (stmt[*i].IS.is_upper_bound_satisfiable()) {
-      Relation mapping(stmt[*i].IS.n_set(), level);
-      F_And *f_root = mapping.add_and();
-      for (int j = 1; j <= level; j++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(mapping.input_var(j), 1);
-        h.update_coef(mapping.output_var(j), -1);
-      }
-      hull = Intersection(hull,
-                          Range(Restrict_Domain(mapping, copy(stmt[*i].IS))));
-      hull.simplify(2, 4);
-    }
-  }
-  for (int i = 1; i <= level; i++) {
-    std::string name = tmp_loop_var_name_prefix + to_string(i);
-    hull.name_set_var(i, name);
-  }
-  hull.setup_names();
-  
-  // extract the exact loop bound of the dimension to be unrolled
-  if (is_single_loop_iteration(hull, level, this->known))
-    return std::set<int>();
-  Relation bound = get_loop_bound(hull, level, this->known);
-  if (!bound.has_single_conjunct() || !bound.is_satisfiable()
-      || bound.is_tautology())
-    throw loop_error("unable to extract loop bound for unrolling");
-  
-  // extract the loop stride
-  EQ_Handle stride_eq;
-  int stride = 1;
-  {
-    bool simple_stride = true;
-    int strides = countStrides(bound.query_DNF()->single_conjunct(),
-                               bound.set_var(level), stride_eq, simple_stride);
-    if (strides > 1)
-      throw loop_error("too many strides");
-    else if (strides == 1) {
-      int sign = stride_eq.get_coef(bound.set_var(level));
-      Constr_Vars_Iter it(stride_eq, true);
-      stride = abs((*it).coef / sign);
-    }
-  }
-  
-  // separate lower and upper bounds
-  std::vector<GEQ_Handle> lb_list, ub_list;
-  {
-    Conjunct *c = bound.query_DNF()->single_conjunct();
-    for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
-      int coef = (*gi).get_coef(bound.set_var(level));
-      if (coef < 0)
-        ub_list.push_back(*gi);
-      else if (coef > 0)
-        lb_list.push_back(*gi);
-    }
-  }
-  
-  // simplify overflow expression for each pair of upper and lower bounds
-  std::vector<std::vector<std::map<Variable_ID, int> > > overflow_table(
-    lb_list.size(),
-    std::vector<std::map<Variable_ID, int> >(ub_list.size(),
-                                             std::map<Variable_ID, int>()));
-  bool is_overflow_simplifiable = true;
-  for (int i = 0; i < lb_list.size(); i++) {
-    if (!is_overflow_simplifiable)
-      break;
-    
-    for (int j = 0; j < ub_list.size(); j++) {
-      // lower bound or upper bound has non-unit coefficient, can't simplify
-      if (ub_list[j].get_coef(bound.set_var(level)) != -1
-          || lb_list[i].get_coef(bound.set_var(level)) != 1) {
-        is_overflow_simplifiable = false;
-        break;
-      }
-      
-      for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) {
-        switch ((*ci).var->kind()) {
-        case Input_Var: {
-          if ((*ci).var != bound.set_var(level))
-            overflow_table[i][j][(*ci).var] += (*ci).coef;
-          
-          break;
-        }
-        case Global_Var: {
-          Global_Var_ID g = (*ci).var->get_global_var();
-          Variable_ID v;
-          if (g->arity() == 0)
-            v = bound.get_local(g);
-          else
-            v = bound.get_local(g, (*ci).var->function_of());
-          overflow_table[i][j][(*ci).var] += (*ci).coef;
-          break;
-        }
-        default:
-          throw loop_error("failed to calculate overflow amount");
-        }
-      }
-      overflow_table[i][j][NULL] += ub_list[j].get_const();
-      
-      for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) {
-        switch ((*ci).var->kind()) {
-        case Input_Var: {
-          if ((*ci).var != bound.set_var(level)) {
-            overflow_table[i][j][(*ci).var] += (*ci).coef;
-            if (overflow_table[i][j][(*ci).var] == 0)
-              overflow_table[i][j].erase(
-                overflow_table[i][j].find((*ci).var));
-          }
-          break;
-        }
-        case Global_Var: {
-          Global_Var_ID g = (*ci).var->get_global_var();
-          Variable_ID v;
-          if (g->arity() == 0)
-            v = bound.get_local(g);
-          else
-            v = bound.get_local(g, (*ci).var->function_of());
-          overflow_table[i][j][(*ci).var] += (*ci).coef;
-          if (overflow_table[i][j][(*ci).var] == 0)
-            overflow_table[i][j].erase(
-              overflow_table[i][j].find((*ci).var));
-          break;
-        }
-        default:
-          throw loop_error("failed to calculate overflow amount");
-        }
-      }
-      overflow_table[i][j][NULL] += lb_list[i].get_const();
-      
-      overflow_table[i][j][NULL] += stride;
-      if (unroll_amount == 0
-          || (overflow_table[i][j].size() == 1
-              && overflow_table[i][j][NULL] / stride
-              < unroll_amount))
-        unroll_amount = overflow_table[i][j][NULL] / stride;
-    }
-  }
-  
-  // loop iteration count can't be determined, bail out gracefully
-  if (unroll_amount == 0)
-    return std::set<int>();
-  
-  // further simply overflow calculation using coefficients' modular
-  if (is_overflow_simplifiable) {
-    for (int i = 0; i < lb_list.size(); i++)
-      for (int j = 0; j < ub_list.size(); j++)
-        if (stride == 1) {
-          for (std::map<Variable_ID, int>::iterator k =
-                 overflow_table[i][j].begin();
-               k != overflow_table[i][j].end();)
-            if ((*k).first != NULL) {
-              int t = int_mod_hat((*k).second, unroll_amount);
-              if (t == 0) {
-                overflow_table[i][j].erase(k++);
-              } else {
-                int t2 = hull.query_variable_mod((*k).first,
-                                                 unroll_amount);
-                if (t2 != INT_MAX) {
-                  overflow_table[i][j][NULL] += t * t2;
-                  overflow_table[i][j].erase(k++);
-                } else {
-                  (*k).second = t;
-                  k++;
-                }
-              }
-            } else
-              k++;
-          
-          overflow_table[i][j][NULL] = int_mod_hat(
-            overflow_table[i][j][NULL], unroll_amount);
-          
-          // Since we don't have MODULO instruction in SUIF yet (only MOD), make all coef positive in the final formula
-          for (std::map<Variable_ID, int>::iterator k =
-                 overflow_table[i][j].begin();
-               k != overflow_table[i][j].end(); k++)
-            if ((*k).second < 0)
-              (*k).second += unroll_amount;
-        }
-  }
-  
-  // build overflow statement
-  CG_outputBuilder *ocg = ir->builder();
-  CG_outputRepr *overflow_code = NULL;
-  Relation cond_upper(level), cond_lower(level);
-  Relation overflow_constraint(0);
-  F_And *overflow_constraint_root = overflow_constraint.add_and();
-  std::vector<Free_Var_Decl *> over_var_list;
-  if (is_overflow_simplifiable && lb_list.size() == 1) {
-    for (int i = 0; i < ub_list.size(); i++) {
-      if (overflow_table[0][i].size() == 1) {
-        // upper splitting condition
-        GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]);
-        h.update_const(
-          ((overflow_table[0][i][NULL] / stride) % unroll_amount)
-          * -stride);
-      } else {
-        // upper splitting condition
-        std::string over_name = overflow_var_name_prefix
-          + to_string(overflow_var_name_counter++);
-        Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name);
-        over_var_list.push_back(over_free_var);
-        GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]);
-        h.update_coef(cond_upper.get_local(over_free_var), -stride);
-        
-        // insert constraint 0 <= overflow < unroll_amount
-        Variable_ID v = overflow_constraint.get_local(over_free_var);
-        GEQ_Handle h1 = overflow_constraint_root->add_GEQ();
-        h1.update_coef(v, 1);
-        GEQ_Handle h2 = overflow_constraint_root->add_GEQ();
-        h2.update_coef(v, -1);
-        h2.update_const(unroll_amount - 1);
-        
-        // create overflow assignment
-        bound.setup_names();
-        CG_outputRepr *rhs = NULL;
-        for (std::map<Variable_ID, int>::iterator j =
-               overflow_table[0][i].begin();
-             j != overflow_table[0][i].end(); j++)
-          if ((*j).first != NULL) {
-            CG_outputRepr *t = ocg->CreateIdent((*j).first->name());
-            if ((*j).second != 1)
-              t = ocg->CreateTimes(ocg->CreateInt((*j).second),
-                                   t);
-            rhs = ocg->CreatePlus(rhs, t);
-          } else if ((*j).second != 0)
-            rhs = ocg->CreatePlus(rhs, ocg->CreateInt((*j).second));
-        
-        if (stride != 1)
-          rhs = ocg->CreateIntegerCeil(rhs, ocg->CreateInt(stride));
-        rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount));
-        
-        CG_outputRepr *lhs = ocg->CreateIdent(over_name);
-        init_code = ocg->StmtListAppend(init_code,
-                                        ocg->CreateAssignment(0, lhs, ocg->CreateInt(0)));
-        lhs = ocg->CreateIdent(over_name);
-        overflow_code = ocg->StmtListAppend(overflow_code,
-                                            ocg->CreateAssignment(0, lhs, rhs));
-      }
-    }
-    
-    // lower splitting condition
-    GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[0]);
-  } else if (is_overflow_simplifiable && ub_list.size() == 1) {
-    for (int i = 0; i < lb_list.size(); i++) {
-      
-      if (overflow_table[i][0].size() == 1) {
-        // lower splitting condition
-        GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]);
-        h.update_const(overflow_table[i][0][NULL] * -stride);
-      } else {
-        // lower splitting condition
-        std::string over_name = overflow_var_name_prefix
-          + to_string(overflow_var_name_counter++);
-        Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name);
-        over_var_list.push_back(over_free_var);
-        GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]);
-        h.update_coef(cond_lower.get_local(over_free_var), -stride);
-        
-        // insert constraint 0 <= overflow < unroll_amount
-        Variable_ID v = overflow_constraint.get_local(over_free_var);
-        GEQ_Handle h1 = overflow_constraint_root->add_GEQ();
-        h1.update_coef(v, 1);
-        GEQ_Handle h2 = overflow_constraint_root->add_GEQ();
-        h2.update_coef(v, -1);
-        h2.update_const(unroll_amount - 1);
-        
-        // create overflow assignment
-        bound.setup_names();
-        CG_outputRepr *rhs = NULL;
-        for (std::map<Variable_ID, int>::iterator j =
-               overflow_table[0][i].begin();
-             j != overflow_table[0][i].end(); j++)
-          if ((*j).first != NULL) {
-            CG_outputRepr *t = ocg->CreateIdent((*j).first->name());
-            if ((*j).second != 1)
-              t = ocg->CreateTimes(ocg->CreateInt((*j).second),
-                                   t);
-            rhs = ocg->CreatePlus(rhs, t);
-          } else if ((*j).second != 0)
-            rhs = ocg->CreatePlus(rhs, ocg->CreateInt((*j).second));
-        
-        if (stride != 1)
-          rhs = ocg->CreateIntegerCeil(rhs, ocg->CreateInt(stride));
-        rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount));
-        
-        CG_outputRepr *lhs = ocg->CreateIdent(over_name);
-        init_code = ocg->StmtListAppend(init_code,
-                                        ocg->CreateAssignment(0, lhs, ocg->CreateInt(0)));
-        lhs = ocg->CreateIdent(over_name);
-        overflow_code = ocg->StmtListAppend(overflow_code,
-                                            ocg->CreateAssignment(0, lhs, rhs));
-      }
-    }
-    
-    // upper splitting condition
-    GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[0]);
-  } else {
-    std::string over_name = overflow_var_name_prefix
-      + to_string(overflow_var_name_counter++);
-    Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name);
-    over_var_list.push_back(over_free_var);
-    
-    Tuple<CG_outputRepr *> lb_repr_list, ub_repr_list;
-    for (int i = 0; i < lb_list.size(); i++) {
-      //lb_repr_list.append(outputLBasRepr(ocg, lb_list[i], bound, bound.set_var(dim+1), stride, stride_eq, Relation::True(bound.n_set()), std::vector<CG_outputRepr *>(bound.n_set(), NULL)));
-      lb_repr_list.append(
-        outputLBasRepr(ocg, lb_list[i], bound,
-                       bound.set_var(dim + 1), stride, stride_eq,
-                       Relation::True(bound.n_set()),
-                       std::vector<CG_outputRepr *>(bound.n_set())));
-      GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]);
-    }
-    for (int i = 0; i < ub_list.size(); i++) {
-      //ub_repr_list.append(outputUBasRepr(ocg, ub_list[i], bound, bound.set_var(dim+1), stride, stride_eq, std::vector<CG_outputRepr *>(bound.n_set(), NULL)));
-      ub_repr_list.append(
-        outputUBasRepr(ocg, ub_list[i], bound,
-                       bound.set_var(dim + 1), stride, stride_eq,
-                       std::vector<CG_outputRepr *>(bound.n_set())));
-      GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]);
-      h.update_coef(cond_upper.get_local(over_free_var), -stride);
-    }
-    
-    CG_outputRepr *lbRepr, *ubRepr;
-    if (lb_repr_list.size() > 1)
-      lbRepr = ocg->CreateInvoke("max", lb_repr_list);
-    else if (lb_repr_list.size() == 1)
-      lbRepr = lb_repr_list[1];
-    
-    if (ub_repr_list.size() > 1)
-      ubRepr = ocg->CreateInvoke("min", ub_repr_list);
-    else if (ub_repr_list.size() == 1)
-      ubRepr = ub_repr_list[1];
-    
-    // create overflow assignment
-    bound.setup_names();
-    CG_outputRepr *rhs = ocg->CreatePlus(ocg->CreateMinus(ubRepr, lbRepr),
-                                         ocg->CreateInt(1));
-    if (stride != 1)
-      rhs = ocg->CreateIntegerDivide(rhs, ocg->CreateInt(stride));
-    rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount));
-    CG_outputRepr *lhs = ocg->CreateIdent(over_name);
-    init_code = ocg->StmtListAppend(init_code,
-                                    ocg->CreateAssignment(0, lhs, ocg->CreateInt(0)));
-    lhs = ocg->CreateIdent(over_name);
-    overflow_code = ocg->CreateAssignment(0, lhs, rhs);
-    
-    // insert constraint 0 <= overflow < unroll_amount
-    Variable_ID v = overflow_constraint.get_local(over_free_var);
-    GEQ_Handle h1 = overflow_constraint_root->add_GEQ();
-    h1.update_coef(v, 1);
-    GEQ_Handle h2 = overflow_constraint_root->add_GEQ();
-    h2.update_coef(v, -1);
-    h2.update_const(unroll_amount - 1);
-  }
-  
-  // insert overflow statement
-  int overflow_stmt_num = -1;
-  if (overflow_code != NULL) {
-    // build iteration space for overflow statement
-    Relation mapping(level, level - 1);
-    F_And *f_root = mapping.add_and();
-    for (int i = 1; i < level; i++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(i), 1);
-      h.update_coef(mapping.input_var(i), -1);
-    }
-    Relation overflow_IS = Range(Restrict_Domain(mapping, copy(hull)));
-    for (int i = 1; i < level; i++)
-      overflow_IS.name_set_var(i, hull.set_var(i)->name());
-    overflow_IS.setup_names();
-    
-    // build dumb transformation relation for overflow statement
-    Relation overflow_xform(level - 1, 2 * (level - 1) + 1);
-    f_root = overflow_xform.add_and();
-    for (int i = 1; i <= level - 1; i++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(overflow_xform.output_var(2 * i), 1);
-      h.update_coef(overflow_xform.input_var(i), -1);
-      
-      h = f_root->add_EQ();
-      h.update_coef(overflow_xform.output_var(2 * i - 1), 1);
-      h.update_const(-lex[2 * i - 2]);
-    }
-    EQ_Handle h = f_root->add_EQ();
-    h.update_coef(overflow_xform.output_var(2 * (level - 1) + 1), 1);
-    h.update_const(-lex[2 * (level - 1)]);
-    
-    shiftLexicalOrder(lex, dim - 1, 1);
-    Statement overflow_stmt;
-    overflow_stmt.code = overflow_code;
-    overflow_stmt.IS = overflow_IS;
-    overflow_stmt.xform = overflow_xform;
-    overflow_stmt.loop_level = std::vector<LoopLevel>(level - 1);
-    for (int i = 0; i < level - 1; i++) {
-      overflow_stmt.loop_level[i].type =
-        stmt[stmt_num].loop_level[i].type;
-      if (stmt[stmt_num].loop_level[i].type == LoopLevelTile
-          && stmt[stmt_num].loop_level[i].payload >= level)
-        overflow_stmt.loop_level[i].payload = -1;
-      else
-        overflow_stmt.loop_level[i].payload =
-          stmt[stmt_num].loop_level[i].payload;
-      overflow_stmt.loop_level[i].parallel_level =
-        stmt[stmt_num].loop_level[i].parallel_level;
-    }
-    stmt.push_back(overflow_stmt);
-    dep.insert();
-    overflow_stmt_num = stmt.size() - 1;
-    overflow[overflow_stmt_num] = over_var_list;
-    
-    // update the global known information on overflow variable
-    this->known = Intersection(this->known,
-                               Extend_Set(copy(overflow_constraint),
-                                          this->known.n_set() - overflow_constraint.n_set()));
-    
-    // update dependence graph
-    DependenceVector dv;
-    dv.type = DEP_CONTROL;
-    for (std::set<int>::iterator i = same_loop.begin();
-         i != same_loop.end(); i++)
-      dep.connect(overflow_stmt_num, *i, dv);
-    dv.type = DEP_W2W;
-    {
-      IR_ScalarSymbol *overflow_sym = NULL;
-      std::vector<IR_ScalarRef *> scalars = ir->FindScalarRef(
-        overflow_code);
-      for (int i = scalars.size() - 1; i >= 0; i--)
-        if (scalars[i]->is_write()) {
-          overflow_sym = scalars[i]->symbol();
-          break;
-        }
-      for (int i = scalars.size() - 1; i >= 0; i--)
-        delete scalars[i];
-      dv.sym = overflow_sym;
-    }
-    dv.lbounds = std::vector<coef_t>(num_dep_dim, 0);
-    dv.ubounds = std::vector<coef_t>(num_dep_dim, 0);
-    int dep_dim = get_last_dep_dim_before(stmt_num, level);
-    for (int i = dep_dim + 1; i < num_dep_dim; i++) {
-      dv.lbounds[i] = -posInfinity;
-      dv.ubounds[i] = posInfinity;
-    }
-    for (int i = 0; i <= dep_dim; i++) {
-      if (i != 0) {
-        dv.lbounds[i - 1] = 0;
-        dv.ubounds[i - 1] = 0;
-      }
-      dv.lbounds[i] = 1;
-      dv.ubounds[i] = posInfinity;
-      dep.connect(overflow_stmt_num, overflow_stmt_num, dv);
-    }
-  }
-  
-  // split the loop so it can be fully unrolled
-  std::set<int> result = split(stmt_num, level, cond_upper);
-  std::set<int> result2 = split(stmt_num, level, cond_lower);
-  for (std::set<int>::iterator i = result2.begin(); i != result2.end(); i++)
-    result.insert(*i);
-  
-  // check if unrolled statements can be trivially lumped together as one statement
-  bool can_be_lumped = true;
-  if (can_be_lumped) {
-    for (std::set<int>::iterator i = same_loop.begin();
-         i != same_loop.end(); i++)
-      if (*i != stmt_num) {
-        if (stmt[*i].loop_level.size()
-            != stmt[stmt_num].loop_level.size()) {
-          can_be_lumped = false;
-          break;
-        }
-        for (int j = 0; j < stmt[stmt_num].loop_level.size(); j++)
-          if (!(stmt[*i].loop_level[j].type
-                == stmt[stmt_num].loop_level[j].type
-                && stmt[*i].loop_level[j].payload
-                == stmt[stmt_num].loop_level[j].payload)) {
-            can_be_lumped = false;
-            break;
-          }
-        if (!can_be_lumped)
-          break;
-        std::vector<int> lex2 = getLexicalOrder(*i);
-        for (int j = 2 * level; j < lex.size() - 1; j += 2)
-          if (lex[j] != lex2[j]) {
-            can_be_lumped = false;
-            break;
-          }
-        if (!can_be_lumped)
-          break;
-      }
-  }
-  if (can_be_lumped) {
-    for (std::set<int>::iterator i = same_loop.begin();
-         i != same_loop.end(); i++)
-      if (is_inner_loop_depend_on_level(stmt[*i].IS, level, known)) {
-        can_be_lumped = false;
-        break;
-      }
-  }
-  if (can_be_lumped) {
-    for (std::set<int>::iterator i = same_loop.begin();
-         i != same_loop.end(); i++)
-      if (*i != stmt_num) {
-        if (!(Must_Be_Subset(copy(stmt[*i].IS), copy(stmt[stmt_num].IS))
-              && Must_Be_Subset(copy(stmt[stmt_num].IS),
-                                copy(stmt[*i].IS)))) {
-          can_be_lumped = false;
-          break;
-        }
-      }
-  }
-  if (can_be_lumped) {
-    for (std::set<int>::iterator i = same_loop.begin();
-         i != same_loop.end(); i++) {
-      for (DependenceGraph::EdgeList::iterator j =
-             dep.vertex[*i].second.begin();
-           j != dep.vertex[*i].second.end(); j++)
-        if (same_loop.find(j->first) != same_loop.end()) {
-          for (int k = 0; k < j->second.size(); k++)
-            if (j->second[k].type == DEP_CONTROL
-                || j->second[k].type == DEP_UNKNOWN) {
-              can_be_lumped = false;
-              break;
-            }
-          if (!can_be_lumped)
-            break;
-        }
-      if (!can_be_lumped)
-        break;
-    }
-  }
-  
-  // add strides to original statements
-  // for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-  //   add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
-  
-  // std::vector<Free_Var_Decl *> depending_overflow_var;
-  // for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-  //   add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
-  //   if (overflow.find(*i) != overflow.end()) {
-  //     // TO DO: It should check whether overflow vaiable depends on
-  //     // this loop index and by how much.  This step is important if
-  //     // you want to unroll loops in arbitrary order.
-  //     depending_overflow_var.insert(depending_overflow_var.end(), overflow[*i].begin(), overflow[*i].end());
-  
-  //     continue;
-  //   }
-  // }
-  
-//   std::map<int, std::vector<Statement> > pending;
-//   for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-//     add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
-  
-//     if (overflow.find(*i) != overflow.end()) {
-//       // TO DO: It should check whether overflow vaiable depends on
-//       // this loop index and by how much.  This step is important if
-//       // you want to unroll loops in arbitrary order.
-//       depending_overflow_var.insert(depending_overflow_var.end(), overflow[*i].begin(), overflow[*i].end());
-  
-//       continue;
-//     }
-  
-//     // create copy for each unroll amount
-//     for (int j = 1; j < unroll_amount; j++) {
-//       Tuple<CG_outputRepr *> funcList;
-//       Tuple<std::string> loop_vars;
-//       loop_vars.append(stmt[*i].IS.set_var((dim+1)/2)->name());
-//       funcList.append(ocg->CreatePlus(ocg->CreateIdent(stmt[*i].IS.set_var(level)->name()), ocg->CreateInt(j*stride)));
-//       CG_outputRepr *code = ocg->CreatePlaceHolder(0, stmt[*i].code->clone(), funcList, loop_vars);
-  
-//       // prepare the new statment to insert
-//       Statement unrolled_stmt;
-//       unrolled_stmt.IS = copy(stmt[*i].IS);
-// //      adjust_loop_bound(unrolled_stmt.IS, (dim-1)/2, j);
-//       unrolled_stmt.xform = copy(stmt[*i].xform);
-//       unrolled_stmt.code = code;
-//       unrolled_stmt.loop_level = stmt[*i].loop_level;
-//       pending[*i].push_back(unrolled_stmt);
-//     }
-//   }
-  
-//   // adjust iteration space due to loop bounds depending on this loop
-//   // index and affected overflow variables
-//   for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-//     for (int j = 0; j < pending[*i].size(); j++) {
-//       adjust_loop_bound(pending[*i][j].IS, (dim-1)/2, j+1, depending_overflow_var);
-//       //pending[*i][j].IS = Intersection(pending[*i][j].IS, Extend_Set(copy(this->known), pending[*i][j].IS.n_set() - this->known.n_set()));
-//     }
-//   }
-  
-  // insert unrolled statements
-  int old_num_stmt = stmt.size();
-  if (!can_be_lumped) {
-    std::map<int, std::vector<int> > what_stmt_num;
-    
-    for (int j = 1; j < unroll_amount; j++) {
-      for (std::set<int>::iterator i = same_loop.begin();
-           i != same_loop.end(); i++) {
-        Statement new_stmt;
-        
-        Tuple<CG_outputRepr *> funcList;
-        Tuple<std::string> loop_vars;
-        loop_vars.append(stmt[*i].IS.set_var(level)->name());
-        funcList.append(
-          ocg->CreatePlus(
-            ocg->CreateIdent(
-              stmt[*i].IS.set_var(level)->name()),
-            ocg->CreateInt(j * stride)));
-        new_stmt.code = ocg->CreatePlaceHolder(0,
-                                               stmt[*i].code->clone(), funcList, loop_vars);
-        
-        new_stmt.IS = adjust_loop_bound(stmt[*i].IS, level, j * stride);
-        add_loop_stride(new_stmt.IS, bound, level - 1,
-                        unroll_amount * stride);
-        
-        new_stmt.xform = copy(stmt[*i].xform);
-        new_stmt.loop_level = stmt[*i].loop_level;
-        stmt.push_back(new_stmt);
-        dep.insert();
-        what_stmt_num[*i].push_back(stmt.size() - 1);
-      }
-    }
-    for (std::set<int>::iterator i = same_loop.begin();
-         i != same_loop.end(); i++)
-      add_loop_stride(stmt[*i].IS, bound, level - 1,
-                      unroll_amount * stride);
-    
-    // update dependence graph
-    if (stmt[stmt_num].loop_level[level - 1].type == LoopLevelOriginal) {
-      int dep_dim = stmt[stmt_num].loop_level[level - 1].payload;
-      int new_stride = unroll_amount * stride;
-      for (int i = 0; i < old_num_stmt; i++) {
-        std::vector<std::pair<int, DependenceVector> > D;
-        
-        for (DependenceGraph::EdgeList::iterator j =
-               dep.vertex[i].second.begin();
-             j != dep.vertex[i].second.end();) {
-          if (same_loop.find(i) != same_loop.end()) {
-            if (same_loop.find(j->first) != same_loop.end()) {
-              for (int k = 0; k < j->second.size(); k++) {
-                DependenceVector dv = j->second[k];
-                if (dv.type == DEP_CONTROL
-                    || dv.type == DEP_UNKNOWN) {
-                  D.push_back(std::make_pair(j->first, dv));
-                  for (int kk = 0; kk < unroll_amount - 1;
-                       kk++)
-                    if (what_stmt_num[i][kk] != -1
-                        && what_stmt_num[j->first][kk]
-                        != -1)
-                      dep.connect(what_stmt_num[i][kk],
-                                  what_stmt_num[j->first][kk],
-                                  dv);
-                } else {
-                  coef_t lb = dv.lbounds[dep_dim];
-                  coef_t ub = dv.ubounds[dep_dim];
-                  if (ub == lb
-                      && int_mod(lb,
-                                 static_cast<coef_t>(new_stride))
-                      == 0) {
-                    D.push_back(
-                      std::make_pair(j->first, dv));
-                    for (int kk = 0; kk < unroll_amount - 1;
-                         kk++)
-                      if (what_stmt_num[i][kk] != -1
-                          && what_stmt_num[j->first][kk]
-                          != -1)
-                        dep.connect(
-                          what_stmt_num[i][kk],
-                          what_stmt_num[j->first][kk],
-                          dv);
-                  } else if (lb == -posInfinity
-                             && ub == posInfinity) {
-                    D.push_back(
-                      std::make_pair(j->first, dv));
-                    for (int kk = 0; kk < unroll_amount;
-                         kk++)
-                      if (kk == 0)
-                        D.push_back(
-                          std::make_pair(j->first,
-                                         dv));
-                      else if (what_stmt_num[j->first][kk
-                                                       - 1] != -1)
-                        D.push_back(
-                          std::make_pair(
-                            what_stmt_num[j->first][kk
-                                                    - 1],
-                            dv));
-                    for (int t = 0; t < unroll_amount - 1;
-                         t++)
-                      if (what_stmt_num[i][t] != -1)
-                        for (int kk = 0;
-                             kk < unroll_amount;
-                             kk++)
-                          if (kk == 0)
-                            dep.connect(
-                              what_stmt_num[i][t],
-                              j->first, dv);
-                          else if (what_stmt_num[j->first][kk
-                                                           - 1] != -1)
-                            dep.connect(
-                              what_stmt_num[i][t],
-                              what_stmt_num[j->first][kk
-                                                      - 1],
-                              dv);
-                  } else {
-                    for (int kk = 0; kk < unroll_amount;
-                         kk++) {
-                      if (lb != -posInfinity) {
-                        if (kk * stride
-                            < int_mod(lb,
-                                      static_cast<coef_t>(new_stride)))
-                          dv.lbounds[dep_dim] =
-                            floor(
-                              static_cast<double>(lb)
-                              / new_stride)
-                            * new_stride
-                            + new_stride;
-                        else
-                          dv.lbounds[dep_dim] =
-                            floor(
-                              static_cast<double>(lb)
-                              / new_stride)
-                            * new_stride;
-                      }
-                      if (ub != posInfinity) {
-                        if (kk * stride
-                            > int_mod(ub,
-                                      static_cast<coef_t>(new_stride)))
-                          dv.ubounds[dep_dim] =
-                            floor(
-                              static_cast<double>(ub)
-                              / new_stride)
-                            * new_stride
-                            - new_stride;
-                        else
-                          dv.ubounds[dep_dim] =
-                            floor(
-                              static_cast<double>(ub)
-                              / new_stride)
-                            * new_stride;
-                      }
-                      if (dv.ubounds[dep_dim]
-                          >= dv.lbounds[dep_dim]) {
-                        if (kk == 0)
-                          D.push_back(
-                            std::make_pair(
-                              j->first,
-                              dv));
-                        else if (what_stmt_num[j->first][kk
-                                                         - 1] != -1)
-                          D.push_back(
-                            std::make_pair(
-                              what_stmt_num[j->first][kk
-                                                      - 1],
-                              dv));
-                      }
-                    }
-                    for (int t = 0; t < unroll_amount - 1;
-                         t++)
-                      if (what_stmt_num[i][t] != -1)
-                        for (int kk = 0;
-                             kk < unroll_amount;
-                             kk++) {
-                          if (lb != -posInfinity) {
-                            if (kk * stride
-                                < int_mod(
-                                  lb + t
-                                  + 1,
-                                  static_cast<coef_t>(new_stride)))
-                              dv.lbounds[dep_dim] =
-                                floor(
-                                  static_cast<double>(lb
-                                                      + (t
-                                                         + 1)
-                                                      * stride)
-                                  / new_stride)
-                                * new_stride
-                                + new_stride;
-                            else
-                              dv.lbounds[dep_dim] =
-                                floor(
-                                  static_cast<double>(lb
-                                                      + (t
-                                                         + 1)
-                                                      * stride)
-                                  / new_stride)
-                                * new_stride;
-                          }
-                          if (ub != posInfinity) {
-                            if (kk * stride
-                                > int_mod(
-                                  ub + t
-                                  + 1,
-                                  static_cast<coef_t>(new_stride)))
-                              dv.ubounds[dep_dim] =
-                                floor(
-                                  static_cast<double>(ub
-                                                      + (t
-                                                         + 1)
-                                                      * stride)
-                                  / new_stride)
-                                * new_stride
-                                - new_stride;
-                            else
-                              dv.ubounds[dep_dim] =
-                                floor(
-                                  static_cast<double>(ub
-                                                      + (t
-                                                         + 1)
-                                                      * stride)
-                                  / new_stride)
-                                * new_stride;
-                          }
-                          if (dv.ubounds[dep_dim]
-                              >= dv.lbounds[dep_dim]) {
-                            if (kk == 0)
-                              dep.connect(
-                                what_stmt_num[i][t],
-                                j->first,
-                                dv);
-                            else if (what_stmt_num[j->first][kk
-                                                             - 1] != -1)
-                              dep.connect(
-                                what_stmt_num[i][t],
-                                what_stmt_num[j->first][kk
-                                                        - 1],
-                                dv);
-                          }
-                        }
-                  }
-                }
-              }
-              
-              dep.vertex[i].second.erase(j++);
-            } else {
-              for (int kk = 0; kk < unroll_amount - 1; kk++)
-                if (what_stmt_num[i][kk] != -1)
-                  dep.connect(what_stmt_num[i][kk], j->first,
-                              j->second);
-              
-              j++;
-            }
-          } else {
-            if (same_loop.find(j->first) != same_loop.end())
-              for (int k = 0; k < j->second.size(); k++)
-                for (int kk = 0; kk < unroll_amount - 1; kk++)
-                  if (what_stmt_num[j->first][kk] != -1)
-                    D.push_back(
-                      std::make_pair(
-                        what_stmt_num[j->first][kk],
-                        j->second[k]));
-            j++;
-          }
-        }
-        
-        for (int j = 0; j < D.size(); j++)
-          dep.connect(i, D[j].first, D[j].second);
-      }
-    }
-    
-    // reset lexical order for the unrolled loop body
-    std::set<int> new_same_loop;
-    for (std::map<int, std::vector<int> >::iterator i =
-           what_stmt_num.begin(); i != what_stmt_num.end(); i++) {
-      new_same_loop.insert(i->first);
-      for (int j = 0; j < i->second.size(); j++)
-        new_same_loop.insert(i->second[j]);
-    }
-    setLexicalOrder(dim + 1, new_same_loop);
-  } else {
-    for (std::set<int>::iterator i = same_loop.begin();
-         i != same_loop.end(); i++)
-      add_loop_stride(stmt[*i].IS, bound, level - 1,
-                      unroll_amount * stride);
-    
-    int max_level = stmt[stmt_num].loop_level.size();
-    std::vector<std::pair<int, int> > stmt_order;
-    for (std::set<int>::iterator i = same_loop.begin();
-         i != same_loop.end(); i++)
-      stmt_order.push_back(
-        std::make_pair(
-          get_const(stmt[*i].xform, 2 * max_level,
-                    Output_Var), *i));
-    sort(stmt_order.begin(), stmt_order.end());
-    
-    Statement new_stmt;
-    new_stmt.code = NULL;
-    for (int j = 1; j < unroll_amount; j++)
-      for (int i = 0; i < stmt_order.size(); i++) {
-        Tuple<CG_outputRepr *> funcList;
-        Tuple<std::string> loop_vars;
-        loop_vars.append(
-          stmt[stmt_order[i].second].IS.set_var(level)->name());
-        funcList.append(
-          ocg->CreatePlus(
-            ocg->CreateIdent(
-              stmt[stmt_order[i].second].IS.set_var(
-                level)->name()),
-            ocg->CreateInt(j * stride)));
-        CG_outputRepr *code = ocg->CreatePlaceHolder(0,
-                                                     stmt[stmt_order[i].second].code->clone(), funcList,
-                                                     loop_vars);
-        new_stmt.code = ocg->StmtListAppend(new_stmt.code, code);
-      }
-    
-    new_stmt.IS = copy(stmt[stmt_num].IS);
-    new_stmt.xform = copy(stmt[stmt_num].xform);
-    assign_const(new_stmt.xform, 2 * max_level,
-                 stmt_order[stmt_order.size() - 1].first + 1);
-    new_stmt.loop_level = stmt[stmt_num].loop_level;
-    stmt.push_back(new_stmt);
-    dep.insert();
-    
-    // update dependence graph
-    if (stmt[stmt_num].loop_level[level - 1].type == LoopLevelOriginal) {
-      int dep_dim = stmt[stmt_num].loop_level[level - 1].payload;
-      int new_stride = unroll_amount * stride;
-      for (int i = 0; i < old_num_stmt; i++) {
-        std::vector<std::pair<int, std::vector<DependenceVector> > > D;
-        
-        for (DependenceGraph::EdgeList::iterator j =
-               dep.vertex[i].second.begin();
-             j != dep.vertex[i].second.end();) {
-          if (same_loop.find(i) != same_loop.end()) {
-            if (same_loop.find(j->first) != same_loop.end()) {
-              std::vector<DependenceVector> dvs11, dvs12, dvs22,
-                dvs21;
-              for (int k = 0; k < j->second.size(); k++) {
-                DependenceVector dv = j->second[k];
-                if (dv.type == DEP_CONTROL
-                    || dv.type == DEP_UNKNOWN) {
-                  if (i == j->first) {
-                    dvs11.push_back(dv);
-                    dvs22.push_back(dv);
-                  } else
-                    throw loop_error(
-                      "unrolled statements lumped together illegally");
-                } else {
-                  coef_t lb = dv.lbounds[dep_dim];
-                  coef_t ub = dv.ubounds[dep_dim];
-                  if (ub == lb
-                      && int_mod(lb,
-                                 static_cast<coef_t>(new_stride))
-                      == 0) {
-                    dvs11.push_back(dv);
-                    dvs22.push_back(dv);
-                  } else {
-                    if (lb != -posInfinity)
-                      dv.lbounds[dep_dim] = ceil(
-                        static_cast<double>(lb)
-                        / new_stride)
-                        * new_stride;
-                    if (ub != posInfinity)
-                      dv.ubounds[dep_dim] = floor(
-                        static_cast<double>(ub)
-                        / new_stride)
-                        * new_stride;
-                    if (dv.ubounds[dep_dim]
-                        >= dv.lbounds[dep_dim])
-                      dvs11.push_back(dv);
-                    
-                    if (lb != -posInfinity)
-                      dv.lbounds[dep_dim] = ceil(
-                        static_cast<double>(lb)
-                        / new_stride)
-                        * new_stride;
-                    if (ub != posInfinity)
-                      dv.ubounds[dep_dim] = ceil(
-                        static_cast<double>(ub)
-                        / new_stride)
-                        * new_stride;
-                    if (dv.ubounds[dep_dim]
-                        >= dv.lbounds[dep_dim])
-                      dvs21.push_back(dv);
-                    
-                    if (lb != -posInfinity)
-                      dv.lbounds[dep_dim] = floor(
-                        static_cast<double>(lb)
-                        / new_stride)
-                        * new_stride;
-                    if (ub != posInfinity)
-                      dv.ubounds[dep_dim] = floor(
-                        static_cast<double>(ub
-                                            - stride)
-                        / new_stride)
-                        * new_stride;
-                    if (dv.ubounds[dep_dim]
-                        >= dv.lbounds[dep_dim])
-                      dvs12.push_back(dv);
-                    
-                    if (lb != -posInfinity)
-                      dv.lbounds[dep_dim] = floor(
-                        static_cast<double>(lb)
-                        / new_stride)
-                        * new_stride;
-                    if (ub != posInfinity)
-                      dv.ubounds[dep_dim] = ceil(
-                        static_cast<double>(ub
-                                            - stride)
-                        / new_stride)
-                        * new_stride;
-                    if (dv.ubounds[dep_dim]
-                        >= dv.lbounds[dep_dim])
-                      dvs22.push_back(dv);
-                  }
-                }
-              }
-              if (dvs11.size() > 0)
-                D.push_back(std::make_pair(i, dvs11));
-              if (dvs22.size() > 0)
-                dep.connect(old_num_stmt, old_num_stmt, dvs22);
-              if (dvs12.size() > 0)
-                D.push_back(
-                  std::make_pair(old_num_stmt, dvs12));
-              if (dvs21.size() > 0)
-                dep.connect(old_num_stmt, i, dvs21);
-              
-              dep.vertex[i].second.erase(j++);
-            } else {
-              dep.connect(old_num_stmt, j->first, j->second);
-              j++;
-            }
-          } else {
-            if (same_loop.find(j->first) != same_loop.end())
-              D.push_back(
-                std::make_pair(old_num_stmt, j->second));
-            j++;
-          }
-        }
-        
-        for (int j = 0; j < D.size(); j++)
-          dep.connect(i, D[j].first, D[j].second);
-      }
-    }
-  }
-  
-  return result;
-}
-
-std::vector<int> Loop::getLexicalOrder(int stmt_num) const {
-  assert(stmt_num < stmt.size());
-  
-  const int n = stmt[stmt_num].xform.n_out();
-  std::vector<int> lex(n, 0);
-  
-  for (int i = 0; i < n; i += 2)
-    lex[i] = get_const(stmt[stmt_num].xform, i, Output_Var);
-  
-  return lex;
-}
-
-std::set<int> Loop::getStatements(const std::vector<int> &lex, int dim) const {
-  const int m = stmt.size();
-  
-  std::set<int> same_loops;
-  for (int i = 0; i < m; i++) {
-    if (dim < 0)
-      same_loops.insert(i);
-    else {
-      std::vector<int> a_lex = getLexicalOrder(i);
-      int j;
-      for (j = 0; j <= dim; j += 2)
-        if (lex[j] != a_lex[j])
-          break;
-      if (j > dim)
-        same_loops.insert(i);
-    }
-  }
-  
-  return same_loops;
-}
-
-void Loop::shiftLexicalOrder(const std::vector<int> &lex, int dim, int amount) {
-  const int m = stmt.size();
-  
-  if (amount == 0)
-    return;
-  
-  for (int i = 0; i < m; i++) {
-    std::vector<int> lex2 = getLexicalOrder(i);
-    
-    bool need_shift = true;
-    
-    for (int j = 0; j < dim; j++)
-      if (lex2[j] != lex[j]) {
-        need_shift = false;
-        break;
-      }
-    
-    if (!need_shift)
-      continue;
-    
-    if (amount > 0) {
-      if (lex2[dim] < lex[dim])
-        continue;
-    } else if (amount < 0) {
-      if (lex2[dim] > lex[dim])
-        continue;
-    }
-    
-    assign_const(stmt[i].xform, dim, lex2[dim] + amount);
-  }
-}
-
-void Loop::setLexicalOrder(int dim, const std::set<int> &active,
-                           int starting_order) {
-  if (active.size() == 0)
-    return;
-  
-  // check for sanity of parameters
-  if (dim < 0 || dim % 2 != 0)
-    throw std::invalid_argument(
-      "invalid constant loop level to set lexicographical order");
-  std::vector<int> lex;
-  int ref_stmt_num;
-  for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
-    if ((*i) < 0 || (*i) >= stmt.size())
-      throw std::invalid_argument(
-        "invalid statement number " + to_string(*i));
-    if (dim >= stmt[*i].xform.n_out())
-      throw std::invalid_argument(
-        "invalid constant loop level to set lexicographical order");
-    if (i == active.begin()) {
-      lex = getLexicalOrder(*i);
-      ref_stmt_num = *i;
-    } else {
-      std::vector<int> lex2 = getLexicalOrder(*i);
-      for (int j = 0; j < dim; j += 2)
-        if (lex[j] != lex2[j])
-          throw std::invalid_argument(
-            "statements are not in the same sub loop nest");
-    }
-  }
-  
-  // sepearate statements by current loop level types
-  int level = (dim + 2) / 2;
-  std::map<std::pair<LoopLevelType, int>, std::set<int> > active_by_level_type;
-  std::set<int> active_by_no_level;
-  for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
-    if (level > stmt[*i].loop_level.size())
-      active_by_no_level.insert(*i);
-    else
-      active_by_level_type[std::make_pair(
-          stmt[*i].loop_level[level - 1].type,
-          stmt[*i].loop_level[level - 1].payload)].insert(*i);
-  }
-  
-  // further separate statements due to control dependences
-  std::vector<std::set<int> > active_by_level_type_splitted;
-  for (std::map<std::pair<LoopLevelType, int>, std::set<int> >::iterator i =
-         active_by_level_type.begin(); i != active_by_level_type.end(); i++)
-    active_by_level_type_splitted.push_back(i->second);
-  for (std::set<int>::iterator i = active_by_no_level.begin();
-       i != active_by_no_level.end(); i++)
-    for (int j = active_by_level_type_splitted.size() - 1; j >= 0; j--) {
-      std::set<int> controlled, not_controlled;
-      for (std::set<int>::iterator k =
-             active_by_level_type_splitted[j].begin();
-           k != active_by_level_type_splitted[j].end(); k++) {
-        std::vector<DependenceVector> dvs = dep.getEdge(*i, *k);
-        bool is_controlled = false;
-        for (int kk = 0; kk < dvs.size(); kk++)
-          if (dvs[kk].type = DEP_CONTROL) {
-            is_controlled = true;
-            break;
-          }
-        if (is_controlled)
-          controlled.insert(*k);
-        else
-          not_controlled.insert(*k);
-      }
-      if (controlled.size() != 0 && not_controlled.size() != 0) {
-        active_by_level_type_splitted.erase(
-          active_by_level_type_splitted.begin() + j);
-        active_by_level_type_splitted.push_back(controlled);
-        active_by_level_type_splitted.push_back(not_controlled);
-      }
-    }
-  
-  // set lexical order separating loops with different loop types first
-  if (active_by_level_type_splitted.size() + active_by_no_level.size() > 1) {
-    int dep_dim = get_last_dep_dim_before(ref_stmt_num, level) + 1;
-    
-    Graph<std::set<int>, Empty> g;
-    for (std::vector<std::set<int> >::iterator i =
-           active_by_level_type_splitted.begin();
-         i != active_by_level_type_splitted.end(); i++)
-      g.insert(*i);
-    for (std::set<int>::iterator i = active_by_no_level.begin();
-         i != active_by_no_level.end(); i++) {
-      std::set<int> t;
-      t.insert(*i);
-      g.insert(t);
-    }
-    for (int i = 0; i < g.vertex.size(); i++)
-      for (int j = i + 1; j < g.vertex.size(); j++) {
-        bool connected = false;
-        for (std::set<int>::iterator ii = g.vertex[i].first.begin();
-             ii != g.vertex[i].first.end(); ii++) {
-          for (std::set<int>::iterator jj = g.vertex[j].first.begin();
-               jj != g.vertex[j].first.end(); jj++) {
-            std::vector<DependenceVector> dvs = dep.getEdge(*ii,
-                                                            *jj);
-            for (int k = 0; k < dvs.size(); k++)
-              if (dvs[k].is_control_dependence()
-                  || (dvs[k].is_data_dependence()
-                      && !dvs[k].has_been_carried_before(
-                        dep_dim))) {
-                g.connect(i, j);
-                connected = true;
-                break;
-              }
-            if (connected)
-              break;
-          }
-          if (connected)
-            break;
-        }
-        connected = false;
-        for (std::set<int>::iterator ii = g.vertex[i].first.begin();
-             ii != g.vertex[i].first.end(); ii++) {
-          for (std::set<int>::iterator jj = g.vertex[j].first.begin();
-               jj != g.vertex[j].first.end(); jj++) {
-            std::vector<DependenceVector> dvs = dep.getEdge(*jj,
-                                                            *ii);
-            for (int k = 0; k < dvs.size(); k++)
-              if (dvs[k].is_control_dependence()
-                  || (dvs[k].is_data_dependence()
-                      && !dvs[k].has_been_carried_before(
-                        dep_dim))) {
-                g.connect(j, i);
-                connected = true;
-                break;
-              }
-            if (connected)
-              break;
-          }
-          if (connected)
-            break;
-        }
-      }
-    
-    std::vector<std::set<int> > s = g.topoSort();
-    if (s.size() != g.vertex.size())
-      throw loop_error(
-        "cannot separate statements with different loop types at loop level "
-        + to_string(level));
-    
-    // assign lexical order
-    int order = starting_order;
-    for (int i = 0; i < s.size(); i++) {
-      std::set<int> &cur_scc = g.vertex[*(s[i].begin())].first;
-      int sz = cur_scc.size();
-      if (sz == 1) {
-        int cur_stmt = *(cur_scc.begin());
-        assign_const(stmt[cur_stmt].xform, dim, order);
-        for (int j = dim + 2; j < stmt[cur_stmt].xform.n_out(); j += 2)
-          assign_const(stmt[cur_stmt].xform, j, 0);
-        order++;
-      } else {
-        setLexicalOrder(dim, cur_scc, order);
-        order += sz;
-      }
-    }
-  }
-  // set lexical order seperating single iteration statements and loops
-  else {
-    std::set<int> true_singles;
-    std::set<int> nonsingles;
-    std::map<coef_t, std::set<int> > fake_singles;
-    
-    // sort out statements that do not require loops
-    for (std::set<int>::iterator i = active.begin(); i != active.end();
-         i++) {
-      Relation cur_IS = getNewIS(*i);
-      if (is_single_iteration(cur_IS, dim + 1)) {
-        bool is_all_single = true;
-        for (int j = dim + 3; j < stmt[*i].xform.n_out(); j += 2)
-          if (!is_single_iteration(cur_IS, j)) {
-            is_all_single = false;
-            break;
-          }
-        if (is_all_single)
-          true_singles.insert(*i);
-        else {
-          try {
-            fake_singles[get_const(cur_IS, dim + 1, Set_Var)].insert(
-              *i);
-          } catch (const std::exception &e) {
-            fake_singles[posInfinity].insert(*i);
-          }
-        }
-      } else
-        nonsingles.insert(*i);
-    }
-    
-    // split nonsingles forcibly according to negative dependences present (loop unfusible)
-    int dep_dim = get_dep_dim_of(ref_stmt_num, level);
-    Graph<int, Empty> g2;
-    for (std::set<int>::iterator i = nonsingles.begin();
-         i != nonsingles.end(); i++)
-      g2.insert(*i);
-    for (int i = 0; i < g2.vertex.size(); i++)
-      for (int j = i + 1; j < g2.vertex.size(); j++) {
-        std::vector<DependenceVector> dvs = dep.getEdge(
-          g2.vertex[i].first, g2.vertex[j].first);
-        for (int k = 0; k < dvs.size(); k++)
-          if (dvs[k].is_control_dependence()
-              || (dvs[k].is_data_dependence()
-                  && dvs[k].has_negative_been_carried_at(
-                    dep_dim))) {
-            g2.connect(i, j);
-            break;
-          }
-        dvs = dep.getEdge(g2.vertex[j].first, g2.vertex[i].first);
-        for (int k = 0; k < dvs.size(); k++)
-          if (dvs[k].is_control_dependence()
-              || (dvs[k].is_data_dependence()
-                  && dvs[k].has_negative_been_carried_at(
-                    dep_dim))) {
-            g2.connect(j, i);
-            break;
-          }
-      }
-    
-    std::vector<std::set<int> > s2 = g2.packed_topoSort();
-    
-    std::vector<std::set<int> > splitted_nonsingles;
-    for (int i = 0; i < s2.size(); i++) {
-      std::set<int> cur_scc;
-      for (std::set<int>::iterator j = s2[i].begin(); j != s2[i].end();
-           j++)
-        cur_scc.insert(g2.vertex[*j].first);
-      splitted_nonsingles.push_back(cur_scc);
-    }
-    
-    // convert to dependence graph for grouped statements
-    dep_dim = get_last_dep_dim_before(ref_stmt_num, level) + 1;
-    Graph<std::set<int>, Empty> g;
-    for (std::set<int>::iterator i = true_singles.begin();
-         i != true_singles.end(); i++) {
-      std::set<int> t;
-      t.insert(*i);
-      g.insert(t);
-    }
-    for (int i = 0; i < splitted_nonsingles.size(); i++) {
-      g.insert(splitted_nonsingles[i]);
-    }
-    for (std::map<coef_t, std::set<int> >::iterator i =
-           fake_singles.begin(); i != fake_singles.end(); i++)
-      g.insert((*i).second);
-    
-    for (int i = 0; i < g.vertex.size(); i++)
-      for (int j = i + 1; j < g.vertex.size(); j++) {
-        bool connected = false;
-        for (std::set<int>::iterator ii = g.vertex[i].first.begin();
-             ii != g.vertex[i].first.end(); ii++) {
-          for (std::set<int>::iterator jj = g.vertex[j].first.begin();
-               jj != g.vertex[j].first.end(); jj++) {
-            std::vector<DependenceVector> dvs = dep.getEdge(*ii,
-                                                            *jj);
-            for (int k = 0; k < dvs.size(); k++)
-              if (dvs[k].is_control_dependence()
-                  || (dvs[k].is_data_dependence()
-                      && !dvs[k].has_been_carried_before(
-                        dep_dim))) {
-                g.connect(i, j);
-                connected = true;
-                break;
-              }
-            if (connected)
-              break;
-          }
-          if (connected)
-            break;
-        }
-        connected = false;
-        for (std::set<int>::iterator ii = g.vertex[i].first.begin();
-             ii != g.vertex[i].first.end(); ii++) {
-          for (std::set<int>::iterator jj = g.vertex[j].first.begin();
-               jj != g.vertex[j].first.end(); jj++) {
-            std::vector<DependenceVector> dvs = dep.getEdge(*jj,
-                                                            *ii);
-            for (int k = 0; k < dvs.size(); k++)
-              if (dvs[k].is_control_dependence()
-                  || (dvs[k].is_data_dependence()
-                      && !dvs[k].has_been_carried_before(
-                        dep_dim))) {
-                g.connect(j, i);
-                connected = true;
-                break;
-              }
-            if (connected)
-              break;
-          }
-          if (connected)
-            break;
-        }
-      }
-    
-    // topological sort according to chun's permute algorithm
-    std::vector<std::set<int> > s = g.topoSort();
-    
-    // assign lexical order
-    int order = starting_order;
-    for (int i = 0; i < s.size(); i++) {
-      // translate each SCC into original statements
-      std::set<int> cur_scc;
-      for (std::set<int>::iterator j = s[i].begin(); j != s[i].end(); j++)
-        copy(g.vertex[*j].first.begin(), g.vertex[*j].first.end(),
-             inserter(cur_scc, cur_scc.begin()));
-      
-      // now assign the constant
-      for (std::set<int>::iterator j = cur_scc.begin();
-           j != cur_scc.end(); j++)
-        assign_const(stmt[*j].xform, dim, order);
-      
-      if (cur_scc.size() > 1)
-        setLexicalOrder(dim + 2, cur_scc);
-      else if (cur_scc.size() == 1) {
-        int cur_stmt = *(cur_scc.begin());
-        for (int j = dim + 2; j < stmt[cur_stmt].xform.n_out(); j += 2)
-          assign_const(stmt[cur_stmt].xform, j, 0);
-      }
-      
-      if (cur_scc.size() > 0)
-        order++;
-    }
-  }
-}
-
-void Loop::apply_xform() {
-  std::set<int> active;
-  for (int i = 0; i < stmt.size(); i++)
-    active.insert(i);
-  apply_xform(active);
-}
-
-void Loop::apply_xform(int stmt_num) {
-  std::set<int> active;
-  active.insert(stmt_num);
-  apply_xform(active);
-}
-
-void Loop::apply_xform(std::set<int> &active) {
-  int max_n = 0;
-  
-  CG_outputBuilder *ocg = ir->builder();
-  for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
-    int n = stmt[*i].loop_level.size();
-    if (n > max_n)
-      max_n = n;
-    
-    std::vector<int> lex = getLexicalOrder(*i);
-    
-    Relation mapping(2 * n + 1, n);
-    F_And *f_root = mapping.add_and();
-    for (int j = 1; j <= n; j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(j), 1);
-      h.update_coef(mapping.input_var(2 * j), -1);
-    }
-    mapping = Composition(mapping, stmt[*i].xform);
-    mapping.simplify();
-    
-    // match omega input/output variables to variable names in the code
-    for (int j = 1; j <= stmt[*i].IS.n_set(); j++)
-      mapping.name_input_var(j, stmt[*i].IS.set_var(j)->name());
-    for (int j = 1; j <= n; j++)
-      mapping.name_output_var(j,
-                              tmp_loop_var_name_prefix
-                              + to_string(tmp_loop_var_name_counter + j - 1));
-    mapping.setup_names();
-    
-    Relation known = Extend_Set(copy(this->known),
-                                mapping.n_out() - this->known.n_set());
-    //stmt[*i].code = outputStatement(ocg, stmt[*i].code, 0, mapping, known, std::vector<CG_outputRepr *>(mapping.n_out(), NULL));
-    stmt[*i].code = outputStatement(ocg, stmt[*i].code, 0, mapping, known,
-                                    std::vector<CG_outputRepr *>(mapping.n_out()));
-    stmt[*i].IS = Range(Restrict_Domain(mapping, stmt[*i].IS));
-    stmt[*i].IS.simplify();
-    
-    // replace original transformation relation with straight 1-1 mapping
-    mapping = Relation(n, 2 * n + 1);
-    f_root = mapping.add_and();
-    for (int j = 1; j <= n; j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(2 * j), 1);
-      h.update_coef(mapping.input_var(j), -1);
-    }
-    for (int j = 1; j <= 2 * n + 1; j += 2) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.output_var(j), 1);
-      h.update_const(-lex[j - 1]);
-    }
-    stmt[*i].xform = mapping;
-  }
-  
-  tmp_loop_var_name_counter += max_n;
-}
-
-void Loop::addKnown(const Relation &cond) {
-  int n1 = this->known.n_set();
-  
-  Relation r = copy(cond);
-  int n2 = r.n_set();
-  
-  if (n1 < n2)
-    this->known = Extend_Set(this->known, n2 - n1);
-  else if (n1 > n2)
-    r = Extend_Set(r, n1 - n2);
-  
-  this->known = Intersection(this->known, r);
-}
-
-bool Loop::nonsingular(const std::vector<std::vector<int> > &T) {
-  if (stmt.size() == 0)
-    return true;
-  
-  // check for sanity of parameters
-  for (int i = 0; i < stmt.size(); i++) {
-    if (stmt[i].loop_level.size() != num_dep_dim)
-      throw std::invalid_argument(
-        "nonsingular loop transformations must be applied to original perfect loop nest");
-    for (int j = 0; j < stmt[i].loop_level.size(); j++)
-      if (stmt[i].loop_level[j].type != LoopLevelOriginal)
-        throw std::invalid_argument(
-          "nonsingular loop transformations must be applied to original perfect loop nest");
-  }
-  if (T.size() != num_dep_dim)
-    throw std::invalid_argument("invalid transformation matrix");
-  for (int i = 0; i < stmt.size(); i++)
-    if (T[i].size() != num_dep_dim + 1 && T[i].size() != num_dep_dim)
-      throw std::invalid_argument("invalid transformation matrix");
-  
-  // build relation from matrix
-  Relation mapping(2 * num_dep_dim + 1, 2 * num_dep_dim + 1);
-  F_And *f_root = mapping.add_and();
-  for (int i = 0; i < num_dep_dim; i++) {
-    EQ_Handle h = f_root->add_EQ();
-    h.update_coef(mapping.output_var(2 * (i + 1)), -1);
-    for (int j = 0; j < num_dep_dim; j++)
-      if (T[i][j] != 0)
-        h.update_coef(mapping.input_var(2 * (j + 1)), T[i][j]);
-    if (T[i].size() == num_dep_dim + 1)
-      h.update_const(T[i][num_dep_dim]);
-  }
-  for (int i = 1; i <= 2 * num_dep_dim + 1; i += 2) {
-    EQ_Handle h = f_root->add_EQ();
-    h.update_coef(mapping.output_var(i), -1);
-    h.update_coef(mapping.input_var(i), 1);
-  }
-  
-  // update transformation relations
-  for (int i = 0; i < stmt.size(); i++)
-    stmt[i].xform = Composition(copy(mapping), stmt[i].xform);
-  
-  // update dependence graph
-  for (int i = 0; i < dep.vertex.size(); i++)
-    for (DependenceGraph::EdgeList::iterator j =
-           dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();
-         j++) {
-      std::vector<DependenceVector> dvs = j->second;
-      for (int k = 0; k < dvs.size(); k++) {
-        DependenceVector &dv = dvs[k];
-        switch (dv.type) {
-        case DEP_W2R:
-        case DEP_R2W:
-        case DEP_W2W:
-        case DEP_R2R: {
-          std::vector<coef_t> lbounds(num_dep_dim), ubounds(
-            num_dep_dim);
-          for (int p = 0; p < num_dep_dim; p++) {
-            coef_t lb = 0;
-            coef_t ub = 0;
-            for (int q = 0; q < num_dep_dim; q++) {
-              if (T[p][q] > 0) {
-                if (lb == -posInfinity
-                    || dv.lbounds[q] == -posInfinity)
-                  lb = -posInfinity;
-                else
-                  lb += T[p][q] * dv.lbounds[q];
-                if (ub == posInfinity
-                    || dv.ubounds[q] == posInfinity)
-                  ub = posInfinity;
-                else
-                  ub += T[p][q] * dv.ubounds[q];
-              } else if (T[p][q] < 0) {
-                if (lb == -posInfinity
-                    || dv.ubounds[q] == posInfinity)
-                  lb = -posInfinity;
-                else
-                  lb += T[p][q] * dv.ubounds[q];
-                if (ub == posInfinity
-                    || dv.lbounds[q] == -posInfinity)
-                  ub = posInfinity;
-                else
-                  ub += T[p][q] * dv.lbounds[q];
-              }
-            }
-            if (T[p].size() == num_dep_dim + 1) {
-              if (lb != -posInfinity)
-                lb += T[p][num_dep_dim];
-              if (ub != posInfinity)
-                ub += T[p][num_dep_dim];
-            }
-            lbounds[p] = lb;
-            ubounds[p] = ub;
-          }
-          dv.lbounds = lbounds;
-          dv.ubounds = ubounds;
-          
-          break;
-        }
-        default:
-          ;
-        }
-      }
-      j->second = dvs;
-    }
-  
-  // set constant loop values
-  std::set<int> active;
-  for (int i = 0; i < stmt.size(); i++)
-    active.insert(i);
-  setLexicalOrder(0, active);
-  
-  return true;
-}
-
-void Loop::skew(const std::set<int> &stmt_nums, int level,
-                const std::vector<int> &skew_amount) {
-  if (stmt_nums.size() == 0)
-    return;
-  
-  // check for sanity of parameters
-  int ref_stmt_num = *(stmt_nums.begin());
-  std::vector<std::set<int> > array_of_deps;
-  for (std::set<int>::const_iterator i = stmt_nums.begin();
-       i != stmt_nums.end(); i++) {
-    if (*i < 0 || *i >= stmt.size())
-      throw std::invalid_argument(
-        "invalid statement number " + to_string(*i));
-    if (level < 1 || level > stmt[*i].loop_level.size())
-      throw std::invalid_argument(
-        "invalid loop level " + to_string(level));
-    for (int j = stmt[*i].loop_level.size(); j < skew_amount.size(); j++)
-      if (skew_amount[j] != 0)
-        throw std::invalid_argument("invalid skewing formula");
-  }
-  
-  // set trasformation relations
-  for (std::set<int>::const_iterator i = stmt_nums.begin();
-       i != stmt_nums.end(); i++) {
-    int n = stmt[*i].xform.n_out();
-    Relation r(n, n);
-    F_And *f_root = r.add_and();
-    for (int j = 1; j <= n; j++)
-      if (j != 2 * level) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(r.input_var(j), 1);
-        h.update_coef(r.output_var(j), -1);
-      }
-    EQ_Handle h = f_root->add_EQ();
-    h.update_coef(r.output_var(2 * level), -1);
-    for (int j = 0; j < skew_amount.size(); j++)
-      if (skew_amount[j] != 0)
-        h.update_coef(r.input_var(2 * (j + 1)), skew_amount[j]);
-    
-    stmt[*i].xform = Composition(r, stmt[*i].xform);
-    stmt[*i].xform.simplify();
-    applyXform(*i);
-    std::set<int> dont_consider;
-    //}
-    
-    // update dependence graph
-    if (stmt[ref_stmt_num].loop_level[level - 1].type
-        == LoopLevelOriginal) {
-      int dep_dim = stmt[ref_stmt_num].loop_level[level - 1].payload;
-      for (std::set<int>::const_iterator i = stmt_nums.begin();
-           i != stmt_nums.end(); i++)
-        for (DependenceGraph::EdgeList::iterator j =
-               dep.vertex[*i].second.begin();
-             j != dep.vertex[*i].second.end(); j++)
-          if (stmt_nums.find(j->first) != stmt_nums.end()) {
-            // dependence between skewed statements
-            std::vector<DependenceVector> dvs = j->second;
-            for (int k = 0; k < dvs.size(); k++) {
-              DependenceVector &dv = dvs[k];
-              if (dv.is_data_dependence()) {
-                coef_t lb = 0;
-                coef_t ub = 0;
-                for (int kk = 0; kk < skew_amount.size();
-                     kk++) {
-                  int cur_dep_dim = get_dep_dim_of(*i,
-                                                   kk + 1);
-                  if (skew_amount[kk] > 0) {
-                    if (lb != -posInfinity
-                        && stmt[*i].loop_level[kk].type
-                        == LoopLevelOriginal
-                        && dv.lbounds[cur_dep_dim]
-                        != -posInfinity)
-                      lb += skew_amount[kk]
-                        * dv.lbounds[cur_dep_dim];
-                    else {
-                      if (cur_dep_dim != -1
-                          && !(dv.lbounds[cur_dep_dim]
-                               == 0
-                               && dv.ubounds[cur_dep_dim]
-                               == 0))
-                        lb = -posInfinity;
-                    }
-                    if (ub != posInfinity
-                        && stmt[*i].loop_level[kk].type
-                        == LoopLevelOriginal
-                        && dv.ubounds[cur_dep_dim]
-                        != posInfinity)
-                      ub += skew_amount[kk]
-                        * dv.ubounds[cur_dep_dim];
-                    else {
-                      if (cur_dep_dim != -1
-                          && !(dv.lbounds[cur_dep_dim]
-                               == 0
-                               && dv.ubounds[cur_dep_dim]
-                               == 0))
-                        ub = posInfinity;
-                    }
-                  } else if (skew_amount[kk] < 0) {
-                    if (lb != -posInfinity
-                        && stmt[*i].loop_level[kk].type
-                        == LoopLevelOriginal
-                        && dv.ubounds[cur_dep_dim]
-                        != posInfinity)
-                      lb += skew_amount[kk]
-                        * dv.ubounds[cur_dep_dim];
-                    else {
-                      if (cur_dep_dim != -1
-                          && !(dv.lbounds[cur_dep_dim]
-                               == 0
-                               && dv.ubounds[cur_dep_dim]
-                               == 0))
-                        lb = -posInfinity;
-                    }
-                    if (ub != posInfinity
-                        && stmt[*i].loop_level[kk].type
-                        == LoopLevelOriginal
-                        && dv.lbounds[cur_dep_dim]
-                        != -posInfinity)
-                      ub += skew_amount[kk]
-                        * dv.lbounds[cur_dep_dim];
-                    else {
-                      if (cur_dep_dim != -1
-                          && !(dv.lbounds[cur_dep_dim]
-                               == 0
-                               && dv.ubounds[cur_dep_dim]
-                               == 0))
-                        ub = posInfinity;
-                    }
-                  }
-                }
-                if ((dv.isCarried(dep_dim)
-                     && dv.hasPositive(dep_dim)) && dv.quasi)
-                  dv.quasi = false;
-                
-                if ((dv.isCarried(dep_dim)
-                     && dv.hasNegative(dep_dim))
-                    && !dv.quasi)
-                  throw loop_error(
-                    "loop error: Skewing is illegal, dependence violation!");
-                dv.lbounds[dep_dim] = lb;
-                dv.ubounds[dep_dim] = ub;
-                if ((dv.isCarried(dep_dim)
-                     && dv.hasPositive(dep_dim)) && dv.quasi)
-                  dv.quasi = false;
-                
-                if ((dv.isCarried(dep_dim)
-                     && dv.hasNegative(dep_dim))
-                    && !dv.quasi)
-                  throw loop_error(
-                    "loop error: Skewing is illegal, dependence violation!");
-              }
-            }
-            
-            j->second = dvs;
-          }
-    } else {
-      // dependence from skewed statement to unskewed statement becomes jumbled,
-      // put distance value at skewed dimension to unknown
-      /*std::vector<DependenceVector> dvs = j->second;
-        for (int k = 0; k < dvs.size(); k++) {
-        DependenceVector &dv = dvs[k];
-        if (dv.is_data_dependence()) {
-        dv.lbounds[dep_dim] = -posInfinity;
-        dv.ubounds[dep_dim] = posInfinity;
-        }
-        }
-        j->second = dvs;
-      */
-      dont_consider.insert(j->first);
-    }
-    for (int l = 0; l < dep.vertex.size(); l++)
-      if (stmt_nums.find(l) == stmt_nums.end())
-        if (dont_consider.find(l) == stmt_nums.end()
-            && (dep.vertex[l].second.find(*i)
-                != dep.vertex[l].second.end()))
-          dont_consider.insert(l);
-    array_of_deps.push_back(dont_consider);
-  }
-  /*for (int i = 0; i < dep.vertex.size(); i++)
-    if (stmt_nums.find(i) == stmt_nums.end())
-    for (DependenceGraph::EdgeList::iterator j =
-    dep.vertex[i].second.begin();
-    j != dep.vertex[i].second.end(); j++)
-    if (stmt_nums.find(j->first) != stmt_nums.end()) {
-    // dependence from unskewed statement to skewed statement becomes jumbled,
-    // put distance value at skewed dimension to unknown
-    std::vector<DependenceVector> dvs = j->second;
-    for (int k = 0; k < dvs.size(); k++) {
-    DependenceVector &dv = dvs[k];
-    if (dv.is_data_dependence()) {
-    dv.lbounds[dep_dim] = -posInfinity;
-    dv.ubounds[dep_dim] = posInfinity;
-    }
-    }
-    j->second = dvs;
-    }
-    }*/
-  std::set<int>::const_iterator w = stmt_nums.begin();
-  for (int i = 0; i < array_of_deps.size() && w != stmt_nums.end(); i++)
-    for (std::set<int>::const_iterator j = array_of_deps[i].begin();
-         j != array_of_deps[i].end(); j++) {
-      if (dep.vertex[*w].second.find(*j) != dep.vertex[*w].second.end())
-        dep.disconnect(*w, *j);
-      if (dep.vertex[*j].second.find(*w) != dep.vertex[*j].second.end())
-        dep.disconnect(*j, *w);
-      int x, y;
-      std::pair<std::vector<DependenceVector>,
-        std::vector<DependenceVector> > dv_s;
-      if ((*w) <= (*j)) {
-        x = *w;
-        y = *j;
-        
-        dv_s = test_data_dependences(ir_, stmt[x].code, stmt[x].IS,
-                                     stmt[y].code, stmt[y].IS, freevar, index, x, y);
-      } else {
-        x = *j;
-        y = *w;
-        dv_s = test_data_dependences(ir_, stmt[y].code, stmt[y].IS,
-                                     stmt[x].code, stmt[x].IS, freevar, index, x, y);
-      }
-      for (int k = 0; k < dv_s.first.size(); k++) {
-        if (is_dependence_valid_based_on_lex_order(x, y, dv_s.first[k],
-                                                   true))
-          dep.connect(x, y, dv_s.first[k]);
-        else
-          dep.connect(y, x, dv_s.first[k].reverse());
-      }
-      for (int k = 0; k < dv_s.second.size(); k++) {
-        if (is_dependence_valid_based_on_lex_order(x, y, dv_s.second[k],
-                                                   false))
-          dep.connect(y, x, dv_s.second[k]);
-        else
-          dep.connect(x, y, dv_s.second[k].reverse());
-      }
-      w++;
-    }
-}
-
-void Loop::shift(const std::set<int> &stmt_nums, int level, int shift_amount) {
-  if (stmt_nums.size() == 0)
-    return;
-  
-  // check for sanity of parameters
-  int ref_stmt_num = *(stmt_nums.begin());
-  for (std::set<int>::const_iterator i = stmt_nums.begin();
-       i != stmt_nums.end(); i++) {
-    if (*i < 0 || *i >= stmt.size())
-      throw std::invalid_argument(
-        "invalid statement number " + to_string(*i));
-    if (level < 1 || level > stmt[*i].loop_level.size())
-      throw std::invalid_argument(
-        "invalid loop level " + to_string(level));
-  }
-  
-  // do nothing
-  if (shift_amount == 0)
-    return;
-  
-  // set trasformation relations
-  for (std::set<int>::const_iterator i = stmt_nums.begin();
-       i != stmt_nums.end(); i++) {
-    int n = stmt[*i].xform.n_out();
-    
-    Relation r(n, n);
-    F_And *f_root = r.add_and();
-    for (int j = 1; j <= n; j++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(r.input_var(j), 1);
-      h.update_coef(r.output_var(j), -1);
-      if (j == 2 * level)
-        h.update_const(shift_amount);
-    }
-    
-    stmt[*i].xform = Composition(r, stmt[*i].xform);
-    stmt[*i].xform.simplify();
-  }
-  
-  // update dependence graph
-  if (stmt[ref_stmt_num].loop_level[level - 1].type == LoopLevelOriginal) {
-    int dep_dim = stmt[ref_stmt_num].loop_level[level - 1].payload;
-    for (std::set<int>::const_iterator i = stmt_nums.begin();
-         i != stmt_nums.end(); i++)
-      for (DependenceGraph::EdgeList::iterator j =
-             dep.vertex[*i].second.begin();
-           j != dep.vertex[*i].second.end(); j++)
-        if (stmt_nums.find(j->first) == stmt_nums.end()) {
-          // dependence from shifted statement to unshifted statement
-          std::vector<DependenceVector> dvs = j->second;
-          for (int k = 0; k < dvs.size(); k++) {
-            DependenceVector &dv = dvs[k];
-            if (dv.is_data_dependence()) {
-              if (dv.lbounds[dep_dim] != -posInfinity)
-                dv.lbounds[dep_dim] -= shift_amount;
-              if (dv.ubounds[dep_dim] != posInfinity)
-                dv.ubounds[dep_dim] -= shift_amount;
-            }
-          }
-          j->second = dvs;
-        }
-    for (int i = 0; i < dep.vertex.size(); i++)
-      if (stmt_nums.find(i) == stmt_nums.end())
-        for (DependenceGraph::EdgeList::iterator j =
-               dep.vertex[i].second.begin();
-             j != dep.vertex[i].second.end(); j++)
-          if (stmt_nums.find(j->first) != stmt_nums.end()) {
-            // dependence from unshifted statement to shifted statement
-            std::vector<DependenceVector> dvs = j->second;
-            for (int k = 0; k < dvs.size(); k++) {
-              DependenceVector &dv = dvs[k];
-              if (dv.is_data_dependence()) {
-                if (dv.lbounds[dep_dim] != -posInfinity)
-                  dv.lbounds[dep_dim] += shift_amount;
-                if (dv.ubounds[dep_dim] != posInfinity)
-                  dv.ubounds[dep_dim] += shift_amount;
-              }
-            }
-            j->second = dvs;
-          }
-  }
-}
-
-// bool Loop::fuse(const std::set<int> &stmt_nums, int level) {
-//   if (stmt_nums.size() == 0 || stmt_nums.size() == 1)
-//     return true;
-//   int dim = 2*level-1;
-
-//   // check for sanity of parameters
-//   std::vector<int> ref_lex;
-//   for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-//     if (*i < 0 || *i >= stmt.size())
-//       throw std::invalid_argument("invalid statement number " + to_string(*i));
-//     if (level < 1 || level > (stmt[*i].xform.n_out()-1)/2)
-//       throw std::invalid_argument("invalid loop level " + to_string(level));
-//     if (ref_lex.size() == 0)
-//       ref_lex = getLexicalOrder(*i);
-//     else {
-//       std::vector<int> lex = getLexicalOrder(*i);
-//       for (int j = 0; j < dim-1; j+=2)
-//         if (lex[j] != ref_lex[j])
-//           throw std::invalid_argument("statements for fusion must be in the same level-" + to_string(level-1) + " subloop");
-//     }
-//   }
-
-//   // collect lexicographical order values from to-be-fused statements
-//   std::set<int> lex_values;
-//   for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-//     std::vector<int> lex = getLexicalOrder(*i);
-//     lex_values.insert(lex[dim-1]);
-//   }
-//   if (lex_values.size() == 1)
-//     return true;
-
-//   // negative dependence would prevent fusion
-//   int dep_dim = xform_index[dim].first;
-//   for (std::set<int>::iterator i = lex_values.begin(); i != lex_values.end(); i++) {
-//     ref_lex[dim-1] = *i;
-//     std::set<int> a = getStatements(ref_lex, dim-1);
-//     std::set<int>::iterator j = i;
-//     j++;
-//     for (; j != lex_values.end(); j++) {
-//       ref_lex[dim-1] = *j;
-//       std::set<int> b = getStatements(ref_lex, dim-1);
-//       for (std::set<int>::iterator ii = a.begin(); ii != a.end(); ii++)
-//         for (std::set<int>::iterator jj = b.begin(); jj != b.end(); jj++) {
-//           std::vector<DependenceVector> dvs;
-//           dvs = dep.getEdge(*ii, *jj);
-//           for (int k = 0; k < dvs.size(); k++)
-//             if (dvs[k].isCarried(dep_dim) && dvs[k].hasNegative(dep_dim))
-//               throw loop_error("loop error: statements " + to_string(*ii) + " and " + to_string(*jj) + " cannot be fused together due to negative dependence");
-//           dvs = dep.getEdge(*jj, *ii);
-//           for (int k = 0; k < dvs.size(); k++)
-//             if (dvs[k].isCarried(dep_dim) && dvs[k].hasNegative(dep_dim))
-//               throw loop_error("loop error: statements " + to_string(*jj) + " and " + to_string(*ii) + " cannot be fused together due to negative dependence");
-//         }
-//     }
-//   }
-
-//   // collect all other lexicographical order values from the subloop
-//   // enclosing these to-be-fused loops
-//   std::set<int> same_loop = getStatements(ref_lex, dim-3);
-//   std::set<int> other_lex_values;
-//   for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-//     std::vector<int> lex = getLexicalOrder(*i);
-//     if (lex_values.find(lex[dim-1]) == lex_values.end())
-//       other_lex_values.insert(lex[dim-1]);
-//   }
-
-//   // update to-be-fused loops due to dependence cycle
-//   Graph<std::set<int>, Empty> g;
-//   {
-//     std::set<int> t;
-//     for (std::set<int>::iterator i = lex_values.begin(); i != lex_values.end(); i++) {
-//       ref_lex[dim-1] = *i;
-//       std::set<int> t2 = getStatements(ref_lex, dim-1);
-//       std::set_union(t.begin(), t.end(), t2.begin(), t2.end(), inserter(t, t.begin()));
-//     }
-//     g.insert(t);
-//   }
-//   for (std::set<int>::iterator i = other_lex_values.begin(); i != other_lex_values.end(); i++) {
-//     ref_lex[dim-1] = *i;
-//     std::set<int> t = getStatements(ref_lex, dim-1);
-//     g.insert(t);
-//   }
-//   for (int i = 0; i < g.vertex.size(); i++)
-//     for (int j = i+1; j < g.vertex.size(); j++)
-//       for (std::set<int>::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++)
-//         for (std::set<int>::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) {
-//           std::vector<DependenceVector> dvs;
-//           dvs = dep.getEdge(*ii, *jj);
-//           for (int k = 0; k < dvs.size(); k++)
-//             if (dvs[k].isCarried(dep_dim)) {
-//               g.connect(i, j);
-//               break;
-//             }
-//           dvs = dep.getEdge(*jj, *ii);
-//           for (int k = 0; k < dvs.size(); k++)
-//             if (dvs[k].isCarried(dep_dim)) {
-//               g.connect(j, i);
-//               break;
-//             }
-//         }
-//   std::vector<std::set<int> > s = g.topoSort();
-//   int fused_lex_value = 0;
-//   for (int i = 0; i < s.size(); i++)
-//     if (s[i].find(0) != s[i].end()) {
-//       // now add additional lexicographical order values
-//       for (std::set<int>::iterator j = s[i].begin(); j != s[i].end(); j++)
-//         if (*j != 0) {
-//           int stmt = *(g.vertex[*j].first.begin());
-//           std::vector<int> lex = getLexicalOrder(stmt);
-//           lex_values.insert(lex[dim-1]);
-//         }
-
-//       if (s.size() > 1) {
-//         if (i == 0) {
-//           int min_lex_value;
-//           for (std::set<int>::iterator j = s[i+1].begin(); j != s[i+1].end(); j++) {
-//             int stmt = *(g.vertex[*j].first.begin());
-//             std::vector<int> lex = getLexicalOrder(stmt);
-//             if (j == s[i+1].begin())
-//               min_lex_value = lex[dim-1];
-//             else if (lex[dim-1] < min_lex_value)
-//               min_lex_value = lex[dim-1];
-//           }
-//           fused_lex_value = min_lex_value - 1;
-//         }
-//         else {
-//           int max_lex_value;
-//           for (std::set<int>::iterator j = s[i-1].begin(); j != s[i-1].end(); j++) {
-//             int stmt = *(g.vertex[*j].first.begin());
-//             std::vector<int> lex = getLexicalOrder(stmt);
-//             if (j == s[i-1].begin())
-//               max_lex_value = lex[dim-1];
-//             else if (lex[dim-1] > max_lex_value)
-//               max_lex_value = lex[dim-1];
-//           }
-//           fused_lex_value = max_lex_value + 1;
-//         }
-//       }
-
-//       break;
-//     }
-
-//   // sort the newly updated to-be-fused lexicographical order values
-//   std::vector<int> ordered_lex_values;
-//   for (std::set<int>::iterator i = lex_values.begin(); i != lex_values.end(); i++)
-//     ordered_lex_values.push_back(*i);
-//   std::sort(ordered_lex_values.begin(), ordered_lex_values.end());
-
-//   // make sure internal loops inside to-be-fused loops have the same
-//   // lexicographical order before and after fusion
-//   std::vector<std::pair<int, int> > inside_lex_range(ordered_lex_values.size());
-//   for (int i = 0; i < ordered_lex_values.size(); i++) {
-//     ref_lex[dim-1] = ordered_lex_values[i];
-//     std::set<int> the_stmts = getStatements(ref_lex, dim-1);
-//     std::set<int>::iterator j = the_stmts.begin();
-//     std::vector<int> lex = getLexicalOrder(*j);
-//     int min_inside_lex_value = lex[dim+1];
-//     int max_inside_lex_value = lex[dim+1];
-//     j++;
-//     for (; j != the_stmts.end(); j++) {
-//       std::vector<int> lex = getLexicalOrder(*j);
-//       if (lex[dim+1] < min_inside_lex_value)
-//         min_inside_lex_value = lex[dim+1];
-//       if (lex[dim+1] > max_inside_lex_value)
-//         max_inside_lex_value = lex[dim+1];
-//     }
-//     inside_lex_range[i].first = min_inside_lex_value;
-//     inside_lex_range[i].second = max_inside_lex_value;
-//   }
-//   for (int i = 1; i < ordered_lex_values.size(); i++)
-//     if (inside_lex_range[i].first <= inside_lex_range[i-1].second) {
-//       int shift_lex_value = inside_lex_range[i-1].second - inside_lex_range[i].first + 1;
-//       ref_lex[dim-1] = ordered_lex_values[i];
-//       ref_lex[dim+1] = inside_lex_range[i].first;
-//       shiftLexicalOrder(ref_lex, dim+1, shift_lex_value);
-//       inside_lex_range[i].first += shift_lex_value;
-//       inside_lex_range[i].second += shift_lex_value;
-//     }
-
-//   // set lexicographical order for fused loops
-//   for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-//     std::vector<int> lex = getLexicalOrder(*i);
-//     if (lex_values.find(lex[dim-1]) != lex_values.end())
-//       assign_const(stmt[*i].xform, dim-1, fused_lex_value);      
-//   }
-
-//   // no need to update dependence graph
-//   ;
-
-//   return true;
-// }
-
-// bool Loop::distribute(const std::set<int> &stmt_nums, int level) {
-//   if (stmt_nums.size() == 0 || stmt_nums.size() == 1)
-//     return true;
-//   int dim = 2*level-1;
-
-//   // check for sanity of parameters
-//   std::vector<int> ref_lex;
-//   for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-//     if (*i < 0 || *i >= stmt.size())
-//       throw std::invalid_argument("invalid statement number " + to_string(*i));
-//     if (level < 1 || level > (stmt[*i].xform.n_out()-1)/2)
-//       throw std::invalid_argument("invalid loop level " + to_string(level));
-//     if (ref_lex.size() == 0)
-//       ref_lex = getLexicalOrder(*i);
-//     else {
-//       std::vector<int> lex = getLexicalOrder(*i);
-//       for (int j = 0; j <= dim-1; j+=2)
-//         if (lex[j] != ref_lex[j])
-//           throw std::invalid_argument("statements for distribution must be in the same level-" + to_string(level) + " subloop");
-//     }
-//   }
-
-//   // find SCC in the to-be-distributed loop
-//   int dep_dim = xform_index[dim].first;
-//   std::set<int> same_loop = getStatements(ref_lex, dim-1);
-//   Graph<int, Empty> g;
-//   for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-//     g.insert(*i);
-//   for (int i = 0; i < g.vertex.size(); i++)
-//     for (int j = i+1; j < g.vertex.size(); j++) {
-//       std::vector<DependenceVector> dvs;
-//       dvs = dep.getEdge(g.vertex[i].first, g.vertex[j].first);
-//       for (int k = 0; k < dvs.size(); k++)
-//         if (dvs[k].isCarried(dep_dim)) {
-//           g.connect(i, j);
-//           break;
-//         }
-//       dvs = dep.getEdge(g.vertex[j].first, g.vertex[i].first);
-//       for (int k = 0; k < dvs.size(); k++)
-//         if (dvs[k].isCarried(dep_dim)) {
-//           g.connect(j, i);
-//           break;
-//         }
-//     }
-//   std::vector<std::set<int> > s = g.topoSort();
-
-//   // find statements that cannot be distributed due to dependence cycle
-//   Graph<std::set<int>, Empty> g2;
-//   for (int i = 0; i < s.size(); i++) {
-//     std::set<int> t;
-//     for (std::set<int>::iterator j = s[i].begin(); j != s[i].end(); j++)
-//       if (stmt_nums.find(g.vertex[*j].first) != stmt_nums.end())
-//         t.insert(g.vertex[*j].first);
-//     if (!t.empty())
-//       g2.insert(t);
-//   }
-//   for (int i = 0; i < g2.vertex.size(); i++)
-//     for (int j = i+1; j < g2.vertex.size(); j++)
-//       for (std::set<int>::iterator ii = g2.vertex[i].first.begin(); ii != g2.vertex[i].first.end(); ii++)
-//         for (std::set<int>::iterator jj = g2.vertex[j].first.begin(); jj != g2.vertex[j].first.end(); jj++) {
-//           std::vector<DependenceVector> dvs;
-//           dvs = dep.getEdge(*ii, *jj);
-//           for (int k = 0; k < dvs.size(); k++)
-//             if (dvs[k].isCarried(dep_dim)) {
-//               g2.connect(i, j);
-//               break;
-//             }
-//           dvs = dep.getEdge(*jj, *ii);
-//           for (int k = 0; k < dvs.size(); k++)
-//             if (dvs[k].isCarried(dep_dim)) {
-//               g2.connect(j, i);
-//               break;
-//             }
-//         }
-//   std::vector<std::set<int> > s2 = g2.topoSort();
-
-//   // nothing to distribute
-//   if (s2.size() == 1)
-//     throw loop_error("loop error: no statement can be distributed due to dependence cycle");
-
-//   std::vector<std::set<int> > s3;
-//   for (int i = 0; i < s2.size(); i++) {
-//     std::set<int> t;
-//     for (std::set<int>::iterator j = s2[i].begin(); j != s2[i].end(); j++)
-//       std::set_union(t.begin(), t.end(), g2.vertex[*j].first.begin(), g2.vertex[*j].first.end(), inserter(t, t.begin()));
-//     s3.push_back(t);
-//   }
-
-//   // associate other affected statements with the right distributed statements
-//   for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-//     if (stmt_nums.find(*i) == stmt_nums.end()) {
-//       bool is_inserted = false;
-//       int potential_insertion_point = 0;
-//       for (int j = 0; j < s3.size(); j++) {
-//         for (std::set<int>::iterator k = s3[j].begin(); k != s3[j].end(); k++) {
-//           std::vector<DependenceVector> dvs;
-//           dvs = dep.getEdge(*i, *k);
-//           for (int kk = 0; kk < dvs.size(); kk++)
-//             if (dvs[kk].isCarried(dep_dim)) {
-//               s3[j].insert(*i);
-//               is_inserted = true;
-//               break;
-//             }
-//           dvs = dep.getEdge(*k, *i);
-//           for (int kk = 0; kk < dvs.size(); kk++)
-//             if (dvs[kk].isCarried(dep_dim))
-//               potential_insertion_point = j;
-//         }
-//         if (is_inserted)
-//           break;
-//       }
-
-//       if (!is_inserted)
-//         s3[potential_insertion_point].insert(*i);
-//     }
-
-//   // set lexicographical order after distribution
-//   int order = ref_lex[dim-1];
-//   shiftLexicalOrder(ref_lex, dim-1, s3.size()-1);
-//   for (std::vector<std::set<int> >::iterator i = s3.begin(); i != s3.end(); i++) {
-//     for (std::set<int>::iterator j = (*i).begin(); j != (*i).end(); j++)
-//       assign_const(stmt[*j].xform, dim-1, order);
-//     order++;
-//   }
-
-//   // no need to update dependence graph
-//   ;
-
-//   return true;
-// }
-
diff --git a/mem_mapping_utils.cc b/mem_mapping_utils.cc
deleted file mode 100644
index 645fe59..0000000
--- a/mem_mapping_utils.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-#include <vector>
-#include <string.h>
-#include <map>
-#include "rose.h"
-#include "mem_mapping_utils.hh"
-
-using namespace SageBuilder;
-using namespace SageInterface;
-
-memory_mapping::memory_mapping (bool used, const char * array_name){
-  this->mem_used = used;
-  this->add(array_name);
-}
-
-texture_memory_mapping::texture_memory_mapping(bool used, const char* array_name) : memory_mapping(used, array_name) { }
-constant_memory_mapping::constant_memory_mapping(bool used, const char* array_name) : memory_mapping(used, array_name) { }
-//texture_memory_mapping::texture_memory_mapping (bool used, const char* array_name, int width, int height) {
-//  tex_mem_used = used;
-//  this->add(array_name, width, height);
-//}
-
-void memory_mapping::add(const char * array_name) {
-  this->mapped_array_name.push_back(std::string(array_name));
-  //std::vector<int> ivec = std::vector<int>();
-  //dims[std::string(array_name)] = ivec;
-}
-//void texture_memory_mapping::add(const char* array_name, int width, int height) {
-//  tex_mapped_array_name.push_back(std::string(array_name));
-//  std::vector<int> ivec = std::vector<int>();
-//  ivec.push_back(width);
-//  ivec.push_back(height);
-//  dims[std::string(array_name)] = ivec;
-//}
-
-bool memory_mapping::is_mem_used(){
-  return this->mem_used;
-}
-bool memory_mapping::is_array_mapped(const char * array_name){
-  
-  for( int i=0; i<mapped_array_name.size(); i++){
-    if(!(strcmp(array_name, mapped_array_name[i].c_str())))
-      return true;
-  }
-  return false;
-}
-void memory_mapping::set_mapped_symbol(const char * array_name, SgVariableSymbol* sym) {
-  this->mapped_symbol[std::string(array_name)] = sym;
-}
-void texture_memory_mapping::set_devptr_symbol(const char * array_name, SgVariableSymbol* sym) {
-  devptr_symbol[std::string(array_name)] = sym;
-}
-void memory_mapping::set_vardef(const char* array_name, VarDefs* vardef) {
-  this->vardefs[std::string(array_name)] = vardef;
-}
-SgVarRefExp* memory_mapping::get_mapped_symbol_exp(const char * array_name) {
-  return buildVarRefExp(this->mapped_symbol[std::string(array_name)]);
-}
-SgVarRefExp* texture_memory_mapping::get_devptr_symbol_exp(const char * array_name) {
-  return buildVarRefExp(devptr_symbol[std::string(array_name)]);
-}
-VarDefs* memory_mapping::get_vardef(const char* vardef_name) {
-  return this->vardefs[std::string(vardef_name)];
-}
-//int texture_memory_mapping::get_dims(const char* array_name) {
-//  return (int)(dims[std::string(array_name)].size());
-//}
-//int texture_memory_mapping::get_dim_length(const char* array_name, int dim) {
-//  return dims[std::string(array_name)][dim];
-//}
-memory_mapping::memory_mapping() {
-  mem_used = false;
-}
-texture_memory_mapping::texture_memory_mapping() : memory_mapping() { }
-constant_memory_mapping::constant_memory_mapping() : memory_mapping() { }
-
-
diff --git a/mem_mapping_utils.hh b/mem_mapping_utils.hh
deleted file mode 100644
index 8ff0545..0000000
--- a/mem_mapping_utils.hh
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef MEM_MAPPING_UTILS_HH
-#define MEM_MAPPING_UTILS_HH
-
-#include <vector>
-#include <string.h>
-#include <map>
-#include "rose.h"
-
-using namespace SageInterface;
-using namespace SageBuilder;
-
-struct VarDefs;
-
-class memory_mapping {
-private:
-  bool mem_used;
-  std::vector< std::string > mapped_array_name;
-  std::map<std::string, SgVariableSymbol*> mapped_symbol;
-  std::map<std::string, VarDefs*> vardefs;
-public:
-  memory_mapping();
-  memory_mapping(bool used, const char* array_name);
-  void add(const char* array_name);
-  bool is_mem_used();
-  bool is_array_mapped(const char* array_name);
-  void set_mapped_symbol(const char* array_name, SgVariableSymbol* sym);
-  void set_vardef(const char* array_name, VarDefs* vardef);
-  SgVarRefExp* get_mapped_symbol_exp(const char* array_name);
-  VarDefs* get_vardef(const char* vardef_name);
-};
-
-//protonu --class introduced to hold texture memory information in one single place
-//this might help me get over the weird memory issues I am having with the Loop class
-//where someone/something corrupts my memory
-
-class texture_memory_mapping : public memory_mapping {
-private:
-  std::map<std::string, SgVariableSymbol*> devptr_symbol;
-  // a hack for multi-dimensional texture mapping
-  //std::map<std::string, std::vector<int> > dims;
-public:
-  texture_memory_mapping ( bool used, const char * array_name);
-  //texture_memory_mapping (bool used, const char* array_name, int width, int height);
-  // this function is a hack to get arround a bug
-  // void add(const char* array_name, int width, int height);
-  void set_devptr_symbol(const char * array_name, SgVariableSymbol* sym);
-  SgVarRefExp* get_devptr_symbol_exp(const char * array_name);
-  //int get_dim_length(const char* array_name, int dim);
-  //int get_dims(const char* array_name);
-  texture_memory_mapping();
-};
-
-class constant_memory_mapping : public memory_mapping {
-public:
-  constant_memory_mapping();
-  constant_memory_mapping(bool used, const char* array_name);
-};
-
-#endif
diff --git a/omega/INSTALL b/omega/INSTALL
deleted file mode 100644
index f3c3558..0000000
--- a/omega/INSTALL
+++ /dev/null
@@ -1,34 +0,0 @@
-BUILD
-=====
-
-0. Install Rose using the rose installation instructions given.
-
-1. Edit Makefile.config. Change BUILD_CODEGEN to false if you don't want
-   CodeGen+ library to be built.
-
-2. Do "make depend".
-
-3. Optionally, do "make clean" to remove object files or "make veryclean"
-   to additionally remove target files.
-
-4. Do "make".
-
-
-INSTALLATION
-============
-
-You can use Omega+ and CodeGen+ in source directory since all links
-are already created in bin/, lib/ and include/ subdirectories.
-
-omega/            source directory root
-  bin/            command line interface "oc"
-  lib/            libraries "libomega.a" and "libcode_gen.a"
-  include/
-    omega.h       main Omega+ header file
-    omega/        Omega+ header files
-    basic/        basic utility header files
-    code_gen/     CodeGen+ header files
-
-You can also do "make install" to copy necessary files into
-/usr/local for root account, or use home directory for other accounts.
-
diff --git a/omega/README b/omega/README
deleted file mode 100644
index 378f4bd..0000000
--- a/omega/README
+++ /dev/null
@@ -1,96 +0,0 @@
-Omega+ and CodeGen+ 2.2 open source release
-See LICENSE file for copyright information.
-
-Omega+ is a mathematical library for manipulating integer linear
-constraints over integer variables in first order logic, and
-operations on integer sets and their mappings. CodeGen+ is a code
-generation library by scanning the points in a union of polytopes.
-A command-line interface to libraries is also included.
-
-
-What is new?
-============
-
-version 2.2:
-  * Redesigned polyhedra scanning which generates higher quality code
-    than before especially for complex set of polyhedra.
-  * New SimpleHull for hull approximation (deprecate Hull).
-  * Command line editing and history support in calculator.
-
-version 2.1:
-  * Updated "effort" parameter's meaning in MMGenerateCode: value n
-    (n >= 0, default to 1) means that control overheads are removed
-    from all n-depth innermost loops.
-  * Enhanced stride handling in the code generation.
-  * Support code generation for a set of iteration spaces with different
-    dimensionality.
-  * New ConvexRepresentation that reduces the number of conjuncts in a union
-    (deprecate CheckForConvexPairs and CheckForConvexRepresentation).
-  * Handle floor/ceiling defined variables cleanly in output code.
-  * Use namespace omega for the library.
-  * New closure functions contributed by Klimek Tomasz (R^+ and R^@).
-
-version 2.0:
-  * Improved internal code generation interface so that it generates both
-    string and rose ouput now, and more easily extendable for new compiler
-    intermediate representations.
-  * Improved gist function so that integer modular constraints are handled
-    more gracefully.
-  * Merge duplicate if-conditions in generated code, which might still miss
-    a few opportunities due to the way AST is constructed.
-  * Correct output/input variable substitution for non-unimodular
-    mapping relations.
-  * Deprecate Omega's assert/Exit interface.
-  * Some fixing in calculator's parsing and interactive interface.
-
-version 1.2 (Omega Project):
-  * Support for code generation with memory mappings, as described in
-    Tina Shen's MASPLAS '98 paper. This is available in oc via the
-    tcodegen function; see examples/calc/mm* for examples.
-  * Use of the compile-time flags -DSTILL_CHECK_MULT=1 -DNDEBUG turns off 
-    all assertions and chechk _except_ some checks for integer overflow
-    during variable elimination in the omega core. Unless you know a priori
-    that overflow cannot occur, you should use this instead of just plain
-    -DNDEBUG when optimizing.
-  * You can now use "assertUnsatisfiable relation" to cause oc to quit if
-    "relation" could be satisfiable. This is mainly useful when running oc
-    in a script.
-
-version 1.1 (Omega Project):
-  * An exact convex hull computation.
-  * An improved system for handling inexact relations, including taking
-    upper and lower bounds, checking for subsets, and checking tautologies.
-  * Better handling of existentially quantified variables: we can now
-    negate and generate code for sets like:
-    {[i]: 1 <= i <= n && exists (alpha: i <= 10*alpha <= i+k)}.
-  * An Example operator, that gives a sample solution to set or relation.
-
-version 0.90 (Omega Project):
-  * Initial release.
-
-
-DIRECTORIES 
-===========
-
-omega/
-  omega_lib/     source files for the Omega+ library
-  code_gen/      source files for the CodeGen+ library
-  omega_calc/    source files for the calculator
-  examples/      script examples using calculator 
-    c_code/      code examples for using libraries
-  bin/           links to executables: oc
-  lib/           links to libraries: libomega.a, libcode_gen.a
-  include/       links to header files
-
-
-DOCUMENTATION AND QUESTIONS
-===========================
-
-There are only old documents from the Omega Project under doc/ subdirectory
-for now.
-
-Software website:
-  http://www.chunchen.info/omega
-
-For questions, bug reports or suggestions, please contact:
-  mailto:riverofdreams@gmail.com
diff --git a/omega/ROSE_INSTALL.txt b/omega/ROSE_INSTALL.txt
deleted file mode 100644
index 79e0c43..0000000
--- a/omega/ROSE_INSTALL.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-INSTALLATION STEPS:
-
-1) Please install Boost library version <= 1.45.0 using these instruccions
-
-1. Download BOOST.
-Download BOOST at www.boost.org/users/download.
-
-2. Untar BOOST.
-Type tar -zxf BOOST-[VersionNumber].tar.gz to untar the BOOST distribution.
-
-3. Create a separate install tree.
-Type mkdir installTree to create a location for the install.
-
-4. Run the bootstrap.sh script.
-Type ./bootstrap.sh --prefix=[installTree] 
-
-5. Run bjam.
-Type ./bjam install --prefix=[installTree] 
-
-
-6) set your BOOSTHOME environment variable to where you've installed BOOST.
-
-7) Download the latest version of rose from the website.
-   https://outreach.scidac.gov/frs/?group_id=24
-
-8) set the JAVA_HOME environment variable in your ${HOME}/.bashrc
-   eg. export JAVA_HOME=/usr/lib/jvm/java-1.6.0-openjdk
-
-9)  add this to the LD_LIBRARY_PATH environment variable
-
-    LD_LIBRARY_PATH=$JAVA_HOME/jre/lib/i386/server:$LD_LIBRARY_PATH
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${BOOSTHOME}/lib 
-
-10) make a new empty directory separate from the downloaded source directory($ROSE_SRC) for rose. 
-   eg. mkdir ${HOME}/compileTree    
-
-11) set your ROSEHOME environment variable in ${HOME}/.bashrc to ${HOME}/compileTree 
-
-12) run the following command from this ${ROSEHOME}
-    ${ROSE_SRC}/configure --prefix=${ROSEHOME} --with-boost=${BOOSTHOME} --with-boost-libdir=${BOOSTHOME}/lib -with-haskell=no
-
-13) run the following command to compile:
-    make install-core 
-
-
-14) Install lua version <= 5.1 (usually not necessary to set the LUAHOME environment variable unless 
-    you installed it in a local directory, in which case set LUAHOME to that directory). Lua is only required for
-    cuda-chill and not plain chill.
-
-15) If you are installing for CUDA-CHILL set the CUDACHILL environment variable to true
-    else false
-
-
-16) Install omega by doing the following commands
-   i)  make clean
-   ii) make veryclean
-   iii)make depend
-   iv) make
-
-17) Set your OMEGAHOME environment variable to the appropriate directory in ${HOME}/.bashrc
-
-18)  Install cuda-chill by doing the following commands
-   i)  make clean
-   ii) make veryclean
-   iii)make depend-cuda-chill
-   iv) make cuda-chill
-
-   else if you are installing just plain chill
-   export CUDACHILL=false; (remember to rebuild plain omega as well)
-   i)  make clean
-   ii) make veryclean
-   iii)make depend
-   iv) make
-
-19) Go to examples/cuda-chill and run ../../cuda-chill mm.lua
-
-20) If running plain Chill go to examples/chill and run ../../chill gemm.script
diff --git a/omega/bin/oc b/omega/bin/oc
deleted file mode 120000
index be58273..0000000
--- a/omega/bin/oc
+++ /dev/null
@@ -1 +0,0 @@
-../omega_calc/obj/oc
\ No newline at end of file
diff --git a/orig_loop_datacopy.cc b/orig_loop_datacopy.cc
deleted file mode 100644
index 04741bc..0000000
--- a/orig_loop_datacopy.cc
+++ /dev/null
@@ -1,1175 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2008 University of Southern California
- Copyright (C) 2009-2010 University of Utah
- All Rights Reserved.
-
- Purpose:
-   Various data copy schemes.
-
- Notes:
-
- History:
-   02/20/09 Created by Chun Chen by splitting original datacopy from loop.cc
-*****************************************************************************/
-
-#include <code_gen/code_gen.h>
-#include <code_gen/output_repr.h>
-#include "loop.hh"
-#include "omegatools.hh"
-#include "ir_code.hh"
-#include "chill_error.hh"
-
-using namespace omega;
-
-//
-// data copy function by referring arrays by numbers.
-// e.g. A[i] = A[i-1] + B[i]
-//      parameter array_ref_num=[0,2] means to copy data touched by A[i-1] and A[i]
-//
-bool Loop::datacopy(const std::vector<std::pair<int, std::vector<int> > > &array_ref_nums, int level,
-                    bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, int memory_type) {
-  // check for sanity of parameters
-  std::set<int> same_loop;
-  for (int i = 0; i < array_ref_nums.size(); i++) {
-    int stmt_num = array_ref_nums[i].first;
-    if (stmt_num < 0 || stmt_num >= stmt.size())
-      throw std::invalid_argument("invalid statement number " + to_string(stmt_num));
-    if (level <= 0 || level > stmt[stmt_num].loop_level.size())
-      throw std::invalid_argument("invalid loop level " + to_string(level));
-    if (i == 0) {
-      std::vector<int> lex = getLexicalOrder(stmt_num);
-      same_loop = getStatements(lex, 2*level-2);
-    }
-    else if (same_loop.find(stmt_num) == same_loop.end())
-      throw std::invalid_argument("array references for data copy must be located in the same subloop");
-  }
-  
-  // convert array reference numbering scheme to actual array references
-  std::vector<std::pair<int, std::vector<IR_ArrayRef *> > > selected_refs;
-  for (int i = 0; i < array_ref_nums.size(); i++) {
-    if (array_ref_nums[i].second.size() == 0)
-      continue;
-    
-    int stmt_num = array_ref_nums[i].first;
-    selected_refs.push_back(std::make_pair(stmt_num, std::vector<IR_ArrayRef *>()));
-    std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[stmt_num].code);
-    std::vector<bool> selected(refs.size(), false);
-    for (int j = 0; j < array_ref_nums[i].second.size(); j++) {
-      int ref_num = array_ref_nums[i].second[j];
-      if (ref_num < 0 || ref_num >= refs.size()) {
-        for (int k = 0; k < refs.size(); k++)
-          delete refs[k];
-        throw std::invalid_argument("invalid array reference number " + to_string(ref_num) + " in statement " + to_string(stmt_num));
-      }
-      selected_refs[selected_refs.size()-1].second.push_back(refs[ref_num]);
-      selected[ref_num] = true;
-    }
-    for (int j = 0; j < refs.size(); j++)
-      if (!selected[j])
-        delete refs[j];
-  }
-  if (selected_refs.size() == 0)
-    throw std::invalid_argument("found no array references to copy");
-  
-  // do the copy
-  return datacopy_privatized(selected_refs, level, std::vector<int>(), allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, memory_type);
-}
-
-//
-// data copy function by referring arrays by name.
-// e.g. A[i] = A[i-1] + B[i]
-//      parameter array_name=A means to copy data touched by A[i-1] and A[i]
-//
-bool Loop::datacopy(int stmt_num, int level, const std::string &array_name,
-                    bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, int memory_type) {
-  // check for sanity of parameters
-  if (stmt_num < 0 || stmt_num >= stmt.size())
-    throw std::invalid_argument("invalid statement number " + to_string(stmt_num));
-  if (level <= 0 || level > stmt[stmt_num].loop_level.size())
-    throw std::invalid_argument("invalid loop level " + to_string(level));
-  
-  // collect array references by name
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  int dim = 2*level - 1;
-  std::set<int> same_loop = getStatements(lex, dim-1);
-  
-  std::vector<std::pair<int, std::vector<IR_ArrayRef *> > > selected_refs;
-  for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-    std::vector<IR_ArrayRef *> t;
-    std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[*i].code);  
-    for (int j = 0; j < refs.size(); j++)
-      if (refs[j]->name() == array_name)
-        t.push_back(refs[j]);
-      else
-        delete refs[j];
-    if (t.size() != 0)
-      selected_refs.push_back(std::make_pair(*i, t)); 
-  }
-  if (selected_refs.size() == 0)
-    throw std::invalid_argument("found no array references with name " + to_string(array_name) + " to copy");
-  
-  // do the copy
-  return datacopy_privatized(selected_refs, level, std::vector<int>(), allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, memory_type);
-}
-
-
-bool Loop::datacopy_privatized(int stmt_num, int level, const std::string &array_name, const std::vector<int> &privatized_levels,
-                               bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, int memory_type) {
-  // check for sanity of parameters
-  if (stmt_num < 0 || stmt_num >= stmt.size())
-    throw std::invalid_argument("invalid statement number " + to_string(stmt_num));
-  if (level <= 0 || level > stmt[stmt_num].loop_level.size())
-    throw std::invalid_argument("invalid loop level " + to_string(level));
-  
-  // collect array references by name
-  std::vector<int> lex = getLexicalOrder(stmt_num);
-  int dim = 2*level - 1;
-  std::set<int> same_loop = getStatements(lex, dim-1);
-  
-  std::vector<std::pair<int, std::vector<IR_ArrayRef *> > > selected_refs;
-  for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-    selected_refs.push_back(std::make_pair(*i, std::vector<IR_ArrayRef *>()));
-    
-    std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[*i].code);  
-    for (int j = 0; j < refs.size(); j++)
-      if (refs[j]->name() == array_name)
-        selected_refs[selected_refs.size()-1].second.push_back(refs[j]);
-      else
-        delete refs[j];
-  }
-  if (selected_refs.size() == 0)
-    throw std::invalid_argument("found no array references with name " + to_string(array_name) + " to copy");
-  
-  // do the copy
-  return datacopy_privatized(selected_refs, level, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, memory_type);
-}
-
-
-bool Loop::datacopy_privatized(const std::vector<std::pair<int, std::vector<int> > > &array_ref_nums, int level, const std::vector<int> &privatized_levels, bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, int memory_type) {
-  // check for sanity of parameters
-  std::set<int> same_loop;
-  for (int i = 0; i < array_ref_nums.size(); i++) {
-    int stmt_num = array_ref_nums[i].first;
-    if (stmt_num < 0 || stmt_num >= stmt.size())
-      throw std::invalid_argument("invalid statement number " + to_string(stmt_num));
-    if (level <= 0 || level > stmt[stmt_num].loop_level.size())
-      throw std::invalid_argument("invalid loop level " + to_string(level));
-    if (i == 0) {
-      std::vector<int> lex = getLexicalOrder(stmt_num);
-      same_loop = getStatements(lex, 2*level-2);
-    }
-    else if (same_loop.find(stmt_num) == same_loop.end())
-      throw std::invalid_argument("array references for data copy must be located in the same subloop");
-  }
-  
-  // convert array reference numbering scheme to actual array references
-  std::vector<std::pair<int, std::vector<IR_ArrayRef *> > > selected_refs;
-  for (int i = 0; i < array_ref_nums.size(); i++) {
-    if (array_ref_nums[i].second.size() == 0)
-      continue;
-    
-    int stmt_num = array_ref_nums[i].first;
-    selected_refs.push_back(std::make_pair(stmt_num, std::vector<IR_ArrayRef *>()));
-    std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[stmt_num].code);
-    std::vector<bool> selected(refs.size(), false);
-    for (int j = 0; j < array_ref_nums[i].second.size(); j++) {
-      int ref_num = array_ref_nums[i].second[j];
-      if (ref_num < 0 || ref_num >= refs.size()) {
-        for (int k = 0; k < refs.size(); k++)
-          delete refs[k];
-        throw std::invalid_argument("invalid array reference number " + to_string(ref_num) + " in statement " + to_string(stmt_num));
-      }
-      selected_refs[selected_refs.size()-1].second.push_back(refs[ref_num]);
-      selected[ref_num] = true;
-    }
-    for (int j = 0; j < refs.size(); j++)
-      if (!selected[j])
-        delete refs[j];
-  }
-  if (selected_refs.size() == 0)
-    throw std::invalid_argument("found no array references to copy");
-  
-  // do the copy
-  return datacopy_privatized(selected_refs, level, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, memory_type);
-}
-
-
-//
-// Implement low level datacopy function with lots of options.
-//
-bool Loop::datacopy_privatized(const std::vector<std::pair<int, std::vector<IR_ArrayRef *> > > &stmt_refs, int level,
-                               const std::vector<int> &privatized_levels,
-                               bool allow_extra_read, int fastest_changing_dimension,
-                               int padding_stride, int padding_alignment, int memory_type) {
-  if (stmt_refs.size() == 0)
-    return true;
-  
-  // check for sanity of parameters
-  IR_ArraySymbol *sym = NULL;
-  std::vector<int> lex;
-  std::set<int> active;
-  if (level <= 0)
-    throw std::invalid_argument("invalid loop level " + to_string(level));
-  for (int i = 0; i < privatized_levels.size(); i++) {
-    if (i == 0) {
-      if (privatized_levels[i] < level)
-        throw std::invalid_argument("privatized loop levels must be no less than level " + to_string(level));
-    }
-    else if (privatized_levels[i] <= privatized_levels[i-1])
-      throw std::invalid_argument("privatized loop levels must be in ascending order");
-  }
-  for (int i = 0; i < stmt_refs.size(); i++) {
-    int stmt_num = stmt_refs[i].first;
-    active.insert(stmt_num);
-    if (stmt_num < 0 || stmt_num >= stmt.size())
-      throw std::invalid_argument("invalid statement number " + to_string(stmt_num));
-    if (privatized_levels.size() != 0) {
-      if (privatized_levels[privatized_levels.size()-1] > stmt[stmt_num].loop_level.size())
-        throw std::invalid_argument("invalid loop level " + to_string(privatized_levels[privatized_levels.size()-1]) + " for statement " + to_string(stmt_num));
-    }
-    else {
-      if (level > stmt[stmt_num].loop_level.size())
-        throw std::invalid_argument("invalid loop level " + to_string(level) + " for statement " + to_string(stmt_num));
-    }
-    for (int j = 0; j < stmt_refs[i].second.size(); j++) {
-      if (sym == NULL) {
-        sym = stmt_refs[i].second[j]->symbol();
-        lex = getLexicalOrder(stmt_num);
-      }
-      else {
-        IR_ArraySymbol *t = stmt_refs[i].second[j]->symbol();
-        if (t->name() != sym->name()) {
-          delete t;
-          delete sym;
-          throw std::invalid_argument("try to copy data from different arrays");
-        }
-        delete t;
-      }
-    }
-  }
-  if (!(fastest_changing_dimension >= -1 && fastest_changing_dimension < sym->n_dim()))
-    throw std::invalid_argument("invalid fastest changing dimension for the array to be copied");
-  if (padding_stride < 0)
-    throw std::invalid_argument("invalid temporary array stride requirement");
-  if (padding_alignment == -1 || padding_alignment == 0)
-    throw std::invalid_argument("invalid temporary array alignment requirement");
-  
-  int dim = 2*level - 1;
-  int n_dim = sym->n_dim();
-  
-  if (fastest_changing_dimension == -1)
-    switch (sym->layout_type()) {
-    case IR_ARRAY_LAYOUT_ROW_MAJOR:
-      fastest_changing_dimension = n_dim - 1;
-      break;
-    case IR_ARRAY_LAYOUT_COLUMN_MAJOR:
-      fastest_changing_dimension = 0;
-      break;
-    default:
-      throw loop_error("unsupported array layout");
-    }
-  
-  
-  // build iteration spaces for all reads and for all writes separately
-  apply_xform(active);
-  bool has_write_refs = false;
-  bool has_read_refs = false;
-  Relation wo_copy_is = Relation::False(level-1+privatized_levels.size()+n_dim);
-  Relation ro_copy_is = Relation::False(level-1+privatized_levels.size()+n_dim);
-  for (int i = 0; i < stmt_refs.size(); i++) {
-    int stmt_num = stmt_refs[i].first;
-    
-    for (int j = 0; j < stmt_refs[i].second.size(); j++) {
-      Relation mapping(stmt[stmt_num].IS.n_set(), level-1+privatized_levels.size()+n_dim);
-      for (int k = 1; k <= mapping.n_inp(); k++)
-        mapping.name_input_var(k, stmt[stmt_num].IS.set_var(k)->name());
-      mapping.setup_names();
-      F_And *f_root = mapping.add_and();
-      for (int k = 1; k <= level-1; k++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(mapping.input_var(k), 1);
-        h.update_coef(mapping.output_var(k), -1);
-      }
-      for (int k = 0; k < privatized_levels.size(); k++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(mapping.input_var(privatized_levels[k]), 1);
-        h.update_coef(mapping.output_var(level+k), -1);
-      }
-      for (int k = 0; k < n_dim; k++) {
-        CG_outputRepr *repr = stmt_refs[i].second[j]->index(k);
-        exp2formula(ir, mapping, f_root, freevar, repr, mapping.output_var(level-1+privatized_levels.size()+k+1), 'w', IR_COND_EQ, false);
-        repr->clear();
-        delete repr;
-      }
-      Relation r = Range(Restrict_Domain(mapping, Intersection(copy(stmt[stmt_num].IS), Extend_Set(copy(this->known), stmt[stmt_num].IS.n_set() - this->known.n_set()))));
-      if (stmt_refs[i].second[j]->is_write()) {
-        has_write_refs = true;
-        wo_copy_is = Union(wo_copy_is, r);
-        wo_copy_is.simplify(2, 4);
-      }
-      else {
-        has_read_refs = true;
-        //protonu--removing the next line for now
-        ro_copy_is = Union(ro_copy_is, r);
-        ro_copy_is.simplify(2, 4);
-        //ro_copy_is = ConvexRepresentation(Union(ro_copy_is, r));
-        
-      }
-    }
-  }
-  
-  if (allow_extra_read) {
-    Relation t = DecoupledConvexHull(copy(ro_copy_is));
-    if (t.number_of_conjuncts() > 1)
-      ro_copy_is = RectHull(ro_copy_is);
-    else
-      ro_copy_is = t;
-  }
-  else {
-    Relation t = ConvexRepresentation(copy(ro_copy_is));
-    if (t.number_of_conjuncts() > 1)
-      ro_copy_is = RectHull(ro_copy_is);
-    else
-      ro_copy_is = t;
-  }
-  wo_copy_is = ConvexRepresentation(wo_copy_is);
-  
-  if (allow_extra_read) {
-    Tuple<Relation> Rs;
-    Tuple<int> active;
-    for (DNF_Iterator di(ro_copy_is.query_DNF()); di; di++) {
-      Rs.append(Relation(ro_copy_is, di.curr()));
-      active.append(1);
-    }
-    Relation the_gcs = Relation::True(ro_copy_is.n_set());
-    for (int i = level-1+privatized_levels.size()+1; i <= level-1+privatized_levels.size()+n_dim; i++) {
-      Relation r = greatest_common_step(Rs, active, i, Relation::Null());
-      the_gcs = Intersection(the_gcs, r);
-    }
-    
-    ro_copy_is = Approximate(ro_copy_is);
-    ro_copy_is = ConvexRepresentation(ro_copy_is);
-    ro_copy_is = Intersection(ro_copy_is, the_gcs);
-    ro_copy_is.simplify();
-  }
-  for (int i = 1; i <= level-1+privatized_levels.size()+n_dim; i++) {
-    wo_copy_is.name_set_var(i, tmp_loop_var_name_prefix+to_string(i));
-    ro_copy_is.name_set_var(i, tmp_loop_var_name_prefix+to_string(i));
-  } 
-  wo_copy_is.setup_names();
-  ro_copy_is.setup_names();
-  
-  // build merged iteration space for calculating temporary array size
-  bool already_use_recthull = false;
-  Relation untampered_copy_is = ConvexRepresentation(Union(copy(wo_copy_is), copy(ro_copy_is)));
-  Relation copy_is = untampered_copy_is;
-  if (copy_is.number_of_conjuncts() > 1) {
-    try {
-      copy_is = ConvexHull(copy(untampered_copy_is));
-    }
-    catch (const std::overflow_error &e) {
-      copy_is = RectHull(copy(untampered_copy_is));
-      already_use_recthull = true;
-    }
-  }
-  
-  
-Retry_copy_is:
-  // extract temporary array information
-  CG_outputBuilder *ocg = ir->builder();
-  std::vector<CG_outputRepr *> index_lb(n_dim); // initialized to NULL
-  std::vector<coef_t> index_stride(n_dim, 1);
-  std::vector<bool> is_index_eq(n_dim, false);
-  std::vector<std::pair<int, CG_outputRepr *> > index_sz(0);  
-  Relation reduced_copy_is = copy(copy_is);
-  
-  for (int i = 0; i < n_dim; i++) {
-    if (i != 0)
-      reduced_copy_is = Project(reduced_copy_is, level-1+privatized_levels.size()+i, Set_Var);
-    Relation bound = get_loop_bound(reduced_copy_is, level-1+privatized_levels.size()+i);
-    
-    // extract stride
-    EQ_Handle stride_eq;
-    {
-      bool simple_stride = true;
-      int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(level-1+privatized_levels.size()+i+1), stride_eq, simple_stride);
-      if (strides > 1) {
-        throw loop_error("too many strides");
-      }
-      else if (strides == 1) {
-        int sign = stride_eq.get_coef(bound.set_var(level-1+privatized_levels.size()+i+1));
-        Constr_Vars_Iter it(stride_eq, true);
-        index_stride[i] = abs((*it).coef/sign);
-      }
-    }
-    
-    // check if this arary index requires loop
-    Conjunct *c = bound.query_DNF()->single_conjunct();
-    for (EQ_Iterator ei(c->EQs()); ei; ei++) {
-      if ((*ei).has_wildcards())
-        continue;
-      
-      int coef = (*ei).get_coef(bound.set_var(level-1+privatized_levels.size()+i+1));
-      if (coef != 0) {
-        int sign = 1;
-        if (coef < 0) {
-          coef = -coef;
-          sign = -1;
-        }
-        
-        CG_outputRepr *op = NULL;
-        for (Constr_Vars_Iter ci(*ei); ci; ci++) {
-          switch ((*ci).var->kind()) {
-          case Input_Var:
-          {
-            if ((*ci).var != bound.set_var(level-1+privatized_levels.size()+i+1))
-              if ((*ci).coef*sign == 1)
-                op = ocg->CreateMinus(op, ocg->CreateIdent((*ci).var->name()));
-              else if ((*ci).coef*sign == -1)
-                op = ocg->CreatePlus(op, ocg->CreateIdent((*ci).var->name()));
-              else if ((*ci).coef*sign > 1)
-                op = ocg->CreateMinus(op, ocg->CreateTimes(ocg->CreateInt(abs((*ci).coef)), ocg->CreateIdent((*ci).var->name())));
-              else // (*ci).coef*sign < -1
-                op = ocg->CreatePlus(op, ocg->CreateTimes(ocg->CreateInt(abs((*ci).coef)), ocg->CreateIdent((*ci).var->name())));
-            break;
-          }
-          case Global_Var:
-          {
-            Global_Var_ID g = (*ci).var->get_global_var();
-            if ((*ci).coef*sign == 1)
-              op = ocg->CreateMinus(op, ocg->CreateIdent(g->base_name()));
-            else if ((*ci).coef*sign == -1)
-              op = ocg->CreatePlus(op, ocg->CreateIdent(g->base_name()));
-            else if ((*ci).coef*sign > 1)
-              op = ocg->CreateMinus(op, ocg->CreateTimes(ocg->CreateInt(abs((*ci).coef)), ocg->CreateIdent(g->base_name())));
-            else // (*ci).coef*sign < -1
-              op = ocg->CreatePlus(op, ocg->CreateTimes(ocg->CreateInt(abs((*ci).coef)), ocg->CreateIdent(g->base_name())));
-            break;
-          }
-          default:
-            throw loop_error("unsupported array index expression");
-          }
-        }
-        if ((*ei).get_const() != 0)
-          op = ocg->CreatePlus(op, ocg->CreateInt(-sign*((*ei).get_const())));
-        if (coef != 1)
-          op = ocg->CreateIntegerDivide(op, ocg->CreateInt(coef));
-        
-        index_lb[i] = op;
-        is_index_eq[i] = true;
-        break;
-      }
-    }
-    if (is_index_eq[i])
-      continue;
-    
-    // seperate lower and upper bounds
-    std::vector<GEQ_Handle> lb_list, ub_list;
-    for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
-      int coef = (*gi).get_coef(bound.set_var(level-1+privatized_levels.size()+i+1));
-      if (coef != 0 && (*gi).has_wildcards()) {
-        bool clean_bound = true;
-        GEQ_Handle h;
-        for (Constr_Vars_Iter cvi(*gi, true); gi; gi++)
-          if (!findFloorInequality(bound, (*cvi).var, h, bound.set_var(level-1+privatized_levels.size()+i+1))) {
-            clean_bound = false;
-            break;
-          }
-        if (!clean_bound)
-          continue;
-      }
-      
-      if (coef > 0)
-        lb_list.push_back(*gi);
-      else if (coef < 0)
-        ub_list.push_back(*gi);
-    }
-    if (lb_list.size() == 0 || ub_list.size() == 0)
-      if (already_use_recthull)
-        throw loop_error("failed to calcuate array footprint size");
-      else {
-        copy_is = RectHull(copy(untampered_copy_is));
-        already_use_recthull = true;
-        goto Retry_copy_is;
-      }
-    
-    // build lower bound representation
-    Tuple<CG_outputRepr *> lb_repr_list;
-    for (int j = 0; j < lb_list.size(); j++)
-      lb_repr_list.append(outputLBasRepr(ocg, lb_list[j], bound,
-                                         bound.set_var(level-1+privatized_levels.size()+i+1), 
-                                         index_stride[i], stride_eq, Relation::True(bound.n_set()),
-                                         std::vector<CG_outputRepr *>(bound.n_set())));
-    
-    if (lb_repr_list.size() > 1)
-      index_lb[i] = ocg->CreateInvoke("max", lb_repr_list);
-    else if (lb_repr_list.size() == 1)
-      index_lb[i] = lb_repr_list[1];
-    
-    // build temporary array size representation
-    {
-      Relation cal(copy_is.n_set(), 1);
-      F_And *f_root = cal.add_and();
-      for (int j = 0; j < ub_list.size(); j++)
-        for (int k = 0; k < lb_list.size(); k++) {
-          GEQ_Handle h = f_root->add_GEQ();
-          
-          for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var:
-            {
-              int pos = (*ci).var->get_position();
-              h.update_coef(cal.input_var(pos), (*ci).coef);
-              break;
-            }
-            case Global_Var:
-            {
-              Global_Var_ID g = (*ci).var->get_global_var();
-              Variable_ID v;
-              if (g->arity() == 0)
-                v = cal.get_local(g);
-              else
-                v = cal.get_local(g, (*ci).var->function_of());
-              h.update_coef(v, (*ci).coef);
-              break;
-            }
-            default:
-              throw loop_error("cannot calculate temporay array size statically");
-            }
-          }
-          h.update_const(ub_list[j].get_const());
-          
-          for (Constr_Vars_Iter ci(lb_list[k]); ci; ci++) {
-            switch ((*ci).var->kind()) {
-            case Input_Var:
-            {
-              int pos = (*ci).var->get_position();
-              h.update_coef(cal.input_var(pos), (*ci).coef);
-              break;
-            }
-            case Global_Var:
-            {
-              Global_Var_ID g = (*ci).var->get_global_var();
-              Variable_ID v;
-              if (g->arity() == 0)
-                v = cal.get_local(g);
-              else
-                v = cal.get_local(g, (*ci).var->function_of());
-              h.update_coef(v, (*ci).coef);
-              break;
-            }
-            default:
-              throw loop_error("cannot calculate temporay array size statically");
-            }
-          }
-          h.update_const(lb_list[k].get_const());
-          
-          h.update_const(1);
-          h.update_coef(cal.output_var(1), -1);
-        }
-      
-      cal = Restrict_Domain(cal, copy(copy_is));
-      for (int j = 1; j <= cal.n_inp(); j++)
-        cal = Project(cal, j, Input_Var);
-      cal.simplify();
-      
-      // pad temporary array size
-      // TODO: for variable array size, create padding formula
-      Conjunct *c = cal.query_DNF()->single_conjunct();
-      bool is_index_bound_const = false;
-      for (GEQ_Iterator gi(c->GEQs()); gi && !is_index_bound_const; gi++)
-        if ((*gi).is_const(cal.output_var(1))) {
-          coef_t size = (*gi).get_const() / (-(*gi).get_coef(cal.output_var(1)));
-          if (padding_stride != 0) {
-            size = (size + index_stride[i] - 1) / index_stride[i];
-            if (i == fastest_changing_dimension)
-              size = size * padding_stride;
-          }
-          if (i == fastest_changing_dimension) {
-            if (padding_alignment > 1) { // align to boundary for data packing
-              int residue = size % padding_alignment;
-              if (residue)
-                size = size+padding_alignment-residue;
-            }
-            else if (padding_alignment < -1) {  // un-alignment for memory bank conflicts
-              while (gcd(size, static_cast<coef_t>(-padding_alignment)) != 1)
-                size++;
-            }
-          }
-          index_sz.push_back(std::make_pair(i, ocg->CreateInt(size)));
-          is_index_bound_const = true;
-        }
-      
-      if (!is_index_bound_const) {
-        for (GEQ_Iterator gi(c->GEQs()); gi && !is_index_bound_const; gi++) {
-          int coef = (*gi).get_coef(cal.output_var(1));
-          if (coef < 0) {
-            CG_outputRepr *op = NULL;
-            for (Constr_Vars_Iter ci(*gi); ci; ci++) {
-              if ((*ci).var != cal.output_var(1)) {
-                switch((*ci).var->kind()) {
-                case Global_Var:
-                {
-                  Global_Var_ID g = (*ci).var->get_global_var();
-                  if ((*ci).coef == 1)
-                    op = ocg->CreatePlus(op, ocg->CreateIdent(g->base_name()));
-                  else if ((*ci).coef == -1)
-                    op = ocg->CreateMinus(op, ocg->CreateIdent(g->base_name()));
-                  else if ((*ci).coef > 1)
-                    op = ocg->CreatePlus(op, ocg->CreateTimes(ocg->CreateInt((*ci).coef), ocg->CreateIdent(g->base_name())));
-                  else // (*ci).coef < -1
-                    op = ocg->CreateMinus(op, ocg->CreateTimes(ocg->CreateInt(-(*ci).coef), ocg->CreateIdent(g->base_name())));
-                  break;
-                }
-                default:
-                  throw loop_error("failed to generate array index bound code");
-                }
-              }
-            }
-            int c = (*gi).get_const();
-            if (c > 0)
-              op = ocg->CreatePlus(op, ocg->CreateInt(c));
-            else if (c < 0)
-              op = ocg->CreateMinus(op, ocg->CreateInt(-c));
-            if (padding_stride != 0) {
-              if (i == fastest_changing_dimension) {
-                coef_t g = gcd(index_stride[i], static_cast<coef_t>(padding_stride));
-                coef_t t1 = index_stride[i] / g;
-                if (t1 != 1)
-                  op = ocg->CreateIntegerDivide(ocg->CreatePlus(op, ocg->CreateInt(t1-1)), ocg->CreateInt(t1));
-                coef_t t2 = padding_stride / g;
-                if (t2 != 1)
-                  op = ocg->CreateTimes(op, ocg->CreateInt(t2));
-              }
-              else if (index_stride[i] != 1) {
-                op = ocg->CreateIntegerDivide(ocg->CreatePlus(op, ocg->CreateInt(index_stride[i]-1)), ocg->CreateInt(index_stride[i]));
-              }
-            }
-            
-            index_sz.push_back(std::make_pair(i, op));
-            break;
-          }
-        }
-      }
-    }
-  }
-  
-  // change the temporary array index order
-  for (int i = 0; i < index_sz.size(); i++)
-    if (index_sz[i].first == fastest_changing_dimension)
-      switch (sym->layout_type()) {
-      case IR_ARRAY_LAYOUT_ROW_MAJOR:
-        std::swap(index_sz[index_sz.size()-1], index_sz[i]);
-        break;
-      case IR_ARRAY_LAYOUT_COLUMN_MAJOR:
-        std::swap(index_sz[0], index_sz[i]);
-        break;
-      default:
-        throw loop_error("unsupported array layout");
-      }
-  
-  // declare temporary array or scalar
-  IR_Symbol *tmp_sym;
-  if (index_sz.size() == 0) {
-    tmp_sym = ir->CreateScalarSymbol(sym, memory_type);
-  }
-  else {
-    std::vector<CG_outputRepr *> tmp_array_size(index_sz.size());
-    for (int i = 0; i < index_sz.size(); i++)
-      tmp_array_size[i] = index_sz[i].second->clone();
-    tmp_sym = ir->CreateArraySymbol(sym, tmp_array_size, memory_type);
-  }
-  
-  // create temporary array read initialization code
-  CG_outputRepr *copy_code_read;
-  if (has_read_refs)
-    if (index_sz.size() == 0) {
-      IR_ScalarRef *tmp_scalar_ref = ir->CreateScalarRef(static_cast<IR_ScalarSymbol *>(tmp_sym));
-      
-      std::vector<CG_outputRepr *> rhs_index(n_dim);
-      for (int i = 0; i < index_lb.size(); i++)
-        if (is_index_eq[i])
-          rhs_index[i] = index_lb[i]->clone();
-        else
-          rhs_index[i] = ir->builder()->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+i+1)->name());
-      IR_ArrayRef *copied_array_ref = ir->CreateArrayRef(sym, rhs_index);
-      
-      copy_code_read = ir->builder()->CreateAssignment(0, tmp_scalar_ref->convert(), copied_array_ref->convert());
-    }
-    else {
-      std::vector<CG_outputRepr *> lhs_index(index_sz.size());
-      for (int i = 0; i < index_sz.size(); i++) {
-        int cur_index_num = index_sz[i].first;
-        CG_outputRepr *cur_index_repr = ocg->CreateMinus(ocg->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+cur_index_num+1)->name()), index_lb[cur_index_num]->clone());
-        if (padding_stride != 0) {
-          if (i == n_dim-1) {
-            coef_t g = gcd(index_stride[cur_index_num], static_cast<coef_t>(padding_stride));
-            coef_t t1 = index_stride[cur_index_num] / g;
-            if (t1 != 1)
-              cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(t1));
-            coef_t t2 = padding_stride / g;
-            if (t2 != 1)
-              cur_index_repr = ocg->CreateTimes(cur_index_repr, ocg->CreateInt(t2));
-          }
-          else if (index_stride[cur_index_num] != 1) {
-            cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(index_stride[cur_index_num]));
-          }
-        }
-        
-        if (ir->ArrayIndexStartAt() != 0)
-          cur_index_repr = ocg->CreatePlus(cur_index_repr, ocg->CreateInt(ir->ArrayIndexStartAt()));
-        lhs_index[i] = cur_index_repr;
-      }
-      
-      IR_ArrayRef *tmp_array_ref = ir->CreateArrayRef(static_cast<IR_ArraySymbol *>(tmp_sym), lhs_index);
-      
-      std::vector<CG_outputRepr *> rhs_index(n_dim);
-      for (int i = 0; i < index_lb.size(); i++)
-        if (is_index_eq[i])
-          rhs_index[i] = index_lb[i]->clone();
-        else
-          rhs_index[i] = ir->builder()->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+i+1)->name());
-      IR_ArrayRef *copied_array_ref = ir->CreateArrayRef(sym, rhs_index);
-      
-      copy_code_read = ir->builder()->CreateAssignment(0, tmp_array_ref->convert(), copied_array_ref->convert());
-    }
-  
-  // create temporary array write back code
-  CG_outputRepr *copy_code_write;
-  if (has_write_refs)
-    if (index_sz.size() == 0) {
-      IR_ScalarRef *tmp_scalar_ref = ir->CreateScalarRef(static_cast<IR_ScalarSymbol *>(tmp_sym));
-      
-      std::vector<CG_outputRepr *> rhs_index(n_dim);
-      for (int i = 0; i < index_lb.size(); i++)
-        if (is_index_eq[i])
-          rhs_index[i] = index_lb[i]->clone();
-        else
-          rhs_index[i] = ir->builder()->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+i+1)->name());
-      IR_ArrayRef *copied_array_ref = ir->CreateArrayRef(sym, rhs_index);
-      
-      copy_code_write = ir->builder()->CreateAssignment(0, copied_array_ref->convert(), tmp_scalar_ref->convert());
-    }
-    else {
-      std::vector<CG_outputRepr *> lhs_index(n_dim);
-      for (int i = 0; i < index_lb.size(); i++)
-        if (is_index_eq[i])
-          lhs_index[i] = index_lb[i]->clone();
-        else
-          lhs_index[i] = ir->builder()->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+i+1)->name());
-      IR_ArrayRef *copied_array_ref = ir->CreateArrayRef(sym, lhs_index);
-      
-      std::vector<CG_outputRepr *> rhs_index(index_sz.size());
-      for (int i = 0; i < index_sz.size(); i++) {
-        int cur_index_num = index_sz[i].first;
-        CG_outputRepr *cur_index_repr = ocg->CreateMinus(ocg->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+cur_index_num+1)->name()), index_lb[cur_index_num]->clone());
-        if (padding_stride != 0) {
-          if (i == n_dim-1) {
-            coef_t g = gcd(index_stride[cur_index_num], static_cast<coef_t>(padding_stride));
-            coef_t t1 = index_stride[cur_index_num] / g;
-            if (t1 != 1)
-              cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(t1));
-            coef_t t2 = padding_stride / g;
-            if (t2 != 1)
-              cur_index_repr = ocg->CreateTimes(cur_index_repr, ocg->CreateInt(t2));
-          }
-          else if (index_stride[cur_index_num] != 1) {
-            cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(index_stride[cur_index_num]));
-          }
-        }
-        
-        if (ir->ArrayIndexStartAt() != 0)
-          cur_index_repr = ocg->CreatePlus(cur_index_repr, ocg->CreateInt(ir->ArrayIndexStartAt()));
-        rhs_index[i] = cur_index_repr;
-      }
-      IR_ArrayRef *tmp_array_ref = ir->CreateArrayRef(static_cast<IR_ArraySymbol *>(tmp_sym), rhs_index);
-      
-      copy_code_write = ir->builder()->CreateAssignment(0, copied_array_ref->convert(), tmp_array_ref->convert());
-    }
-  
-  // now we can remove those loops for array indexes that are
-  // dependent on others
-  if (!(index_sz.size() == n_dim && (sym->layout_type() == IR_ARRAY_LAYOUT_ROW_MAJOR || n_dim <= 1))) {
-    Relation mapping(level-1+privatized_levels.size()+n_dim, level-1+privatized_levels.size()+index_sz.size());
-    F_And *f_root = mapping.add_and();
-    for (int i = 1; i <= level-1+privatized_levels.size(); i++) {
-      EQ_Handle h = f_root->add_EQ();
-      h.update_coef(mapping.input_var(i), 1);
-      h.update_coef(mapping.output_var(i), -1);
-    }
-    
-    int cur_index = 0;
-    std::vector<int> mapped_index(index_sz.size());
-    for (int i = 0; i < n_dim; i++)
-      if (!is_index_eq[i]) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(mapping.input_var(level-1+privatized_levels.size()+i+1), 1);
-        switch (sym->layout_type()) {
-        case IR_ARRAY_LAYOUT_COLUMN_MAJOR: {
-          h.update_coef(mapping.output_var(level-1+privatized_levels.size()+index_sz.size()-cur_index), -1);
-          mapped_index[index_sz.size()-cur_index-1] = i;
-          break;
-        }
-        case IR_ARRAY_LAYOUT_ROW_MAJOR: {
-          h.update_coef(mapping.output_var(level-1+privatized_levels.size()+cur_index+1), -1);
-          mapped_index[cur_index] = i;
-          break;
-        }
-        default:
-          throw loop_error("unsupported array layout");
-        }
-        cur_index++;
-      }
-    
-    wo_copy_is = Range(Restrict_Domain(copy(mapping), wo_copy_is));
-    ro_copy_is = Range(Restrict_Domain(copy(mapping), ro_copy_is));
-    for (int i = 1; i <= level-1+privatized_levels.size(); i++) {
-      wo_copy_is.name_set_var(i, copy_is.set_var(i)->name());
-      ro_copy_is.name_set_var(i, copy_is.set_var(i)->name());
-    }
-    for (int i = 0; i < index_sz.size(); i++) {
-      wo_copy_is.name_set_var(level-1+privatized_levels.size()+i+1, copy_is.set_var(level-1+privatized_levels.size()+mapped_index[i]+1)->name());
-      ro_copy_is.name_set_var(level-1+privatized_levels.size()+i+1, copy_is.set_var(level-1+privatized_levels.size()+mapped_index[i]+1)->name());
-    }      
-    wo_copy_is.setup_names();
-    ro_copy_is.setup_names();
-  }
-  
-  // insert read copy statement
-  int old_num_stmt = stmt.size();
-  int ro_copy_stmt_num = -1;
-  if (has_read_refs) {
-    Relation copy_xform(ro_copy_is.n_set(), 2*ro_copy_is.n_set()+1);
-    {
-      F_And *f_root = copy_xform.add_and();
-      for (int i = 1; i <= ro_copy_is.n_set(); i++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(copy_xform.input_var(i), 1);
-        h.update_coef(copy_xform.output_var(2*i), -1);
-      }
-      for (int i = 1; i <= dim; i+=2) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(copy_xform.output_var(i), -1);
-        h.update_const(lex[i-1]);
-      }
-      for (int i = dim+2; i <= copy_xform.n_out(); i+=2) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(copy_xform.output_var(i), 1);
-      }
-    }
-    
-    Statement copy_stmt_read;
-    copy_stmt_read.IS = ro_copy_is;
-    copy_stmt_read.xform = copy_xform;
-    copy_stmt_read.code = copy_code_read;
-    copy_stmt_read.loop_level = std::vector<LoopLevel>(ro_copy_is.n_set());
-    for (int i = 0; i < level-1; i++) {
-      copy_stmt_read.loop_level[i].type = stmt[*(active.begin())].loop_level[i].type;
-      if (stmt[*(active.begin())].loop_level[i].type == LoopLevelTile &&
-          stmt[*(active.begin())].loop_level[i].payload >= level) {
-        int j;
-        for (j = 0; j < privatized_levels.size(); j++)
-          if (privatized_levels[j] == stmt[*(active.begin())].loop_level[i].payload)
-            break;
-        if (j == privatized_levels.size())
-          copy_stmt_read.loop_level[i].payload = -1;
-        else
-          copy_stmt_read.loop_level[i].payload = level + j;
-      }
-      else
-        copy_stmt_read.loop_level[i].payload = stmt[*(active.begin())].loop_level[i].payload;
-      copy_stmt_read.loop_level[i].parallel_level = stmt[*(active.begin())].loop_level[i].parallel_level;
-    }
-    for (int i = 0; i < privatized_levels.size(); i++) {
-      copy_stmt_read.loop_level[level-1+i].type = stmt[*(active.begin())].loop_level[privatized_levels[i]].type;
-      copy_stmt_read.loop_level[level-1+i].payload = stmt[*(active.begin())].loop_level[privatized_levels[i]].payload;
-      copy_stmt_read.loop_level[level-1+i].parallel_level = stmt[*(active.begin())].loop_level[privatized_levels[i]].parallel_level;
-    }
-    int left_num_dim = num_dep_dim - (get_last_dep_dim_before(*(active.begin()), level) + 1);
-    for (int i = 0; i < min(left_num_dim, static_cast<int>(index_sz.size())); i++) {
-      copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].type = LoopLevelOriginal;
-      copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].payload = num_dep_dim-left_num_dim+i;
-      copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].parallel_level = 0;
-    }
-    for (int i = min(left_num_dim, static_cast<int>(index_sz.size())); i < index_sz.size(); i++) {
-      copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].type = LoopLevelUnknown;
-      copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].payload = -1;
-      copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].parallel_level = 0;
-    }
-    
-    shiftLexicalOrder(lex, dim-1, 1);
-    stmt.push_back(copy_stmt_read);
-    ro_copy_stmt_num = stmt.size() - 1;
-    dep.insert();
-  }
-  
-  // insert write copy statement
-  int wo_copy_stmt_num = -1;
-  if (has_write_refs) {
-    Relation copy_xform(wo_copy_is.n_set(), 2*wo_copy_is.n_set()+1);
-    {
-      F_And *f_root = copy_xform.add_and();
-      for (int i = 1; i <= wo_copy_is.n_set(); i++) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(copy_xform.input_var(i), 1);
-        h.update_coef(copy_xform.output_var(2*i), -1);
-      }
-      for (int i = 1; i <= dim; i+=2) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(copy_xform.output_var(i), -1);
-        h.update_const(lex[i-1]);
-      }
-      for (int i = dim+2; i <= copy_xform.n_out(); i+=2) {
-        EQ_Handle h = f_root->add_EQ();
-        h.update_coef(copy_xform.output_var(i), 1);
-      }
-    }
-    
-    Statement copy_stmt_write;
-    copy_stmt_write.IS = wo_copy_is;
-    copy_stmt_write.xform = copy_xform;
-    copy_stmt_write.code = copy_code_write;
-    copy_stmt_write.loop_level = std::vector<LoopLevel>(wo_copy_is.n_set());
-    for (int i = 0; i < level-1; i++) {
-      copy_stmt_write.loop_level[i].type = stmt[*(active.begin())].loop_level[i].type;
-      if (stmt[*(active.begin())].loop_level[i].type == LoopLevelTile &&
-          stmt[*(active.begin())].loop_level[i].payload >= level) {
-        int j;
-        for (j = 0; j < privatized_levels.size(); j++)
-          if (privatized_levels[j] == stmt[*(active.begin())].loop_level[i].payload)
-            break;
-        if (j == privatized_levels.size())
-          copy_stmt_write.loop_level[i].payload = -1;
-        else
-          copy_stmt_write.loop_level[i].payload = level + j;
-      }
-      else
-        copy_stmt_write.loop_level[i].payload = stmt[*(active.begin())].loop_level[i].payload;
-      copy_stmt_write.loop_level[i].parallel_level = stmt[*(active.begin())].loop_level[i].parallel_level;
-    }
-    for (int i = 0; i < privatized_levels.size(); i++) {
-      copy_stmt_write.loop_level[level-1+i].type = stmt[*(active.begin())].loop_level[privatized_levels[i]].type;
-      copy_stmt_write.loop_level[level-1+i].payload = stmt[*(active.begin())].loop_level[privatized_levels[i]].payload;
-      copy_stmt_write.loop_level[level-1+i].parallel_level = stmt[*(active.begin())].loop_level[privatized_levels[i]].parallel_level;
-    }
-    int left_num_dim = num_dep_dim - (get_last_dep_dim_before(*(active.begin()), level) + 1);
-    for (int i = 0; i < min(left_num_dim, static_cast<int>(index_sz.size())); i++) {
-      copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].type = LoopLevelOriginal;
-      copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].payload = num_dep_dim-left_num_dim+i;
-      copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].parallel_level = 0;
-    }
-    for (int i = min(left_num_dim, static_cast<int>(index_sz.size())); i < index_sz.size(); i++) {
-      copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].type = LoopLevelUnknown;
-      copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].payload = -1;
-      copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].parallel_level = 0;
-    }
-    
-    lex[dim-1]++;
-    shiftLexicalOrder(lex, dim-1, -2);
-    stmt.push_back(copy_stmt_write);
-    wo_copy_stmt_num = stmt.size() - 1;
-    dep.insert();
-  } 
-  
-  // replace original array accesses with temporary array accesses
-  for (int i =0; i < stmt_refs.size(); i++)
-    for (int j = 0; j < stmt_refs[i].second.size(); j++) {
-      if (index_sz.size() == 0) {
-        IR_ScalarRef *tmp_scalar_ref = ir->CreateScalarRef(static_cast<IR_ScalarSymbol *>(tmp_sym));
-        ir->ReplaceExpression(stmt_refs[i].second[j], tmp_scalar_ref->convert());
-      }
-      else {
-        std::vector<CG_outputRepr *> index_repr(index_sz.size());
-        for (int k = 0; k < index_sz.size(); k++) {
-          int cur_index_num = index_sz[k].first;
-          
-          CG_outputRepr *cur_index_repr = ocg->CreateMinus(stmt_refs[i].second[j]->index(cur_index_num), index_lb[cur_index_num]->clone());
-          if (padding_stride != 0) {
-            if (k == n_dim-1) {
-              coef_t g = gcd(index_stride[cur_index_num], static_cast<coef_t>(padding_stride));
-              coef_t t1 = index_stride[cur_index_num] / g;
-              if (t1 != 1)
-                cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(t1));
-              coef_t t2 = padding_stride / g;
-              if (t2 != 1)
-                cur_index_repr = ocg->CreateTimes(cur_index_repr, ocg->CreateInt(t2));
-            }
-            else if (index_stride[cur_index_num] != 1) {
-              cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(index_stride[cur_index_num]));
-            }
-          }
-          
-          if (ir->ArrayIndexStartAt() != 0)
-            cur_index_repr = ocg->CreatePlus(cur_index_repr, ocg->CreateInt(ir->ArrayIndexStartAt()));
-          index_repr[k] = cur_index_repr;
-        }
-        
-        IR_ArrayRef *tmp_array_ref = ir->CreateArrayRef(static_cast<IR_ArraySymbol *>(tmp_sym), index_repr);
-        ir->ReplaceExpression(stmt_refs[i].second[j], tmp_array_ref->convert());
-      }
-    }
-  
-  // update dependence graph
-  int dep_dim = get_last_dep_dim_before(*(active.begin()), level) + 1;
-  if (ro_copy_stmt_num != -1) {
-    for (int i = 0; i < old_num_stmt; i++) {
-      std::vector<std::vector<DependenceVector> > D;
-      
-      for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();) {
-        if (active.find(i) != active.end() && active.find(j->first) == active.end()) {
-          std::vector<DependenceVector> dvs1, dvs2;
-          for (int k = 0; k < j->second.size(); k++) {
-            DependenceVector dv = j->second[k];
-            if (dv.sym != NULL && dv.sym->name() == sym->name() && (dv.type == DEP_R2R || dv.type == DEP_R2W))
-              dvs1.push_back(dv);
-            else
-              dvs2.push_back(dv);
-          }
-          j->second = dvs2;
-          if (dvs1.size() > 0)
-            dep.connect(ro_copy_stmt_num, j->first, dvs1);
-        }
-        else if (active.find(i) == active.end() && active.find(j->first) != active.end()) {
-          std::vector<DependenceVector> dvs1, dvs2;
-          for (int k = 0; k < j->second.size(); k++) {
-            DependenceVector dv = j->second[k];
-            if (dv.sym != NULL && dv.sym->name() == sym->name() && (dv.type == DEP_R2R || dv.type == DEP_W2R))
-              dvs1.push_back(dv);
-            else
-              dvs2.push_back(dv);
-          }
-          j->second = dvs2;
-          if (dvs1.size() > 0)
-            D.push_back(dvs1);
-        }
-        
-        if (j->second.size() == 0)
-          dep.vertex[i].second.erase(j++);
-        else
-          j++;
-      }
-      
-      for (int j = 0; j < D.size(); j++)
-        dep.connect(i, ro_copy_stmt_num, D[j]);
-    }
-    
-    // insert dependences from copy statement loop to copied statements
-    DependenceVector dv;
-    dv.type = DEP_W2R;
-    dv.sym = tmp_sym->clone();
-    dv.lbounds = std::vector<coef_t>(num_dep_dim, 0);
-    dv.ubounds = std::vector<coef_t>(num_dep_dim, 0);
-    for (int i = dep_dim; i < num_dep_dim; i++) {
-      dv.lbounds[i] = -posInfinity;
-      dv.ubounds[i] = posInfinity;
-    } 
-    for (std::set<int>::iterator i = active.begin(); i != active.end(); i++)
-      dep.connect(ro_copy_stmt_num, *i, dv);
-  }
-  
-  if (wo_copy_stmt_num != -1) {
-    for (int i = 0; i < old_num_stmt; i++) {
-      std::vector<std::vector<DependenceVector> > D;
-      
-      for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();) {
-        if (active.find(i) != active.end() && active.find(j->first) == active.end()) {
-          std::vector<DependenceVector> dvs1, dvs2;
-          for (int k = 0; k < j->second.size(); k++) {
-            DependenceVector dv = j->second[k];
-            if (dv.sym != NULL && dv.sym->name() == sym->name() && (dv.type == DEP_W2R || dv.type == DEP_W2W))
-              dvs1.push_back(dv);
-            else
-              dvs2.push_back(dv);
-          }
-          j->second = dvs2;
-          if (dvs1.size() > 0)
-            dep.connect(wo_copy_stmt_num, j->first, dvs1);
-        }
-        else if (active.find(i) == active.end() && active.find(j->first) != active.end()) {
-          std::vector<DependenceVector> dvs1, dvs2;
-          for (int k = 0; k < j->second.size(); k++) {
-            DependenceVector dv = j->second[k];
-            if (dv.sym != NULL && dv.sym->name() == sym->name() && (dv.type == DEP_R2W || dv.type == DEP_W2W))
-              dvs1.push_back(dv);
-            else
-              dvs2.push_back(dv);
-          }
-          j->second = dvs2;
-          if (dvs1.size() > 0)
-            D.push_back(dvs1);
-        }
-        
-        if (j->second.size() == 0)
-          dep.vertex[i].second.erase(j++);
-        else
-          j++;
-      }
-      
-      for (int j = 0; j < D.size(); j++)
-        dep.connect(i, wo_copy_stmt_num, D[j]);
-    }
-    
-    // insert dependences from copied statements to write statements
-    DependenceVector dv;
-    dv.type = DEP_W2R;
-    dv.sym = tmp_sym->clone();
-    dv.lbounds = std::vector<coef_t>(num_dep_dim, 0);
-    dv.ubounds = std::vector<coef_t>(num_dep_dim, 0);
-    for (int i = dep_dim; i < num_dep_dim; i++) {
-      dv.lbounds[i] = -posInfinity;
-      dv.ubounds[i] = posInfinity;
-    } 
-    for (std::set<int>::iterator i = active.begin(); i != active.end(); i++)
-      dep.connect(*i, wo_copy_stmt_num, dv);
-    
-  }
-  
-  // update variable name for dependences among copied statements
-  for (int i = 0; i < old_num_stmt; i++) {
-    if (active.find(i) != active.end())
-      for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++)
-        if (active.find(j->first) != active.end())
-          for (int k = 0; k < j->second.size(); k++) {
-            IR_Symbol *s = tmp_sym->clone();
-            j->second[k].sym = s;
-          }
-  }
-  
-  // insert anti-dependence from write statement to read statement
-  if (ro_copy_stmt_num != -1 && wo_copy_stmt_num != -1)
-    if (dep_dim >= 0) {
-      DependenceVector dv;
-      dv.type = DEP_R2W;
-      dv.sym = tmp_sym->clone();
-      dv.lbounds = std::vector<coef_t>(num_dep_dim, 0);
-      dv.ubounds = std::vector<coef_t>(num_dep_dim, 0);
-      for (int k = dep_dim; k < num_dep_dim; k++) {
-        dv.lbounds[k] = -posInfinity;
-        dv.ubounds[k] = posInfinity;
-      }
-      for (int k = 0; k < dep_dim; k++) {
-        if (k != 0) {
-          dv.lbounds[k-1] = 0;
-          dv.ubounds[k-1] = 0;
-        }
-        dv.lbounds[k] = 1;
-        dv.ubounds[k] = posInfinity;
-        dep.connect(wo_copy_stmt_num, ro_copy_stmt_num, dv);
-      }
-    }
-  
-  
-  // cleanup
-  delete sym;
-  delete tmp_sym;
-  for (int i = 0; i < index_lb.size(); i++) {
-    index_lb[i]->clear();
-    delete index_lb[i];
-  }
-  for (int i = 0; i < index_sz.size(); i++) {
-    index_sz[i].second->clear();
-    delete index_sz[i].second;
-  }
-  
-  return true;
-}
-- 
cgit v1.2.3-70-g09d2