Initial commit

author: dhuth <derickhuth@gmail.com> 2014-08-27 09:52:06 -0600
committer: dhuth <derickhuth@gmail.com> 2014-08-27 09:52:06 -0600
commit: bff810cc371a38f493d688c54f71013f5a7d53bf (patch)
tree: fbe86954bb3c01deb21da9e41ebff5baa2889a45 /examples
download: chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.gz
chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.bz2
chill-bff810cc371a38f493d688c54f71013f5a7d53bf.zip
39 files changed, 2308 insertions, 0 deletions
diff --git a/examples/chill/gemm.c b/examples/chill/gemm.c
new file mode 100644
index 0000000..355bafe
--- /dev/null
+++ b/examples/chill/gemm.c
@@ -0,0 +1,15 @@
+int main() {
+
+	float a[512][512], b[512][512], c[512][512];
+
+	int i, j, k;
+	int n;
+	for (j = 0; j < n; j++)
+		for (k = 0; k < n; k++)
+			for (i = 0; i < n; i++) {
+				c[i][j] = c[i][j] + a[i][k] * b[k][j];
+			}
+
+	return 0;
+}
+
diff --git a/examples/chill/gemm.script b/examples/chill/gemm.script
new file mode 100644
index 0000000..ed91567
--- /dev/null
+++ b/examples/chill/gemm.script
@@ -0,0 +1,31 @@
+#matrix multiply large array size for intel machine
+source: gemm.c
+procedure: main
+format: rose
+loop: 0
+
+TI = 128
+TJ = 8
+TK = 512
+UI = 2
+UJ = 2
+
+permute([3,1,2])
+tile(0,2,TJ)
+#print space
+tile(0,2,TI)
+#print space
+tile(0,5,TK)
+#print space
+
+datacopy(0,3,a,false,1)
+#print space
+
+datacopy(0,4,b)
+print
+unroll(0,4,UI)#print space
+print 
+unroll(0,5,UJ)
+#print space
+print
+
diff --git a/examples/chill/gemv.c b/examples/chill/gemv.c
new file mode 100644
index 0000000..610d4cb
--- /dev/null
+++ b/examples/chill/gemv.c
@@ -0,0 +1,15 @@
+#define N 10
+
+int main() {
+	// int n;
+	float a[N];
+	float b[N];
+	float c[N][N];
+
+	int i, j;
+
+	for (i = 1; i < N; i++)
+		for (j = 1; j < N; j++)
+			a[i] = a[i] + c[i][j] * b[j];
+
+}
diff --git a/examples/chill/gemv.script b/examples/chill/gemv.script
new file mode 100644
index 0000000..f1d5f89
--- /dev/null
+++ b/examples/chill/gemv.script
@@ -0,0 +1,9 @@
+source: gemv.c # matrix-vector multiply
+procedure: main
+format : rose
+loop: 0
+
+
+
+original()
+print
diff --git a/examples/chill/jacobi1.c b/examples/chill/jacobi1.c
new file mode 100644
index 0000000..0fcaee4
--- /dev/null
+++ b/examples/chill/jacobi1.c
@@ -0,0 +1,13 @@
+#define N 512
+
+int main() {
+	int i, t;
+
+	float a[N][N];
+
+	for (t = 2; t <= 100; t++)
+		for (i = 2; i <= N - 1; i++)
+			a[t][i] = a[t - 1][i - 1] + a[t - 1][i] + a[t - 1][i + 1];
+
+	return 0;
+}
diff --git a/examples/chill/jacobi1.script b/examples/chill/jacobi1.script
new file mode 100644
index 0000000..c0dec8d
--- /dev/null
+++ b/examples/chill/jacobi1.script
@@ -0,0 +1,18 @@
+#
+# tiling perfect jacobi loop nest with time step, use
+# unimodular transformation first (only applicable to the
+# perfect loop nest) to make tiling legal.
+#
+
+source: jacobi1.c
+procedure: main
+format : rose
+loop: 0
+
+print dep
+
+nonsingular([[1,0],[1,1]])  # unimodular matrix, determinant is one
+tile(0,2,64)
+
+print dep
+print
diff --git a/examples/chill/jacobi2.c b/examples/chill/jacobi2.c
new file mode 100644
index 0000000..b8d8d7b
--- /dev/null
+++ b/examples/chill/jacobi2.c
@@ -0,0 +1,15 @@
+#define N 512
+
+int main() {
+	double a[N];
+	double b[N];
+	int t, i;
+	for (t = 1; t <= 100; t++) {
+		for (i = 2; i <= N - 1; i++)
+			b[i] = (double) 0.25 * (a[i - 1] + a[i + 1]) + (double) 0.5 * a[i];
+
+		for (i = 2; i <= N - 1; i++)
+			a[i] = b[i];
+	}
+	return 0;
+}
diff --git a/examples/chill/jacobi2.script b/examples/chill/jacobi2.script
new file mode 100644
index 0000000..afe14c6
--- /dev/null
+++ b/examples/chill/jacobi2.script
@@ -0,0 +1,21 @@
+#
+# tiling imperfect jacobi loop nest, more details in the paper
+# "Automatic Tiling of Iterative Stencil Loops" by Zhiyuan Li and
+# Yonghong Song, TOPLAS, 2004.
+#
+
+source: jacobi2.c
+procedure: main
+format: rose
+loop: 0
+
+print dep
+
+original()
+shift([1], 2, 1)
+fuse([0,1], 2)  # optional
+skew([0,1], 2, [2,1])
+tile(0, 2, 32, 1)
+
+print dep
+print
diff --git a/examples/chill/unroll.c b/examples/chill/unroll.c
new file mode 100644
index 0000000..68f4633
--- /dev/null
+++ b/examples/chill/unroll.c
@@ -0,0 +1,31 @@
+#define N 14
+void foo(int n, float* x, float* y, float* z, float* f3, float* f1, float* w) {
+	int dt;
+
+	int i, j;
+
+	for (i = 1; i <= 14; i++)
+		x[i] = 1.0;
+
+	for (i = 1; i <= 14; i += 3)
+		y[i] = 1.0;
+
+	for (i = N + 1; i <= N + 20; i += 3)
+		z[i] = 1.0;
+
+	for (i = 0; i <= N; i++) {
+		for (j = i; j <= i + N; j++)
+			f3[i] = f3[i] + f1[j] * w[j - i];
+		f3[i] = f3[i] * dt;
+	}
+
+	return 0;
+}
+
+int main() {
+	float x[N], y[N], z[N], f3[N], f1[N], w[N];
+
+	foo(N, x, y, z, f3, f1, w);
+	return 0;
+}
+
diff --git a/examples/chill/unroll.script b/examples/chill/unroll.script
new file mode 100644
index 0000000..e64acb6
--- /dev/null
+++ b/examples/chill/unroll.script
@@ -0,0 +1,35 @@
+#
+# Test unroll-and-jam. The last loop adapted from the simple
+# convolution example from p463 of "Optimizing Compilers for
+# Modern Architectures", by Randy Allen and Ken Kennedy.
+#
+
+source: unroll.c
+procedure: foo
+format: rose
+# fully unroll a loop with known iteration count
+loop: 0
+original()
+unroll(0,1,3)
+print
+print space
+
+
+# a strided loop
+loop: 1
+original()
+unroll(0,1,2)
+print
+print space
+
+# lower and upper bounds are not constant
+loop: 2
+original()
+unroll(0,1,20)
+print
+
+# parallelogram iteration space
+loop: 3
+original()
+unroll(0,1,2)
+print
diff --git a/examples/cuda-chill/cp.c b/examples/cuda-chill/cp.c
new file mode 100644
index 0000000..837d7a6
--- /dev/null
+++ b/examples/cuda-chill/cp.c
@@ -0,0 +1,29 @@
+#define N 1
+
+#define VOLSIZEY 512
+#define VOLSIZEX 512
+#define VOLSIZEZ 1
+#define ATOMCOUNT 4000
+#define GRIDSPACING 0.1
+#define zDim 0
+
+extern float sqrtf(float);
+
+void cenergy_cpu(float atoms[ATOMCOUNT*4],float *energy,float z)
+{
+int i,j,n;float dx,dy,dz; 
+   
+    for (j=0; j<VOLSIZEY; j++) {
+        for (i=0; i<VOLSIZEX; i++) {
+            	  for (n=0;n<ATOMCOUNT;n+=4) {
+				dx = (GRIDSPACING * i) - atoms[n];
+				dy = (GRIDSPACING * j) - atoms[n+1];
+				dz = z - atoms[n+2];
+        		        energy[(j*VOLSIZEX + i)+VOLSIZEX*VOLSIZEY*zDim] += atoms[n+3]/sqrtf( (dx*dx) + (dy*dy)+ (dz*dz) ) ;
+            }
+              
+
+        }
+    }
+}
+
diff --git a/examples/cuda-chill/cp.lua b/examples/cuda-chill/cp.lua
new file mode 100644
index 0000000..1ef2264
--- /dev/null
+++ b/examples/cuda-chill/cp.lua
@@ -0,0 +1,46 @@
+--CUBLAS 2 MM Multiply
+
+--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
+--call init() and use global variables to specify procedure and loop
+
+--Second parameter is procedure # and third is loop #
+init("cp.c", "cenergy_cpu", 0) 
+
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+                     --copy_to_shared methods
+V=512
+N=4000
+N=1
+
+Tj=32
+Ti=16
+Tii=16
+Tjj=16
+
+--normalize_index("j")
+--normalize_index("i")
+print_code()
+normalize_index("n")
+-- TILE COMMANDS ZEROOOOOOOOOOO:3
+--permute(0,{"i","j","n"})
+--tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","n"})--CU=-1
+tile_by_index({"j","i"},{Tj,Ti},{l1_control="jj",l2_control="ii"},{"jj","ii","j","i","n"})--CU=-1
+--tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1
+
+--tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3
+--tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","i","iii","j","jjj","n"})--CU=3
+--tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"})
+--tile_by_index({"i"}, {Tii}, {l1_control="iii",l1_tile="i"}, {"ii", "jj", "iii","i","j","n"})
+print_code()
+cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"jj","ii"}, thread={"j","i"}})--CU=3
+--cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"ii","jj"}, thread={"i","j"}})--CU=3
+print_code()
+copy_to_shared("tx","atoms",-16)
+copy_to_registers("tx","energy")
+--copy_to_texture("atoms")
+--unroll_to_depth(1)
+--unroll(0,9,0)
+--unroll(0,5,0)
+
+--unroll(0,8,256)
+print_code()
diff --git a/examples/cuda-chill/cudaize.lua b/examples/cuda-chill/cudaize.lua
new file mode 100644
index 0000000..7359cca
--- /dev/null
+++ b/examples/cuda-chill/cudaize.lua
@@ -0,0 +1,1004 @@
+
+-- THIS IS CUDAIZE.LUA
+
+function table.contains_key(table, key)
+   for k in pairs(table) do
+      if k == key then
+         return true
+      end
+   end
+   return false
+end
+
+function valid_indices(stmt, indices)
+   --print( "valid_indices() lua calling C cur_indices")
+   --io.flush()
+   cur = cur_indices(stmt) 
+   --print("Cur indices "..list_to_string(cur))
+   for idx in pairs(indices) do
+      if not table.contains_key(cur,idx) then
+         return false
+      end
+   end
+   return true
+end
+
+function next_clean_level(cur_idxs,level)
+   --print("next_clean_level( ..., "..level.." )")
+   --print(string.format("indices_at_each_level %s ",list_to_string(cur_idxs) ))
+   
+   --print("loop to "..#cur_idxs)
+   for i=level+1,#cur_idxs do
+      --print("Checking level "..i.." = '"..cur_idxs[i].."'")
+      if (# cur_idxs[i] > 0) then
+         --print("Good enough"..(# cur_idxs[i]))
+         --print("returning "..i)
+         return i
+      end
+   end
+   return -1 --sentinal that there were no non-dummy indices left
+end
+
+function build_order(final_order, tile_idx_names, ctrl_idx_names, tile_idx_map, cur_level)
+   order = {}
+   --print("\nbuild_order()")
+   --print("build_order(): final_order = ( "..list_to_string(final_order).." )")
+   --print("build_order(): ctrl_idx_names = ("..list_to_string(ctrl_idx_names).." )")
+   --print("cur_level "..cur_level.."")
+   --io.flush()
+   
+   for i,k in ipairs(final_order) do
+      skip = false
+      cur = final_order[i]
+      --print("\ncur "..cur.." = final_order["..i.."] = "..final_order[i].."  ")
+      --control loops below our current level should not be in the current order
+      for j=cur_level+2,# ctrl_idx_names do
+         --print("j "..j.." final_order["..i.."] = "..final_order[i].."  ")
+         if ctrl_idx_names[j] == final_order[i] then
+            skip = true
+            --print("SKIP "..final_order[i].."  ")
+            --io.flush()
+         end
+      end
+      --possibly substitute tile indices ifn necessar
+      if table.contains_key(tile_idx_map,final_order[i]) then
+         approved_sub = false
+         sub_string = tile_idx_map[final_order[i]]
+         for j=cur_level+2,# tile_idx_names do
+            if tile_idx_names[j] == sub_string then
+               approved_sub = true
+            end
+         end
+         if approved_sub then
+            cur = sub_string
+         end
+      end
+      if not skip then
+         table.insert(order,cur)
+      end
+   end
+   return order
+end
+
+function list_to_string(str_list)
+   --Helpful debug output
+   l = ""
+   for i,str in ipairs(str_list) do
+      if i > 1 then
+         l = l .. ", " .. str
+      else
+         l = str
+      end
+   end
+   return l
+end
+
+
+function find_cur_level(stmt,idx)
+   --Search cur_indices for a idx at stmt
+   cur = cur_indices(stmt)
+   --print(string.format("find_cur_level(stmt %d, idx %s)  Cur indices %s", stmt, idx, list_to_string(cur)))
+   for i,cidx in ipairs(cur) do
+      if cidx == idx then
+         --print(string.format("found it at index %d", i))
+         return i
+      end
+   end
+   error("Unable to find "..idx.." in current list of indices")
+end
+
+
+function chk_cur_level(stmt,idx)
+   --Search cur_indices for a idx at stmt
+   cur = cur_indices(stmt)
+   for i,cidx in ipairs(cur) do
+      if cidx == idx then
+         return i
+      end
+   end
+   return -1
+end
+
+
+function find_offset(cur_order, tile, control)
+   --print("Looking for tile '"..tile.."' and control '"..control.."' in ( "..list_to_string(cur_order)..", )")
+   idx1 = -1
+   idx2 = -1
+   for i,cur in ipairs(cur_order) do
+      if(cur == tile) then
+         idx1 = i
+      end
+      if(cur == control) then
+         idx2 = i
+      end
+   end
+   if(idx1 < 0) then
+      error("Unable to find tile " .. tile .. " in current list of indices")
+   end
+   if(idx2 < 0) then
+      error("Unable to find control " .. control .. " in current list of indices")
+   end
+   --print("found at level " .. idx2 .. " and " .. idx1)
+   if(idx2 < idx1) then
+      return idx2-idx1+1
+   else
+      return idx2-idx1
+   end
+end
+
+function tile_by_index(tile_indices, sizes, index_names, final_order, tile_method)
+   --print "STARTING TILE BY INDEX"
+   --io.flush()
+   stmt = 0 --assume stmt 0
+   cur = cur_indices(stmt)
+   --print("Cur indices "..list_to_string(cur))
+   if not valid_indices(stmt,tile_indices) then
+      error('One of the indices in the first parameter were not '..
+            'found in the current set of indices.')
+   end
+   if not tile_method then tile_method = counted end
+   tile_idx_names = {}
+   for i,s in ipairs(tile_indices) do tile_idx_names[i]=s end --shallow copy
+   --print("tile_index_names: ['"..list_to_string(tile_indices).."']")
+   
+   --print("index_names:  ") 
+   --for k,v in pairs(index_names) do print(k,v) end
+   
+   --io.flush()
+   
+   ctrl_idx_names = {}
+   tile_idx_map = {}
+   for k,v in pairs(index_names) do
+      valid = false
+      if(string.sub(k,1,1) == "l") then
+         if string.sub(k,-8) == "_control" then
+            i = tonumber(string.sub(k,2,-9))
+            if i and i >= 1 and i <= (# tile_indices) then
+               ctrl_idx_names[i] = v
+               --print(string.format("Handling control %s for loop level %d",v,i))
+               --print("control "..k.."   name  "..v.." ")
+               valid = true
+            end
+         elseif string.sub(k,-5) == "_tile" then
+            i = tonumber(string.sub(k,2,-6))
+            if i and i >= 1 and i <= (# tile_indices) then
+               --print(string.format("tile %s -> %s",tile_indices[i], v))
+               tile_idx_names[i] = v
+               tile_idx_map[v] = tile_indices[i]
+               --print(string.format("tile %s -> %s",tile_indices[i], v))
+               valid = true
+            end
+         end
+      end
+      if not valid then error(string.format("%s is not a proper key for specifying "..
+                                            "tile or control loop indices\n", k)) end
+   end
+   
+   --filter out control indices (and do name substitution of unprocessed tile indices) for a given level
+   cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, -1)
+   permute(stmt, cur_order)
+   
+   for i,cur_idx in ipairs(tile_indices) do
+      --print(string.format("i %d  cur_idx %s calling build order ********", i-1, cur_idx))
+      cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)
+      --Find a offset between tile loop and control loop
+      -- 0   = control loop one level above tile loop
+      -- -1  = control loop two levels above tile loop
+      -- > 0 = tile loop above control loop
+      -- In the last case, we do two extra tile commands to get the control
+      -- above the tile and then rely on the final permute to handle the
+      -- rest
+      level = find_cur_level(stmt,cur_idx)
+      offset = find_offset(cur_order, tile_idx_names[i], ctrl_idx_names[i])
+      --print(string.format("offset %d", offset))
+      
+      if (offset <= 0) then
+         --print(string.format("[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %s)",stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method)) 
+         tile(stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method)
+      else
+         --print(string.format("2tile(%d, %d, %d, %d, %s, %s, %s)", stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method))
+         tile(stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method);--regular level
+         --flip tile and control loop
+         --print(string.format("3tile(%d, %d, %d)",stmt, level+1, level+1))
+         tile(stmt, level+1, level+1);
+         --print(string.format("4tile(%d, %d, %d)",stmt, level+1, level))
+         tile(stmt, level+1, level);
+         --print(string.format("\n[offset>0]tile(%d, %d, %d, %d,%s,%s,%s)",stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method)) 
+	 --print_code()
+         
+      end
+      
+      --Do permutation based on cur_order
+      --print "permute based on build order calling build_order()"
+      --print "cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)"
+      cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)
+      --print "permute(stmt, cur_order);"
+      permute(stmt, cur_order);
+      --print "\nafter permute(), code is:"
+      --print_code()
+   end
+   --print "ENDING TILE BY INDEX"
+   --print_code()
+end
+
+function normalize_index(index)
+   stmt = 0 --assume stmt 0cur = cur_indices(stmt)
+   --print("Cur indices "..list_to_string(cur))
+   l = find_cur_level(stmt, index)
+   tile(stmt, l, l)
+   --print(string.format("\n[Normalize]tile(%d, %d, %d)",stmt, l,l)) 
+end
+
+function is_in_indices(stmt, idx)
+   cur = cur_indices(stmt)
+   for i=0,#cur,1 do
+      if(cur[i]==idx) then
+         return true
+      end
+   end
+   return false
+   
+end
+
+
+function copy_to_registers(start_loop, array_name)
+   
+   --print("\n\n****** starting copy to registers")
+   io.flush()
+
+   stmt = 0 --assume stmt 0
+   
+   -- [Malik] first we make sure that tx and ty are consecutive loops in the 2D thread setup, otherwise all levels for subsequent operations are messed up. Start logic.
+   cur = cur_indices(stmt)
+   table_Size = table.getn(cur)
+   
+   --print(string.format("Cur indices %s,",list_to_string(cur)))
+   --print(string.format("The table size is %d", table_Size))
+   --table.foreach(cur, print)
+   --print_code()
+   
+   level_tx = -1
+   level_ty = -1
+   if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end
+   if is_in_indices(stmt,"ty") then level_ty = find_cur_level(stmt,"ty") end
+   --print(string.format("level_tx %d  level_ty %d", level_tx, level_ty))
+   
+   ty_lookup_idx = "" 
+   org_level_ty = level_ty
+   
+   --if(cur[level_tx+1]~=nil and cur[level_tx+1]~="") then ty_lookup = ty_lookup+1 end
+   if(cur[level_ty+1]~=nil and cur[level_ty+1]~="") then 
+      --print(string.format("IF  cur[%d] = %s", level_ty+1, cur[level_ty+1]))
+      ty_lookup_idx = cur[level_ty+1] 
+   else
+      --if cur[level_ty]  ~= nil then print(string.format("ELSE ty_lookup_idx = cur[%d] = %s", level_ty, cur[level_ty])) --   TODO 
+      --else print "ELSE (dangerous)" end
+      ty_lookup_idx = cur[level_ty]  -- may assign nil !?
+   end
+   --if ty_lookup_idx ~= nil then print(string.format("ty_lookup_idx '%s'", ty_lookup_idx))  --  TODO 
+   --else print "ty_lookup_idx is NIL"
+   --end
+   
+   if level_ty > 0 then
+      --print(string.format("\ntile3(%d,%d,%d)",stmt,level_ty,level_tx+1))
+      tile(stmt,level_ty,level_tx+1) 
+   end
+   --print_code()
+   
+   --print("\ntylookup is %d",ty_lookup)
+   --exit(0)
+   --
+   cur = cur_indices(stmt)
+   table_Size = table.getn(cur)
+   --print(string.format("Cur indices %s,",list_to_string(cur)))
+   --print("The table size is "..table.getn(cur))
+   --table.foreach(cur, print)
+   
+   if is_in_indices(stmt,"tx") then   level_tx = find_cur_level(stmt,"tx") end
+   if ty_lookup_idx then
+      if is_in_indices(stmt,ty_lookup_idx) then level_ty = find_cur_level(stmt,ty_lookup_idx) end
+   end
+   
+   ty_lookup = 1
+   idx_flag = -1
+   -- find the level of the next valid index after ty+1
+   --print(string.format("\nlevel_ty %d", level_ty))
+   if level_ty > 0 then
+      --print(string.format("table_Size %d", table_Size))
+      for num= level_ty+ty_lookup,table_Size do
+         --print(string.format("num=%d   cur[num] = '%s'",num, cur[num]))
+         if(cur[num] ~= "") then
+            idx_flag = find_cur_level(stmt,cur[num])
+            --print (string.format("idx_flag = %d", idx_flag))
+            break
+         end
+      end
+   end
+   
+   --print(string.format("\n(first) I am checking all indexes after ty+1 %s",idx_flag))
+   --print_code()
+   --print ""
+   
+   how_many_levels = 1
+   startat = idx_flag + 1
+   if startat == 0 then startat = 1 end  -- avoid attempt to examine an illegal array offset
+   --print(string.format("idx_flag = %d   I will check levels starting with %d", idx_flag, idx_flag+1))
+   
+   for ch_lev = startat,table_Size,1 do    -- was for ch_lev = idx_flag+1,table_Size,1 do
+      --print(string.format("ch_lev %d", ch_lev))
+      if(cur[ch_lev] ~= nil and cur[ch_lev] ~= "") then
+         --print(string.format("cur[%d] = '%s'", ch_lev, cur[ch_lev])) 
+         how_many_levels = how_many_levels+1
+      end
+   end
+   --print("\nHow Many Levels",how_many_levels)
+   
+   -- change this all to reflect the real logic which is to normalize all loops inside the thread loops. 
+   if(how_many_levels <2) then
+      while( idx_flag >= 0) do
+         for num = level_ty+ty_lookup,(table_Size) do
+            --print(string.format("at top of loop, num is %d", num))
+            --print(string.format("num %d", num))
+            --print(string.format("cur[num] = '%s'", cur[num]))
+            if(cur[num] ~= "") then
+               idx=cur[num]
+               --print(string.format("idx '%s'", idx))
+               
+               curlev = find_cur_level(stmt,idx)
+               --print(string.format("curlev %d", curlev))
+               
+               --print_code()
+               --print(string.format("\n[COPYTOREG]tile(%d,%d,%d)",stmt,find_cur_level(stmt,idx),level_tx))
+               tile(stmt,find_cur_level(stmt,idx),find_cur_level(stmt,idx))
+               curlev = find_cur_level(stmt,idx)
+               --print(string.format("curlev %d", curlev))
+               tile(stmt,find_cur_level(stmt,idx),level_tx)
+               --print(string.format("hehe '%s'",cur[num]))
+               
+               cur = cur_indices(stmt)
+               --print("Cur indices INSIDE"..list_to_string(cur))
+               table_Size = table.getn(cur)
+               --print(string.format("Table Size is: %d",table_Size))
+               level_tx = find_cur_level(stmt,"tx")
+               --print(string.format("\n level TX is: %d",level_tx))
+               level_ty = find_cur_level(stmt,ty_lookup_idx)
+               --print(string.format("\n level TY is: %d",level_ty))
+               idx_flag = -1
+               --print "idx_flag = -1"
+               
+               -- find the level of the next valid index after ty+1
+               
+               -- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?)
+               for num= level_ty+ty_lookup,table_Size do
+                  --print(string.format("num mucking num = %d", num))
+                  if(cur[num] ~= nil and cur[num] ~= "") then
+                     idx_flag = find_cur_level(stmt,cur[num])
+                     --print("\n(second) I am checking all indexes after ty+1 %s",cur[num])
+                     break
+                  end
+               end
+               --print(string.format("num mucked to %d     idx_flag = %d", num, idx_flag))
+               
+            end
+            --print(string.format("at bottom of loop, num is %d", num))
+         end
+      end
+   end
+   --print "done with levels"
+   
+   
+   
+   
+   --print "ARE WE SYNCED HERE?"
+   --print_code()
+   --print("\ntile(%d,%d,%d)",stmt,level_k,level_k)
+   --tile(stmt,level_k,level_k)
+   
+   -- [Malik] end logic
+   --print_code()
+   start_level = find_cur_level(stmt, start_loop)
+   --We should hold contant any block or tile loop
+   block_idxs = block_indices()
+   thread_idxs = thread_indices()
+   --print("\nblock indices are")
+   --table.foreach(block_idxs, print)
+   --print("\nthread indices are")
+   --table.foreach(thread_idxs, print)
+   --print(string.format("\nStart Level: %d",start_level))
+   
+   hold_constant = {}
+   --print("\n Now in Blocks")
+   for i,idx in ipairs(block_idxs) do
+      --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx)))
+      if find_cur_level(stmt,idx) >= start_level then
+         table.insert(hold_constant, idx)
+         --print(string.format("\nJust inserted block %s in hold_constant",idx))
+      end
+   end
+   
+   
+   --print("\n Now in Threads")
+   for i,idx in ipairs(thread_idxs) do
+      --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx)))
+      if find_cur_level(stmt,idx) >= start_level then
+         table.insert(hold_constant, idx)
+         --print(string.format("\nJust inserted thread %s in hold_constant",idx))
+      end
+   end
+   
+   --print "\nhold constant table is: "
+   --table.foreach(hold_constant, print)
+   
+   --print("\nbefore datacopy pvt")
+   old_num_stmts = num_statements()
+   --print_code()
+   --print(string.format("\n[DataCopy]datacopy_privatized(%d, %s, %s, vector having privatized levels)",stmt, start_loop, array_name)) 
+   --table.foreach(hold_constant, print)
+   datacopy_privatized(stmt, start_loop, array_name, hold_constant)
+   
+   --print(hold_constant)
+   new_num_stmts = num_statements()
+   --print("\nthe num of statements:%d\n",new_num_stmt)
+   --print_code()
+   --exit(0)
+   -- [Malik] normalize the copy loops created.
+   cur = cur_indices(old_num_stmts)
+   --print("Cur indices "..list_to_string(cur))
+   for cidx,i in ipairs(cur) do
+      if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then
+         --tile(old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i))
+         --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i))
+      end
+   end
+   --print_code()
+   --print("\nthe num of statements OLD+1 :",(old_num_stmts+1))  
+
+
+--[[ 
+   is this commented out? why yes, yes it is   block comment 
+   if( (old_num_stmts+1) <= new_num_stmts) then
+      cur = cur_indices(old_num_stmts+1)
+      --print("Cur indices+1 "..list_to_string(cur))
+      for cidx,i in ipairs(cur) do
+         if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then
+            tile(old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i))
+	    --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i))
+         end
+      end
+   end
+--]]
+
+
+   --Unroll to the last thread level
+   --for stmt=old_num_stmts,new_num_stmts-1 do
+   -- level = find_cur_level(stmt,thread_idxs[#thread_idxs])--get last thread level
+   --if level < #cur_indices(stmt) then
+   -- unroll(stmt,level+1,0)
+   --print(string.format("\n[Unroll]unroll(%d, %d, 0)",stmt, level+1)) 
+   ----print_code()
+   --end
+   --end
+   io.flush()
+   --print("****** ending copy to registers\n\n")
+   --io.flush()
+end
+
+function copy_to_shared(start_loop, array_name, alignment)
+   --print(string.format("\nstarting copy to shared(%s, %s, %d )",start_loop,array_name,alignment))
+   stmt = 0 --assume stmt 0
+   cur = cur_indices(stmt)
+   --print("Cur indices "..list_to_string(cur))
+   
+   start_level = find_cur_level(stmt, start_loop)
+   --print(string.format("start_level %d", start_level))
+   
+   old_num_stmts = num_statements()
+   --print(string.format("old_num_statements %d", old_num_stmts))
+   
+   --Now, we give it indices for up to two dimentions for copy loop
+   copy_loop_idxs = {"tmp1","tmp2"}
+   --print(string.format("\n[DataCopy]datacopy(%d, %d, %s, {\"tmp1\",\"tmp2\"},false,0,1,%d,true)",stmt, start_level, array_name, alignment)) 
+   datacopy(stmt, start_level, array_name, copy_loop_idxs, false, 0, 1, alignment,true)
+   
+   add_sync(stmt,start_loop)
+   new_num_stmts = num_statements()
+   
+   --This is fairly CUBLAS2 specific, not sure how well it generalizes,
+   --but for a 2D copy, what we want to do is "normalize" the first loop
+   --"tmp1" then get its hard upper bound. We then want to tile it to
+   --make the control loop of that tile "ty". We then tile "tmp2" with a
+   --size of 1 and make it "tx".
+   --print(string.format("fairly CUBLAS2 specific, OLD %d  NEW %d",  old_num_stmts, new_num_stmts ))
+   
+   for stmt=old_num_stmts,new_num_stmts-1 do
+      --print(string.format("for stmt = %d", stmt))
+      was_no_error, level = pcall(find_cur_level, stmt, "tmp2")
+      
+      if was_no_error then 
+         --print_code() 
+         --print("\nCopy to shared: [If was no error]\n")
+         find_cur_level(stmt,"tmp2")
+         tile(stmt, level, level)
+         
+         lower,upper = hard_loop_bounds(stmt, level)
+         upper = upper + 1
+         --print(string.format("lower %d  upper %d", lower, upper))
+         
+         tx,ty = thread_dims()
+         --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx)
+         
+         level = find_cur_level(stmt,"tmp1")
+         --print(string.format("level %d", level))
+         
+         if tx == upper and ty == 1 then
+            --print(string.format("tx = %d    upper = %d     ty = %d", tx, upper, ty))
+            --print "Don't need"
+            
+            --Don't need an extra tile level, just move this loop up
+            second_level = find_cur_level(stmt,"tmp2")
+            --print(string.format("\n[Tile0]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx")) 
+            tile(stmt, second_level, 1, level, "tx", "tx", counted)
+         else
+            --print "DO need?"
+            --print_code()
+            if(ty == 1) then new_ctrl = "tmp3" else new_ctrl = "ty" end
+
+
+--[[ Commenting out a block of Gabe's code in this control flow
+               -- level = find_cur_level(stmt,"tmp1")
+               tile(stmt, level, level)
+
+               lower,upper = hard_loop_bounds(stmt, level)
+               upper = upper + 1
+               --print_code()
+               --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx..", level: "..level)
+               if(math.ceil(upper/ty) > 1)then
+                  tile(stmt, level, math.ceil(upper/ty), level, "tmp", new_ctrl, counted)
+                  --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level,  math.ceil(upper/ty),upper,ty, level, "tmp", new_ctrl)) 
+               else
+                  tile(stmt, level, math.ceil(upper/ty), level, "ty", new_ctrl, counted)
+		  --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level,  math.ceil(upper/ty),upper,ty, level, "tx", new_ctrl))
+               end
+               
+               --print_code()    
+               -- [Malik] If here we have the loop upper bound > tx, then we should tile once more after the next tile, to carve out the correct tx. 
+               lower1,upper1 = hard_loop_bounds(stmt,level)
+               level1 = level
+               stmt1 = stmt
+               -- [Malik] Do the tile after the second level tile with if condition. Just to keep the original order, the tile is being pushed to the end. 
+               
+               --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1)
+
+               --print_code()
+               --level = find_cur_level(stmt,"tmp")
+               --tile(stmt,level,level)
+               --print_code() 
+               
+               --[Malik] if you are moving the loop above the level1, you need to update level1 with new position which would be level1+2 or second_level
+               if(level <= level1) then level1 = level1+2 end
+ 	       --print(string.format("\n[Tile2]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx")) 
+               --print("\n----------------------------------")
+               --print_code()
+               --print("\n**********************************")
+               --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1)
+               -- [Malik] If the upper bound > tx, we do another tile to carve out the correct tx from a bigger loop. Else just normalize the bounds. 
+               if( upper1 > ty) then
+                  third_level = find_cur_level(stmt1,"tmp")
+                  --print("\n\n\n\t\t\t\tthirdlevel:"..third_level)
+                  tile(stmt1, third_level, ty, third_level, "ty", "tmp", counted)
+                  --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt1, third_level, ty,third_level, "ty", "tmp"))
+                  tile(stmt1,third_level+1,third_level+1)
+                  --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level+1))
+                  tile(stmt1,third_level+1,third_level)
+                  --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level))
+               else
+                  tile(stmt1,level1,level1)
+                  --print(string.format("\n[Tile3ELSE]tile(%d, %d, %d)",stmt1,level1,level1))
+               end
+               
+               --print("\nStarting tmp2\n");--print_code();
+               second_level = find_cur_level(stmt,"tmp2")
+               lower,upper = hard_loop_bounds(stmt,second_level)
+               level = second_level
+               --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..", level:"..level)
+               
+               if(math.ceil(upper/tx) > 1)then
+                  tile(stmt, second_level,math.ceil(upper/tx), level, "tmp", "tx", counted)
+                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tmp", "tx"))
+               else
+                  tile(stmt, second_level,math.ceil(upper/tx), level, "tx", "tx", counted)
+                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tx", "tx"))
+               end
+               --print_code()
+               lower2,upper2 = hard_loop_bounds(stmt,level)
+               level2 = level
+               stmt2 = stmt
+               --print("[Malik]-loop cleanup@tmp2: lower2, upper2: "..lower2..", "..upper2..", tx: "..tx..", level:"..level2)
+               -- now for the second level.
+               if( upper2 > tx) then
+                  forth_level = find_cur_level(stmt2,"tmp")
+                  --print("\n\n\n\t\t\t\tforthlevel:"..forth_level)
+                  --print_code()
+                  tile(stmt2, forth_level, 1, forth_level, "tx", "tmp", counted)
+                  --print(string.format("\n[Tile3B]tile(%d, %d, %d,%d,%s,%s,counted)",stmt2, forth_level, tx,forth_level, "ty", "tmp"))
+                  --print_code()
+                  --tile(stmt2,forth_level+1,forth_level+1)
+                  --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level+1))
+                  --tile(stmt2,forth_level+1,forth_level)
+                  --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level))
+               else
+                  new_level = find_cur_level(stmt2,"ty")
+                  tile(stmt2,level2,1,new_level,"tx","tx",counted)
+                  --print(string.format("\n[Tile3BELSE]tile(%d, %d, %d)",stmt2,level2,level2))
+                  tmp_level = find_cur_level(stmt2,"tmp")
+                  tile(stmt2,tmp_level,tmp_level)
+               end
+               
+               --print_code()
+               --print("\n----------------------------------")
+--]]
+               
+               --print_code() 
+               --print("\nStarting tmp2\n");--print_code();
+               first_level = find_cur_level(stmt,"tmp1")
+               second_level = find_cur_level(stmt,"tmp2")
+               lower,upper = hard_loop_bounds(stmt,second_level)
+               
+               --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..",first level:"..first_level..",second_level:"..second_level)
+               
+               -- Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx.
+               --print(string.format("\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,1,first_level, "tx", "tx"))
+               tile(stmt,second_level,1,first_level,"tx","tx",counted)
+               --print_code()
+               
+               first_level = find_cur_level(stmt,"tmp1")
+               lower_1,upper_1 = hard_loop_bounds(stmt,first_level)
+               tx_level = find_cur_level(stmt,"tx")
+               lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level)
+               --print(string.format("UL_1 %d %d     UL_tx %d %d", lower_1, upper_1, lower_tx, upper_tx))
+               
+               if(math.ceil(upper_tx/tx) > 1)then
+                  --print "ceil I say"
+                  --print(string.format("\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,tx,tx_level, "tx", "tmp1"))
+                  tile(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted)
+                  --print_code()
+                  
+                  peat = find_cur_level(stmt,"tx")
+                  --print(string.format("\n[Tile1]tile(%d, %d, %d)",stmt, peat, peat))
+                  tile(stmt, peat, peat )  --find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx"))
+                  --print_code()
+                  
+                  if (find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx")) then
+                     --print(string.format("\nagain [Tile1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")))
+                     tile(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
+                     --print_code()
+                  end
+                  --else
+                  --tile(stmt, tx_level,1, tx_level, "tx", "tx", counted)
+                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,1,tx_level, "tx", "tx"))
+               end
+               --print_code()
+               --]]  -- this apparently is NOT the end of a block comment
+               
+               --print("\nStarting tmp1\n")
+               -- Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty".
+               tile(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))     
+               --print_code()  
+               
+               ty_level = find_cur_level(stmt,"tmp1")
+               lower_ty,upper_ty = hard_loop_bounds(stmt,ty_level)
+               
+               tx_level = find_cur_level(stmt,"tx")
+               lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level)
+               --print("[Malik]-loop cleanup@tmp1: lowerty, upperty: "..lower_ty..", "..upper_ty..", ty: "..ty..",ty level:"..ty_level..",tx_level:"..tx_level..", stmt: "..stmt)
+               
+               --print "before ceil"
+               if(math.ceil(upper_ty/ty) > 1)then
+                  --print "CEIL IF"
+                  --print("\n Inside upper_ty/ty > 1\n");
+                  
+                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,ty,ty_level, "ty", "tmp_ty"))
+                  tile(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted)
+                  --print_code()
+                  
+                  --print(string.format("\n[Tile2-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt  ,"ty"),find_cur_level(stmt,"ty")))
+                  tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty"))
+                  --print_code()
+                  
+                  -----------------------------------------------------------------------
+                  ----------------------------------------------------------------------
+                  cur_idxs = cur_indices(stmt)
+                  --print("\n cur indexes are "..list_to_string(cur_idxs))
+                  
+                  -- Putting ty before any tmp_tx   
+                  idx_flag = -1
+                  for num= 0,table.getn(cur_idxs) do
+                     if(cur[num] == "tmp_tx") then
+                        idx_flag = find_cur_level(stmt,cur[num])
+                        break
+                     end
+                  end
+                  --print(string.format("\n (1) so i have found out the value of idx flag as %d",idx_flag) )
+                  
+                  if(idx_flag >=0 ) then  
+                     if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then
+                        --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
+                        tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+                        --print_code()
+                     end
+                  end
+                  
+                  -- Now Putting ty before any tmp_ty
+                  idx_flag = -1
+                  for num= 0,table.getn(cur_idxs) do
+                     if(cur[num] == "tmp_ty") then
+                        idx_flag = find_cur_level(stmt,cur[num])
+                        break
+                     end
+                  end
+		  --print(string.format("\n IF  so i have found out the value of idx flag as %d",idx_flag) )
+                  if(idx_flag >=0 ) then  
+                     --print "one more test"
+                     if ((find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"))) then
+                        --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
+                        tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+                        --print_code()
+                     end
+                  end
+               else
+                  --print "CEIL ELSE"
+                  --cur_idxs = cur_indices(stmt)
+                  --print("\n Inside upper_ty/ty <= 1\n");
+                  
+                  --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,1,ty_level, "ty", "ty"))
+                  tile(stmt, ty_level,1, ty_level, "ty", "ty", counted)
+                  --print_code()
+                  
+                  --print(string.format("\n[Tile3-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1))
+                  tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
+                  --print_code()
+                  
+                  idx_flag = -1
+                  if(cur_idxs) then
+                     --print "CAN NEVER GET HERE?  cur_idxs"
+                     for num= 0,table.getn(cur_idxs) do
+                        if(cur[num] == "tmp_ty") then
+                           idx_flag = find_cur_level(stmt,cur[num])
+                           break
+                        end
+                     end
+                  end
+                  --print(string.format("\n ELSE so i have found out the value of idx flag as %d",idx_flag) )
+                  if(idx_flag >=0 ) then  
+                     if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then
+                        --print(string.format("tile( stmt %d, level ty %d, level ty %d",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) 
+                        tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+                        --print(string.format("\n[Tile3-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
+                     end
+                  end
+               end
+               
+               --print_code()
+         end
+         
+         
+         --print "\n\n *** at bottom of if in copy to shared, "
+         --print_code()
+         --print "end of if"
+         
+      else
+         --copy to shared only created one level, not two, so we use a different approach (MV & TMV)
+         --print("\nCopy to shared: [If was error]\n")
+         level = find_cur_level(stmt,"tmp1")
+         tile(stmt, level, level)
+         
+         --print(string.format("\n[Tile]tile(%d, %d, %d)",stmt, level, level)) 
+         tx,ty = thread_dims()
+         lower,upper = hard_loop_bounds(stmt, level)
+         upper = upper+1 --upper bound given as <=, compare to dimensions tx which is <
+         --print("upper "..upper.." tx "..tx)
+         if upper == tx then
+            rename_index(stmt, "tmp1", "tx")
+         else
+            --print("upper is not tx")
+            --TODO: Don't know, maybe do some tileing etc
+            --print_code()
+            --print("upper "..upper.." tx "..tx.." stmt: "..stmt.." level: "..level)
+            tile(stmt, level,tx,level, "tx", "tmp_tx", counted)
+            --print_code()
+            
+            --print("stmt:"..stmt.." level+1: "..level+1)
+            --print("TILE 7")
+            tile(stmt, level+1,1,level+1,"tx", "tx",counted)
+            --print("TILE 3")
+            tile(stmt,level+1,level)
+            --print_code()
+            
+            if(ty > 1) then
+               --print_code()
+               --print("GOING IN")
+               lower,upper = hard_loop_bounds(stmt, level+1)
+               --print(string.format("ty %d  lower %d  upper %d", ty, lower, upper))
+               --upper=125
+               --print("NOW FOR Y: upper "..upper.." ty "..ty.." stmt: "..stmt.." level: "..(level+1).." bound:"..math.ceil(upper/ty))
+               tile(stmt, level+1,math.ceil(upper/ty),level+1, "tmp_ty", "ty", counted)
+               --tile(stmt, level+2,math.ceil(upper/ty),level+2, "tmp_ty", "ty", counted)
+            end
+            --print_code()
+            --rename_index(stmt, "tmp1", "tx")
+            --print("Warning: Need to implement some logic here to tile the single level shared copy loop to match thread dimensions")
+         end
+      end
+      --Always add sync
+      add_sync(stmt,start_loop)
+      
+   end
+   --print("ending copy to shared\n")
+   --print_code()
+end
+
+function unroll_to_depth(max_depth)
+   --print(string.format("\n\nunroll_to_depth(%d)", max_depth ))
+   --print "SYNC UP"
+   
+   cur = cur_indices(0)
+   thread_idxs = thread_indices()
+   guard_idx = thread_idxs[#thread_idxs]
+   
+   --print(string.format("cur    indices %s",list_to_string(cur)))
+   --print(string.format("thread indices %s",list_to_string(thread_idxs)))
+   --print(string.format("#thread_idxs = %d", #thread_idxs))
+   --print(string.format("guard_idx = %s", guard_idx))
+   
+   ---- HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS   
+   common_loops = {}
+   comm_loops_cnt = 0
+   num_stmts = num_statements()
+   --print(string.format("num statements %d", num_stmts))
+   
+   for stmt=0,num_stmts-1 do
+      cur_idxs = cur_indices(stmt)
+      
+      --print(string.format("\nSTMT %d Current Indices: %s",stmt,list_to_string(cur_idxs)))
+      
+      if(chk_cur_level(stmt,"tx")>0) then
+         for ii=1,find_cur_level(stmt,"tx")-1 do    -- started at 0
+            --print(string.format("ii = %d", ii)) -- index starts at 1, what does index 0 do?
+            --if cur_idxs[ii] == nil then print "cur_idxs[i]] is NIL" 
+            --else print(string.format("cur_idxs[%d] = '%s'", ii, cur_idxs[ii])) -- index starts at 1, what does index 0 do?
+            --end
+            
+            if(cur_idxs[ii] ~= "bx" and cur_idxs[ii] ~= "by" and cur_idxs[ii] ~= nil and cur_idxs[ii] ~= "tx" and cur_idxs[ii] ~= "ty" and cur_idxs[ii] ~= "") then 
+               
+               --print(string.format("id %s is not in the list", cur_idxs[ii] ))
+               
+               for stmt1=stmt+1,num_stmts-1 do
+                  --print(string.format("\nii %d stmt1 is %d", ii, stmt1))          
+                  cur_idxs1 = cur_indices(stmt1)
+                  --print("\nstmt1 cur_idxs1 is "..list_to_string(cur_idxs1))   
+                  
+                  --print(string.format("cur level(%d, %s) = %d", stmt, "tx",  find_cur_level(stmt,"tx")))    
+                  
+                  endrange = find_cur_level(stmt,"tx")-1
+                  --print(string.format("for iii=1, %d do", endrange))
+                  
+                  for iii=1,find_cur_level(stmt,"tx")-1 do  -- started at 0
+                     --print(string.format("stmt %d   ii %d   iii %d ", stmt, ii, iii))
+                     --if(cur_idxs1[iii] ~= nil) then 
+                     --   print(string.format("stmt %d   ii %d   iii %d  cur_idxs1[%d] = '%s'", stmt, ii, iii, iii, cur_idxs1[iii]))  
+                     --else 
+                     --   print(string.format("stmt %d   ii %d   iii %d  cur_idxs1[%d] = NIL", stmt, ii, iii, iii))  
+                     --end
+                     
+                     if(cur_idxs1[iii] ~= "bx" and cur_idxs1[iii] ~= "by" and cur_idxs1[iii] ~= nil and cur_idxs1[iii] ~= "tx" and cur_idxs1[iii] ~= "ty" and cur_idxs1[iii] ~= "") then  
+                        if(cur_idxs[ii] == cur_idxs1[iii]) then
+                           --print("\nfound idx:"..cur_idxs[ii])
+			   --if(comm_loops_cnt == 0) then print "\n\n*** WARNING *** assigning to array index ZERO in Lua" end
+                           common_loops[comm_loops_cnt] = cur_idxs[ii]
+                           --print(string.format("cl[%d] = '%s'", comm_loops_cnt,   common_loops[comm_loops_cnt]))
+                           comm_loops_cnt = comm_loops_cnt + 1
+                        end
+                     end  
+                  end
+               end  
+            end
+         end
+      end
+   end
+   ----
+   --if(comm_loops_cnt>0) then 
+   --   print("\n COMM LOOPS :TOTAL "..comm_loops_cnt..", and are "..list_to_string(common_loops).." this loop :"..common_loops[0])
+   --else
+   --   print "UNROLL can't unroll any loops?"
+   --end
+   
+   
+   
+   
+   repeat
+      old_num_stmts = num_statements()
+      --print(string.format("old_num_statements %d", old_num_stmts))
+      
+      for stmt=0,old_num_stmts-1 do
+         cur_idxs = cur_indices(stmt)
+         --print(string.format("stmt %d    cur_idxs = %s", stmt, list_to_string(cur_idxs)))
+         if(#cur_idxs > 0) then 
+            gaurd_level = -1
+            if(chk_cur_level(stmt,guard_idx)>0) then
+               gaurd_level = find_cur_level(stmt,guard_idx)
+            end
+            --print(string.format("guard_level(sp) = %d", gaurd_level))
+            
+            if(gaurd_level>-1) then
+               level = next_clean_level(cur_idxs,gaurd_level)
+               --print(string.format("next clean level %d", level))
+               
+               --need to handle max_depth
+               num_unrolled = 0
+               level_unroll_comm = level
+               level_arr = {}
+               while level >= 0 do
+                  --print(string.format("while: level = %d", level))
+                  
+                  if num_unrolled == max_depth then break end
+                  --print("Unrolling "..stmt.." at level "..(level).." index ".. cur_idxs[gaurd_level+1])
+                  
+                  level_arr[num_unrolled] = level
+                  num_unrolled = num_unrolled + 1
+                  
+                  guard_level = find_cur_level(stmt,guard_idx)
+                  level = next_clean_level(cur_idxs,level+1)
+               end
+               --dies print("How many levels for unroll commands"..table.getn(level_arr).." which is "..level_arr[0].." and "..level_arr[#level_arr])
+               --if(table.getn(level_arr) ~= nil) then
+               
+               --print "OK, NOW WE UNROLL"
+               
+               if(level_unroll_comm >= 0)then
+                  for i = table.getn(level_arr),0,-1 do
+                     --print(string.format("\ni=%d", i))
+                     --print(string.format("[Unroll]unroll(%d, %d, 0)",stmt, level_arr[i]))     
+                     
+                     unroll(stmt,level_arr[i],0)
+                     --print("finished unroll]]\n")
+                     --print_code()
+                  end
+               end
+------
+            end    
+--[[
+
+THERE WAS A BIG BLOCK OF COMMENTED OUT CODE HERE 
+
+
+--]]
+------
+         end
+      end
+      new_num_stmts = num_statements()
+
+   until old_num_stmts == new_num_stmts
+
+end
+
+
diff --git a/examples/cuda-chill/mm.c b/examples/cuda-chill/mm.c
new file mode 100644
index 0000000..0efbeeb
--- /dev/null
+++ b/examples/cuda-chill/mm.c
@@ -0,0 +1,10 @@
+#define N 1024
+
+void normalMM(float c[N][N], float a[N][N], float b[N][N]) {
+  int i, j, k;
+
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      for (k = 0; k < N; k++)
+        c[j][i] = c[j][i] + a[k][i] * b[j][k];
+}
diff --git a/examples/cuda-chill/mm.lua b/examples/cuda-chill/mm.lua
new file mode 100644
index 0000000..5bde1b0
--- /dev/null
+++ b/examples/cuda-chill/mm.lua
@@ -0,0 +1,38 @@
+init("mm.c", "normalMM", 0)
+dofile("cudaize.lua")
+N=1024
+Ti=128
+Tj=64
+Tk=16
+Tii=16
+Tjj=16
+
+
+
+
+N=1024
+
+
+
+
+
+
+
+
+
+
+
+
+
+tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k"})CU=1
+
+tile_by_index({"k"},{Tk},{l1_control="kk"},{"ii","jj","kk","i","j","k"})CU=3
+
+tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","kk","i","iii","j","jjj","k"},1)CU=2
+
+cudaize("mm_GPU",{a=1048576,b=1048576,c=1048576},{block={"ii","jj"}, thread={"i","j"}})CU=2
+copy_to_shared("tx","a",-16)
+copy_to_shared("tx","b",-16)
+copy_to_registers("kk","c")
+--print_code()
+unroll_to_depth(2)
diff --git a/examples/cuda-chill/mpeg4.c b/examples/cuda-chill/mpeg4.c
new file mode 100755
index 0000000..7f83bf7
--- /dev/null
+++ b/examples/cuda-chill/mpeg4.c
@@ -0,0 +1,23 @@
+#define N1 4096
+#define N2 4096
+#define WINDOW_SIZE 16
+
+void mpeg4_cpu(float result[N1][N2], float prev[N2+WINDOW_SIZE][N2+WINDOW_SIZE], float  curr[WINDOW_SIZE*WINDOW_SIZE])
+{
+	unsigned int i;
+	unsigned int j;
+	unsigned int k;
+	unsigned int l;
+
+	for ( i = 0; i < N1; ++i)    
+		for ( j = 0; j < N2; ++j) 
+                       for ( k = 0; k < WINDOW_SIZE; ++k) 
+				for ( l = 0; l < WINDOW_SIZE; ++l) 
+					result[i][j] += prev[i+k][j+l] * curr[k*WINDOW_SIZE+l];
+				
+			
+
+		
+	
+}
+
diff --git a/examples/cuda-chill/mpeg4.lua b/examples/cuda-chill/mpeg4.lua
new file mode 100644
index 0000000..f025dc0
--- /dev/null
+++ b/examples/cuda-chill/mpeg4.lua
@@ -0,0 +1,45 @@
+--CUBLAS 2 MM Multiply
+
+--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
+--call init() and use global variables to specify procedure and loop
+
+--Second parameter is procedure # and third is loop #
+init("mpeg4.c", "mpeg4_cpu", 0) 
+
+--dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods
+
+N=4096
+M=4096
+W=16
+
+--TI 4ust be <= M
+--TJ must be <=TI
+Ti=32
+Tj=32
+Tii=16
+Tjj=16
+Tk=4
+--permute(0,{"j","i","k","l"})
+tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k","l"})
+--tile_by_index({"k","l"},{Tk*2,Tk*2},{l1_control="kk",l2_control="ll"},{"ii","jj","kk","ll","i","j","k","l"})
+--print_code()
+--tile_by_index({"k","l"},{Tk,Tk},{l1_control="kk",l2_control="ll"},{"ii","jj","i","j","kk","k","ll","l"})
+tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","iii","i","jjj","j","k","l"})
+--print_code()
+--normalize_index("j")
+--normalize_index("i")
+--print_code()
+cudaize("kernel_GPU",{curr=W*W,prev=(N+W)*(M+W),result=N*M},{block={"ii","jj"}, thread={"i","j"}})
+--print_code()
+copy_to_shared("iii","prev",16)
+
+copy_to_registers("jjj","result")
+
+--print_code()
+--copy_to_constant_no_tile("curr")
+unroll_to_depth(2)
+print_code()
+print_space()
+
+
diff --git a/examples/cuda-chill/mriq-fh.c b/examples/cuda-chill/mriq-fh.c
new file mode 100755
index 0000000..1e924b7
--- /dev/null
+++ b/examples/cuda-chill/mriq-fh.c
@@ -0,0 +1,38 @@
+#define X 32768
+#define K 256
+struct kValues {
+  float Kx;
+  float Ky;
+  float Kz;
+  float PhiMag;
+};
+extern float sin(float);
+extern float cos(float);
+
+void mriFH_cpu(float *rPhi,float *rRho,float *iRho, float *iPhi, float *rD, float *iD, float *kx, float *ky, float *kz, float *dx, float *dy, float *dz, float *rFHref, float *iFHref)
+{
+
+    	float rfh;
+	float ifh;
+	float exp;
+	float cArg;
+	float sArg;
+    	//float rRho[K];
+	//float iRho[K];
+        unsigned int k;
+	unsigned int x;
+ 
+      
+    for (x = 0; x < X; ++x) {
+        for (k = 0; k < K; ++k) {
+            
+	       exp = 2 * 3.14159 * (kx[k]* dx[x] + ky[k]* dy[x] + kz[k]* dz[x]);
+	       cArg = cos(exp);
+	       sArg = sin(exp);
+            rFHref[x] += rRho[k]* cArg - iRho[k]* sArg;
+            iFHref[x] += iRho[k]*cArg + rRho[k]*sArg;
+        }
+         
+    }
+}
+
diff --git a/examples/cuda-chill/mriq-fh.lua b/examples/cuda-chill/mriq-fh.lua
new file mode 100755
index 0000000..3277bac
--- /dev/null
+++ b/examples/cuda-chill/mriq-fh.lua
@@ -0,0 +1,73 @@
+--CUBLAS 2 MM Multiply
+
+--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
+--call init() and use global variables to specify procedure and loop
+
+--Second parameter is procedure # and third is loop #
+init("mriq-fh.c", "mriFH_cpu", 0) 
+
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+                      --copy_to_shared methods
+N=32768
+M=256
+Tx=256
+
+
+print_code()
+--permute(0,{"j","i"})
+--tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"})
+tile_by_index({"x"},{Tx},{l1_control="xx"},{"xx","x","k"})
+--tile_by_index({"x"},{16},{l1_control="xx1"},{"xx","x","xx1","k"})
+--tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"})
+--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
+print_code()
+
+normalize_index("x")
+--normalize_index("i")
+print_code()
+--tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"})
+--print_code()
+--cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}})
+cudaize("kernel_GPU",{dx=N,dy=N,dz=N,iRho=M,kx=M,ky=M,kz=M,rFHref=N,iFHref=N,rRho=M},{block={"xx"}, thread={"x"}})
+--copy_to_shared("tx","iRho",-16)
+--copy_to_shared("tx","dz",1)
+--copy_to_shared("tx","rRho",-16)
+--copy_to_registers("tx","rFHref")
+--copy_to_registers("tx","rRho")
+--copy_to_registers("tx","iRho")
+--copy_to_registers("tx","kx")
+--copy_to_registers("tx","dx")
+--copy_to_registers("tx","ky")
+--copy_to_registers("tx","dy")
+--copy_to_registers("tx","kz")
+--copy_to_registers("tx","dz")
+--copy_to_registers("tx","iFHref")
+--copy_to_texture("rRho")
+--copy_to_texture("kx")
+--copy_to_texture("dx")
+--copy_to_texture("ky")
+--copy_to_texture("dy")
+--copy_to_texture("kz")
+--copy_to_texture("dz")
+--copy_to_texture("iRho")
+--print_code()--]]
+--unroll(0,4,0)
+--copy_to_constant_no_tile("kx")
+--copy_to_constant_no_tile("ky")
+--copy_to_constant_no_tile("kz")
+--copy_to_constant_no_tile("rRho")
+--copy_to_constant_no_tile("iRho")
+
+--unroll_to_depth(1)
+print_code()
+--[[
+copy_to_Texture("rRho")
+copy_to_Texture("kx")
+copy_to_Texture("dx")
+copy_to_Texture("ky")
+copy_to_Texture("dy")
+copy_to_Texture("kz")
+copy_to_Texture("dz")
+copy_to_Texture("iRho")
+--unroll_to_depth(2)
+--]]
diff --git a/examples/cuda-chill/mriq.c b/examples/cuda-chill/mriq.c
new file mode 100644
index 0000000..ba4b87c
--- /dev/null
+++ b/examples/cuda-chill/mriq.c
@@ -0,0 +1,33 @@
+#define N 32768
+#define M 3072
+struct kValues {
+  float Kx;
+  float Ky;
+  float Kz;
+  float PhiMag;
+};
+extern float sinf(float);
+extern float cosf(float);
+
+void
+ComputeQCPU(int numK, int numX,struct kValues kVals[M],float x[N], float y[N], float z[N],float Qr[N], float Qi[N]) {
+  float expArg;
+  float cosArg;
+  float sinArg;
+  float phi;
+  int i;
+  int j;
+  numK = M;
+  numX = N;
+  for ( i = 0; i < M; i++) {
+    for ( j = 0; j < N; j++) {
+      expArg = 6.2831853071795864769252867665590058f * (kVals[i].Kx * x[j] +kVals[i].Ky * y[j] +kVals[i].Kz * z[j]);
+      cosArg = cosf(expArg);
+      sinArg = sinf(expArg);
+      phi = kVals[i].PhiMag;
+      Qr[j] += phi * cosArg;
+      Qi[j] += phi * sinArg;
+    }
+  }
+}
+  
diff --git a/examples/cuda-chill/mriq.lua b/examples/cuda-chill/mriq.lua
new file mode 100644
index 0000000..1170111
--- /dev/null
+++ b/examples/cuda-chill/mriq.lua
@@ -0,0 +1,55 @@
+--CUBLAS 2 MM Multiply
+
+--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
+--call init() and use global variables to specify procedure and loop
+
+--Second parameter is procedure # and third is loop #
+init("mriq.c", "ComputeQCPU", 0) 
+
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+                      --copy_to_shared methods
+N=32768
+M=3072
+TI=128
+TJ=128
+
+permute(0,{"j","i"})
+--tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"})
+tile_by_index({"i"}, {TJ}, {l1_control="ii",l1_tile="i"}, {"ii", "j","i"})
+tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"})
+--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
+--print_code()
+
+normalize_index("j")
+normalize_index("i")
+--print_code()
+--tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"})
+--print_code()
+cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}})
+
+copy_to_shared("tx","kVals",1)
+--copy_to_shared("tx","x",1)
+--copy_to_shared("tx","y",1)
+--copy_to_shared("tx","z",1)
+
+--copy_to_texture("kVals")
+--datacopy(0, 3, "kVals", {"tt","t"},false,0,1,-16,true)
+--print_code()
+--datacopy_privatized(0,"tx","kVals",{"tx"})
+--copy_to_registers("tx","kVals")
+copy_to_registers("ii","x")
+copy_to_registers("ii","y")
+copy_to_registers("ii","z")
+copy_to_registers("ii","Qi")
+copy_to_registers("ii","Qr")
+--[[datacopy_privatized(0,"tx","x",{"tx"})
+datacopy_privatized(0,"tx","y",{"tx"})
+datacopy_privatized(0,"tx","z",{"tx"})
+datacopy_privatized(0,"tx","Qi",{"tx"})
+datacopy_privatized(0,"tx","Qr",{"tx"})
+
+
+]]--
+--unroll(0,5,64)
+print_code()
+--unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
diff --git a/examples/cuda-chill/mv-shadow.c b/examples/cuda-chill/mv-shadow.c
new file mode 100644
index 0000000..582b187
--- /dev/null
+++ b/examples/cuda-chill/mv-shadow.c
@@ -0,0 +1,9 @@
+#define N 1024
+
+void normalMV(float c[N][N], float a[N], float b[N]) {
+  int i, j;
+
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      a[i] = a[i] + c[j][i] * b[j];
+}
diff --git a/examples/cuda-chill/mv-shadow.lua b/examples/cuda-chill/mv-shadow.lua
new file mode 100644
index 0000000..43e8491
--- /dev/null
+++ b/examples/cuda-chill/mv-shadow.lua
@@ -0,0 +1,65 @@
+init("mv-shadow.c","normalMV",0)
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+                      --copy_to_shared methods
+
+N=129
+TI=32
+TJ=64
+
+N=1024
+TI=16
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+--Tile the i and j loop, introducing "ii" as the control loop for the "i"
+--tile, "k" for the control loop fo the "j" tile, with the final order
+--of {"ii", "k", "i", "j"}
+tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
+--tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
+--tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"})
+--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
+--print_code()
+--Normalize indx will do a tile size of one over the loop level specified
+--by the input index. This is useful to get a zero lower bound and hard
+--upper bound on a loop instead of it being relative to previous loop
+--levels.
+--normalize_index("ii")
+normalize_index("i")
+print_code()
+
+--Cudaize now determines the grid dimentions from the loops themselves
+--(the upper bounds of the block and thread loops). It also renames the
+--given block and thread loops's indexes to the approviate values from
+--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
+--size of the arrays to be copied in the CUDA scaffolding.
+cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}})
+--print_code()
+
+--Does a datacopy, tile, and add_sync to get a shared memory copy
+
+--copy_to_shared("tx", "b", 1)
+--copy_to_shared("tx", "c", -16)
+--print_code()
+--copy_to_texture("b")
+--copy_to_texture("c")
+copy_to_registers("k", "a")
+--print_code()
+
+unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
+--copy_to_texture("b")
+--print_code()
+--unroll(0,5,0)
+--print_code()
diff --git a/examples/cuda-chill/mv.c b/examples/cuda-chill/mv.c
new file mode 100644
index 0000000..582b187
--- /dev/null
+++ b/examples/cuda-chill/mv.c
@@ -0,0 +1,9 @@
+#define N 1024
+
+void normalMV(float c[N][N], float a[N], float b[N]) {
+  int i, j;
+
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      a[i] = a[i] + c[j][i] * b[j];
+}
diff --git a/examples/cuda-chill/mv.lua b/examples/cuda-chill/mv.lua
new file mode 100644
index 0000000..ca54501
--- /dev/null
+++ b/examples/cuda-chill/mv.lua
@@ -0,0 +1,65 @@
+init("mv.c","normalMV",0)
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+                      --copy_to_shared methods
+
+N=129
+TI=32
+TJ=64
+
+N=1024
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+--Tile the i and j loop, introducing "ii" as the control loop for the "i"
+--tile, "k" for the control loop fo the "j" tile, with the final order
+--of {"ii", "k", "i", "j"}
+tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
+--tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
+--tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"})
+--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
+--print_code()
+--Normalize indx will do a tile size of one over the loop level specified
+--by the input index. This is useful to get a zero lower bound and hard
+--upper bound on a loop instead of it being relative to previous loop
+--levels.
+--normalize_index("ii")
+normalize_index("i")
+print_code()
+
+--Cudaize now determines the grid dimentions from the loops themselves
+--(the upper bounds of the block and thread loops). It also renames the
+--given block and thread loops's indexes to the approviate values from
+--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
+--size of the arrays to be copied in the CUDA scaffolding.
+cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}})
+
+--print_code()
+
+--Does a datacopy, tile, and add_sync to get a shared memory copy
+
+--copy_to_shared("tx", "b", 1)
+--copy_to_shared("tx", "c", -16)
+--print_code()
+--copy_to_texture("b")
+--copy_to_texture("c")
+copy_to_registers("k", "a")
+--print_code()
+
+unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
+--copy_to_texture("b")
+--print_code()
+--unroll(0,5,0)
+--print_code()
diff --git a/examples/cuda-chill/mv_try.c b/examples/cuda-chill/mv_try.c
new file mode 100644
index 0000000..7781f3b
--- /dev/null
+++ b/examples/cuda-chill/mv_try.c
@@ -0,0 +1,9 @@
+#define N 4096
+
+void normalMV(int n, float c[N][N], float a[N], float b[N]) {
+  int i, j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++)
+      a[i] = a[i] + c[i][j] * b[j];
+}
diff --git a/examples/cuda-chill/mv_try.lua b/examples/cuda-chill/mv_try.lua
new file mode 100644
index 0000000..db4d9ad
--- /dev/null
+++ b/examples/cuda-chill/mv_try.lua
@@ -0,0 +1,14 @@
+init("mv_try.c","normalMV",0)
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+                      --copy_to_shared methods
+
+TI=96
+
+N=4096
+
+
+tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
+cudaize("mv_GPU", {a=N, b=N, c=N*N},
+        {block={"ii"}, thread={"i"}})
+
+print_code()
diff --git a/examples/cuda-chill/nbody.c b/examples/cuda-chill/nbody.c
new file mode 100644
index 0000000..57899b6
--- /dev/null
+++ b/examples/cuda-chill/nbody.c
@@ -0,0 +1,66 @@
+#define NBODIES 16384
+#define SOFTENINGSQUARED 0.01f
+#define DELTATIME 0.001f
+#define DAMPING 1.0f
+
+#define NBLOCKSY 1
+#define NBLOCKSX (NBODIES/NTHREADSX)
+#define NTHREADSY 1 
+#define NTHREADSX 64
+
+#define BLOCKSIZE 128
+
+#define SHARED 1
+#define TIMER 1
+#define VERIFY 1
+
+extern float sqrtf(float);
+
+void nbody_cpu(float* oldpos,float* oldpos1, float *newpos, float *oldvel, float *newvel, float *force)
+{
+    float r0,r1,r2;
+    float invDist, invDistCube, mass, invMass;
+    unsigned int i,j;
+    for(i = 0; i < NBODIES; ++i) {
+        //force[i*4  ] = 0;
+        //force[i*4+1] = 0;
+        //force[i*4+2] = 0;
+        //force[i*4+3] = 0;
+        for(j = 0; j < NBODIES; ++j) {
+	    r0 = oldpos[j*4]-oldpos1[i*4];
+	    r1 = oldpos[j*4+1]-oldpos1[i*4+1];
+	    r2 = oldpos[j*4+2]-oldpos1[i*4+2];
+
+	    invDist = 1.0/sqrtf(r0 * r0 + r1 * r1 + r2 * r2 + SOFTENINGSQUARED);
+	    invDistCube =  invDist * invDist * invDist;
+	    mass = oldpos1[i*4+3];
+
+	    force[i*4] = force[i*4] + r0 * mass * invDistCube;
+	    force[i*4+1] = force[i*4+1] + r1 * mass * invDistCube;
+	    force[i*4+2] = force[i*4+2] + r2 * mass * invDistCube;
+
+        }
+    }
+
+/*    for (i = 0; i < NBODIES; ++i) {
+        invMass = oldvel[4*i+3];
+
+        oldvel[4*i] += (force[4*i] * invMass) * DELTATIME * DAMPING;
+        oldvel[4*i+1] += (force[4*i+1] * invMass) * DELTATIME * DAMPING;
+        oldvel[4*i+2] += (force[4*i+2] * invMass) * DELTATIME * DAMPING;
+
+        oldpos[4*i] += oldvel[4*i] * DELTATIME;
+        oldpos[4*i+1] += oldvel[4*i+1] * DELTATIME;
+        oldpos[4*i+2] += oldvel[4*i+2] * DELTATIME;
+
+        newpos[4*i+0] = oldpos[4*i];
+        newpos[4*i+1] = oldpos[4*i+1];
+        newpos[4*i+2] = oldpos[4*i+2];
+        newpos[4*i+3] = oldpos[4*i+3];
+
+        newvel[4*i+0] = oldvel[4*i];
+        newvel[4*i+1] = oldvel[4*i+1];
+        newvel[4*i+2] = oldvel[4*i+2];
+        newvel[4*i+3] = oldvel[4*i+3];
+    }*/
+}
diff --git a/examples/cuda-chill/nbody.lua b/examples/cuda-chill/nbody.lua
new file mode 100644
index 0000000..08f88a9
--- /dev/null
+++ b/examples/cuda-chill/nbody.lua
@@ -0,0 +1,53 @@
+--CUBLAS 2 MM Multiply
+
+--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
+--call init() and use global variables to specify procedure and loop
+
+--Second parameter is procedure # and third is loop #
+init("nbody.c", "nbody_cpu" , 0) 
+
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+                     --copy_to_shared methods
+NBODIES=16384
+
+
+--Tj=128 CHANGE FOR BEST..... BEST IS 64BLOCKS 128THREADS
+--Ti=256
+Tj=64
+Ti=32
+Tjjj=1
+Tiii=1
+Tn=0.1
+--normalize_index("j")
+--
+--print_code()
+--normalize_index("n")
+-- TILE COMMANDS ZEROOOOOOOOOOO:3
+--tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j"})--CU=-1
+tile_by_index({"i"},{Ti},{l1_control="ii"},{"ii","i","j"})--CU=-1
+--normalize_index("i")
+--tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1
+
+--tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3
+--tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"})
+--tile_by_index({"i"}, {Ti/2}, {l1_control="iii"}, {"ii","iii", "jj","i","j"})
+--print_code()
+cudaize("kernel_GPU",{oldpos=4*NBODIES,oldpos1=4*NBODIES,oldvel=4*NBODIES,force=4*NBODIES,newpos=4*NBODIES,newvel=4*NBODIES},{block={"ii"}, thread={"i"}})--CU=3
+print_code()
+--tile(0,6,6)
+--copy_to_shared("tx","oldpos",-16)
+--copy_to_registers("j","oldpos")
+--copy_to_registers("j","oldpos1")
+--copy_to_registers("j","force")
+
+--copy_to_texture("oldpos")
+--tile(1,3,3)
+--tile(2,3,3)
+
+print_code()
+--unroll_to_depth(1)
+--
+--tile(2,3,3)
+--unroll(2,3,0)
+--unroll(0,5,0)
+--print_code()
diff --git a/examples/cuda-chill/tmv-shadow.c b/examples/cuda-chill/tmv-shadow.c
new file mode 100644
index 0000000..cb9ea8d
--- /dev/null
+++ b/examples/cuda-chill/tmv-shadow.c
@@ -0,0 +1,9 @@
+#define N 1024
+
+void normalMV(float c[N][N], float a[N], float b[N]) {
+  int i, j;
+
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      a[i] = a[i] + c[i][j] * b[j];
+}
diff --git a/examples/cuda-chill/tmv-shadow.lua b/examples/cuda-chill/tmv-shadow.lua
new file mode 100644
index 0000000..196b939
--- /dev/null
+++ b/examples/cuda-chill/tmv-shadow.lua
@@ -0,0 +1,50 @@
+init("tmv-shadow.c","normalMV",0)
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+                      --copy_to_shared methods
+
+N=1024
+--N= 8209
+--N=129
+TI=64
+N=1024
+TI=32
+--tile, "k" for the control loop for the "j" tile, with the final order
+--of {"ii", "k", "i", "j"}
+tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
+--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii",  "i", "j"})
+--print_code()
+--tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
+
+--print_code()
+--Normalize indx will do a tile size of one over the loop level specified
+--by the input index. This is useful to get a zero lower bound and hard
+--upper bound on a loop instead of it being relative to previous loop
+--levels.
+--normalize_index("i")
+--print_code()
+
+--Cudaize now determines the grid dimentions from the loops themselves
+--(the upper bounds of the block and thread loops). It also renames the
+--given block and thread loops's indexes to the approviate values from
+--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
+--size of the arrays to be copied in the CUDA scaffolding.
+cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}})
+
+--print_code()
+
+--Does a datacopy, tile, and add_sync to get a shared memory copy
+copy_to_shared("tx", "b", 1)
+--copy_to_texture("b")
+--print_code()
+
+copy_to_shared("tx", "c", -16)
+--copy_to_texture("c")
+--print_code()
+
+copy_to_registers("k", "a")
+print_code()
+--unroll(0,5,0)
+--unroll(0,4,0)
+--unroll(2,4,16)
+unroll_to_depth(1)
+--print_code()
diff --git a/examples/cuda-chill/tmv.c b/examples/cuda-chill/tmv.c
new file mode 100644
index 0000000..cb9ea8d
--- /dev/null
+++ b/examples/cuda-chill/tmv.c
@@ -0,0 +1,9 @@
+#define N 1024
+
+void normalMV(float c[N][N], float a[N], float b[N]) {
+  int i, j;
+
+  for (i = 0; i < N; i++)
+    for (j = 0; j < N; j++)
+      a[i] = a[i] + c[i][j] * b[j];
+}
diff --git a/examples/cuda-chill/tmv.lua b/examples/cuda-chill/tmv.lua
new file mode 100644
index 0000000..5071108
--- /dev/null
+++ b/examples/cuda-chill/tmv.lua
@@ -0,0 +1,50 @@
+init("tmv.c","normalMV",0)
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+                      --copy_to_shared methods
+
+N=1024
+--N= 8209
+--N=129
+TI=64
+N=1024
+TI=32
+--tile, "k" for the control loop for the "j" tile, with the final order
+--of {"ii", "k", "i", "j"}
+tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
+--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii",  "i", "j"})
+--print_code()
+--tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
+
+--print_code()
+--Normalize indx will do a tile size of one over the loop level specified
+--by the input index. This is useful to get a zero lower bound and hard
+--upper bound on a loop instead of it being relative to previous loop
+--levels.
+--normalize_index("i")
+--print_code()
+
+--Cudaize now determines the grid dimentions from the loops themselves
+--(the upper bounds of the block and thread loops). It also renames the
+--given block and thread loops's indexes to the approviate values from
+--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
+--size of the arrays to be copied in the CUDA scaffolding.
+cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}})
+
+--print_code()
+
+--Does a datacopy, tile, and add_sync to get a shared memory copy
+copy_to_shared("tx", "b", 1)
+--copy_to_texture("b")
+--print_code()
+
+copy_to_shared("tx", "c", -16)
+--copy_to_texture("c")
+--print_code()
+
+copy_to_registers("k", "a")
+print_code()
+--unroll(0,5,0)
+--unroll(0,4,0)
+--unroll(2,4,16)
+unroll_to_depth(1)
+--print_code()
diff --git a/examples/fortran/README b/examples/fortran/README
new file mode 100644
index 0000000..4f23bee
--- /dev/null
+++ b/examples/fortran/README
@@ -0,0 +1,10 @@
+// Manu
+
+1) Fortran support added to permute, tile, unroll and datacopy. Tested these w.r.t gemm.c using gemm.script. 
+   There might be other issues (like fusion due to unroll, ...) that have not been tested.
+
+2) To incorporate Fortran support I had to modify certain values in omega (include/omega/omega_core/oc.h). 
+   To solve for large number of unknowns, these values have to be reverted back.
+
+3) Tested the existing chill scripts using Derick's python script. 
+   At least the existing chill scripts are not affected by the fortran related changes.
diff --git a/examples/fortran/ccd.f b/examples/fortran/ccd.f
new file mode 100644
index 0000000..12d834d
--- /dev/null
+++ b/examples/fortran/ccd.f
@@ -0,0 +1,32 @@
+c
+c These have been separated out from ccsd_t_singles_l.F and ccsd_t_doubles_l.F
+c
+      subroutine clean_sd_t_s1_1(h3d,h2d,h1d,p6d,p5d,p4d,
+     2                     triplesx,t1sub,v2sub)
+      IMPLICIT NONE
+      integer h3d,h2d,h1d,p6d,p5d,p4d
+      integer h3,h2,h1,p6,p5,p4
+      integer N
+			double precision triplesx(16,16,16,16,16,16)
+      double precision t1sub(16,16)
+      double precision v2sub(16,16,16,16)
+      
+      N = 16       
+
+      do p4=1,10
+      do p5=1,10
+      do p6=1,10
+      do h1=1,10
+      do h2=1,10
+      do h3=1,10
+       triplesx(h3,h2,h1,p6,p5,p4)=triplesx(h3,h2,h1,p6,p5,p4)
+     1   + t1sub(p4,h1)*v2sub(h3,h2,p6,p5)
+      enddo
+      enddo
+      enddo
+      enddo
+      enddo
+      enddo
+      return
+      end
+
diff --git a/examples/fortran/ccd.script b/examples/fortran/ccd.script
new file mode 100644
index 0000000..c2af500
--- /dev/null
+++ b/examples/fortran/ccd.script
@@ -0,0 +1,18 @@
+source: ccd.f
+procedure: clean_sd_t_s1_1
+format : rose
+loop: 0
+
+
+
+original()
+
+UN=4
+
+unroll(0,5,4)
+unroll(0,4,4)
+unroll(0,3,4)
+unroll(0,2,4)
+unroll(0,1,4)
+
+print
diff --git a/examples/fortran/gemm.f90 b/examples/fortran/gemm.f90
new file mode 100644
index 0000000..b65bb58
--- /dev/null
+++ b/examples/fortran/gemm.f90
@@ -0,0 +1,58 @@
+program matmul
+
+    integer N,i,j,k
+    real*8 a(10,10), b(10,10), c(10,10), ct(10,10),mysum
+
+    do i=1,10,1
+      do j=1,10,1
+        a(i,j) = i+j 
+        b(i,j) = i-j
+        c(i,j) = 0.0
+        ct(i,j) = 0.0
+      end do
+      b(i,i) = 1.0;
+    end do
+
+
+      DO j=1,10,1
+         DO k=1,10,1
+            DO i=1,10,1
+               c(i,j) = c(i,j)+a(i,k)*b(k,j)
+            end do
+        end do
+      end do
+
+
+
+    call gemm(10,a,b,ct)
+
+    mysum = 0.0
+    do i=1,10,1
+      do j=1,10,1
+        mysum = c(i,j) - ct(i,j)
+      end do
+    end do
+
+   if (abs(mysum) >= 0.00001) then
+     write (*,*) "Something wrong"
+   else
+     write (*,*) "Output matches"
+   end if
+    
+end program matmul
+
+      SUBROUTINE gemm(N,A,B,C)
+      INTEGER N
+      REAL*8  A(N,N), B(N,N), C(N,N)
+
+      INTEGER I,J,K
+
+      DO J=1,N,1
+         DO K=1,N,1
+            DO I=1,N,1
+               C(I,J) = C(I,J)+A(I,K)*B(K,J)
+						end do
+				end do
+			end do
+
+      END subroutine
diff --git a/examples/fortran/gemm.script b/examples/fortran/gemm.script
new file mode 100644
index 0000000..01eb859
--- /dev/null
+++ b/examples/fortran/gemm.script
@@ -0,0 +1,30 @@
+#matrix multiply large array size for intel machine
+source: gemm.f90
+procedure: gemm
+format: rose
+loop: 0
+
+TI = 128
+#TI = 4
+TJ = 8
+#TK = 3
+TK = 512
+UI = 2
+UJ = 2
+
+permute([3,1,2])
+tile(0,2,TJ)
+#print space
+tile(0,2,TI)
+#print space
+tile(0,5,TK)
+#print space
+
+
+datacopy(0,3,A,false,-1)
+#print space
+
+datacopy(0,4,B)
+unroll(0,4,UI)                                                            
+unroll(0,5,UJ)  
+
diff --git a/examples/fortran/rose_gemm.f90 b/examples/fortran/rose_gemm.f90
new file mode 100644
index 0000000..d150922
--- /dev/null
+++ b/examples/fortran/rose_gemm.f90
@@ -0,0 +1,155 @@
+PROGRAM matmul
+INTEGER :: N, i, j, k
+REAL(kind=8) :: a(10,10), b(10,10), c(10,10), ct(10,10), mysum
+DO i = 1, 10, 1
+DO j = 1, 10, 1
+a(i,j) = i + j
+b(i,j) = i - j
+c(i,j) = 0.0
+ct(i,j) = 0.0
+END DO
+b(i,i) = 1.0
+END DO
+DO j = 1, 10, 1
+DO k = 1, 10, 1
+DO i = 1, 10, 1
+c(i,j) = c(i,j) + a(i,k) * b(k,j)
+END DO
+END DO
+END DO
+CALL gemm(10,a,b,ct)
+mysum = 0.0
+DO i = 1, 10, 1
+DO j = 1, 10, 1
+mysum = c(i,j) - ct(i,j)
+END DO
+END DO
+IF (abs(mysum) >= 0.00001) THEN
+WRITE (*, FMT=*) "Something wrong"
+ELSE
+WRITE (*, FMT=*) "Output matches"
+END IF
+END PROGRAM matmul
+
+SUBROUTINE gemm(N,A,B,C)
+INTEGER :: t12
+INTEGER :: t10
+INTEGER :: t8
+INTEGER :: t6
+INTEGER :: t4
+INTEGER :: t2
+INTEGER :: chill_t64
+INTEGER :: chill_t63
+INTEGER :: chill_t62
+INTEGER :: chill_t61
+INTEGER :: chill_t60
+INTEGER :: chill_t59
+INTEGER :: chill_t58
+INTEGER :: chill_t57
+INTEGER :: chill_t56
+INTEGER :: chill_t55
+INTEGER :: chill_t54
+INTEGER :: chill_t53
+INTEGER :: chill_t52
+INTEGER :: chill_t51
+INTEGER :: chill_t50
+INTEGER :: chill_t49
+INTEGER :: chill_t48
+INTEGER :: chill_t47
+INTEGER :: over2
+INTEGER :: chill_t46
+INTEGER :: chill_t45
+INTEGER :: chill_t44
+INTEGER :: chill_t43
+INTEGER :: chill_t42
+INTEGER :: chill_t41
+INTEGER :: chill_t40
+INTEGER :: chill_t39
+INTEGER :: chill_t38
+INTEGER :: chill_t37
+INTEGER :: chill_t36
+INTEGER :: chill_t35
+INTEGER :: chill_t34
+INTEGER :: chill_t33
+INTEGER :: chill_t32
+INTEGER :: chill_t31
+INTEGER :: chill_t30
+INTEGER :: chill_t29
+INTEGER :: chill_t28
+INTEGER :: chill_t27
+INTEGER :: chill_t26
+INTEGER :: chill_t25
+INTEGER :: chill_t24
+INTEGER :: chill_t23
+INTEGER :: over1
+INTEGER :: chill_t22
+INTEGER :: chill_t21
+INTEGER :: chill_t20
+INTEGER :: chill_t19
+INTEGER :: chill_t18
+INTEGER :: chill_t17
+INTEGER :: chill_t16
+INTEGER :: chill_t15
+REAL(kind=8), DIMENSION(8,512) :: f_P2
+INTEGER :: chill_t14
+INTEGER :: chill_t13
+INTEGER :: chill_t12
+INTEGER :: chill_t11
+INTEGER :: chill_t10
+INTEGER :: chill_t9
+INTEGER :: chill_t8
+INTEGER :: chill_t7
+REAL(kind=8), DIMENSION(512,128) :: f_P1
+INTEGER :: chill_t1
+INTEGER :: chill_t2
+INTEGER :: chill_t4
+INTEGER :: chill_t6
+INTEGER :: chill_t5
+INTEGER :: N
+REAL(kind=8) :: A(N,N), B(N,N), C(N,N)
+INTEGER :: I, J, K
+over1 = 0
+over2 = 0
+DO t2 = 1, N, 512
+DO t4 = 1, N, 128
+DO t6 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
+DO t8 = t4, merge(t4 + 127,N,t4 + 127 <= N), 1
+f_P1(t8 - t4 + 1,t6 - t2 + 1) = A(t8,t6)
+END DO
+END DO
+DO t6 = 1, N, 8
+DO t8 = t6, merge(N,t6 + 7,N <= t6 + 7), 1
+DO t10 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
+f_P2(t10 - t2 + 1,t8 - t6 + 1) = B(t10,t8)
+END DO
+END DO
+over1 = MOD(N,2)
+DO t8 = t4, merge(-over1 + N,t4 + 126,-over1 + N <= t4 + 126), 2
+over2 = MOD(N,2)
+DO t10 = t6, merge(t6 + 6,N - over2,t6 + 6 <= N - over2), 2
+DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1
+C(t8,t10) = C(t8,t10) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
+C(t8 + 1,t10) = C(t8 + 1,t10) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
+C(t8,t10 + 1) = C(t8,t10 + 1) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1)
+C(t8 + 1,t10 + 1) = C(t8 + 1,t10 + 1) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1)
+END DO
+END DO
+IF (N - 7 <= t6 .AND. 1 <= over2) THEN
+DO t12 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
+C(t8,N) = C(t8,N) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1)
+C(t8 + 1,N) = C(t8 + 1,N) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1)
+END DO
+END IF
+END DO
+IF (N - 127 <= t4 .AND. 1 <= over1) THEN
+DO t10 = t6, merge(t6 + 7,N,t6 + 7 <= N), 1
+DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1
+C(N,t10) = C(N,t10) + f_P1(N - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
+END DO
+END DO
+END IF
+END DO
+END DO
+END DO
+END SUBROUTINE 
+
author	dhuth <derickhuth@gmail.com>	2014-08-27 09:52:06 -0600
committer	dhuth <derickhuth@gmail.com>	2014-08-27 09:52:06 -0600
commit	bff810cc371a38f493d688c54f71013f5a7d53bf (patch)
tree	fbe86954bb3c01deb21da9e41ebff5baa2889a45 /examples
download	chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.gz chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.bz2 chill-bff810cc371a38f493d688c54f71013f5a7d53bf.zip