summaryrefslogtreecommitdiff
path: root/test-chill/test-cases
diff options
context:
space:
mode:
Diffstat (limited to 'test-chill/test-cases')
-rw-r--r--test-chill/test-cases/chill-lua.tclist19
-rw-r--r--test-chill/test-cases/chill-python.tclist19
-rw-r--r--test-chill/test-cases/chill-script.tclist36
-rw-r--r--test-chill/test-cases/chill/mm.c15
-rw-r--r--test-chill/test-cases/chill/test_distribute.py12
-rw-r--r--test-chill/test-cases/chill/test_distribute.script10
-rw-r--r--test-chill/test-cases/chill/test_distribute.stdout14
-rw-r--r--test-chill/test-cases/chill/test_fuse.py14
-rw-r--r--test-chill/test-cases/chill/test_fuse.script12
-rw-r--r--test-chill/test-cases/chill/test_fuse.stdout25
-rw-r--r--test-chill/test-cases/chill/test_known.py11
-rw-r--r--test-chill/test-cases/chill/test_known.script9
-rw-r--r--test-chill/test-cases/chill/test_known.stdout11
-rw-r--r--test-chill/test-cases/chill/test_known_2.py9
-rw-r--r--test-chill/test-cases/chill/test_original.py12
-rw-r--r--test-chill/test-cases/chill/test_original.script12
-rw-r--r--test-chill/test-cases/chill/test_original.stdout28
-rw-r--r--test-chill/test-cases/chill/test_peel.py10
-rw-r--r--test-chill/test-cases/chill/test_peel.script10
-rw-r--r--test-chill/test-cases/chill/test_peel.stdout14
-rw-r--r--test-chill/test-cases/chill/test_permute.py12
-rw-r--r--test-chill/test-cases/chill/test_permute.script10
-rw-r--r--test-chill/test-cases/chill/test_permute.stdout17
-rw-r--r--test-chill/test-cases/chill/test_print_code.py8
-rw-r--r--test-chill/test-cases/chill/test_print_code.script7
-rw-r--r--test-chill/test-cases/chill/test_print_code.stdout18
-rw-r--r--test-chill/test-cases/chill/test_print_dep.py8
-rw-r--r--test-chill/test-cases/chill/test_print_dep.script7
-rw-r--r--test-chill/test-cases/chill/test_print_dep.stdout4
-rw-r--r--test-chill/test-cases/chill/test_print_space.py8
-rw-r--r--test-chill/test-cases/chill/test_print_space.script7
-rw-r--r--test-chill/test-cases/chill/test_print_space.stdout3
-rw-r--r--test-chill/test-cases/chill/test_reverse.py12
-rw-r--r--test-chill/test-cases/chill/test_reverse.script12
-rw-r--r--test-chill/test-cases/chill/test_reverse.stdout14
-rw-r--r--test-chill/test-cases/chill/test_scale.py12
-rw-r--r--test-chill/test-cases/chill/test_scale.script13
-rw-r--r--test-chill/test-cases/chill/test_scale.stdout14
-rw-r--r--test-chill/test-cases/chill/test_shift.py12
-rw-r--r--test-chill/test-cases/chill/test_shift.script11
-rw-r--r--test-chill/test-cases/chill/test_shift.stdout24
-rw-r--r--test-chill/test-cases/chill/test_shift_to.py12
-rw-r--r--test-chill/test-cases/chill/test_shift_to.script11
-rw-r--r--test-chill/test-cases/chill/test_shift_to.stdout11
-rw-r--r--test-chill/test-cases/chill/test_skew.py12
-rw-r--r--test-chill/test-cases/chill/test_skew.script11
-rw-r--r--test-chill/test-cases/chill/test_skew.stdout22
-rw-r--r--test-chill/test-cases/chill/test_tile.py14
-rw-r--r--test-chill/test-cases/chill/test_tile.script7
-rw-r--r--test-chill/test-cases/chill/test_tile.stdout20
-rw-r--r--test-chill/test-cases/chill/test_unroll.py13
-rw-r--r--test-chill/test-cases/chill/test_unroll.script11
-rw-r--r--test-chill/test-cases/chill/test_unroll.stdout19
-rw-r--r--test-chill/test-cases/chill/test_unroll_extra.py12
-rw-r--r--test-chill/test-cases/chill/test_unroll_extra.script11
-rw-r--r--test-chill/test-cases/chill/test_unroll_extra.stdout28
-rw-r--r--test-chill/test-cases/cuda-chill-lua.tclist13
-rw-r--r--test-chill/test-cases/cuda-chill-python.tclist1
-rw-r--r--test-chill/test-cases/examples/chill/gemm.c25
-rw-r--r--test-chill/test-cases/examples/chill/gemm.script31
-rw-r--r--test-chill/test-cases/examples/chill/gemv.c21
-rw-r--r--test-chill/test-cases/examples/chill/gemv.script9
-rw-r--r--test-chill/test-cases/examples/chill/jacobi1.c19
-rw-r--r--test-chill/test-cases/examples/chill/jacobi1.script18
-rw-r--r--test-chill/test-cases/examples/chill/jacobi2.c15
-rw-r--r--test-chill/test-cases/examples/chill/jacobi2.script21
-rw-r--r--test-chill/test-cases/examples/chill/qr.c44
-rw-r--r--test-chill/test-cases/examples/chill/qr.script13
-rw-r--r--test-chill/test-cases/examples/chill/scalar_test.c16
-rw-r--r--test-chill/test-cases/examples/chill/scalar_test.script10
-rw-r--r--test-chill/test-cases/examples/chill/swim.c159
-rw-r--r--test-chill/test-cases/examples/chill/swim.script13
-rw-r--r--test-chill/test-cases/examples/chill/test_align.c20
-rw-r--r--test-chill/test-cases/examples/chill/test_align.script12
-rw-r--r--test-chill/test-cases/examples/chill/test_fusion.c13
-rw-r--r--test-chill/test-cases/examples/chill/test_fusion.script7
-rw-r--r--test-chill/test-cases/examples/chill/test_lex_order.c31
-rw-r--r--test-chill/test-cases/examples/chill/test_lex_order.script12
-rw-r--r--test-chill/test-cases/examples/chill/test_split.c14
-rw-r--r--test-chill/test-cases/examples/chill/test_split.script9
-rw-r--r--test-chill/test-cases/examples/chill/test_split2.c14
-rw-r--r--test-chill/test-cases/examples/chill/test_split2.script9
-rw-r--r--test-chill/test-cases/examples/chill/test_tile.c16
-rw-r--r--test-chill/test-cases/examples/chill/test_tile.script14
-rw-r--r--test-chill/test-cases/examples/chill/tile_violation.c12
-rw-r--r--test-chill/test-cases/examples/chill/tile_violation.script14
-rw-r--r--test-chill/test-cases/examples/chill/unroll.c31
-rw-r--r--test-chill/test-cases/examples/chill/unroll.script35
-rw-r--r--test-chill/test-cases/examples/chill/unroll_violation.c12
-rw-r--r--test-chill/test-cases/examples/chill/unroll_violation.script14
-rw-r--r--test-chill/test-cases/examples/cuda-chill/cp.c29
-rw-r--r--test-chill/test-cases/examples/cuda-chill/cp.lua46
-rw-r--r--test-chill/test-cases/examples/cuda-chill/cudaize.lua1004
-rwxr-xr-xtest-chill/test-cases/examples/cuda-chill/cudaize.py1047
-rw-r--r--test-chill/test-cases/examples/cuda-chill/mm.c10
-rw-r--r--test-chill/test-cases/examples/cuda-chill/mm.lua38
-rwxr-xr-xtest-chill/test-cases/examples/cuda-chill/mpeg4.c23
-rw-r--r--test-chill/test-cases/examples/cuda-chill/mpeg4.lua45
-rwxr-xr-xtest-chill/test-cases/examples/cuda-chill/mriq-fh.c38
-rwxr-xr-xtest-chill/test-cases/examples/cuda-chill/mriq-fh.lua73
-rw-r--r--test-chill/test-cases/examples/cuda-chill/mriq.c33
-rw-r--r--test-chill/test-cases/examples/cuda-chill/mriq.lua55
-rw-r--r--test-chill/test-cases/examples/cuda-chill/mv-shadow.c9
-rw-r--r--test-chill/test-cases/examples/cuda-chill/mv-shadow.lua65
-rw-r--r--test-chill/test-cases/examples/cuda-chill/mv.c9
-rw-r--r--test-chill/test-cases/examples/cuda-chill/mv.lua65
-rw-r--r--test-chill/test-cases/examples/cuda-chill/mv_try.c9
-rw-r--r--test-chill/test-cases/examples/cuda-chill/mv_try.lua14
-rw-r--r--test-chill/test-cases/examples/cuda-chill/nbody.c66
-rw-r--r--test-chill/test-cases/examples/cuda-chill/nbody.lua53
-rw-r--r--test-chill/test-cases/examples/cuda-chill/tmv-shadow.c9
-rw-r--r--test-chill/test-cases/examples/cuda-chill/tmv-shadow.lua50
-rw-r--r--test-chill/test-cases/examples/cuda-chill/tmv.c9
-rw-r--r--test-chill/test-cases/examples/cuda-chill/tmv.lua50
-rw-r--r--test-chill/test-cases/unit/chill-basic-python.tclist20
-rw-r--r--test-chill/test-cases/unit/chill-basic-script.tclist20
-rw-r--r--test-chill/test-cases/unit/chill-basic.tclist4
117 files changed, 4359 insertions, 0 deletions
diff --git a/test-chill/test-cases/chill-lua.tclist b/test-chill/test-cases/chill-lua.tclist
new file mode 100644
index 0000000..39bd140
--- /dev/null
+++ b/test-chill/test-cases/chill-lua.tclist
@@ -0,0 +1,19 @@
+
+#chill-testcase test-cases/chill/test_distribute.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_fuse.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_known.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_original.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_peel.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_permute.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_print_code.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_print_dep.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_print_space.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_reverse.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_scale.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_shift.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_shift_to.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_skew.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_tile.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_unroll_extra.lua test-cases/chill/mm.c --check-run-script
+#chill-testcase test-cases/chill/test_unroll.lua test-cases/chill/mm.c --check-run-script
+
diff --git a/test-chill/test-cases/chill-python.tclist b/test-chill/test-cases/chill-python.tclist
new file mode 100644
index 0000000..ee6c54c
--- /dev/null
+++ b/test-chill/test-cases/chill-python.tclist
@@ -0,0 +1,19 @@
+
+chill-testcase test-cases/chill/test_distribute.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_fuse.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_known.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_original.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_peel.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_permute.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_print_code.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_print_dep.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_print_space.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_reverse.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_scale.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_shift.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_shift_to.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_skew.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_tile.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_unroll_extra.py test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_unroll.py test-cases/chill/mm.c --check-run-script
+
diff --git a/test-chill/test-cases/chill-script.tclist b/test-chill/test-cases/chill-script.tclist
new file mode 100644
index 0000000..117ee0d
--- /dev/null
+++ b/test-chill/test-cases/chill-script.tclist
@@ -0,0 +1,36 @@
+
+chill-testcase test-cases/chill/test_distribute.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_fuse.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_known.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_original.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_peel.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_permute.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_print_code.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_print_dep.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_print_space.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_reverse.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_scale.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_shift.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_shift_to.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_skew.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_tile.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_unroll_extra.script test-cases/chill/mm.c --check-run-script
+chill-testcase test-cases/chill/test_unroll.script test-cases/chill/mm.c --check-run-script
+
+chill-testcase test-cases/examples/chill/gemm.script test-cases/examples/chill/gemm.c
+chill-testcase test-cases/examples/chill/gemv.script test-cases/examples/chill/gemv.c
+chill-testcase test-cases/examples/chill/jacobi1.script test-cases/examples/chill/jacobi1.c
+chill-testcase test-cases/examples/chill/jacobi2.script test-cases/examples/chill/jacobi2.c
+chill-testcase test-cases/examples/chill/qr.script test-cases/examples/chill/qr.c
+#chill-testcase test-cases/examples/chill/scalar_test.script test-cases/examples/chill/scalar_test.c --check-run-script --fail-run-script
+chill-testcase test-cases/examples/chill/swim.script test-cases/examples/chill/swim.c
+chill-testcase test-cases/examples/chill/test_align.script test-cases/examples/chill/test_align.c
+#chill-testcase test-cases/examples/chill/test_fusion.script test-cases/examples/chill/test_fusion.c --check-run-script --fail-run-script
+#chill-testcase test-cases/examples/chill/test_lex_order.script test-cases/examples/chill/test_lex_order.c --check-run-script --fail-run-script
+chill-testcase test-cases/examples/chill/test_split.script test-cases/examples/chill/test_split.c
+#chill-testcase test-cases/examples/chill/test_split2.script test-cases/examples/chill/test_split2.c --check-run-script --fail-run-script
+chill-testcase test-cases/examples/chill/test_tile.script test-cases/examples/chill/test_tile.c
+#chill-testcase test-cases/examples/chill/tile_violation.script test-cases/examples/chill/tile_violation.c --check-run-script --fail-run-script
+chill-testcase test-cases/examples/chill/unroll.script test-cases/examples/chill/unroll.c
+#chill-testcase test-cases/examples/chill/unroll_violation.script test-cases/examples/chill/unroll_violation.c --check-run-script --fail-run-script
+
diff --git a/test-chill/test-cases/chill/mm.c b/test-chill/test-cases/chill/mm.c
new file mode 100644
index 0000000..354d929
--- /dev/null
+++ b/test-chill/test-cases/chill/mm.c
@@ -0,0 +1,15 @@
+
+
+void mm(float **A, float **B, float **C, int ambn, int an, int bm) {
+ int i, j, n;
+
+ for(i = 0; i < an; i++) {
+ for(j = 0; j < bm; j++) {
+ C[i][j] = 0.0f;
+ for(n = 0; n < ambn; n++) {
+ C[i][j] += A[i][n] * B[n][j];
+ }
+ }
+ }
+}
+
diff --git a/test-chill/test-cases/chill/test_distribute.py b/test-chill/test-cases/chill/test_distribute.py
new file mode 100644
index 0000000..760d29f
--- /dev/null
+++ b/test-chill/test-cases/chill/test_distribute.py
@@ -0,0 +1,12 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known('ambn > 0')
+known('an > 0')
+known('bm > 0')
+distribute([0,1], 1)
+print_code()
diff --git a/test-chill/test-cases/chill/test_distribute.script b/test-chill/test-cases/chill/test_distribute.script
new file mode 100644
index 0000000..2476e8d
--- /dev/null
+++ b/test-chill/test-cases/chill/test_distribute.script
@@ -0,0 +1,10 @@
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+known(ambn > 0)
+known(an > 0)
+known(bm > 0)
+distribute([0,1], 1)
+print
diff --git a/test-chill/test-cases/chill/test_distribute.stdout b/test-chill/test-cases/chill/test_distribute.stdout
new file mode 100644
index 0000000..f6aa1a8
--- /dev/null
+++ b/test-chill/test-cases/chill/test_distribute.stdout
@@ -0,0 +1,14 @@
+for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ s0(t2,t4,0);
+ }
+}
+for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ for(t6 = 0; t6 <= ambn-1; t6++) {
+ s1(t2,t4,t6);
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_fuse.py b/test-chill/test-cases/chill/test_fuse.py
new file mode 100644
index 0000000..32c594c
--- /dev/null
+++ b/test-chill/test-cases/chill/test_fuse.py
@@ -0,0 +1,14 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known('ambn > 0')
+known('an > 0')
+known('bm > 0')
+distribute([0,1], 1)
+print_code()
+fuse([0,1], 1)
+print_code()
diff --git a/test-chill/test-cases/chill/test_fuse.script b/test-chill/test-cases/chill/test_fuse.script
new file mode 100644
index 0000000..6578ad2
--- /dev/null
+++ b/test-chill/test-cases/chill/test_fuse.script
@@ -0,0 +1,12 @@
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+known(ambn > 0)
+known(an > 0)
+known(bm > 0)
+distribute([0,1], 1)
+print
+fuse([0,1], 1)
+print
diff --git a/test-chill/test-cases/chill/test_fuse.stdout b/test-chill/test-cases/chill/test_fuse.stdout
new file mode 100644
index 0000000..922d511
--- /dev/null
+++ b/test-chill/test-cases/chill/test_fuse.stdout
@@ -0,0 +1,25 @@
+for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ s0(t2,t4,0);
+ }
+}
+for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ for(t6 = 0; t6 <= ambn-1; t6++) {
+ s1(t2,t4,t6);
+ }
+ }
+}
+
+
+for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ s0(t2,t4,0);
+ s1(t2,t4,0);
+ for(t6 = 1; t6 <= ambn-1; t6++) {
+ s1(t2,t4,t6);
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_known.py b/test-chill/test-cases/chill/test_known.py
new file mode 100644
index 0000000..662d7d0
--- /dev/null
+++ b/test-chill/test-cases/chill/test_known.py
@@ -0,0 +1,11 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known('ambn > 0')
+known('an > 0')
+known('bm > 0')
+print_code()
diff --git a/test-chill/test-cases/chill/test_known.script b/test-chill/test-cases/chill/test_known.script
new file mode 100644
index 0000000..6772e18
--- /dev/null
+++ b/test-chill/test-cases/chill/test_known.script
@@ -0,0 +1,9 @@
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+known(ambn > 0)
+known(an > 0)
+known(bm > 0)
+print
diff --git a/test-chill/test-cases/chill/test_known.stdout b/test-chill/test-cases/chill/test_known.stdout
new file mode 100644
index 0000000..6975a99
--- /dev/null
+++ b/test-chill/test-cases/chill/test_known.stdout
@@ -0,0 +1,11 @@
+for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ s0(t2,t4,0);
+ s1(t2,t4,0);
+ for(t6 = 1; t6 <= ambn-1; t6++) {
+ s1(t2,t4,t6);
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_known_2.py b/test-chill/test-cases/chill/test_known_2.py
new file mode 100644
index 0000000..5b16325
--- /dev/null
+++ b/test-chill/test-cases/chill/test_known_2.py
@@ -0,0 +1,9 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known(['ambn > 0', 'an > 0', 'bm > 0'])
+print_code()
diff --git a/test-chill/test-cases/chill/test_original.py b/test-chill/test-cases/chill/test_original.py
new file mode 100644
index 0000000..2d17799
--- /dev/null
+++ b/test-chill/test-cases/chill/test_original.py
@@ -0,0 +1,12 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known(['ambn > 4', 'an > 0', 'bm > 0'])
+peel(1,3,4)
+print_code()
+original()
+print_code()
diff --git a/test-chill/test-cases/chill/test_original.script b/test-chill/test-cases/chill/test_original.script
new file mode 100644
index 0000000..8f07121
--- /dev/null
+++ b/test-chill/test-cases/chill/test_original.script
@@ -0,0 +1,12 @@
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+known(ambn > 4)
+known(an > 0)
+known(bm > 0)
+peel(1,3,4)
+print
+original()
+print
diff --git a/test-chill/test-cases/chill/test_original.stdout b/test-chill/test-cases/chill/test_original.stdout
new file mode 100644
index 0000000..5121763
--- /dev/null
+++ b/test-chill/test-cases/chill/test_original.stdout
@@ -0,0 +1,28 @@
+for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ s2(t2,t4,0);
+ s3(t2,t4,0);
+ s4(t2,t4,1);
+ s5(t2,t4,2);
+ s6(t2,t4,3);
+ for(t6 = 4; t6 <= ambn-1; t6++) {
+ s1(t2,t4,t6);
+ }
+ }
+}
+
+
+for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ s2(t2,t4,0);
+ s3(t2,t4,0);
+ s4(t2,t4,1);
+ s5(t2,t4,2);
+ s6(t2,t4,3);
+ for(t6 = 4; t6 <= ambn-1; t6++) {
+ s1(t2,t4,t6);
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_peel.py b/test-chill/test-cases/chill/test_peel.py
new file mode 100644
index 0000000..bb6c583
--- /dev/null
+++ b/test-chill/test-cases/chill/test_peel.py
@@ -0,0 +1,10 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known(['ambn > 4', 'an > 0', 'bm > 0'])
+peel(1,3,4)
+print_code()
diff --git a/test-chill/test-cases/chill/test_peel.script b/test-chill/test-cases/chill/test_peel.script
new file mode 100644
index 0000000..121868e
--- /dev/null
+++ b/test-chill/test-cases/chill/test_peel.script
@@ -0,0 +1,10 @@
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+known(ambn > 4)
+known(an > 0)
+known(bm > 0)
+peel(1,3,4)
+print
diff --git a/test-chill/test-cases/chill/test_peel.stdout b/test-chill/test-cases/chill/test_peel.stdout
new file mode 100644
index 0000000..7096b21
--- /dev/null
+++ b/test-chill/test-cases/chill/test_peel.stdout
@@ -0,0 +1,14 @@
+for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ s2(t2,t4,0);
+ s3(t2,t4,0);
+ s4(t2,t4,1);
+ s5(t2,t4,2);
+ s6(t2,t4,3);
+ for(t6 = 4; t6 <= ambn-1; t6++) {
+ s1(t2,t4,t6);
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_permute.py b/test-chill/test-cases/chill/test_permute.py
new file mode 100644
index 0000000..c201d2f
--- /dev/null
+++ b/test-chill/test-cases/chill/test_permute.py
@@ -0,0 +1,12 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known('ambn > 0')
+known('an > 0')
+known('bm > 0')
+permute([3,1,2])
+print_code()
diff --git a/test-chill/test-cases/chill/test_permute.script b/test-chill/test-cases/chill/test_permute.script
new file mode 100644
index 0000000..946bff5
--- /dev/null
+++ b/test-chill/test-cases/chill/test_permute.script
@@ -0,0 +1,10 @@
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+known(ambn > 0)
+known(an > 0)
+known(bm > 0)
+permute([3,1,2])
+print
diff --git a/test-chill/test-cases/chill/test_permute.stdout b/test-chill/test-cases/chill/test_permute.stdout
new file mode 100644
index 0000000..0268d6b
--- /dev/null
+++ b/test-chill/test-cases/chill/test_permute.stdout
@@ -0,0 +1,17 @@
+for(t2 = 0; t2 <= ambn-1; t2++) {
+ for(t4 = 0; t4 <= an-1; t4++) {
+ if (t2 <= 0) {
+ for(t6 = 0; t6 <= bm-1; t6++) {
+ s0(t4,t6,t2);
+ s1(t4,t6,t2);
+ }
+ }
+ else {
+ for(t6 = 0; t6 <= bm-1; t6++) {
+ s1(t4,t6,t2);
+ }
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_print_code.py b/test-chill/test-cases/chill/test_print_code.py
new file mode 100644
index 0000000..004c46c
--- /dev/null
+++ b/test-chill/test-cases/chill/test_print_code.py
@@ -0,0 +1,8 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+print_code()
diff --git a/test-chill/test-cases/chill/test_print_code.script b/test-chill/test-cases/chill/test_print_code.script
new file mode 100644
index 0000000..20c8364
--- /dev/null
+++ b/test-chill/test-cases/chill/test_print_code.script
@@ -0,0 +1,7 @@
+
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+print
diff --git a/test-chill/test-cases/chill/test_print_code.stdout b/test-chill/test-cases/chill/test_print_code.stdout
new file mode 100644
index 0000000..b4ece20
--- /dev/null
+++ b/test-chill/test-cases/chill/test_print_code.stdout
@@ -0,0 +1,18 @@
+if (bm >= 1) {
+ for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ if (ambn >= 1) {
+ s0(t2,t4,0);
+ s1(t2,t4,0);
+ }
+ for(t6 = 1; t6 <= ambn-1; t6++) {
+ s1(t2,t4,t6);
+ }
+ if (ambn <= 0) {
+ s0(t2,t4,0);
+ }
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_print_dep.py b/test-chill/test-cases/chill/test_print_dep.py
new file mode 100644
index 0000000..a3dee29
--- /dev/null
+++ b/test-chill/test-cases/chill/test_print_dep.py
@@ -0,0 +1,8 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+print_dep()
diff --git a/test-chill/test-cases/chill/test_print_dep.script b/test-chill/test-cases/chill/test_print_dep.script
new file mode 100644
index 0000000..99dc567
--- /dev/null
+++ b/test-chill/test-cases/chill/test_print_dep.script
@@ -0,0 +1,7 @@
+
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+print dep
diff --git a/test-chill/test-cases/chill/test_print_dep.stdout b/test-chill/test-cases/chill/test_print_dep.stdout
new file mode 100644
index 0000000..ab679a3
--- /dev/null
+++ b/test-chill/test-cases/chill/test_print_dep.stdout
@@ -0,0 +1,4 @@
+dependence graph:
+1->2: C:flow(0, 0, +) C:flow(0, 0, 0) C:output(0, 0, +) C:output(0, 0, 0)
+2->2: C:anti(0, 0, +) C:output(0, 0, +)
+
diff --git a/test-chill/test-cases/chill/test_print_space.py b/test-chill/test-cases/chill/test_print_space.py
new file mode 100644
index 0000000..2f8f678
--- /dev/null
+++ b/test-chill/test-cases/chill/test_print_space.py
@@ -0,0 +1,8 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+print_space()
diff --git a/test-chill/test-cases/chill/test_print_space.script b/test-chill/test-cases/chill/test_print_space.script
new file mode 100644
index 0000000..d8c81df
--- /dev/null
+++ b/test-chill/test-cases/chill/test_print_space.script
@@ -0,0 +1,7 @@
+
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+print space
diff --git a/test-chill/test-cases/chill/test_print_space.stdout b/test-chill/test-cases/chill/test_print_space.stdout
new file mode 100644
index 0000000..f97c8c1
--- /dev/null
+++ b/test-chill/test-cases/chill/test_print_space.stdout
@@ -0,0 +1,3 @@
+s0: { Sym=[bm,an] [t1,t2,t3,t4,t5,t6,t7] : t1 = 0 && t3 = 0 && t5 = 0 && t7 = 0 && t6 = 0 && 0 <= t2 < an && 0 <= t4 < bm }
+s1: { Sym=[ambn,bm,an] [t1,t2,t3,t4,t5,t6,t7] : t1 = 0 && t3 = 0 && t5 = 0 && t7 = 0 && 0 <= t2 < an && 0 <= t6 < ambn && 0 <= t4 < bm }
+
diff --git a/test-chill/test-cases/chill/test_reverse.py b/test-chill/test-cases/chill/test_reverse.py
new file mode 100644
index 0000000..a97c611
--- /dev/null
+++ b/test-chill/test-cases/chill/test_reverse.py
@@ -0,0 +1,12 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known(['ambn > 0', 'an > 0', 'bm > 0'])
+distribute([0,1],1)
+reverse([1],1)
+reverse([1],2)
+print_code()
diff --git a/test-chill/test-cases/chill/test_reverse.script b/test-chill/test-cases/chill/test_reverse.script
new file mode 100644
index 0000000..fc04d5c
--- /dev/null
+++ b/test-chill/test-cases/chill/test_reverse.script
@@ -0,0 +1,12 @@
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+known(ambn > 0)
+known(an > 0)
+known(bm > 0)
+distribute([0,1],1)
+reverse([1],1)
+reverse([1],2)
+print
diff --git a/test-chill/test-cases/chill/test_reverse.stdout b/test-chill/test-cases/chill/test_reverse.stdout
new file mode 100644
index 0000000..182b822
--- /dev/null
+++ b/test-chill/test-cases/chill/test_reverse.stdout
@@ -0,0 +1,14 @@
+for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ s0(t2,t4,0);
+ }
+}
+for(t2 = -an+1; t2 <= 0; t2++) {
+ for(t4 = -bm+1; t4 <= 0; t4++) {
+ for(t6 = 0; t6 <= ambn-1; t6++) {
+ s1(-t2,-t4,t6);
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_scale.py b/test-chill/test-cases/chill/test_scale.py
new file mode 100644
index 0000000..ee8455d
--- /dev/null
+++ b/test-chill/test-cases/chill/test_scale.py
@@ -0,0 +1,12 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known(['ambn > 0', 'an > 0', 'bm > 0'])
+distribute([0,1],1)
+scale([1],1,4)
+scale([1],2,4)
+print_code()
diff --git a/test-chill/test-cases/chill/test_scale.script b/test-chill/test-cases/chill/test_scale.script
new file mode 100644
index 0000000..20611ec
--- /dev/null
+++ b/test-chill/test-cases/chill/test_scale.script
@@ -0,0 +1,13 @@
+
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+known(ambn > 0)
+known(an > 0)
+known(bm > 0)
+distribute([0,1],1)
+scale([1],1,4)
+scale([1],2,4)
+print
diff --git a/test-chill/test-cases/chill/test_scale.stdout b/test-chill/test-cases/chill/test_scale.stdout
new file mode 100644
index 0000000..049451c
--- /dev/null
+++ b/test-chill/test-cases/chill/test_scale.stdout
@@ -0,0 +1,14 @@
+for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ s0(t2,t4,0);
+ }
+}
+for(t2 = 0; t2 <= 4*an-4; t2 += 4) {
+ for(t4 = 0; t4 <= 4*bm-4; t4 += 4) {
+ for(t6 = 0; t6 <= ambn-1; t6++) {
+ s1(t2/4,t4/4,t6);
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_shift.py b/test-chill/test-cases/chill/test_shift.py
new file mode 100644
index 0000000..b3fc6ab
--- /dev/null
+++ b/test-chill/test-cases/chill/test_shift.py
@@ -0,0 +1,12 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known('ambn > 0')
+known('an > 0')
+known('bm > 0')
+shift([1],1,4)
+print_code()
diff --git a/test-chill/test-cases/chill/test_shift.script b/test-chill/test-cases/chill/test_shift.script
new file mode 100644
index 0000000..d3d67aa
--- /dev/null
+++ b/test-chill/test-cases/chill/test_shift.script
@@ -0,0 +1,11 @@
+
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+known(ambn > 0)
+known(an > 0)
+known(bm > 0)
+shift([1],1,4)
+print
diff --git a/test-chill/test-cases/chill/test_shift.stdout b/test-chill/test-cases/chill/test_shift.stdout
new file mode 100644
index 0000000..2b96895
--- /dev/null
+++ b/test-chill/test-cases/chill/test_shift.stdout
@@ -0,0 +1,24 @@
+for(t2 = 0; t2 <= an+3; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ if (an >= t2+1) {
+ s0(t2,t4,0);
+ if (t2 >= 4) {
+ s1(t2-4,t4,0);
+ }
+ if (t2 >= 4) {
+ for(t6 = 1; t6 <= ambn-1; t6++) {
+ s1(t2-4,t4,t6);
+ }
+ }
+ }
+ else {
+ if (t2 >= 4) {
+ for(t6 = 0; t6 <= ambn-1; t6++) {
+ s1(t2-4,t4,t6);
+ }
+ }
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_shift_to.py b/test-chill/test-cases/chill/test_shift_to.py
new file mode 100644
index 0000000..f3537c5
--- /dev/null
+++ b/test-chill/test-cases/chill/test_shift_to.py
@@ -0,0 +1,12 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known('ambn > 0')
+known('an > 0')
+known('bm > 0')
+shift_to(1,1,4)
+print_code()
diff --git a/test-chill/test-cases/chill/test_shift_to.script b/test-chill/test-cases/chill/test_shift_to.script
new file mode 100644
index 0000000..64a6443
--- /dev/null
+++ b/test-chill/test-cases/chill/test_shift_to.script
@@ -0,0 +1,11 @@
+
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+known(ambn > 0)
+known(an > 0)
+known(bm > 0)
+shift_to(1,1,4)
+print
diff --git a/test-chill/test-cases/chill/test_shift_to.stdout b/test-chill/test-cases/chill/test_shift_to.stdout
new file mode 100644
index 0000000..820d83f
--- /dev/null
+++ b/test-chill/test-cases/chill/test_shift_to.stdout
@@ -0,0 +1,11 @@
+for(t2 = 4; t2 <= an+3; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ s0(t2-4,t4,0);
+ s1(t2-4,t4,0);
+ for(t6 = 1; t6 <= ambn-1; t6++) {
+ s1(t2-4,t4,t6);
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_skew.py b/test-chill/test-cases/chill/test_skew.py
new file mode 100644
index 0000000..c7271f4
--- /dev/null
+++ b/test-chill/test-cases/chill/test_skew.py
@@ -0,0 +1,12 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known('ambn > 0')
+known('an > 0')
+known('bm > 0')
+skew([1],1,[4])
+print_code()
diff --git a/test-chill/test-cases/chill/test_skew.script b/test-chill/test-cases/chill/test_skew.script
new file mode 100644
index 0000000..e9cf75a
--- /dev/null
+++ b/test-chill/test-cases/chill/test_skew.script
@@ -0,0 +1,11 @@
+
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+known(ambn > 0)
+known(an > 0)
+known(bm > 0)
+skew([1],1,[4])
+print
diff --git a/test-chill/test-cases/chill/test_skew.stdout b/test-chill/test-cases/chill/test_skew.stdout
new file mode 100644
index 0000000..a1b36f8
--- /dev/null
+++ b/test-chill/test-cases/chill/test_skew.stdout
@@ -0,0 +1,22 @@
+for(t2 = 0; t2 <= 4*an-4; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ if (an >= t2+1) {
+ s0(t2,t4,0);
+ if (intMod(t2,4) == 0) {
+ s1(t2/4,t4,0);
+ }
+ }
+ else {
+ if (intMod(t2,4) == 0) {
+ s1(t2/4,t4,0);
+ }
+ }
+ if (intMod(t2,4) == 0) {
+ for(t6 = 1; t6 <= ambn-1; t6++) {
+ s1(t2/4,t4,t6);
+ }
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_tile.py b/test-chill/test-cases/chill/test_tile.py
new file mode 100644
index 0000000..fbe0368
--- /dev/null
+++ b/test-chill/test-cases/chill/test_tile.py
@@ -0,0 +1,14 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+#known('ambn > 0')
+#known('an > 0')
+#known('bm > 0')
+#tile(1, 1, 4, 1)
+#tile(1, 3, 4, 2)
+tile(0,2,4)
+print_code()
diff --git a/test-chill/test-cases/chill/test_tile.script b/test-chill/test-cases/chill/test_tile.script
new file mode 100644
index 0000000..de27998
--- /dev/null
+++ b/test-chill/test-cases/chill/test_tile.script
@@ -0,0 +1,7 @@
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+tile(0, 2, 4)
+print
diff --git a/test-chill/test-cases/chill/test_tile.stdout b/test-chill/test-cases/chill/test_tile.stdout
new file mode 100644
index 0000000..0a2d89a
--- /dev/null
+++ b/test-chill/test-cases/chill/test_tile.stdout
@@ -0,0 +1,20 @@
+if (an >= 1) {
+ for(t2 = 0; t2 <= bm-1; t2 += 4) {
+ for(t4 = 0; t4 <= an-1; t4++) {
+ for(t6 = t2; t6 <= min(bm-1,t2+3); t6++) {
+ if (ambn >= 1) {
+ s0(t4,t6,0);
+ s1(t4,t6,0);
+ }
+ for(t8 = 1; t8 <= ambn-1; t8++) {
+ s1(t4,t6,t8);
+ }
+ if (ambn <= 0) {
+ s0(t4,t6,0);
+ }
+ }
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_unroll.py b/test-chill/test-cases/chill/test_unroll.py
new file mode 100644
index 0000000..39dd0db
--- /dev/null
+++ b/test-chill/test-cases/chill/test_unroll.py
@@ -0,0 +1,13 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known('ambn > 0')
+known('an > 0')
+known('bm > 0')
+distribute([0,1], 1)
+unroll(1, 3, 4)
+print_code()
diff --git a/test-chill/test-cases/chill/test_unroll.script b/test-chill/test-cases/chill/test_unroll.script
new file mode 100644
index 0000000..bd19bd1
--- /dev/null
+++ b/test-chill/test-cases/chill/test_unroll.script
@@ -0,0 +1,11 @@
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+known(ambn > 0)
+known(an > 0)
+known(bm > 0)
+distribute([0,1], 1)
+unroll(1, 3, 4)
+print
diff --git a/test-chill/test-cases/chill/test_unroll.stdout b/test-chill/test-cases/chill/test_unroll.stdout
new file mode 100644
index 0000000..71616bf
--- /dev/null
+++ b/test-chill/test-cases/chill/test_unroll.stdout
@@ -0,0 +1,19 @@
+for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ s0(t2,t4,0);
+ }
+}
+for(t2 = 0; t2 <= an-1; t2++) {
+ for(t4 = 0; t4 <= bm-1; t4++) {
+ s2(t2,t4);
+ for(t6 = 0; t6 <= -over1+ambn-1; t6 += 4) {
+ s1(t2,t4,t6);
+ s4(t2,t4,t6);
+ }
+ for(t6 = max(0,ambn-over1); t6 <= ambn-1; t6++) {
+ s3(t2,t4,t6);
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/chill/test_unroll_extra.py b/test-chill/test-cases/chill/test_unroll_extra.py
new file mode 100644
index 0000000..929313c
--- /dev/null
+++ b/test-chill/test-cases/chill/test_unroll_extra.py
@@ -0,0 +1,12 @@
+from chill import *
+
+source('mm.c')
+procedure('mm')
+#format: rose
+loop(0)
+
+known('ambn > 0')
+known('an > 0')
+known('bm > 0')
+unroll_extra(1, 2, 4)
+print_code()
diff --git a/test-chill/test-cases/chill/test_unroll_extra.script b/test-chill/test-cases/chill/test_unroll_extra.script
new file mode 100644
index 0000000..fae244e
--- /dev/null
+++ b/test-chill/test-cases/chill/test_unroll_extra.script
@@ -0,0 +1,11 @@
+
+source: mm.c
+procedure: mm
+format: rose
+loop: 0
+
+known(ambn > 0)
+known(an > 0)
+known(bm > 0)
+unroll_extra(1, 2, 4)
+print
diff --git a/test-chill/test-cases/chill/test_unroll_extra.stdout b/test-chill/test-cases/chill/test_unroll_extra.stdout
new file mode 100644
index 0000000..eca65f2
--- /dev/null
+++ b/test-chill/test-cases/chill/test_unroll_extra.stdout
@@ -0,0 +1,28 @@
+for(t2 = 0; t2 <= an-1; t2++) {
+ s2(t2);
+ for(t4 = 0; t4 <= -over1+bm-1; t4 += 4) {
+ s0(t2,t4,0);
+ s1(t2,t4,0);
+ s5(t2,t4,0);
+ s6(t2,t4,0);
+ s7(t2,t4,0);
+ s8(t2,t4,0);
+ s9(t2,t4,0);
+ s10(t2,t4,0);
+ for(t6 = 1; t6 <= ambn-1; t6++) {
+ s1(t2,t4,t6);
+ s6(t2,t4,t6);
+ s8(t2,t4,t6);
+ s10(t2,t4,t6);
+ }
+ }
+ for(t4 = max(bm-over1,0); t4 <= bm-1; t4++) {
+ s3(t2,t4,0);
+ s4(t2,t4,0);
+ for(t6 = 1; t6 <= ambn-1; t6++) {
+ s4(t2,t4,t6);
+ }
+ }
+}
+
+
diff --git a/test-chill/test-cases/cuda-chill-lua.tclist b/test-chill/test-cases/cuda-chill-lua.tclist
new file mode 100644
index 0000000..d2e91dc
--- /dev/null
+++ b/test-chill/test-cases/cuda-chill-lua.tclist
@@ -0,0 +1,13 @@
+
+chill-testcase -u test-cases/examples/cuda-chill/cp.lua test-cases/examples/cuda-chill/cp.c --no-compile-gensrc
+chill-testcase -u test-cases/examples/cuda-chill/mm.lua test-cases/examples/cuda-chill/mm.c --no-compile-gensrc
+chill-testcase -u test-cases/examples/cuda-chill/mpeg4.lua test-cases/examples/cuda-chill/mpeg4.c --no-compile-gensrc
+chill-testcase -u test-cases/examples/cuda-chill/mriq.lua test-cases/examples/cuda-chill/mriq.c --no-compile-gensrc
+chill-testcase -u test-cases/examples/cuda-chill/mriq-fh.lua test-cases/examples/cuda-chill/mriq-fh.c --no-compile-gensrc
+chill-testcase -u test-cases/examples/cuda-chill/mv.lua test-cases/examples/cuda-chill/mv.c --no-compile-gensrc
+chill-testcase -u test-cases/examples/cuda-chill/mv-shadow.lua test-cases/examples/cuda-chill/mv-shadow.c --no-compile-gensrc
+chill-testcase -u test-cases/examples/cuda-chill/mv_try.lua test-cases/examples/cuda-chill/mv_try.c --no-compile-gensrc
+chill-testcase -u test-cases/examples/cuda-chill/nbody.lua test-cases/examples/cuda-chill/nbody.c --no-compile-gensrc
+chill-testcase -u test-cases/examples/cuda-chill/tmv.lua test-cases/examples/cuda-chill/tmv.c --no-compile-gensrc
+chill-testcase -u test-cases/examples/cuda-chill/tmv-shadow.lua test-cases/examples/cuda-chill/tmv-shadow.c --no-compile-gensrc
+
diff --git a/test-chill/test-cases/cuda-chill-python.tclist b/test-chill/test-cases/cuda-chill-python.tclist
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/test-chill/test-cases/cuda-chill-python.tclist
@@ -0,0 +1 @@
+
diff --git a/test-chill/test-cases/examples/chill/gemm.c b/test-chill/test-cases/examples/chill/gemm.c
new file mode 100644
index 0000000..2c90ea5
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/gemm.c
@@ -0,0 +1,25 @@
+
+#ifndef N
+#define N 512
+#endif
+
+/*
+<test name=gemm define="{'N':512}">
+procedure int gemm(
+ in float[N][N] a = matrix([,], lambda i,j: random(2,-2)),
+ in float[N][N] b = matrix([,], lambda i,j: random(2,-2)),
+ out float[N][N] c = matrix([,], lambda i,j: 0))
+</test>
+*/
+int gemm(float a[N][N], float b[N][N], float c[N][N]) {
+ int i, j, k;
+ int n = N;
+ for (j = 0; j < n; j++)
+ for (k = 0; k < n; k++)
+ for (i = 0; i < n; i++) {
+ c[i][j] = c[i][j] + a[i][k] * b[k][j];
+ }
+
+ return 0;
+}
+
diff --git a/test-chill/test-cases/examples/chill/gemm.script b/test-chill/test-cases/examples/chill/gemm.script
new file mode 100644
index 0000000..393f236
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/gemm.script
@@ -0,0 +1,31 @@
+#matrix multiply large array size for intel machine
+source: gemm.c
+procedure: gemm
+format: rose
+loop: 0
+
+TI = 128
+TJ = 8
+TK = 512
+UI = 2
+UJ = 2
+
+permute([3,1,2])
+tile(0,2,TJ)
+#print space
+tile(0,2,TI)
+#print space
+tile(0,5,TK)
+#print space
+
+datacopy(0,3,a,false,1)
+#print space
+
+datacopy(0,4,b)
+print
+unroll(0,4,UI)#print space
+print
+unroll(0,5,UJ)
+#print space
+print
+
diff --git a/test-chill/test-cases/examples/chill/gemv.c b/test-chill/test-cases/examples/chill/gemv.c
new file mode 100644
index 0000000..39b083c
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/gemv.c
@@ -0,0 +1,21 @@
+#ifndef N
+#define N 512
+#endif
+
+/*
+<test name=gemv define="{'N':512}">
+procedure int gemv(
+ out float[N] a = matrix([], lambda i: random(2,-2)),
+ in float[N] b = matrix([], lambda i: random(2,-2)),
+ in float[N][N] c = matrix([,], lambda i,j: random(2,-2)))
+</test>
+*/
+int gemv(float a[N], float b[N], float c[N][N]) {
+ int i, j;
+
+ for (i = 1; i < N; i++)
+ for (j = 1; j < N; j++)
+ a[i] = a[i] + c[i][j] * b[j];
+
+ return 0;
+}
diff --git a/test-chill/test-cases/examples/chill/gemv.script b/test-chill/test-cases/examples/chill/gemv.script
new file mode 100644
index 0000000..73b3b58
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/gemv.script
@@ -0,0 +1,9 @@
+source: gemv.c # matrix-vector multiply
+procedure: gemv
+format : rose
+loop: 0
+
+
+
+original()
+print
diff --git a/test-chill/test-cases/examples/chill/jacobi1.c b/test-chill/test-cases/examples/chill/jacobi1.c
new file mode 100644
index 0000000..e7ff8f8
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/jacobi1.c
@@ -0,0 +1,19 @@
+
+#ifndef N
+#define N 512
+#endif
+
+/*
+<test name=jacobi define="{'N':512}">
+procedure int jacobi(
+ in out float[N][N] a = matrix [i,j] random(2,-2))
+</test>
+*/
+int jacobi(float a[N][N]) {
+ int t, i;
+ for (t = 2; t <= 100; t++)
+ for (i = 2; i <= N - 1; i++)
+ a[t][i] = a[t - 1][i - 1] + a[t - 1][i] + a[t - 1][i + 1];
+
+ return 0;
+}
diff --git a/test-chill/test-cases/examples/chill/jacobi1.script b/test-chill/test-cases/examples/chill/jacobi1.script
new file mode 100644
index 0000000..604f763
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/jacobi1.script
@@ -0,0 +1,18 @@
+#
+# tiling perfect jacobi loop nest with time step, use
+# unimodular transformation first (only applicable to the
+# perfect loop nest) to make tiling legal.
+#
+
+source: jacobi1.c
+procedure: jacobi
+format : rose
+loop: 0
+
+print dep
+
+nonsingular([[1,0],[1,1]]) # unimodular matrix, determinant is one
+tile(0,2,64)
+
+print dep
+print
diff --git a/test-chill/test-cases/examples/chill/jacobi2.c b/test-chill/test-cases/examples/chill/jacobi2.c
new file mode 100644
index 0000000..b8d8d7b
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/jacobi2.c
@@ -0,0 +1,15 @@
+#define N 512
+
+int main() {
+ double a[N];
+ double b[N];
+ int t, i;
+ for (t = 1; t <= 100; t++) {
+ for (i = 2; i <= N - 1; i++)
+ b[i] = (double) 0.25 * (a[i - 1] + a[i + 1]) + (double) 0.5 * a[i];
+
+ for (i = 2; i <= N - 1; i++)
+ a[i] = b[i];
+ }
+ return 0;
+}
diff --git a/test-chill/test-cases/examples/chill/jacobi2.script b/test-chill/test-cases/examples/chill/jacobi2.script
new file mode 100644
index 0000000..afe14c6
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/jacobi2.script
@@ -0,0 +1,21 @@
+#
+# tiling imperfect jacobi loop nest, more details in the paper
+# "Automatic Tiling of Iterative Stencil Loops" by Zhiyuan Li and
+# Yonghong Song, TOPLAS, 2004.
+#
+
+source: jacobi2.c
+procedure: main
+format: rose
+loop: 0
+
+print dep
+
+original()
+shift([1], 2, 1)
+fuse([0,1], 2) # optional
+skew([0,1], 2, [2,1])
+tile(0, 2, 32, 1)
+
+print dep
+print
diff --git a/test-chill/test-cases/examples/chill/qr.c b/test-chill/test-cases/examples/chill/qr.c
new file mode 100644
index 0000000..8d18b72
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/qr.c
@@ -0,0 +1,44 @@
+#include <math.h>
+
+int main() {
+
+ int M, N;
+ float** A;
+ float *s;
+ float *Rdiag;
+ float *nrm;
+ int i, j, k;
+ float t;
+ for (k = 0; k < N; k++) {
+ nrm[k] = 0;
+
+ for (i = k; i < M; i++)
+ nrm[k] = sqrt(nrm[k] * nrm[k] + A[i][k] * A[i][k]);
+ //t = A[k][k];
+
+ //if (t < 0)
+ // nrm[k] = -nrm[k];
+ for (i = k; i < M; i++)
+ A[i][k] = A[i][k] / nrm[k];
+
+ A[k][k] = A[k][k] + 1;
+
+ for (j = k + 1; j < N; j++) {
+ s[j] = 0; //S6
+
+ for (i = k; i < M; i++)
+ s[j] = s[j] + A[i][k] * A[i][j]; //S7
+
+ s[j] = -s[j] / A[k][k]; //S8
+
+ for (i = k; i < M; i++)
+ A[i][j] = A[i][j] + s[j] * A[i][k]; //S9
+
+ }
+
+ Rdiag[k] = -nrm[k];
+
+ }
+
+ return 0;
+}
diff --git a/test-chill/test-cases/examples/chill/qr.script b/test-chill/test-cases/examples/chill/qr.script
new file mode 100644
index 0000000..6b4cd46
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/qr.script
@@ -0,0 +1,13 @@
+#
+# tiling imperfect jacobi loop nest, more details in the paper
+# "Automatic Tiling of Iterative Stencil Loops" by Zhiyuan Li and
+# Yonghong Song, TOPLAS, 2004.
+#
+
+source: qr.c
+procedure: main
+format: rose
+loop: 0
+original()
+print
+
diff --git a/test-chill/test-cases/examples/chill/scalar_test.c b/test-chill/test-cases/examples/chill/scalar_test.c
new file mode 100644
index 0000000..733c882
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/scalar_test.c
@@ -0,0 +1,16 @@
+int a[10][10];
+int main() {
+
+ int temp;
+ int i, j;
+
+ for (i = 0; i < 10; i++) {
+ for (j = 0; j < 10; j++) {
+ a[i + 1][j - 1] = a[i][j];
+ }
+
+ }
+
+ return 0;
+
+}
diff --git a/test-chill/test-cases/examples/chill/scalar_test.script b/test-chill/test-cases/examples/chill/scalar_test.script
new file mode 100644
index 0000000..f5b0aa8
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/scalar_test.script
@@ -0,0 +1,10 @@
+#Simple Scalar dependence check
+source: scalar_test.c
+procedure: main
+format : rose
+loop: 0
+
+original()
+permute([2,1])
+print dep
+print space
diff --git a/test-chill/test-cases/examples/chill/swim.c b/test-chill/test-cases/examples/chill/swim.c
new file mode 100644
index 0000000..a21ef24
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/swim.c
@@ -0,0 +1,159 @@
+#define M 100
+#define N 100
+#define N3 10
+
+int main() {
+
+ int DX;
+ int DY;
+ int FSDX;
+ int FSDY;
+ int TDT;
+ int TDTS8;
+ int TDTSDX;
+ int TDTSDY;
+ int t, i, j;
+ double CU[M + 1][N + 1];
+ double CV[M + 1][N + 1];
+ double Z[M + 1][N + 1];
+ double H[M + 1][N + 1];
+ double P[M + 1][N + 1];
+ double U[M + 1][N + 1];
+ double V[M + 1][N + 1];
+ double UNEW[M + 1][N + 1];
+ double UOLD[M + 1][N + 1];
+ double PNEW[M + 1][N + 1];
+ double POLD[M + 1][N + 1];
+ double VNEW[M + 1][N + 1];
+ double VOLD[M + 1][N + 1];
+ double ALPHA;
+
+ for (t = 0; t < N3; t++) {
+
+ FSDX = 4 / DX;
+ FSDY = 4 / DY;
+
+ for (i = 0; i < M; i++) {
+ for (j = 0; j < N; j++) {
+ CU[i + 1][j] = (double) 0.5 * (P[i + 1][j] + P[i][j])
+ * U[i + 1][j];
+ CV[i][j + 1] = (double) 0.5 * (P[i][j + 1] + P[i][j])
+ * V[i][j + 1];
+ Z[i + 1][j + 1] =
+ (FSDX * (V[i + 1][j + 1] - V[i][j + 1])
+ - FSDY * (U[i + 1][j + 1] - U[i + 1][j]))
+ / (P[i][j] + P[i + 1][j] + P[i + 1][j + 1]
+ + P[i][j + 1]);
+ H[i][j] = P[i][j]
+ + (double) 0.25
+ * (U[i + 1][j] * U[i + 1][j] + U[i][j] * U[i][j]
+ + V[i][j + 1] * V[i][j + 1]
+ + V[i][j] * V[i][j]);
+ }
+ }
+
+ for (j = 0; j < N; j++) {
+ // CU[0][j] = CU[M+1][j];
+ CU[0][j] = CU[M][j];
+ CV[M][j + 1] = CV[0][j + 1];
+ Z[0][j + 1] = Z[M][j + 1];
+ H[M][j] = H[0][j];
+ }
+
+ for (i = 0; i < M; i++) {
+ CU[i + 1][N] = CU[i + 1][0];
+ CV[i][0] = CV[i][N];
+ Z[i + 1][0] = Z[i + 1][N];
+ H[i][N] = H[i][0];
+ }
+
+ CU[0][N] = CU[M][0];
+ CV[M][0] = CV[0][N];
+ Z[0][0] = Z[M][N];
+ H[M][N] = H[0][0];
+
+ TDTS8 = TDT / 8;
+ TDTSDX = TDT / DX;
+ TDTSDY = TDT / DY;
+
+ for (i = 0; i < M; i++) {
+ for (j = 0; j < N; j++) {
+ UNEW[i + 1][j] = UOLD[i + 1][j]
+ + TDTS8 * (Z[i + 1][j + 1] + Z[i + 1][j])
+ * (CV[i + 1][j + 1] + CV[i][j + 1] + CV[i][j]
+ + CV[i + 1][j])
+ - TDTSDX * (H[i + 1][j] - H[i][j]);
+ VNEW[i][j + 1] = VOLD[i][j + 1]
+ - TDTS8 * (Z[i + 1][j + 1] + Z[i][j + 1])
+ * (CU[i + 1][j + 1] + CU[i][j + 1] + CU[i][j]
+ + CU[i + 1][j])
+ - TDTSDY * (H[i][j + 1] - H[i][j]);
+ PNEW[i][j] = POLD[i][j] - TDTSDX * (CU[i + 1][j] - CU[i][j])
+ - TDTSDY * (CV[i][j + 1] - CV[i][j]);
+ }
+ }
+ for (j = 0; j < N; j++) {
+ UNEW[0][j] = UNEW[M][j];
+ VNEW[M][j + 1] = VNEW[0][j + 1];
+ PNEW[M][j] = PNEW[0][j];
+ }
+
+ for (i = 0; i < M; i++) {
+ UNEW[i + 1][N] = UNEW[i + 1][0];
+ VNEW[i][0] = VNEW[i][N];
+ PNEW[i][N] = PNEW[i][0];
+ }
+
+ UNEW[0][N] = UNEW[M][0];
+ VNEW[M][0] = VNEW[0][N];
+ PNEW[M][N] = PNEW[0][0];
+ // time = time + DT;
+
+ for (i = 0; i < M; i++) {
+ for (j = 0; j < N; j++) {
+ UOLD[i][j] = U[i][j]
+ + ALPHA
+ * (UNEW[i][j] - (double) 2 * U[i][j]
+ + UOLD[i][j]);
+ VOLD[i][j] = V[i][j]
+ + ALPHA
+ * (VNEW[i][j] - (double) 2 * V[i][j]
+ + VOLD[i][j]);
+ POLD[i][j] = P[i][j]
+ + ALPHA
+ * (PNEW[i][j] - (double) 2 * P[i][j]
+ + POLD[i][j]);
+ U[i][j] = UNEW[i][j];
+ V[i][j] = VNEW[i][j];
+ P[i][j] = PNEW[i][j];
+ }
+ }
+
+ for (j = 0; j < N; j++) {
+ UOLD[M][j] = UOLD[0][j];
+ VOLD[M][j] = VOLD[0][j];
+ POLD[M][j] = POLD[0][j];
+ U[M][j] = U[0][j];
+ V[M][j] = V[0][j];
+ P[M][j] = P[0][j];
+ }
+
+ for (i = 0; i < M; i++) {
+ UOLD[i][N] = UOLD[i][0];
+ VOLD[i][N] = VOLD[i][0];
+ POLD[i][N] = POLD[i][0];
+ U[i][N] = U[i][0];
+ V[i][N] = V[i][0];
+ P[i][N] = P[i][0];
+ }
+
+ UOLD[M][N] = UOLD[0][0];
+ VOLD[M][N] = VOLD[0][0];
+ POLD[M][N] = POLD[0][0];
+ U[M][N] = U[0][0];
+ V[M][N] = V[0][0];
+ P[M][N] = P[0][0];
+
+ }
+ return 0;
+}
diff --git a/test-chill/test-cases/examples/chill/swim.script b/test-chill/test-cases/examples/chill/swim.script
new file mode 100644
index 0000000..79de9d9
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/swim.script
@@ -0,0 +1,13 @@
+#
+# tiling imperfect jacobi loop nest, more details in the paper
+# "Automatic Tiling of Iterative Stencil Loops" by Zhiyuan Li and
+# Yonghong Song, TOPLAS, 2004.
+#
+
+source: swim.c
+procedure: main
+format: rose
+loop: 0
+original()
+#print space
+print
diff --git a/test-chill/test-cases/examples/chill/test_align.c b/test-chill/test-cases/examples/chill/test_align.c
new file mode 100644
index 0000000..d1365ca
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/test_align.c
@@ -0,0 +1,20 @@
+int main() {
+
+ int m, n;
+ int a[10], b[10];
+ int i, j;
+ for (i = 0; i < n; i++) {
+ for (j = 0; j < n; j++) {
+ a[i] = 1;
+ }
+
+ for (j = 0; j < n; j++) {
+ b[i] -= 1;
+ }
+
+ }
+
+ return 0;
+
+}
+
diff --git a/test-chill/test-cases/examples/chill/test_align.script b/test-chill/test-cases/examples/chill/test_align.script
new file mode 100644
index 0000000..c990e22
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/test_align.script
@@ -0,0 +1,12 @@
+#matrix multiply large array size for intel machine
+source: test_align.c
+procedure: main
+format: rose
+loop: 0
+
+original()
+
+
+
+print
+
diff --git a/test-chill/test-cases/examples/chill/test_fusion.c b/test-chill/test-cases/examples/chill/test_fusion.c
new file mode 100644
index 0000000..bd2c4f2
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/test_fusion.c
@@ -0,0 +1,13 @@
+int main() {
+
+ int a[10][10];
+ int i, j;
+ for (i = 0; i < 10; i++) {
+ for (j = 0; j < 10; j++)
+ a[i][j] = a[i][j] + 5;
+ for (j = 0; j < 10; j++)
+ a[i][j + 1] = a[i][j + 1] + 5;
+
+ }
+
+}
diff --git a/test-chill/test-cases/examples/chill/test_fusion.script b/test-chill/test-cases/examples/chill/test_fusion.script
new file mode 100644
index 0000000..41f6cc0
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/test_fusion.script
@@ -0,0 +1,7 @@
+source: test_fusion.c
+procedure: main
+loop: 0
+original()
+fuse([0,1],2)
+print
+
diff --git a/test-chill/test-cases/examples/chill/test_lex_order.c b/test-chill/test-cases/examples/chill/test_lex_order.c
new file mode 100644
index 0000000..1a3b26d
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/test_lex_order.c
@@ -0,0 +1,31 @@
+int main() {
+
+ int m, n;
+ int a[10];
+ int b[10];
+ int c[10];
+ int i, j;
+ for (i = 0; i < n; i++) {
+ for (j = 0; j < n; j++) {
+ b[j] = a[j];
+ }
+
+
+
+ for (j = 0; j < n; j++) {
+ a[j+1] = 6;
+ }
+
+ for (j = 0; j < n; j++) {
+ c[j] = a[j];
+ }
+
+
+
+
+ }
+
+ return 0;
+
+}
+
diff --git a/test-chill/test-cases/examples/chill/test_lex_order.script b/test-chill/test-cases/examples/chill/test_lex_order.script
new file mode 100644
index 0000000..2629e50
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/test_lex_order.script
@@ -0,0 +1,12 @@
+#matrix multiply large array size for intel machine
+source: test_lex_order.c
+procedure: main
+format: rose
+loop: 0
+
+original()
+
+
+
+print
+
diff --git a/test-chill/test-cases/examples/chill/test_split.c b/test-chill/test-cases/examples/chill/test_split.c
new file mode 100644
index 0000000..6ca62cc
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/test_split.c
@@ -0,0 +1,14 @@
+int main() {
+
+ int a[10][10][10][10];
+ int i, j, k, l;
+
+ for (i = 0; i < 10; i++)
+ for (j = 0; j < 10; j++)
+ for (k = 0; k < 10; k++)
+ for (l = 0; l < 10; l++)
+ a[i][j][k + 1][l] = a[i][j][k][l];
+ // a[i+1][j-1] = a[i][j];
+
+ return 0;
+}
diff --git a/test-chill/test-cases/examples/chill/test_split.script b/test-chill/test-cases/examples/chill/test_split.script
new file mode 100644
index 0000000..e1ebba9
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/test_split.script
@@ -0,0 +1,9 @@
+source: test_split.c
+procedure: main
+format: rose
+loop: 0
+original()
+N=10
+split(0,1, L3-L2-L4 <= 5)
+print
+
diff --git a/test-chill/test-cases/examples/chill/test_split2.c b/test-chill/test-cases/examples/chill/test_split2.c
new file mode 100644
index 0000000..1ab8e43
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/test_split2.c
@@ -0,0 +1,14 @@
+int main() {
+
+ int a[10][10][10][10];
+ int i, j, k, l;
+
+ for (i = 0; i < 10; i++)
+ for (j = 0; j < 10; j++)
+ for (k = 0; k < 10; k++)
+ for (l = 0; l < 10; l++)
+ a[i][j][k + 1][l - 1] = a[i][j][k][l];
+ // a[i+1][j-1] = a[i][j];
+
+ return 0;
+}
diff --git a/test-chill/test-cases/examples/chill/test_split2.script b/test-chill/test-cases/examples/chill/test_split2.script
new file mode 100644
index 0000000..bcaa2a0
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/test_split2.script
@@ -0,0 +1,9 @@
+source: test_split2.c
+procedure: main
+format: rose
+loop: 0
+original()
+N=10
+split(0,1, L4 <= 5)
+print
+
diff --git a/test-chill/test-cases/examples/chill/test_tile.c b/test-chill/test-cases/examples/chill/test_tile.c
new file mode 100644
index 0000000..aeaaefc
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/test_tile.c
@@ -0,0 +1,16 @@
+void func(int n) {
+
+ int i;
+ int a[10];
+
+ for (i = 0; i < n; i++)
+ a[i] = 2;
+
+}
+
+int main() {
+
+ func(10);
+
+ return 0;
+}
diff --git a/test-chill/test-cases/examples/chill/test_tile.script b/test-chill/test-cases/examples/chill/test_tile.script
new file mode 100644
index 0000000..d437145
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/test_tile.script
@@ -0,0 +1,14 @@
+#matrix multiply large array size for intel machine
+source: test_tile.c
+procedure: func
+format : rose
+loop: 0
+
+original()
+#permute([3,2,1])
+tile(0,1,4)
+
+
+
+print
+
diff --git a/test-chill/test-cases/examples/chill/tile_violation.c b/test-chill/test-cases/examples/chill/tile_violation.c
new file mode 100644
index 0000000..d719e52
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/tile_violation.c
@@ -0,0 +1,12 @@
+int main() {
+
+ int i, j, k;
+ int a[10][10][10];
+
+ for (i = 0; i < 10; i++)
+ for (j = 0; j < 10; j++)
+ for (k = 0; k < 10; k++)
+ a[i][j + 1][k - 1] = a[i][j][k];
+
+ return 0;
+}
diff --git a/test-chill/test-cases/examples/chill/tile_violation.script b/test-chill/test-cases/examples/chill/tile_violation.script
new file mode 100644
index 0000000..57d1423
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/tile_violation.script
@@ -0,0 +1,14 @@
+#matrix multiply large array size for intel machine
+source: tile_violation.c
+procedure: main
+format :rose
+loop: 0
+
+original()
+#permute([3,2,1])
+tile(0,3,2,1)
+
+
+
+print
+
diff --git a/test-chill/test-cases/examples/chill/unroll.c b/test-chill/test-cases/examples/chill/unroll.c
new file mode 100644
index 0000000..68f4633
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/unroll.c
@@ -0,0 +1,31 @@
+#define N 14
+void foo(int n, float* x, float* y, float* z, float* f3, float* f1, float* w) {
+ int dt;
+
+ int i, j;
+
+ for (i = 1; i <= 14; i++)
+ x[i] = 1.0;
+
+ for (i = 1; i <= 14; i += 3)
+ y[i] = 1.0;
+
+ for (i = N + 1; i <= N + 20; i += 3)
+ z[i] = 1.0;
+
+ for (i = 0; i <= N; i++) {
+ for (j = i; j <= i + N; j++)
+ f3[i] = f3[i] + f1[j] * w[j - i];
+ f3[i] = f3[i] * dt;
+ }
+
+ return 0;
+}
+
+int main() {
+ float x[N], y[N], z[N], f3[N], f1[N], w[N];
+
+ foo(N, x, y, z, f3, f1, w);
+ return 0;
+}
+
diff --git a/test-chill/test-cases/examples/chill/unroll.script b/test-chill/test-cases/examples/chill/unroll.script
new file mode 100644
index 0000000..e64acb6
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/unroll.script
@@ -0,0 +1,35 @@
+#
+# Test unroll-and-jam. The last loop adapted from the simple
+# convolution example from p463 of "Optimizing Compilers for
+# Modern Architectures", by Randy Allen and Ken Kennedy.
+#
+
+source: unroll.c
+procedure: foo
+format: rose
+# fully unroll a loop with known iteration count
+loop: 0
+original()
+unroll(0,1,3)
+print
+print space
+
+
+# a strided loop
+loop: 1
+original()
+unroll(0,1,2)
+print
+print space
+
+# lower and upper bounds are not constant
+loop: 2
+original()
+unroll(0,1,20)
+print
+
+# parallelogram iteration space
+loop: 3
+original()
+unroll(0,1,2)
+print
diff --git a/test-chill/test-cases/examples/chill/unroll_violation.c b/test-chill/test-cases/examples/chill/unroll_violation.c
new file mode 100644
index 0000000..d719e52
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/unroll_violation.c
@@ -0,0 +1,12 @@
+int main() {
+
+ int i, j, k;
+ int a[10][10][10];
+
+ for (i = 0; i < 10; i++)
+ for (j = 0; j < 10; j++)
+ for (k = 0; k < 10; k++)
+ a[i][j + 1][k - 1] = a[i][j][k];
+
+ return 0;
+}
diff --git a/test-chill/test-cases/examples/chill/unroll_violation.script b/test-chill/test-cases/examples/chill/unroll_violation.script
new file mode 100644
index 0000000..019473d
--- /dev/null
+++ b/test-chill/test-cases/examples/chill/unroll_violation.script
@@ -0,0 +1,14 @@
+#matrix multiply large array size for intel machine
+source: unroll_violation.c
+procedure: main
+format: rose
+loop: 0
+
+original()
+#permute([3,2,1])
+unroll(0,2,2)
+
+
+
+print
+
diff --git a/test-chill/test-cases/examples/cuda-chill/cp.c b/test-chill/test-cases/examples/cuda-chill/cp.c
new file mode 100644
index 0000000..837d7a6
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/cp.c
@@ -0,0 +1,29 @@
+#define N 1
+
+#define VOLSIZEY 512
+#define VOLSIZEX 512
+#define VOLSIZEZ 1
+#define ATOMCOUNT 4000
+#define GRIDSPACING 0.1
+#define zDim 0
+
+extern float sqrtf(float);
+
+void cenergy_cpu(float atoms[ATOMCOUNT*4],float *energy,float z)
+{
+int i,j,n;float dx,dy,dz;
+
+ for (j=0; j<VOLSIZEY; j++) {
+ for (i=0; i<VOLSIZEX; i++) {
+ for (n=0;n<ATOMCOUNT;n+=4) {
+ dx = (GRIDSPACING * i) - atoms[n];
+ dy = (GRIDSPACING * j) - atoms[n+1];
+ dz = z - atoms[n+2];
+ energy[(j*VOLSIZEX + i)+VOLSIZEX*VOLSIZEY*zDim] += atoms[n+3]/sqrtf( (dx*dx) + (dy*dy)+ (dz*dz) ) ;
+ }
+
+
+ }
+ }
+}
+
diff --git a/test-chill/test-cases/examples/cuda-chill/cp.lua b/test-chill/test-cases/examples/cuda-chill/cp.lua
new file mode 100644
index 0000000..1ef2264
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/cp.lua
@@ -0,0 +1,46 @@
+--CUBLAS 2 MM Multiply
+
+--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
+--call init() and use global variables to specify procedure and loop
+
+--Second parameter is procedure # and third is loop #
+init("cp.c", "cenergy_cpu", 0)
+
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+ --copy_to_shared methods
+V=512
+N=4000
+N=1
+
+Tj=32
+Ti=16
+Tii=16
+Tjj=16
+
+--normalize_index("j")
+--normalize_index("i")
+print_code()
+normalize_index("n")
+-- TILE COMMANDS ZEROOOOOOOOOOO:3
+--permute(0,{"i","j","n"})
+--tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","n"})--CU=-1
+tile_by_index({"j","i"},{Tj,Ti},{l1_control="jj",l2_control="ii"},{"jj","ii","j","i","n"})--CU=-1
+--tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1
+
+--tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3
+--tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","i","iii","j","jjj","n"})--CU=3
+--tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"})
+--tile_by_index({"i"}, {Tii}, {l1_control="iii",l1_tile="i"}, {"ii", "jj", "iii","i","j","n"})
+print_code()
+cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"jj","ii"}, thread={"j","i"}})--CU=3
+--cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"ii","jj"}, thread={"i","j"}})--CU=3
+print_code()
+copy_to_shared("tx","atoms",-16)
+copy_to_registers("tx","energy")
+--copy_to_texture("atoms")
+--unroll_to_depth(1)
+--unroll(0,9,0)
+--unroll(0,5,0)
+
+--unroll(0,8,256)
+print_code()
diff --git a/test-chill/test-cases/examples/cuda-chill/cudaize.lua b/test-chill/test-cases/examples/cuda-chill/cudaize.lua
new file mode 100644
index 0000000..7359cca
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/cudaize.lua
@@ -0,0 +1,1004 @@
+
+-- THIS IS CUDAIZE.LUA
+
+function table.contains_key(table, key)
+ for k in pairs(table) do
+ if k == key then
+ return true
+ end
+ end
+ return false
+end
+
+function valid_indices(stmt, indices)
+ --print( "valid_indices() lua calling C cur_indices")
+ --io.flush()
+ cur = cur_indices(stmt)
+ --print("Cur indices "..list_to_string(cur))
+ for idx in pairs(indices) do
+ if not table.contains_key(cur,idx) then
+ return false
+ end
+ end
+ return true
+end
+
+function next_clean_level(cur_idxs,level)
+ --print("next_clean_level( ..., "..level.." )")
+ --print(string.format("indices_at_each_level %s ",list_to_string(cur_idxs) ))
+
+ --print("loop to "..#cur_idxs)
+ for i=level+1,#cur_idxs do
+ --print("Checking level "..i.." = '"..cur_idxs[i].."'")
+ if (# cur_idxs[i] > 0) then
+ --print("Good enough"..(# cur_idxs[i]))
+ --print("returning "..i)
+ return i
+ end
+ end
+ return -1 --sentinal that there were no non-dummy indices left
+end
+
+function build_order(final_order, tile_idx_names, ctrl_idx_names, tile_idx_map, cur_level)
+ order = {}
+ --print("\nbuild_order()")
+ --print("build_order(): final_order = ( "..list_to_string(final_order).." )")
+ --print("build_order(): ctrl_idx_names = ("..list_to_string(ctrl_idx_names).." )")
+ --print("cur_level "..cur_level.."")
+ --io.flush()
+
+ for i,k in ipairs(final_order) do
+ skip = false
+ cur = final_order[i]
+ --print("\ncur "..cur.." = final_order["..i.."] = "..final_order[i].." ")
+ --control loops below our current level should not be in the current order
+ for j=cur_level+2,# ctrl_idx_names do
+ --print("j "..j.." final_order["..i.."] = "..final_order[i].." ")
+ if ctrl_idx_names[j] == final_order[i] then
+ skip = true
+ --print("SKIP "..final_order[i].." ")
+ --io.flush()
+ end
+ end
+ --possibly substitute tile indices ifn necessar
+ if table.contains_key(tile_idx_map,final_order[i]) then
+ approved_sub = false
+ sub_string = tile_idx_map[final_order[i]]
+ for j=cur_level+2,# tile_idx_names do
+ if tile_idx_names[j] == sub_string then
+ approved_sub = true
+ end
+ end
+ if approved_sub then
+ cur = sub_string
+ end
+ end
+ if not skip then
+ table.insert(order,cur)
+ end
+ end
+ return order
+end
+
+function list_to_string(str_list)
+ --Helpful debug output
+ l = ""
+ for i,str in ipairs(str_list) do
+ if i > 1 then
+ l = l .. ", " .. str
+ else
+ l = str
+ end
+ end
+ return l
+end
+
+
+function find_cur_level(stmt,idx)
+ --Search cur_indices for a idx at stmt
+ cur = cur_indices(stmt)
+ --print(string.format("find_cur_level(stmt %d, idx %s) Cur indices %s", stmt, idx, list_to_string(cur)))
+ for i,cidx in ipairs(cur) do
+ if cidx == idx then
+ --print(string.format("found it at index %d", i))
+ return i
+ end
+ end
+ error("Unable to find "..idx.." in current list of indices")
+end
+
+
+function chk_cur_level(stmt,idx)
+ --Search cur_indices for a idx at stmt
+ cur = cur_indices(stmt)
+ for i,cidx in ipairs(cur) do
+ if cidx == idx then
+ return i
+ end
+ end
+ return -1
+end
+
+
+function find_offset(cur_order, tile, control)
+ --print("Looking for tile '"..tile.."' and control '"..control.."' in ( "..list_to_string(cur_order)..", )")
+ idx1 = -1
+ idx2 = -1
+ for i,cur in ipairs(cur_order) do
+ if(cur == tile) then
+ idx1 = i
+ end
+ if(cur == control) then
+ idx2 = i
+ end
+ end
+ if(idx1 < 0) then
+ error("Unable to find tile " .. tile .. " in current list of indices")
+ end
+ if(idx2 < 0) then
+ error("Unable to find control " .. control .. " in current list of indices")
+ end
+ --print("found at level " .. idx2 .. " and " .. idx1)
+ if(idx2 < idx1) then
+ return idx2-idx1+1
+ else
+ return idx2-idx1
+ end
+end
+
+function tile_by_index(tile_indices, sizes, index_names, final_order, tile_method)
+ --print "STARTING TILE BY INDEX"
+ --io.flush()
+ stmt = 0 --assume stmt 0
+ cur = cur_indices(stmt)
+ --print("Cur indices "..list_to_string(cur))
+ if not valid_indices(stmt,tile_indices) then
+ error('One of the indices in the first parameter were not '..
+ 'found in the current set of indices.')
+ end
+ if not tile_method then tile_method = counted end
+ tile_idx_names = {}
+ for i,s in ipairs(tile_indices) do tile_idx_names[i]=s end --shallow copy
+ --print("tile_index_names: ['"..list_to_string(tile_indices).."']")
+
+ --print("index_names: ")
+ --for k,v in pairs(index_names) do print(k,v) end
+
+ --io.flush()
+
+ ctrl_idx_names = {}
+ tile_idx_map = {}
+ for k,v in pairs(index_names) do
+ valid = false
+ if(string.sub(k,1,1) == "l") then
+ if string.sub(k,-8) == "_control" then
+ i = tonumber(string.sub(k,2,-9))
+ if i and i >= 1 and i <= (# tile_indices) then
+ ctrl_idx_names[i] = v
+ --print(string.format("Handling control %s for loop level %d",v,i))
+ --print("control "..k.." name "..v.." ")
+ valid = true
+ end
+ elseif string.sub(k,-5) == "_tile" then
+ i = tonumber(string.sub(k,2,-6))
+ if i and i >= 1 and i <= (# tile_indices) then
+ --print(string.format("tile %s -> %s",tile_indices[i], v))
+ tile_idx_names[i] = v
+ tile_idx_map[v] = tile_indices[i]
+ --print(string.format("tile %s -> %s",tile_indices[i], v))
+ valid = true
+ end
+ end
+ end
+ if not valid then error(string.format("%s is not a proper key for specifying "..
+ "tile or control loop indices\n", k)) end
+ end
+
+ --filter out control indices (and do name substitution of unprocessed tile indices) for a given level
+ cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, -1)
+ permute(stmt, cur_order)
+
+ for i,cur_idx in ipairs(tile_indices) do
+ --print(string.format("i %d cur_idx %s calling build order ********", i-1, cur_idx))
+ cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)
+ --Find a offset between tile loop and control loop
+ -- 0 = control loop one level above tile loop
+ -- -1 = control loop two levels above tile loop
+ -- > 0 = tile loop above control loop
+ -- In the last case, we do two extra tile commands to get the control
+ -- above the tile and then rely on the final permute to handle the
+ -- rest
+ level = find_cur_level(stmt,cur_idx)
+ offset = find_offset(cur_order, tile_idx_names[i], ctrl_idx_names[i])
+ --print(string.format("offset %d", offset))
+
+ if (offset <= 0) then
+ --print(string.format("[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %s)",stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method))
+ tile(stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method)
+ else
+ --print(string.format("2tile(%d, %d, %d, %d, %s, %s, %s)", stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method))
+ tile(stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method);--regular level
+ --flip tile and control loop
+ --print(string.format("3tile(%d, %d, %d)",stmt, level+1, level+1))
+ tile(stmt, level+1, level+1);
+ --print(string.format("4tile(%d, %d, %d)",stmt, level+1, level))
+ tile(stmt, level+1, level);
+ --print(string.format("\n[offset>0]tile(%d, %d, %d, %d,%s,%s,%s)",stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method))
+ --print_code()
+
+ end
+
+ --Do permutation based on cur_order
+ --print "permute based on build order calling build_order()"
+ --print "cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)"
+ cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)
+ --print "permute(stmt, cur_order);"
+ permute(stmt, cur_order);
+ --print "\nafter permute(), code is:"
+ --print_code()
+ end
+ --print "ENDING TILE BY INDEX"
+ --print_code()
+end
+
+function normalize_index(index)
+ stmt = 0 --assume stmt 0cur = cur_indices(stmt)
+ --print("Cur indices "..list_to_string(cur))
+ l = find_cur_level(stmt, index)
+ tile(stmt, l, l)
+ --print(string.format("\n[Normalize]tile(%d, %d, %d)",stmt, l,l))
+end
+
+function is_in_indices(stmt, idx)
+ cur = cur_indices(stmt)
+ for i=0,#cur,1 do
+ if(cur[i]==idx) then
+ return true
+ end
+ end
+ return false
+
+end
+
+
+function copy_to_registers(start_loop, array_name)
+
+ --print("\n\n****** starting copy to registers")
+ io.flush()
+
+ stmt = 0 --assume stmt 0
+
+ -- [Malik] first we make sure that tx and ty are consecutive loops in the 2D thread setup, otherwise all levels for subsequent operations are messed up. Start logic.
+ cur = cur_indices(stmt)
+ table_Size = table.getn(cur)
+
+ --print(string.format("Cur indices %s,",list_to_string(cur)))
+ --print(string.format("The table size is %d", table_Size))
+ --table.foreach(cur, print)
+ --print_code()
+
+ level_tx = -1
+ level_ty = -1
+ if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end
+ if is_in_indices(stmt,"ty") then level_ty = find_cur_level(stmt,"ty") end
+ --print(string.format("level_tx %d level_ty %d", level_tx, level_ty))
+
+ ty_lookup_idx = ""
+ org_level_ty = level_ty
+
+ --if(cur[level_tx+1]~=nil and cur[level_tx+1]~="") then ty_lookup = ty_lookup+1 end
+ if(cur[level_ty+1]~=nil and cur[level_ty+1]~="") then
+ --print(string.format("IF cur[%d] = %s", level_ty+1, cur[level_ty+1]))
+ ty_lookup_idx = cur[level_ty+1]
+ else
+ --if cur[level_ty] ~= nil then print(string.format("ELSE ty_lookup_idx = cur[%d] = %s", level_ty, cur[level_ty])) -- TODO
+ --else print "ELSE (dangerous)" end
+ ty_lookup_idx = cur[level_ty] -- may assign nil !?
+ end
+ --if ty_lookup_idx ~= nil then print(string.format("ty_lookup_idx '%s'", ty_lookup_idx)) -- TODO
+ --else print "ty_lookup_idx is NIL"
+ --end
+
+ if level_ty > 0 then
+ --print(string.format("\ntile3(%d,%d,%d)",stmt,level_ty,level_tx+1))
+ tile(stmt,level_ty,level_tx+1)
+ end
+ --print_code()
+
+ --print("\ntylookup is %d",ty_lookup)
+ --exit(0)
+ --
+ cur = cur_indices(stmt)
+ table_Size = table.getn(cur)
+ --print(string.format("Cur indices %s,",list_to_string(cur)))
+ --print("The table size is "..table.getn(cur))
+ --table.foreach(cur, print)
+
+ if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end
+ if ty_lookup_idx then
+ if is_in_indices(stmt,ty_lookup_idx) then level_ty = find_cur_level(stmt,ty_lookup_idx) end
+ end
+
+ ty_lookup = 1
+ idx_flag = -1
+ -- find the level of the next valid index after ty+1
+ --print(string.format("\nlevel_ty %d", level_ty))
+ if level_ty > 0 then
+ --print(string.format("table_Size %d", table_Size))
+ for num= level_ty+ty_lookup,table_Size do
+ --print(string.format("num=%d cur[num] = '%s'",num, cur[num]))
+ if(cur[num] ~= "") then
+ idx_flag = find_cur_level(stmt,cur[num])
+ --print (string.format("idx_flag = %d", idx_flag))
+ break
+ end
+ end
+ end
+
+ --print(string.format("\n(first) I am checking all indexes after ty+1 %s",idx_flag))
+ --print_code()
+ --print ""
+
+ how_many_levels = 1
+ startat = idx_flag + 1
+ if startat == 0 then startat = 1 end -- avoid attempt to examine an illegal array offset
+ --print(string.format("idx_flag = %d I will check levels starting with %d", idx_flag, idx_flag+1))
+
+ for ch_lev = startat,table_Size,1 do -- was for ch_lev = idx_flag+1,table_Size,1 do
+ --print(string.format("ch_lev %d", ch_lev))
+ if(cur[ch_lev] ~= nil and cur[ch_lev] ~= "") then
+ --print(string.format("cur[%d] = '%s'", ch_lev, cur[ch_lev]))
+ how_many_levels = how_many_levels+1
+ end
+ end
+ --print("\nHow Many Levels",how_many_levels)
+
+ -- change this all to reflect the real logic which is to normalize all loops inside the thread loops.
+ if(how_many_levels <2) then
+ while( idx_flag >= 0) do
+ for num = level_ty+ty_lookup,(table_Size) do
+ --print(string.format("at top of loop, num is %d", num))
+ --print(string.format("num %d", num))
+ --print(string.format("cur[num] = '%s'", cur[num]))
+ if(cur[num] ~= "") then
+ idx=cur[num]
+ --print(string.format("idx '%s'", idx))
+
+ curlev = find_cur_level(stmt,idx)
+ --print(string.format("curlev %d", curlev))
+
+ --print_code()
+ --print(string.format("\n[COPYTOREG]tile(%d,%d,%d)",stmt,find_cur_level(stmt,idx),level_tx))
+ tile(stmt,find_cur_level(stmt,idx),find_cur_level(stmt,idx))
+ curlev = find_cur_level(stmt,idx)
+ --print(string.format("curlev %d", curlev))
+ tile(stmt,find_cur_level(stmt,idx),level_tx)
+ --print(string.format("hehe '%s'",cur[num]))
+
+ cur = cur_indices(stmt)
+ --print("Cur indices INSIDE"..list_to_string(cur))
+ table_Size = table.getn(cur)
+ --print(string.format("Table Size is: %d",table_Size))
+ level_tx = find_cur_level(stmt,"tx")
+ --print(string.format("\n level TX is: %d",level_tx))
+ level_ty = find_cur_level(stmt,ty_lookup_idx)
+ --print(string.format("\n level TY is: %d",level_ty))
+ idx_flag = -1
+ --print "idx_flag = -1"
+
+ -- find the level of the next valid index after ty+1
+
+ -- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?)
+ for num= level_ty+ty_lookup,table_Size do
+ --print(string.format("num mucking num = %d", num))
+ if(cur[num] ~= nil and cur[num] ~= "") then
+ idx_flag = find_cur_level(stmt,cur[num])
+ --print("\n(second) I am checking all indexes after ty+1 %s",cur[num])
+ break
+ end
+ end
+ --print(string.format("num mucked to %d idx_flag = %d", num, idx_flag))
+
+ end
+ --print(string.format("at bottom of loop, num is %d", num))
+ end
+ end
+ end
+ --print "done with levels"
+
+
+
+
+ --print "ARE WE SYNCED HERE?"
+ --print_code()
+ --print("\ntile(%d,%d,%d)",stmt,level_k,level_k)
+ --tile(stmt,level_k,level_k)
+
+ -- [Malik] end logic
+ --print_code()
+ start_level = find_cur_level(stmt, start_loop)
+ --We should hold contant any block or tile loop
+ block_idxs = block_indices()
+ thread_idxs = thread_indices()
+ --print("\nblock indices are")
+ --table.foreach(block_idxs, print)
+ --print("\nthread indices are")
+ --table.foreach(thread_idxs, print)
+ --print(string.format("\nStart Level: %d",start_level))
+
+ hold_constant = {}
+ --print("\n Now in Blocks")
+ for i,idx in ipairs(block_idxs) do
+ --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx)))
+ if find_cur_level(stmt,idx) >= start_level then
+ table.insert(hold_constant, idx)
+ --print(string.format("\nJust inserted block %s in hold_constant",idx))
+ end
+ end
+
+
+ --print("\n Now in Threads")
+ for i,idx in ipairs(thread_idxs) do
+ --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx)))
+ if find_cur_level(stmt,idx) >= start_level then
+ table.insert(hold_constant, idx)
+ --print(string.format("\nJust inserted thread %s in hold_constant",idx))
+ end
+ end
+
+ --print "\nhold constant table is: "
+ --table.foreach(hold_constant, print)
+
+ --print("\nbefore datacopy pvt")
+ old_num_stmts = num_statements()
+ --print_code()
+ --print(string.format("\n[DataCopy]datacopy_privatized(%d, %s, %s, vector having privatized levels)",stmt, start_loop, array_name))
+ --table.foreach(hold_constant, print)
+ datacopy_privatized(stmt, start_loop, array_name, hold_constant)
+
+ --print(hold_constant)
+ new_num_stmts = num_statements()
+ --print("\nthe num of statements:%d\n",new_num_stmt)
+ --print_code()
+ --exit(0)
+ -- [Malik] normalize the copy loops created.
+ cur = cur_indices(old_num_stmts)
+ --print("Cur indices "..list_to_string(cur))
+ for cidx,i in ipairs(cur) do
+ if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then
+ --tile(old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i))
+ --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i))
+ end
+ end
+ --print_code()
+ --print("\nthe num of statements OLD+1 :",(old_num_stmts+1))
+
+
+--[[
+ is this commented out? why yes, yes it is block comment
+ if( (old_num_stmts+1) <= new_num_stmts) then
+ cur = cur_indices(old_num_stmts+1)
+ --print("Cur indices+1 "..list_to_string(cur))
+ for cidx,i in ipairs(cur) do
+ if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then
+ tile(old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i))
+ --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i))
+ end
+ end
+ end
+--]]
+
+
+ --Unroll to the last thread level
+ --for stmt=old_num_stmts,new_num_stmts-1 do
+ -- level = find_cur_level(stmt,thread_idxs[#thread_idxs])--get last thread level
+ --if level < #cur_indices(stmt) then
+ -- unroll(stmt,level+1,0)
+ --print(string.format("\n[Unroll]unroll(%d, %d, 0)",stmt, level+1))
+ ----print_code()
+ --end
+ --end
+ io.flush()
+ --print("****** ending copy to registers\n\n")
+ --io.flush()
+end
+
+function copy_to_shared(start_loop, array_name, alignment)
+ --print(string.format("\nstarting copy to shared(%s, %s, %d )",start_loop,array_name,alignment))
+ stmt = 0 --assume stmt 0
+ cur = cur_indices(stmt)
+ --print("Cur indices "..list_to_string(cur))
+
+ start_level = find_cur_level(stmt, start_loop)
+ --print(string.format("start_level %d", start_level))
+
+ old_num_stmts = num_statements()
+ --print(string.format("old_num_statements %d", old_num_stmts))
+
+ --Now, we give it indices for up to two dimentions for copy loop
+ copy_loop_idxs = {"tmp1","tmp2"}
+ --print(string.format("\n[DataCopy]datacopy(%d, %d, %s, {\"tmp1\",\"tmp2\"},false,0,1,%d,true)",stmt, start_level, array_name, alignment))
+ datacopy(stmt, start_level, array_name, copy_loop_idxs, false, 0, 1, alignment,true)
+
+ add_sync(stmt,start_loop)
+ new_num_stmts = num_statements()
+
+ --This is fairly CUBLAS2 specific, not sure how well it generalizes,
+ --but for a 2D copy, what we want to do is "normalize" the first loop
+ --"tmp1" then get its hard upper bound. We then want to tile it to
+ --make the control loop of that tile "ty". We then tile "tmp2" with a
+ --size of 1 and make it "tx".
+ --print(string.format("fairly CUBLAS2 specific, OLD %d NEW %d", old_num_stmts, new_num_stmts ))
+
+ for stmt=old_num_stmts,new_num_stmts-1 do
+ --print(string.format("for stmt = %d", stmt))
+ was_no_error, level = pcall(find_cur_level, stmt, "tmp2")
+
+ if was_no_error then
+ --print_code()
+ --print("\nCopy to shared: [If was no error]\n")
+ find_cur_level(stmt,"tmp2")
+ tile(stmt, level, level)
+
+ lower,upper = hard_loop_bounds(stmt, level)
+ upper = upper + 1
+ --print(string.format("lower %d upper %d", lower, upper))
+
+ tx,ty = thread_dims()
+ --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx)
+
+ level = find_cur_level(stmt,"tmp1")
+ --print(string.format("level %d", level))
+
+ if tx == upper and ty == 1 then
+ --print(string.format("tx = %d upper = %d ty = %d", tx, upper, ty))
+ --print "Don't need"
+
+ --Don't need an extra tile level, just move this loop up
+ second_level = find_cur_level(stmt,"tmp2")
+ --print(string.format("\n[Tile0]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx"))
+ tile(stmt, second_level, 1, level, "tx", "tx", counted)
+ else
+ --print "DO need?"
+ --print_code()
+ if(ty == 1) then new_ctrl = "tmp3" else new_ctrl = "ty" end
+
+
+--[[ Commenting out a block of Gabe's code in this control flow
+ -- level = find_cur_level(stmt,"tmp1")
+ tile(stmt, level, level)
+
+ lower,upper = hard_loop_bounds(stmt, level)
+ upper = upper + 1
+ --print_code()
+ --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx..", level: "..level)
+ if(math.ceil(upper/ty) > 1)then
+ tile(stmt, level, math.ceil(upper/ty), level, "tmp", new_ctrl, counted)
+ --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level, math.ceil(upper/ty),upper,ty, level, "tmp", new_ctrl))
+ else
+ tile(stmt, level, math.ceil(upper/ty), level, "ty", new_ctrl, counted)
+ --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level, math.ceil(upper/ty),upper,ty, level, "tx", new_ctrl))
+ end
+
+ --print_code()
+ -- [Malik] If here we have the loop upper bound > tx, then we should tile once more after the next tile, to carve out the correct tx.
+ lower1,upper1 = hard_loop_bounds(stmt,level)
+ level1 = level
+ stmt1 = stmt
+ -- [Malik] Do the tile after the second level tile with if condition. Just to keep the original order, the tile is being pushed to the end.
+
+ --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1)
+
+ --print_code()
+ --level = find_cur_level(stmt,"tmp")
+ --tile(stmt,level,level)
+ --print_code()
+
+ --[Malik] if you are moving the loop above the level1, you need to update level1 with new position which would be level1+2 or second_level
+ if(level <= level1) then level1 = level1+2 end
+ --print(string.format("\n[Tile2]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx"))
+ --print("\n----------------------------------")
+ --print_code()
+ --print("\n**********************************")
+ --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1)
+ -- [Malik] If the upper bound > tx, we do another tile to carve out the correct tx from a bigger loop. Else just normalize the bounds.
+ if( upper1 > ty) then
+ third_level = find_cur_level(stmt1,"tmp")
+ --print("\n\n\n\t\t\t\tthirdlevel:"..third_level)
+ tile(stmt1, third_level, ty, third_level, "ty", "tmp", counted)
+ --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt1, third_level, ty,third_level, "ty", "tmp"))
+ tile(stmt1,third_level+1,third_level+1)
+ --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level+1))
+ tile(stmt1,third_level+1,third_level)
+ --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level))
+ else
+ tile(stmt1,level1,level1)
+ --print(string.format("\n[Tile3ELSE]tile(%d, %d, %d)",stmt1,level1,level1))
+ end
+
+ --print("\nStarting tmp2\n");--print_code();
+ second_level = find_cur_level(stmt,"tmp2")
+ lower,upper = hard_loop_bounds(stmt,second_level)
+ level = second_level
+ --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..", level:"..level)
+
+ if(math.ceil(upper/tx) > 1)then
+ tile(stmt, second_level,math.ceil(upper/tx), level, "tmp", "tx", counted)
+ --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tmp", "tx"))
+ else
+ tile(stmt, second_level,math.ceil(upper/tx), level, "tx", "tx", counted)
+ --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tx", "tx"))
+ end
+ --print_code()
+ lower2,upper2 = hard_loop_bounds(stmt,level)
+ level2 = level
+ stmt2 = stmt
+ --print("[Malik]-loop cleanup@tmp2: lower2, upper2: "..lower2..", "..upper2..", tx: "..tx..", level:"..level2)
+ -- now for the second level.
+ if( upper2 > tx) then
+ forth_level = find_cur_level(stmt2,"tmp")
+ --print("\n\n\n\t\t\t\tforthlevel:"..forth_level)
+ --print_code()
+ tile(stmt2, forth_level, 1, forth_level, "tx", "tmp", counted)
+ --print(string.format("\n[Tile3B]tile(%d, %d, %d,%d,%s,%s,counted)",stmt2, forth_level, tx,forth_level, "ty", "tmp"))
+ --print_code()
+ --tile(stmt2,forth_level+1,forth_level+1)
+ --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level+1))
+ --tile(stmt2,forth_level+1,forth_level)
+ --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level))
+ else
+ new_level = find_cur_level(stmt2,"ty")
+ tile(stmt2,level2,1,new_level,"tx","tx",counted)
+ --print(string.format("\n[Tile3BELSE]tile(%d, %d, %d)",stmt2,level2,level2))
+ tmp_level = find_cur_level(stmt2,"tmp")
+ tile(stmt2,tmp_level,tmp_level)
+ end
+
+ --print_code()
+ --print("\n----------------------------------")
+--]]
+
+ --print_code()
+ --print("\nStarting tmp2\n");--print_code();
+ first_level = find_cur_level(stmt,"tmp1")
+ second_level = find_cur_level(stmt,"tmp2")
+ lower,upper = hard_loop_bounds(stmt,second_level)
+
+ --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..",first level:"..first_level..",second_level:"..second_level)
+
+ -- Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx.
+ --print(string.format("\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,1,first_level, "tx", "tx"))
+ tile(stmt,second_level,1,first_level,"tx","tx",counted)
+ --print_code()
+
+ first_level = find_cur_level(stmt,"tmp1")
+ lower_1,upper_1 = hard_loop_bounds(stmt,first_level)
+ tx_level = find_cur_level(stmt,"tx")
+ lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level)
+ --print(string.format("UL_1 %d %d UL_tx %d %d", lower_1, upper_1, lower_tx, upper_tx))
+
+ if(math.ceil(upper_tx/tx) > 1)then
+ --print "ceil I say"
+ --print(string.format("\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,tx,tx_level, "tx", "tmp1"))
+ tile(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted)
+ --print_code()
+
+ peat = find_cur_level(stmt,"tx")
+ --print(string.format("\n[Tile1]tile(%d, %d, %d)",stmt, peat, peat))
+ tile(stmt, peat, peat ) --find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx"))
+ --print_code()
+
+ if (find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx")) then
+ --print(string.format("\nagain [Tile1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")))
+ tile(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
+ --print_code()
+ end
+ --else
+ --tile(stmt, tx_level,1, tx_level, "tx", "tx", counted)
+ --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,1,tx_level, "tx", "tx"))
+ end
+ --print_code()
+ --]] -- this apparently is NOT the end of a block comment
+
+ --print("\nStarting tmp1\n")
+ -- Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty".
+ tile(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))
+ --print_code()
+
+ ty_level = find_cur_level(stmt,"tmp1")
+ lower_ty,upper_ty = hard_loop_bounds(stmt,ty_level)
+
+ tx_level = find_cur_level(stmt,"tx")
+ lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level)
+ --print("[Malik]-loop cleanup@tmp1: lowerty, upperty: "..lower_ty..", "..upper_ty..", ty: "..ty..",ty level:"..ty_level..",tx_level:"..tx_level..", stmt: "..stmt)
+
+ --print "before ceil"
+ if(math.ceil(upper_ty/ty) > 1)then
+ --print "CEIL IF"
+ --print("\n Inside upper_ty/ty > 1\n");
+
+ --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,ty,ty_level, "ty", "tmp_ty"))
+ tile(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted)
+ --print_code()
+
+ --print(string.format("\n[Tile2-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt ,"ty"),find_cur_level(stmt,"ty")))
+ tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty"))
+ --print_code()
+
+ -----------------------------------------------------------------------
+ ----------------------------------------------------------------------
+ cur_idxs = cur_indices(stmt)
+ --print("\n cur indexes are "..list_to_string(cur_idxs))
+
+ -- Putting ty before any tmp_tx
+ idx_flag = -1
+ for num= 0,table.getn(cur_idxs) do
+ if(cur[num] == "tmp_tx") then
+ idx_flag = find_cur_level(stmt,cur[num])
+ break
+ end
+ end
+ --print(string.format("\n (1) so i have found out the value of idx flag as %d",idx_flag) )
+
+ if(idx_flag >=0 ) then
+ if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then
+ --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
+ tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+ --print_code()
+ end
+ end
+
+ -- Now Putting ty before any tmp_ty
+ idx_flag = -1
+ for num= 0,table.getn(cur_idxs) do
+ if(cur[num] == "tmp_ty") then
+ idx_flag = find_cur_level(stmt,cur[num])
+ break
+ end
+ end
+ --print(string.format("\n IF so i have found out the value of idx flag as %d",idx_flag) )
+ if(idx_flag >=0 ) then
+ --print "one more test"
+ if ((find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"))) then
+ --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
+ tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+ --print_code()
+ end
+ end
+ else
+ --print "CEIL ELSE"
+ --cur_idxs = cur_indices(stmt)
+ --print("\n Inside upper_ty/ty <= 1\n");
+
+ --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,1,ty_level, "ty", "ty"))
+ tile(stmt, ty_level,1, ty_level, "ty", "ty", counted)
+ --print_code()
+
+ --print(string.format("\n[Tile3-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1))
+ tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
+ --print_code()
+
+ idx_flag = -1
+ if(cur_idxs) then
+ --print "CAN NEVER GET HERE? cur_idxs"
+ for num= 0,table.getn(cur_idxs) do
+ if(cur[num] == "tmp_ty") then
+ idx_flag = find_cur_level(stmt,cur[num])
+ break
+ end
+ end
+ end
+ --print(string.format("\n ELSE so i have found out the value of idx flag as %d",idx_flag) )
+ if(idx_flag >=0 ) then
+ if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then
+ --print(string.format("tile( stmt %d, level ty %d, level ty %d",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
+ tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+ --print(string.format("\n[Tile3-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
+ end
+ end
+ end
+
+ --print_code()
+ end
+
+
+ --print "\n\n *** at bottom of if in copy to shared, "
+ --print_code()
+ --print "end of if"
+
+ else
+ --copy to shared only created one level, not two, so we use a different approach (MV & TMV)
+ --print("\nCopy to shared: [If was error]\n")
+ level = find_cur_level(stmt,"tmp1")
+ tile(stmt, level, level)
+
+ --print(string.format("\n[Tile]tile(%d, %d, %d)",stmt, level, level))
+ tx,ty = thread_dims()
+ lower,upper = hard_loop_bounds(stmt, level)
+ upper = upper+1 --upper bound given as <=, compare to dimensions tx which is <
+ --print("upper "..upper.." tx "..tx)
+ if upper == tx then
+ rename_index(stmt, "tmp1", "tx")
+ else
+ --print("upper is not tx")
+ --TODO: Don't know, maybe do some tileing etc
+ --print_code()
+ --print("upper "..upper.." tx "..tx.." stmt: "..stmt.." level: "..level)
+ tile(stmt, level,tx,level, "tx", "tmp_tx", counted)
+ --print_code()
+
+ --print("stmt:"..stmt.." level+1: "..level+1)
+ --print("TILE 7")
+ tile(stmt, level+1,1,level+1,"tx", "tx",counted)
+ --print("TILE 3")
+ tile(stmt,level+1,level)
+ --print_code()
+
+ if(ty > 1) then
+ --print_code()
+ --print("GOING IN")
+ lower,upper = hard_loop_bounds(stmt, level+1)
+ --print(string.format("ty %d lower %d upper %d", ty, lower, upper))
+ --upper=125
+ --print("NOW FOR Y: upper "..upper.." ty "..ty.." stmt: "..stmt.." level: "..(level+1).." bound:"..math.ceil(upper/ty))
+ tile(stmt, level+1,math.ceil(upper/ty),level+1, "tmp_ty", "ty", counted)
+ --tile(stmt, level+2,math.ceil(upper/ty),level+2, "tmp_ty", "ty", counted)
+ end
+ --print_code()
+ --rename_index(stmt, "tmp1", "tx")
+ --print("Warning: Need to implement some logic here to tile the single level shared copy loop to match thread dimensions")
+ end
+ end
+ --Always add sync
+ add_sync(stmt,start_loop)
+
+ end
+ --print("ending copy to shared\n")
+ --print_code()
+end
+
+function unroll_to_depth(max_depth)
+ --print(string.format("\n\nunroll_to_depth(%d)", max_depth ))
+ --print "SYNC UP"
+
+ cur = cur_indices(0)
+ thread_idxs = thread_indices()
+ guard_idx = thread_idxs[#thread_idxs]
+
+ --print(string.format("cur indices %s",list_to_string(cur)))
+ --print(string.format("thread indices %s",list_to_string(thread_idxs)))
+ --print(string.format("#thread_idxs = %d", #thread_idxs))
+ --print(string.format("guard_idx = %s", guard_idx))
+
+ ---- HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS
+ common_loops = {}
+ comm_loops_cnt = 0
+ num_stmts = num_statements()
+ --print(string.format("num statements %d", num_stmts))
+
+ for stmt=0,num_stmts-1 do
+ cur_idxs = cur_indices(stmt)
+
+ --print(string.format("\nSTMT %d Current Indices: %s",stmt,list_to_string(cur_idxs)))
+
+ if(chk_cur_level(stmt,"tx")>0) then
+ for ii=1,find_cur_level(stmt,"tx")-1 do -- started at 0
+ --print(string.format("ii = %d", ii)) -- index starts at 1, what does index 0 do?
+ --if cur_idxs[ii] == nil then print "cur_idxs[i]] is NIL"
+ --else print(string.format("cur_idxs[%d] = '%s'", ii, cur_idxs[ii])) -- index starts at 1, what does index 0 do?
+ --end
+
+ if(cur_idxs[ii] ~= "bx" and cur_idxs[ii] ~= "by" and cur_idxs[ii] ~= nil and cur_idxs[ii] ~= "tx" and cur_idxs[ii] ~= "ty" and cur_idxs[ii] ~= "") then
+
+ --print(string.format("id %s is not in the list", cur_idxs[ii] ))
+
+ for stmt1=stmt+1,num_stmts-1 do
+ --print(string.format("\nii %d stmt1 is %d", ii, stmt1))
+ cur_idxs1 = cur_indices(stmt1)
+ --print("\nstmt1 cur_idxs1 is "..list_to_string(cur_idxs1))
+
+ --print(string.format("cur level(%d, %s) = %d", stmt, "tx", find_cur_level(stmt,"tx")))
+
+ endrange = find_cur_level(stmt,"tx")-1
+ --print(string.format("for iii=1, %d do", endrange))
+
+ for iii=1,find_cur_level(stmt,"tx")-1 do -- started at 0
+ --print(string.format("stmt %d ii %d iii %d ", stmt, ii, iii))
+ --if(cur_idxs1[iii] ~= nil) then
+ -- print(string.format("stmt %d ii %d iii %d cur_idxs1[%d] = '%s'", stmt, ii, iii, iii, cur_idxs1[iii]))
+ --else
+ -- print(string.format("stmt %d ii %d iii %d cur_idxs1[%d] = NIL", stmt, ii, iii, iii))
+ --end
+
+ if(cur_idxs1[iii] ~= "bx" and cur_idxs1[iii] ~= "by" and cur_idxs1[iii] ~= nil and cur_idxs1[iii] ~= "tx" and cur_idxs1[iii] ~= "ty" and cur_idxs1[iii] ~= "") then
+ if(cur_idxs[ii] == cur_idxs1[iii]) then
+ --print("\nfound idx:"..cur_idxs[ii])
+ --if(comm_loops_cnt == 0) then print "\n\n*** WARNING *** assigning to array index ZERO in Lua" end
+ common_loops[comm_loops_cnt] = cur_idxs[ii]
+ --print(string.format("cl[%d] = '%s'", comm_loops_cnt, common_loops[comm_loops_cnt]))
+ comm_loops_cnt = comm_loops_cnt + 1
+ end
+ end
+ end
+ end
+ end
+ end
+ end
+ end
+ ----
+ --if(comm_loops_cnt>0) then
+ -- print("\n COMM LOOPS :TOTAL "..comm_loops_cnt..", and are "..list_to_string(common_loops).." this loop :"..common_loops[0])
+ --else
+ -- print "UNROLL can't unroll any loops?"
+ --end
+
+
+
+
+ repeat
+ old_num_stmts = num_statements()
+ --print(string.format("old_num_statements %d", old_num_stmts))
+
+ for stmt=0,old_num_stmts-1 do
+ cur_idxs = cur_indices(stmt)
+ --print(string.format("stmt %d cur_idxs = %s", stmt, list_to_string(cur_idxs)))
+ if(#cur_idxs > 0) then
+ gaurd_level = -1
+ if(chk_cur_level(stmt,guard_idx)>0) then
+ gaurd_level = find_cur_level(stmt,guard_idx)
+ end
+ --print(string.format("guard_level(sp) = %d", gaurd_level))
+
+ if(gaurd_level>-1) then
+ level = next_clean_level(cur_idxs,gaurd_level)
+ --print(string.format("next clean level %d", level))
+
+ --need to handle max_depth
+ num_unrolled = 0
+ level_unroll_comm = level
+ level_arr = {}
+ while level >= 0 do
+ --print(string.format("while: level = %d", level))
+
+ if num_unrolled == max_depth then break end
+ --print("Unrolling "..stmt.." at level "..(level).." index ".. cur_idxs[gaurd_level+1])
+
+ level_arr[num_unrolled] = level
+ num_unrolled = num_unrolled + 1
+
+ guard_level = find_cur_level(stmt,guard_idx)
+ level = next_clean_level(cur_idxs,level+1)
+ end
+ --dies print("How many levels for unroll commands"..table.getn(level_arr).." which is "..level_arr[0].." and "..level_arr[#level_arr])
+ --if(table.getn(level_arr) ~= nil) then
+
+ --print "OK, NOW WE UNROLL"
+
+ if(level_unroll_comm >= 0)then
+ for i = table.getn(level_arr),0,-1 do
+ --print(string.format("\ni=%d", i))
+ --print(string.format("[Unroll]unroll(%d, %d, 0)",stmt, level_arr[i]))
+
+ unroll(stmt,level_arr[i],0)
+ --print("finished unroll]]\n")
+ --print_code()
+ end
+ end
+------
+ end
+--[[
+
+THERE WAS A BIG BLOCK OF COMMENTED OUT CODE HERE
+
+
+--]]
+------
+ end
+ end
+ new_num_stmts = num_statements()
+
+ until old_num_stmts == new_num_stmts
+
+end
+
+
diff --git a/test-chill/test-cases/examples/cuda-chill/cudaize.py b/test-chill/test-cases/examples/cuda-chill/cudaize.py
new file mode 100755
index 0000000..ffef009
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/cudaize.py
@@ -0,0 +1,1047 @@
+#! /usr/bin/python
+
+# THIS IS CUDAIZE.PY
+
+import chill
+import sys
+import math
+
+strided = 0
+counted = 1
+
+def print_code():
+ chill.print_code()
+ print ""
+ sys.stdout.flush()
+
+
+def table_contains_key( table, key ): # use a dict for the 'table'?
+ return table.has_key(key) # (key in table)?
+
+def print_array( arr ): # a useful function to mimic lua output
+ for a in arr[:-1]:
+ print "%s," % a,
+ print "%s" % arr[-1]
+ sys.stdout.flush()
+
+def valid_indices( statement, indices ):
+ #print "valid_indices() python calling C cur_indices"
+ #print statement
+ cur = chill.cur_indices(statement) # calls C
+ #print "python valid_indices(), cur = ",
+ #print cur
+ #print "indices = ",
+ #print indices
+
+ for index in indices:
+ if not index in cur:
+ return False
+ return True
+
+def next_clean_level( indices_at_each_level, level):
+ #print "next_clean_level( ..., %d )" % level
+ #print "indices_at_each_level ",
+ print_array( indices_at_each_level )
+
+ numlevels = len(indices_at_each_level)
+ #print "loop to %d" % numlevels
+ for i in range(level+1, numlevels+1):
+ pythoni = i-1 # LUA index starts at 1
+ #print "Checking level %d = '%s'" % (i, indices_at_each_level[pythoni])
+ sys.stdout.flush()
+ if len(indices_at_each_level[pythoni]) > 0: # LUA INDEX STARTS AT 1
+ #print "returning %d" % i
+ return i # MATCH lua return value, LUA index starts at one
+ return -1 # no non-dummy indices
+
+
+
+
+def build_order( final_order, tile_index_names, control_index_names, tile_index_map, current_level):
+ order = []
+ #print "\nbuild_order()"
+ #print "build_order(): final_order = (",
+ count = 0
+ for f in final_order:
+ #if count+1 == len(final_order):
+ # print "%s )" % f
+ #else:
+ # print "%s," % f ,
+ count += 1
+
+ keys = control_index_names.keys()
+ keys.sort()
+ #if (2 == len(keys)):
+ # print "build_order(): ctrl_idx_names = (%s, %s)" % (control_index_names[0], control_index_names[1])
+ #else:
+ # print "build_order(): ctrl_idx_names = (%s" % control_index_names[0],
+ # for k in keys[1:]:
+ # print ", %s" % control_index_names[k],
+ # print ")"
+
+ #print control_index_names
+ #print "cur_level %d" % current_level
+
+ #print "tile index map: ",
+ #print tile_index_map
+
+
+ for i in range(len(final_order)):
+ k = final_order[i] # not used?
+ skip = False
+ cur = final_order[i]
+ # control loops below our current level should not be in the current order
+
+ # skip = cur in control_index_names[current_level+2:]
+ #print "\n%d control_index_names, " % len(control_index_names)
+ #print control_index_names
+
+ for j in range(current_level+1, len(control_index_names)):
+ #print "comparing cur %s with cin[%d] %s" % ( cur, j, control_index_names[j])
+ if control_index_names[j] == cur:
+ skip = True
+ #print "SKIP %s " % cur
+
+ # possibly substitute tile indices if necessary
+ if tile_index_map.has_key(cur):
+ approved_sub = False
+ sub_string = tile_index_map[cur]
+ #print "sub_string = ",
+ #print sub_string
+
+ # approved_sub = sub_string in tile_index_names[current_level+2:]
+ for j in range(current_level+1, len(tile_index_names)):
+ if tile_index_names[j] == sub_string:
+ approved_sub = True
+ if approved_sub:
+ cur = sub_string
+
+ if not skip:
+ order.append( cur)
+ #print "build_order() returning order (",
+ #print order
+ #for o in order:
+ # print "%s," % o,
+ #print ")"
+ return order
+
+def find_cur_level( stmt, idx ):
+ #print "find_cur_level(stmt %d, idx %s) Cur indices" % ( stmt, idx ),
+
+ cur = chill.cur_indices(stmt)
+ #for c in cur[:-1]:
+ # print "%s," % c,
+ #print "%s" % cur[ -1 ]
+
+ index = 1 # lua starts indices at 1 !!
+ for c in cur:
+ if c == idx:
+ #print "found it at index %d" % index
+ #sys.stdout.flush()
+ #print "in find_cur_level, returning ",
+ #print index
+ return index
+ index += 1
+ #print "find_cur_level(), Unable to find index %s in" % idx,
+ #print cur
+ #print "in find_cur_level, returning -1"
+ return -1 # special meaning "it's not there"
+
+def chk_cur_level( stmt, idx ):
+ # search cur_indices for a ind at stmt
+ cur = chill.cur_indices(stmt)
+ if idx in cur:
+ return 1 + cur.index(idx) # lua index starts at 1 !
+ return -1
+
+def find_offset( cur_order, tile, control):
+ #print "Looking for tile '%s' and control '%s' in (" % (tile, control),
+ #print cur_order
+ #for o in cur_order:
+ # print "%s," % o,
+ #print ")"
+
+ idx1 = -1
+ idx2 = -1
+ if tile in cur_order:
+ idx1 = 1 + cur_order.index(tile) # lua indexes from 1!
+ else:
+ print "find_offset(), unable to find tile %s in current list of indices" % tile
+ sys.exit(-1)
+
+ if control in cur_order:
+ idx2 = 1 + cur_order.index(control) # lua indexes from 1!
+ else:
+ print "find_offset(), unable to find control %s in current list of indices" % control
+ sys.exit(-1)
+
+ #print "found at level %d and %d" % ( idx2, idx1 )
+ # this appears horrible
+ if idx2 < idx1:
+ return idx2-idx1+1 # bad ordering
+ else:
+ return idx2-idx1
+
+
+
+def tile_by_index( tile_indices, sizes, index_names, final_order, tile_method):
+ #print "STARTING TILE BY INDEX"
+ #print "tile_by_index() tile_method ",
+ #print tile_method
+ #print "index_names: ",
+ #print index_names
+
+ stmt = 0 # assume statement 0
+ if not valid_indices( stmt, tile_indices):
+ print "python tile_by_index() one or more of ",
+ print tile_indices,
+ print " is not valid"
+ sys.exit(-1)
+
+ if tile_method == None:
+ #print "CREATING tile_method = 1"
+ tile_method = 1 # "counted"
+
+ tile_index_names = []
+ for ti in tile_indices:
+ tile_index_names.append( ti ) # make a copy?
+ #print "tile_index_names:",
+ #print tile_index_names
+
+ control_index_names = {} # a dictionary?
+ tile_index_map = {}
+
+ #print "index_names: "
+ #print index_names
+
+ for pair in index_names:
+ valid = False
+ control = pair[0]
+ name = pair[1]
+ #print "control %s name %s" % ( control, name )
+
+ if control[0] == "l" and control[1].isdigit():
+ if control.endswith("_control"):
+ index = int(control[1: -8])
+ control_index_names[index-1] = name
+ valid = True
+
+ elif control.endswith("_tile"):
+ index = int(control[1: -5])
+ #print "index %d" % index
+ tile_index_names[index-1] = name # ??
+ tile_index_map[name] = tile_indices[index-1]
+ valid = True
+ if not valid:
+ print "%s is not a proper key for specifying tile or control loop indices\n" % control
+
+ #print "control_index_names = ",
+ #print control_index_names
+
+ #print "tile_index_names = ",
+ #print tile_index_names
+
+ #print "before call to build_order(), tile_index_map = ",
+ #print tile_index_map
+
+
+ # filter out control indices (and do name substitution of unprocessed tile indices) for a given level
+ cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, -1)
+
+ #print "returned from build_order python\n\n"
+
+ # print("permute("..stmt..", {"..list_to_string(cur_order).."})")
+ #print "permute(%d, {" % stmt,
+ #print "cur_order = ",
+ #print cur_order,
+ #print "})"
+
+ cur_order.insert(0, stmt)
+ #print cur_order
+ chill.permute( tuple( cur_order))
+ #print "in cudaize.py, returned from C code chill.permute()\n"
+
+ for i in range(len(tile_indices)):
+ cur_idx = tile_indices[i]
+ #print "i %d cur_idx %s calling build order ********" % (i, cur_idx)
+ cur_order = build_order( final_order, tile_indices, control_index_names, tile_index_map, i)
+ #print "cur_idx %s return from build order" % cur_idx
+
+ # Find an offset between tile loop and control loop
+ # 0 = control loop one level above tile loop
+ # -1 = control loop two levels above tile loop
+ # > 0 = tile loop above control loop
+ # In the last case, we do two extra tile commands to get the control
+ # above the tile and then rely on the final permute to handle the
+ # rest
+ level = find_cur_level(stmt,cur_idx)
+ #print "level %d\n" % level
+
+ offset = find_offset(cur_order, tile_index_names[i], control_index_names[i])
+ #print "offset %d" % offset
+
+ if offset <= 0:
+ #print "[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method )
+ chill.tile7( stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method )
+ #print "in cudaize.py, returned from C code chill.tile7\n"
+
+ else:
+ #print "2tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method )
+ chill.tile7( stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method ) # regular level
+
+ # flip and tile control loop
+ #print "3tile(%d, %d, %d)" % ( stmt, level+1, level+1)
+ chill.tile3( stmt, level+1, level+1)
+
+ #print "4tile(%d, %d, %d)" % ( stmt, level+1, level)
+ chill.tile3( stmt, level+1, level)
+
+ #print_code()
+
+ # Do permutation based on cur_order
+ #print("permute based on build order calling build_order()")
+ cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, i)
+
+ #print("permute based on build order return from build_order()")
+
+ # print("permute("..stmt..", {"..list_to_string(cur_order).."})")
+ topermute = cur_order
+ topermute.insert(0, stmt)
+ chill.permute( tuple(topermute) )
+ #print "\nafter permute(), code is:"
+ #print_code()
+
+def normalize_index( index ):
+ #print "in cudaize.py, normalize_index( %s )" % index
+ stmt = 0 # assume stmt 0
+ l = find_cur_level( stmt, index )
+ chill.tile3( stmt, l, l )
+
+def is_in_indices( stmt, idx):
+ cur = chill.cur_indices(stmt)
+ return idx in cur
+
+def copy_to_registers( start_loop, array_name ):
+ #print "\n\n****** starting copy to registers"
+ #sys.stdout.flush()
+
+ stmt = 0 # assume stmt 0
+ cur = chill.cur_indices(stmt) # calls C
+ table_Size = len(cur)
+
+ #print "Cur indices",
+ #print_array(cur)
+ #print "\nThe table size is %d" % table_Size
+ #count=1
+ #for c in cur:
+ # print "%d\t%s" % (count,c)
+ # count += 1
+
+ #print_code()
+
+ # would be much cleaner if not translating this code from lua!
+ level_tx = -1
+ level_ty = -1
+ if is_in_indices(stmt,"tx"):
+ level_tx = find_cur_level(stmt,"tx")
+ if is_in_indices(stmt,"ty"):
+ level_ty = find_cur_level(stmt,"ty")
+ #print "level_tx %d level_ty %d" % ( level_tx, level_ty )
+ #sys.stdout.flush()
+
+ ty_lookup_idx = ""
+ org_level_ty = level_ty
+
+ # UGLY logic. Lua index starts at 1, so all tests etc here are off by 1 from the lua code
+ # level_ty initializes to -1 , which is not a valid index, and so there is added code to
+ # make it not try to acccess offset -1. -1 IS a valid python array index
+ # to top it off, the else below can assign a NIL to ty_lookup_idx!
+ if level_ty != -1 and cur[level_ty] != "":
+ #print "IF cur[%d] = %s" % ( level_ty, cur[level_ty] )
+ ty_lookup_idx = cur[level_ty]
+ else:
+ #print "ELSE ty_lookup_idx = cur[%d] = %s" % ( level_ty, cur[level_ty-1])
+ ty_lookup_idx = cur[level_ty-1]
+ #print "ty_lookup_idx '%s'" % ty_lookup_idx
+
+ if level_ty > -1:
+ #print "\ntile3(%d,%d,%d)" % (stmt,level_ty,level_tx+1)
+ chill.tile3(stmt,level_ty,level_tx+1)
+ #print_code()
+
+ cur = chill.cur_indices(stmt) # calls C
+ table_Size = len(cur)
+ #print "Cur indices ",
+ #for c in cur:
+ # print "%s," % c,
+ #print "\nThe table size is %d" % len(cur)
+ #count=1
+ #for c in cur:
+ # print "%d\t%s" % (count,c)
+ # count += 1
+ #sys.stdout.flush()
+
+ if is_in_indices(stmt,"tx"):
+ level_tx = find_cur_level(stmt,"tx")
+ if ty_lookup_idx != "": # perhaps incorrect test
+ if is_in_indices(stmt,ty_lookup_idx):
+ level_ty = find_cur_level(stmt,ty_lookup_idx)
+
+ ty_lookup = 1
+ idx_flag = -1
+ # find the level of the next valid index after ty+1
+ #print "\nlevel_ty %d" % level_ty
+ if level_ty > -1:
+ #print "table_Size %d" % table_Size
+ for num in range(-1 + level_ty+ty_lookup,table_Size): # ?? off by one?
+ #print "num=%d cur[num] = '%s'" % (num+1, cur[num]) # num+1 is lua index ????
+ sys.stdout.flush()
+ if cur[num] != "":
+ idx_flag = find_cur_level(stmt,cur[num])
+ #print "idx_flag = %d" % idx_flag
+ break
+
+ #print "\n(first) I am checking all indexes after ty+1 %s" % idx_flag
+ #print_code()
+ #print ""
+
+ how_many_levels = 1
+
+ #print "idx_flag = %d I will check levels starting with %d" % (idx_flag, idx_flag+1)
+ # lua arrays start at index 1. the next loop in lua starts at offset 0, since idx_flag can be -1
+ # thus the check for "not equal nil" in lua (bad idea)
+ # python arrays start at 0, so will check for things that lua doesn't (?)
+ startat = idx_flag + 1
+ if idx_flag == -1:
+ startat = 1 # pretend we're lua for now. TODO: fix the logic
+
+ for ch_lev in range(startat,table_Size+1): # logic may be wrong (off by one)
+ #print "ch_lev %d" % ch_lev
+ if ch_lev <= table_Size and cur[ch_lev-1] != "":
+ #print "cur[%d] = '%s'" % ( ch_lev, cur[ch_lev-1] )
+ how_many_levels += 1
+
+ #print "\nHow Many Levels %d" % how_many_levels
+ sys.stdout.flush()
+ sys.stdout.flush()
+
+ if how_many_levels< 2:
+ while( idx_flag >= 0):
+ for num in range(level_ty+ty_lookup,table_Size+1):
+ #print "at top of loop, num is %d" % num
+ #print "cur[num] = '%s'" % cur[num-1]
+ if cur[num-1] != "":
+ idx = cur[num-1]
+ #print "idx '%s'" % idx
+ sys.stdout.flush()
+ curlev = find_cur_level(stmt,idx)
+ #print "curlev %d" % curlev
+
+ #print "\n[COPYTOREG]tile(%d,%d,%d)"%(stmt,curlev,level_tx)
+
+ chill.tile3(stmt, curlev, curlev)
+ curlev = find_cur_level(stmt,idx)
+ #print "curlev %d" % curlev
+ chill.tile3(stmt,curlev,level_tx)
+ #print "hehe '%s'" % cur[num-1]
+
+ cur = chill.cur_indices(stmt)
+ #print "Cur indices INSIDE",
+ #for c in cur:
+ # print "%s," % c,
+ table_Size = len(cur)
+ #print "\nTable Size is: %d" % len(cur)
+
+ level_tx = find_cur_level(stmt,"tx")
+ #print "\n level TX is: %d" % level_tx
+ level_ty = find_cur_level(stmt,ty_lookup_idx)
+ #print "\n level TY is: %d" %level_ty
+ idx_flag = -1
+ #print "idx_flag = -1"
+
+
+ #- find the level of the next valid index after ty+1
+ #- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?)
+ for num2 in range( -1 + level_ty+ty_lookup ,table_Size): # lua starts index at one
+ #print "num mucking num = %d" % num2
+ if(cur[num2] != ""):
+ #print "cur[%d] = '%s'" % ( num2, cur[num2] )
+ idx_flag = find_cur_level(stmt,cur[num2])
+ #print("\n(second) I am checking all indexes after ty+1 %s",cur[num2])
+ break
+
+ #print "num mucked to %d idx_flag = %d" % (num, idx_flag)
+
+ #print "at bottom of loop, num is %d" % num
+
+ #print "done with levels"
+
+ # this was a block comment ???
+
+# for num in range(level_ty+1, table_Size+1):
+# print "num %d" % num
+# if cur[num-1] != "":
+# idx_flag = find_cur_level(stmt,cur[num-1]) ## ugly
+# print "idx_flag = %d" % idx_flag
+
+ # change this all to reflect the real logic which is to normalize all loops inside the thread loops.
+# print "change this all ...\n"
+# print "level_ty+1 %d table_Size-1 %d idx_flag %d" %( level_ty+1, table_Size-1, idx_flag)
+# sys.stdout.flush()
+# sys.stdout.flush()
+
+# while level_ty+1 < (table_Size-1) and idx_flag >= 0:
+# print "*** level_ty %d" % level_ty
+# for num in range(level_ty+2,table_Size+1): # lua for includes second value
+# print "num %d cur[num] %s" % (num, cur[num])
+# if cur[num] != "":
+# idx = cur[num]
+# print "idx='%s'" % idx
+# #print_code()
+
+
+
+
+ #print "ARE WE SYNCED HERE?"
+ #print_code()
+
+ # [Malik] end logic
+ start_level = find_cur_level(stmt, start_loop) # start_loop was passed parameter!
+
+ # We should hold constant any block or tile loop
+ block_idxs = chill.block_indices()
+ thread_idxs = chill.thread_indices()
+ #print"\nblock indices are"
+ #for index, val in enumerate(block_idxs):
+ # print "%d\t%s" % ( int(index)+1 , val )
+ #print"\nthread indices are"
+ #for index, val in enumerate(thread_idxs):
+ # print "%d\t%s" % ( int(index)+1 , val )
+ #print "\nStart Level: %d" % start_level
+
+ hold_constant = []
+ #print("\n Now in Blocks")
+ for idx in block_idxs:
+ blocklevel = find_cur_level(stmt,idx)
+ if blocklevel >= start_level:
+ hold_constant.append(idx)
+ #print "\nJust inserted block %s in hold_constant" %idx
+
+ #print("\n Now in Threads")
+ for idx in thread_idxs:
+ blocklevel = find_cur_level(stmt,idx)
+ if blocklevel >= start_level:
+ hold_constant.append(idx)
+ #print "\nJust inserted thread %s in hold_constant" %idx
+ #print "\nhold constant table is: "
+ #for index, val in enumerate(hold_constant):
+ # print "%d\t%s" % ( int(index)+1 , val )
+
+ #print("\nbefore datacopy pvt")
+ old_num_stmts = chill.num_statements()
+ #sys.stdout.flush()
+
+ #print "\n[DataCopy]datacopy_privatized(%d, %s, %s, " % (stmt, start_loop, array_name),
+ #print hold_constant,
+ #print ")"
+ passtoC = [stmt, start_loop, array_name ] # a list
+ passtoC.append( len(hold_constant ) )
+ for h in hold_constant:
+ passtoC.append( h )
+ chill.datacopy_privatized( tuple( passtoC ))
+ sys.stdout.flush()
+ sys.stdout.flush()
+
+ new_num_statements = chill.num_statements()
+ #print "new num statements %d" % new_num_statements
+
+ # Unroll to the last thread level
+# for stmt in range(old_num_statements, new_num_statements):
+# print "unrolling statement %d" % stmt
+# level = find_cur_level(stmt,thread_idxs[-1]) #get last thread level
+# print "level is %d" % level
+# idxs = chill.cur_indices(stmt)
+# if level < len(idxs):
+# chill.unroll(stmt,level+1,0)
+
+
+
+def copy_to_shared( start_loop, array_name, alignment ):
+ #print "\nstarting copy to shared( %s, %s, %d)" % (start_loop, array_name, alignment )
+ #print "copy_to_shared( %s, %s, %d) in cudaize.py" % ( start_loop, array_name, alignment )
+ stmt = 0 # assume statement 0
+
+ cur = chill.cur_indices(stmt)
+ #print "Cur indices ",
+ #print_array( cur )
+
+ start_level = find_cur_level( stmt, start_loop )
+ #print "start_level %d" % start_level
+
+ old_num_statements = chill.num_statements()
+ #print "old_num_statements %d" % old_num_statements
+
+
+ # Now, we give it indices for up to two dimensions for copy loop
+ copy_loop_idxs = ["tmp1","tmp2"]
+ #chill.datacopy_9arg(stmt, start_level, array_name, copy_loop_idxs, False, 0, 1, alignment,True)
+ passtoC = [stmt, start_level, array_name] # a list
+ passtoC.append( len(copy_loop_idxs))
+ for i in copy_loop_idxs:
+ passtoC.append(i)
+ passtoC.append( 0 ) # False
+ passtoC.append( 0 )
+ passtoC.append( 1 )
+ passtoC.append( alignment )
+ passtoC.append( 1 ) # True
+ #print "\n[DataCopy]datacopy( ",
+ #print passtoC,
+ #print ")"
+
+ #if array_name == "b":
+ # chill.cheat(1)
+ #if array_name == "c":
+ # chill.cheat(2)
+
+ chill.datacopy_9arg( tuple( passtoC ))
+
+ #print "back from datacopy_9arg\n\n\n"
+ #sys.stdout.flush()
+
+
+ #print "calling add_sync( %d, %s )" % ( stmt, start_loop )
+ chill.add_sync( stmt, start_loop )
+ #print "back from add_sync()\n\n"
+
+ new_num_statements = chill.num_statements()
+
+ # This is fairly CUBLAS2 specific, not sure how well it generalizes,
+ # but for a 2D copy, what we want to do is "normalize" the first loop
+ # "tmp1" then get its hard upper bound. We then want to tile it to
+ # make the control loop of that tile "ty". We then tile "tmp2" with a
+ # size of 1 and make it "tx".
+
+ #print "fairly CUBLAS2 specific, OLD %d NEW %d" % ( old_num_statements, new_num_statements)
+ sys.stdout.flush()
+ sys.stdout.flush()
+
+ for stmt in range(old_num_statements, new_num_statements):
+ #print "for stmt = %d" % stmt
+ level = find_cur_level( stmt, "tmp2")
+ #print "FOUND CUR LEVEL? level '",
+ #print level,
+ #print "'"
+
+ #print "in loop, stmt %d level %d" % ( stmt, level )
+ if level != -1:
+ #print "\nCopy to shared: [If was no error]\n"
+ find_cur_level(stmt,"tmp2")
+ chill.tile3( stmt, level, level )
+
+ #print "hard_loop_bounds( %d, %d )" % (stmt, level)
+ bounds = chill.hard_loop_bounds(stmt, level)
+ lower = bounds[0]
+ upper = 1+ bounds[1]
+ #print "lower %d upper %d" % ( lower, upper )
+
+ dims = chill.thread_dims()
+ #print "in cudaize.py copy_to_shared, dims =",
+ #print dims
+ tx = dims[0]
+ ty = dims[1]
+ #print "2-loop cleanup: lower, upper: %d, %d, tx: %d" % ( lower, upper, tx)
+
+ level = find_cur_level(stmt,"tmp1")
+ #print "level %d" % level
+ if tx == upper and ty == 1:
+ #print "tx = %d upper = %d ty = %d"% (tx, upper, ty)
+ #print "Don't need"
+
+ # Don't need an extra tile level, just move this loop up
+ second_level = find_cur_level(stmt,"tmp2")
+ chill.tile7(stmt, second_level, 1, level, "tx", "tx", counted)
+
+ else:
+ #print "DO need?"
+ if ty == 1:
+ new_ctrl = "tmp3"
+ else:
+ new_ctrl = "ty"
+
+ # LOTS of commented out code here in cudaize.lua
+
+ #print_code()
+ #print "\nStarting tmp2\n"
+ first_level = find_cur_level(stmt,"tmp1")
+ second_level = find_cur_level(stmt,"tmp2")
+ bounds = chill.hard_loop_bounds(stmt, second_level)
+ lower = bounds[0]
+ upper = 1 + bounds[1] # BROKEN?
+
+ #print "[Malik]-loop cleanup@tmp2: lower, upper: %d, %d, tx: %d,first level:%d,second_level:%d" % ( lower, upper-1, tx, first_level, second_level)
+
+ # Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx.
+ #print "\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, second_level,1,first_level, "tx", "tx")
+ chill.tile7(stmt, second_level,1,first_level,"tx","tx",counted)
+ #print_code()
+
+ first_level = find_cur_level(stmt,"tmp1")
+ bounds = chill.hard_loop_bounds(stmt, first_level)
+ lower_1 = bounds[0]
+ upper_1 = 1 + bounds[1]
+ tx_level = find_cur_level(stmt,"tx")
+ bounds = chill.hard_loop_bounds(stmt,tx_level)
+ lower_tx = bounds[0]
+ upper_tx = 1+bounds[1]
+ #print "UL_1 %d %d UL_tx %d %d" % ( lower_1, upper_1-1, lower_tx, upper_tx-1)
+
+ if int(math.ceil( float(upper_tx)/float(tx))) > 1:
+ #print "ceil I say"
+ #print "\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, tx_level,tx,tx_level, "tx", "tmp1")
+ chill.tile7(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted)
+ #print_code()
+
+ repeat = find_cur_level(stmt,"tx")
+ #print "\n[Tile1]tile(%d, %d, %d)" % (stmt, repeat, repeat)
+ chill.tile3(stmt, repeat, repeat) #find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx"))
+ #print_code()
+
+ if find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx"):
+ #print "\nagain [Tile1]tile(%d, %d, %d)" % (stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
+ chill.tile3(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
+ #print_code()
+
+ #print_code()
+
+ #print "\nStarting tmp1\n"
+ # Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty".
+ chill.tile3(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))
+ #print_code()
+
+ ty_level = find_cur_level(stmt,"tmp1")
+ bounds = chill.hard_loop_bounds(stmt,ty_level)
+ lower_ty = bounds[0]
+ upper_ty = 1 + bounds[1]
+
+ tx_level = find_cur_level(stmt,"tx")
+ bounds = chill.hard_loop_bounds(stmt,tx_level)
+ lower_tx = bounds[0]
+ upper_tx = 1 + bounds[1]
+
+ #print "[Malik]-loop cleanup@tmp1: lowerty, upperty: %d, %d, ty: %d,ty level:%d,tx_level:%d, stmt: %d" % ( lower_ty, upper_ty-1, ty, ty_level, tx_level, stmt)
+
+ #print "before ceil"
+ #sys.stdout.flush()
+
+ if(math.ceil(float(upper_ty)/float(ty)) > 1):
+ #print "CEIL IF"
+ #print "\n Inside upper_ty/ty > 1\n"
+
+ #print "\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, ty_level,ty,ty_level, "ty", "tmp_ty")
+ chill.tile7(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted)
+ #print_code()
+
+ #print "\n[Tile2-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt ,"ty"),find_cur_level(stmt,"ty"))
+ chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty"))
+ #print_code()
+
+ cur_idxs = chill.cur_indices(stmt)
+ #print "\n cur indexes are ",
+ #print_array( cur_idxs)
+ #sys.stdout.flush()
+
+ # Putting ty before any tmp_tx
+ idx_flag = -1
+ if "tmp_tx" in cur_idxs:
+ idx_flag = 1 + cur_idxs.index("tmp_tx") # lua index starts at 1
+ #print "\n (1) so i have found out the value of idx flag as %d" % idx_flag
+ #sys.stdout.flush()
+
+ if idx_flag >= 0:
+ if find_cur_level(stmt,"ty") > find_cur_level(stmt,"tmp_ty"):
+ #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+ chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+ #print_code()
+
+
+ # Now Putting ty before any tmp_ty
+ sys.stdout.flush()
+ idx_flag = -1
+ if "tmp_ty" in cur_idxs:
+ idx_flag = 1 + cur_idxs.index("tmp_ty") # lua index starts at 1
+ #print "\n IF so i have found out the value of idx flag as %d" % idx_flag
+ #sys.stdout.flush()
+
+ if idx_flag >= 0:
+ #print "one more test"
+ sys.stdout.flush()
+ if find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"):
+ #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+ #sys.stdout.flush()
+ chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+ #print_code()
+
+
+
+ else:
+ #print "CEIL ELSE"
+ #print "\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, ty_level,1,ty_level, "ty", "ty")
+ #sys.stdout.flush()
+ chill.tile7( stmt, ty_level, 1, ty_level, "ty", "ty", counted )
+ #print_code()
+
+ #print "\n[Tile3-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
+ sys.stdout.flush()
+
+ chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
+ #print_code()
+
+
+ idx_flag = -1
+ # LUA code checks to see if cur_idxs exists? it is unused except in the other clause of this is
+ #if(cur_idxs) then
+ #print "CAN NEVER GET HERE? cur_idxs"
+ #for num= 0,table.getn(cur_idxs) do
+ #if(cur[num] == "tmp_ty") then
+ #idx_flag = find_cur_level(stmt,cur[num])
+ #break
+ #end
+ #end
+ print "\n ELSE so i have found out the value of idx flag as %d" % idx_flag
+ if idx_flag >= 0: # can't happen
+ print "tile( stmt %d, level ty %d, level ty %d" % ( stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+ #chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
+
+
+
+
+
+ #print "\n\n *** at bottom of if in copy to shared, "
+ #print_code()
+ #print "end of if"
+
+ else:
+ # copy to shared only created one level, not two, so we use a different approach (MV & TMV)
+ #print "\nCopy to shared: [If was error]\n"
+ level = find_cur_level(stmt,"tmp1")
+ chill.tile3(stmt, level, level)
+
+ dims = chill.thread_dims()
+ #print dims
+ tx = dims[0]
+ ty = dims[1]
+
+ bounds = chill.hard_loop_bounds(stmt, level)
+ lower = bounds[0]
+ upper = bounds[1]
+
+ #print "bounds lower %d upper %d" % (lower, upper)
+ upper = upper+1 # upper bound given as <=, compare to dimensions tx which is <
+ if upper == tx:
+ #print "upper == tx"
+ chill.rename_index( stmt, "tmp1", "tx")
+ else:
+ #print "upper is not tx"
+ #print "upper %d tx %d stmt: %d level: %d" % ( upper, tx, stmt, level)
+ chill.tile7( stmt, level, tx, level, "tx", "tmp_tx", counted)
+ #print_code()
+
+ #print "stmt:%d level+1: %d" % ( stmt, level+1)
+ #print("TILE 7")
+ chill.tile7( stmt, level+1,1,level+1,"tx", "tx",counted)
+ #print("TILE 3")
+ chill.tile3( stmt, level+1, level)
+ #print_code()
+
+
+ if ty > 1:
+ #print "GOING IN"
+ bounds = chill.hard_loop_bounds(stmt, level+1)
+ lower = bounds[0]
+ upper = bounds[1]
+ #print "ty %d lower %d upper %d" % ( ty, lower, upper )
+ floatdiv = float(upper)/float(ty)
+ bound = int(math.ceil(float(upper)/float(ty)))
+ #print "NOW FOR Y: upper %d ty %d stmt: %d level: %d bound: %d" % ( upper, ty, stmt, level+1, bound)
+ chill.tile7(stmt, level+1, bound, level+1, "tmp_ty", "ty", counted)
+
+ # Always add sync
+ chill.add_sync( stmt, start_loop )
+ #print "ending copy to shared\n"
+ #sys.stdout.flush()
+ #print_code()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def unroll_to_depth( max_depth ):
+ print "\n\nunroll_to_depth(%d)" % max_depth
+ print "SYNC UP"
+ sys.stdout.flush()
+
+ cur = chill.cur_indices(0)
+ thread_idxs = chill.thread_indices()
+ guard_idx = thread_idxs[-1] # last one
+
+ print "cur indices",
+ print_array(cur)
+ print "thread indices",
+ print_array(thread_idxs)
+ print "guard_idx = %s" % guard_idx
+
+ #print "thread_idxs = ",
+ #print thread_idxs
+ guard_idx = thread_idxs[-1]
+ #print "guard_idx = %s" % guard_idx
+
+ # HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS
+ common_loops = []
+ comm_loops_cnt = 0
+ num_stmts = chill.num_statements()
+ print "num statements %d" % num_stmts
+
+ for stmt in range(num_stmts):
+ sys.stdout.flush()
+ print "\nSTMT %d" % stmt,
+ cur_idxs = chill.cur_indices(stmt)
+ print "Current Indices:",
+ for c in cur_idxs[:-1]:
+ print "%s," % c,
+ print "%s" % cur_idxs[-1] # last one
+ sys.stdout.flush()
+ #print_code()
+
+ if chk_cur_level(stmt, "tx") > 0:
+
+ for ii in range(find_cur_level(stmt,"tx")-1):
+ print "ii = %d\ncur_idxs[%d] = '%s'" % (ii+1, ii+1, cur_idxs[ii]) # print to match lua
+ id = cur_idxs[ii]
+ if id not in ["bx", "by", "", "tx", "ty"]:
+
+ print "id %s is not in the list" % id
+
+ for stmt1 in range(stmt+1, num_stmts):
+ print "\nii %d stmt1 is %d" % (ii+1, stmt1) # print to match lua
+ cur_idxs1 = chill.cur_indices(stmt1)
+ print "\nstmt1 cur_idxs1 is ",
+ for ind in cur_idxs1[:-1]:
+ print "%s," % ind,
+ print "%s" % cur_idxs1[-1]
+
+ print "cur level(%d, %s) = %d" % (stmt, "tx", find_cur_level(stmt,"tx") )
+ sys.stdout.flush()
+
+ endrange = find_cur_level(stmt,"tx")-1
+ print "for iii=1, %d do" % endrange
+ sys.stdout.flush()
+ for iii in range(endrange): # off by one? TODO
+ print "stmt %d ii %d iii %d\n" % (stmt, ii+1, iii+1),
+ sys.stdout.flush()
+
+ if iii >= len(cur_idxs1):
+ print "stmt %d ii %d iii %d cur_idxs1[%d] = NIL" % (stmt, ii+1, iii+1, iii+1, ) # print to match lua
+ else:
+ print "stmt %d ii %d iii %d cur_idxs1[%d] = '%s'" % (stmt, ii+1, iii+1, iii+1, cur_idxs1[iii]) # print to match lua
+ sys.stdout.flush()
+
+ # this will still probably die
+ if iii < len(cur_idxs1) and [iii] not in ["bx", "by", "tx", "ty", ""]:
+ if cur_idxs[ii] == cur_idxs1[iii]:
+ print "\nfound idx:%s" % cur_idxs[ii]
+ common_loops.append(cur_idxs[ii])
+ print "cl[%d] = '%s'" % ( comm_loops_cnt, cur_idxs[ii] )
+ comm_loops_cnt = len(common_loops)
+
+ if len(common_loops) > 0:
+ print "\n COMM LOOPS :TOTAL %d, and are " % comm_loops_cnt,
+ print common_loops,
+ print " this loop : %s" % common_loops[0]
+ else:
+ print "UNROLL can't unroll any loops?"
+
+
+ while True: # break at bottom of loop (repeat in lua)
+ old_num_statements = chill.num_statements()
+ print "old_num_statements %d" % old_num_statements
+
+ for stmt in range(old_num_statements):
+ cur_idxs = chill.cur_indices(stmt)
+ print "stmt %d cur_idxs =" % stmt,
+ index = 0
+ for i in cur_idxs:
+ index +=1
+ if index == len(cur_idxs):
+ print "%s" %i
+ else:
+ print "%s," % i,
+
+ if len(cur_idxs) > 0:
+ guard_level = -1
+ if chk_cur_level(stmt, guard_idx) > 0:
+ guard_level = find_cur_level(stmt,guard_idx)
+ print "guard_level(sp) = %d" % guard_level
+ if guard_level > -1:
+ level = next_clean_level(cur_idxs,guard_level)
+ print "next clean level %d" % level
+
+
+ #print "looking at %d" % stmt
+ #print "comparing %d and %d in" % (guard_level, level),
+ #index = 0
+ #for i in cur_idxs:
+ #index +=1
+ #if index == len(cur_idxs):
+ # print "%s" %i
+ #else:
+ # print "%s," % i,
+
+ # need to handle max_depth
+ num_unrolled = 0
+ level_unroll_comm = level
+ level_arr = []
+
+ #print "before while, level = %d" % level
+ while level >= 0:
+ print "while: level = %d" % level
+ if num_unrolled == max_depth:
+ break
+
+ print "Unrolling %d at level %d index %s" % ( stmt, level, cur_idxs[guard_level]) # ???
+ level_arr.append(level)
+
+ guard_level = find_cur_level(stmt,guard_idx)
+ level = next_clean_level(cur_idxs,level+1)
+
+ print "OK, NOW WE UNROLL"
+ if level_unroll_comm >= 0:
+ level_arr.reverse()
+ for i,lev in enumerate(level_arr):
+ print "\ni=%d" % i
+ print "[Unroll]unroll(%d, %d, 0)" % (stmt, lev)
+ chill.unroll(stmt, lev, 0)
+
+
+ new_num_statements = chill.num_statements()
+ if old_num_statements == new_num_statements:
+ break # exit infinite loop
+
+
+# all other calls to C have a routine in this file (?)
+def unroll( statement, level, unroll_amount ):
+ chill.unroll( statement, level, unroll_amount )
+
diff --git a/test-chill/test-cases/examples/cuda-chill/mm.c b/test-chill/test-cases/examples/cuda-chill/mm.c
new file mode 100644
index 0000000..0efbeeb
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mm.c
@@ -0,0 +1,10 @@
+#define N 1024
+
+void normalMM(float c[N][N], float a[N][N], float b[N][N]) {
+ int i, j, k;
+
+ for (i = 0; i < N; i++)
+ for (j = 0; j < N; j++)
+ for (k = 0; k < N; k++)
+ c[j][i] = c[j][i] + a[k][i] * b[j][k];
+}
diff --git a/test-chill/test-cases/examples/cuda-chill/mm.lua b/test-chill/test-cases/examples/cuda-chill/mm.lua
new file mode 100644
index 0000000..5bde1b0
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mm.lua
@@ -0,0 +1,38 @@
+init("mm.c", "normalMM", 0)
+dofile("cudaize.lua")
+N=1024
+Ti=128
+Tj=64
+Tk=16
+Tii=16
+Tjj=16
+
+
+
+
+N=1024
+
+
+
+
+
+
+
+
+
+
+
+
+
+tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k"})CU=1
+
+tile_by_index({"k"},{Tk},{l1_control="kk"},{"ii","jj","kk","i","j","k"})CU=3
+
+tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","kk","i","iii","j","jjj","k"},1)CU=2
+
+cudaize("mm_GPU",{a=1048576,b=1048576,c=1048576},{block={"ii","jj"}, thread={"i","j"}})CU=2
+copy_to_shared("tx","a",-16)
+copy_to_shared("tx","b",-16)
+copy_to_registers("kk","c")
+--print_code()
+unroll_to_depth(2)
diff --git a/test-chill/test-cases/examples/cuda-chill/mpeg4.c b/test-chill/test-cases/examples/cuda-chill/mpeg4.c
new file mode 100755
index 0000000..7f83bf7
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mpeg4.c
@@ -0,0 +1,23 @@
+#define N1 4096
+#define N2 4096
+#define WINDOW_SIZE 16
+
+void mpeg4_cpu(float result[N1][N2], float prev[N2+WINDOW_SIZE][N2+WINDOW_SIZE], float curr[WINDOW_SIZE*WINDOW_SIZE])
+{
+ unsigned int i;
+ unsigned int j;
+ unsigned int k;
+ unsigned int l;
+
+ for ( i = 0; i < N1; ++i)
+ for ( j = 0; j < N2; ++j)
+ for ( k = 0; k < WINDOW_SIZE; ++k)
+ for ( l = 0; l < WINDOW_SIZE; ++l)
+ result[i][j] += prev[i+k][j+l] * curr[k*WINDOW_SIZE+l];
+
+
+
+
+
+}
+
diff --git a/test-chill/test-cases/examples/cuda-chill/mpeg4.lua b/test-chill/test-cases/examples/cuda-chill/mpeg4.lua
new file mode 100644
index 0000000..f025dc0
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mpeg4.lua
@@ -0,0 +1,45 @@
+--CUBLAS 2 MM Multiply
+
+--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
+--call init() and use global variables to specify procedure and loop
+
+--Second parameter is procedure # and third is loop #
+init("mpeg4.c", "mpeg4_cpu", 0)
+
+--dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods
+
+N=4096
+M=4096
+W=16
+
+--TI 4ust be <= M
+--TJ must be <=TI
+Ti=32
+Tj=32
+Tii=16
+Tjj=16
+Tk=4
+--permute(0,{"j","i","k","l"})
+tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k","l"})
+--tile_by_index({"k","l"},{Tk*2,Tk*2},{l1_control="kk",l2_control="ll"},{"ii","jj","kk","ll","i","j","k","l"})
+--print_code()
+--tile_by_index({"k","l"},{Tk,Tk},{l1_control="kk",l2_control="ll"},{"ii","jj","i","j","kk","k","ll","l"})
+tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","iii","i","jjj","j","k","l"})
+--print_code()
+--normalize_index("j")
+--normalize_index("i")
+--print_code()
+cudaize("kernel_GPU",{curr=W*W,prev=(N+W)*(M+W),result=N*M},{block={"ii","jj"}, thread={"i","j"}})
+--print_code()
+copy_to_shared("iii","prev",16)
+
+copy_to_registers("jjj","result")
+
+--print_code()
+--copy_to_constant_no_tile("curr")
+unroll_to_depth(2)
+print_code()
+print_space()
+
+
diff --git a/test-chill/test-cases/examples/cuda-chill/mriq-fh.c b/test-chill/test-cases/examples/cuda-chill/mriq-fh.c
new file mode 100755
index 0000000..1e924b7
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mriq-fh.c
@@ -0,0 +1,38 @@
+#define X 32768
+#define K 256
+struct kValues {
+ float Kx;
+ float Ky;
+ float Kz;
+ float PhiMag;
+};
+extern float sin(float);
+extern float cos(float);
+
+void mriFH_cpu(float *rPhi,float *rRho,float *iRho, float *iPhi, float *rD, float *iD, float *kx, float *ky, float *kz, float *dx, float *dy, float *dz, float *rFHref, float *iFHref)
+{
+
+ float rfh;
+ float ifh;
+ float exp;
+ float cArg;
+ float sArg;
+ //float rRho[K];
+ //float iRho[K];
+ unsigned int k;
+ unsigned int x;
+
+
+ for (x = 0; x < X; ++x) {
+ for (k = 0; k < K; ++k) {
+
+ exp = 2 * 3.14159 * (kx[k]* dx[x] + ky[k]* dy[x] + kz[k]* dz[x]);
+ cArg = cos(exp);
+ sArg = sin(exp);
+ rFHref[x] += rRho[k]* cArg - iRho[k]* sArg;
+ iFHref[x] += iRho[k]*cArg + rRho[k]*sArg;
+ }
+
+ }
+}
+
diff --git a/test-chill/test-cases/examples/cuda-chill/mriq-fh.lua b/test-chill/test-cases/examples/cuda-chill/mriq-fh.lua
new file mode 100755
index 0000000..3277bac
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mriq-fh.lua
@@ -0,0 +1,73 @@
+--CUBLAS 2 MM Multiply
+
+--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
+--call init() and use global variables to specify procedure and loop
+
+--Second parameter is procedure # and third is loop #
+init("mriq-fh.c", "mriFH_cpu", 0)
+
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+ --copy_to_shared methods
+N=32768
+M=256
+Tx=256
+
+
+print_code()
+--permute(0,{"j","i"})
+--tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"})
+tile_by_index({"x"},{Tx},{l1_control="xx"},{"xx","x","k"})
+--tile_by_index({"x"},{16},{l1_control="xx1"},{"xx","x","xx1","k"})
+--tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"})
+--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
+print_code()
+
+normalize_index("x")
+--normalize_index("i")
+print_code()
+--tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"})
+--print_code()
+--cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}})
+cudaize("kernel_GPU",{dx=N,dy=N,dz=N,iRho=M,kx=M,ky=M,kz=M,rFHref=N,iFHref=N,rRho=M},{block={"xx"}, thread={"x"}})
+--copy_to_shared("tx","iRho",-16)
+--copy_to_shared("tx","dz",1)
+--copy_to_shared("tx","rRho",-16)
+--copy_to_registers("tx","rFHref")
+--copy_to_registers("tx","rRho")
+--copy_to_registers("tx","iRho")
+--copy_to_registers("tx","kx")
+--copy_to_registers("tx","dx")
+--copy_to_registers("tx","ky")
+--copy_to_registers("tx","dy")
+--copy_to_registers("tx","kz")
+--copy_to_registers("tx","dz")
+--copy_to_registers("tx","iFHref")
+--copy_to_texture("rRho")
+--copy_to_texture("kx")
+--copy_to_texture("dx")
+--copy_to_texture("ky")
+--copy_to_texture("dy")
+--copy_to_texture("kz")
+--copy_to_texture("dz")
+--copy_to_texture("iRho")
+--print_code()--]]
+--unroll(0,4,0)
+--copy_to_constant_no_tile("kx")
+--copy_to_constant_no_tile("ky")
+--copy_to_constant_no_tile("kz")
+--copy_to_constant_no_tile("rRho")
+--copy_to_constant_no_tile("iRho")
+
+--unroll_to_depth(1)
+print_code()
+--[[
+copy_to_Texture("rRho")
+copy_to_Texture("kx")
+copy_to_Texture("dx")
+copy_to_Texture("ky")
+copy_to_Texture("dy")
+copy_to_Texture("kz")
+copy_to_Texture("dz")
+copy_to_Texture("iRho")
+--unroll_to_depth(2)
+--]]
diff --git a/test-chill/test-cases/examples/cuda-chill/mriq.c b/test-chill/test-cases/examples/cuda-chill/mriq.c
new file mode 100644
index 0000000..ba4b87c
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mriq.c
@@ -0,0 +1,33 @@
+#define N 32768
+#define M 3072
+struct kValues {
+ float Kx;
+ float Ky;
+ float Kz;
+ float PhiMag;
+};
+extern float sinf(float);
+extern float cosf(float);
+
+void
+ComputeQCPU(int numK, int numX,struct kValues kVals[M],float x[N], float y[N], float z[N],float Qr[N], float Qi[N]) {
+ float expArg;
+ float cosArg;
+ float sinArg;
+ float phi;
+ int i;
+ int j;
+ numK = M;
+ numX = N;
+ for ( i = 0; i < M; i++) {
+ for ( j = 0; j < N; j++) {
+ expArg = 6.2831853071795864769252867665590058f * (kVals[i].Kx * x[j] +kVals[i].Ky * y[j] +kVals[i].Kz * z[j]);
+ cosArg = cosf(expArg);
+ sinArg = sinf(expArg);
+ phi = kVals[i].PhiMag;
+ Qr[j] += phi * cosArg;
+ Qi[j] += phi * sinArg;
+ }
+ }
+}
+
diff --git a/test-chill/test-cases/examples/cuda-chill/mriq.lua b/test-chill/test-cases/examples/cuda-chill/mriq.lua
new file mode 100644
index 0000000..1170111
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mriq.lua
@@ -0,0 +1,55 @@
+--CUBLAS 2 MM Multiply
+
+--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
+--call init() and use global variables to specify procedure and loop
+
+--Second parameter is procedure # and third is loop #
+init("mriq.c", "ComputeQCPU", 0)
+
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+ --copy_to_shared methods
+N=32768
+M=3072
+TI=128
+TJ=128
+
+permute(0,{"j","i"})
+--tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"})
+tile_by_index({"i"}, {TJ}, {l1_control="ii",l1_tile="i"}, {"ii", "j","i"})
+tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"})
+--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
+--print_code()
+
+normalize_index("j")
+normalize_index("i")
+--print_code()
+--tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"})
+--print_code()
+cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}})
+
+copy_to_shared("tx","kVals",1)
+--copy_to_shared("tx","x",1)
+--copy_to_shared("tx","y",1)
+--copy_to_shared("tx","z",1)
+
+--copy_to_texture("kVals")
+--datacopy(0, 3, "kVals", {"tt","t"},false,0,1,-16,true)
+--print_code()
+--datacopy_privatized(0,"tx","kVals",{"tx"})
+--copy_to_registers("tx","kVals")
+copy_to_registers("ii","x")
+copy_to_registers("ii","y")
+copy_to_registers("ii","z")
+copy_to_registers("ii","Qi")
+copy_to_registers("ii","Qr")
+--[[datacopy_privatized(0,"tx","x",{"tx"})
+datacopy_privatized(0,"tx","y",{"tx"})
+datacopy_privatized(0,"tx","z",{"tx"})
+datacopy_privatized(0,"tx","Qi",{"tx"})
+datacopy_privatized(0,"tx","Qr",{"tx"})
+
+
+]]--
+--unroll(0,5,64)
+print_code()
+--unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
diff --git a/test-chill/test-cases/examples/cuda-chill/mv-shadow.c b/test-chill/test-cases/examples/cuda-chill/mv-shadow.c
new file mode 100644
index 0000000..582b187
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mv-shadow.c
@@ -0,0 +1,9 @@
+#define N 1024
+
+void normalMV(float c[N][N], float a[N], float b[N]) {
+ int i, j;
+
+ for (i = 0; i < N; i++)
+ for (j = 0; j < N; j++)
+ a[i] = a[i] + c[j][i] * b[j];
+}
diff --git a/test-chill/test-cases/examples/cuda-chill/mv-shadow.lua b/test-chill/test-cases/examples/cuda-chill/mv-shadow.lua
new file mode 100644
index 0000000..43e8491
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mv-shadow.lua
@@ -0,0 +1,65 @@
+init("mv-shadow.c","normalMV",0)
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+ --copy_to_shared methods
+
+N=129
+TI=32
+TJ=64
+
+N=1024
+TI=16
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+--Tile the i and j loop, introducing "ii" as the control loop for the "i"
+--tile, "k" for the control loop fo the "j" tile, with the final order
+--of {"ii", "k", "i", "j"}
+tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
+--tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
+--tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"})
+--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
+--print_code()
+--Normalize indx will do a tile size of one over the loop level specified
+--by the input index. This is useful to get a zero lower bound and hard
+--upper bound on a loop instead of it being relative to previous loop
+--levels.
+--normalize_index("ii")
+normalize_index("i")
+print_code()
+
+--Cudaize now determines the grid dimentions from the loops themselves
+--(the upper bounds of the block and thread loops). It also renames the
+--given block and thread loops's indexes to the approviate values from
+--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
+--size of the arrays to be copied in the CUDA scaffolding.
+cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}})
+--print_code()
+
+--Does a datacopy, tile, and add_sync to get a shared memory copy
+
+--copy_to_shared("tx", "b", 1)
+--copy_to_shared("tx", "c", -16)
+--print_code()
+--copy_to_texture("b")
+--copy_to_texture("c")
+copy_to_registers("k", "a")
+--print_code()
+
+unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
+--copy_to_texture("b")
+--print_code()
+--unroll(0,5,0)
+--print_code()
diff --git a/test-chill/test-cases/examples/cuda-chill/mv.c b/test-chill/test-cases/examples/cuda-chill/mv.c
new file mode 100644
index 0000000..582b187
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mv.c
@@ -0,0 +1,9 @@
+#define N 1024
+
+void normalMV(float c[N][N], float a[N], float b[N]) {
+ int i, j;
+
+ for (i = 0; i < N; i++)
+ for (j = 0; j < N; j++)
+ a[i] = a[i] + c[j][i] * b[j];
+}
diff --git a/test-chill/test-cases/examples/cuda-chill/mv.lua b/test-chill/test-cases/examples/cuda-chill/mv.lua
new file mode 100644
index 0000000..ca54501
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mv.lua
@@ -0,0 +1,65 @@
+init("mv.c","normalMV",0)
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+ --copy_to_shared methods
+
+N=129
+TI=32
+TJ=64
+
+N=1024
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+--Tile the i and j loop, introducing "ii" as the control loop for the "i"
+--tile, "k" for the control loop fo the "j" tile, with the final order
+--of {"ii", "k", "i", "j"}
+tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
+--tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
+--tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"})
+--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
+--print_code()
+--Normalize indx will do a tile size of one over the loop level specified
+--by the input index. This is useful to get a zero lower bound and hard
+--upper bound on a loop instead of it being relative to previous loop
+--levels.
+--normalize_index("ii")
+normalize_index("i")
+print_code()
+
+--Cudaize now determines the grid dimentions from the loops themselves
+--(the upper bounds of the block and thread loops). It also renames the
+--given block and thread loops's indexes to the approviate values from
+--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
+--size of the arrays to be copied in the CUDA scaffolding.
+cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}})
+
+--print_code()
+
+--Does a datacopy, tile, and add_sync to get a shared memory copy
+
+--copy_to_shared("tx", "b", 1)
+--copy_to_shared("tx", "c", -16)
+--print_code()
+--copy_to_texture("b")
+--copy_to_texture("c")
+copy_to_registers("k", "a")
+--print_code()
+
+unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
+--copy_to_texture("b")
+--print_code()
+--unroll(0,5,0)
+--print_code()
diff --git a/test-chill/test-cases/examples/cuda-chill/mv_try.c b/test-chill/test-cases/examples/cuda-chill/mv_try.c
new file mode 100644
index 0000000..7781f3b
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mv_try.c
@@ -0,0 +1,9 @@
+#define N 4096
+
+void normalMV(int n, float c[N][N], float a[N], float b[N]) {
+ int i, j;
+
+ for (i = 0; i < n; i++)
+ for (j = 0; j < n; j++)
+ a[i] = a[i] + c[i][j] * b[j];
+}
diff --git a/test-chill/test-cases/examples/cuda-chill/mv_try.lua b/test-chill/test-cases/examples/cuda-chill/mv_try.lua
new file mode 100644
index 0000000..db4d9ad
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mv_try.lua
@@ -0,0 +1,14 @@
+init("mv_try.c","normalMV",0)
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+ --copy_to_shared methods
+
+TI=96
+
+N=4096
+
+
+tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
+cudaize("mv_GPU", {a=N, b=N, c=N*N},
+ {block={"ii"}, thread={"i"}})
+
+print_code()
diff --git a/test-chill/test-cases/examples/cuda-chill/nbody.c b/test-chill/test-cases/examples/cuda-chill/nbody.c
new file mode 100644
index 0000000..57899b6
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/nbody.c
@@ -0,0 +1,66 @@
+#define NBODIES 16384
+#define SOFTENINGSQUARED 0.01f
+#define DELTATIME 0.001f
+#define DAMPING 1.0f
+
+#define NBLOCKSY 1
+#define NBLOCKSX (NBODIES/NTHREADSX)
+#define NTHREADSY 1
+#define NTHREADSX 64
+
+#define BLOCKSIZE 128
+
+#define SHARED 1
+#define TIMER 1
+#define VERIFY 1
+
+extern float sqrtf(float);
+
+void nbody_cpu(float* oldpos,float* oldpos1, float *newpos, float *oldvel, float *newvel, float *force)
+{
+ float r0,r1,r2;
+ float invDist, invDistCube, mass, invMass;
+ unsigned int i,j;
+ for(i = 0; i < NBODIES; ++i) {
+ //force[i*4 ] = 0;
+ //force[i*4+1] = 0;
+ //force[i*4+2] = 0;
+ //force[i*4+3] = 0;
+ for(j = 0; j < NBODIES; ++j) {
+ r0 = oldpos[j*4]-oldpos1[i*4];
+ r1 = oldpos[j*4+1]-oldpos1[i*4+1];
+ r2 = oldpos[j*4+2]-oldpos1[i*4+2];
+
+ invDist = 1.0/sqrtf(r0 * r0 + r1 * r1 + r2 * r2 + SOFTENINGSQUARED);
+ invDistCube = invDist * invDist * invDist;
+ mass = oldpos1[i*4+3];
+
+ force[i*4] = force[i*4] + r0 * mass * invDistCube;
+ force[i*4+1] = force[i*4+1] + r1 * mass * invDistCube;
+ force[i*4+2] = force[i*4+2] + r2 * mass * invDistCube;
+
+ }
+ }
+
+/* for (i = 0; i < NBODIES; ++i) {
+ invMass = oldvel[4*i+3];
+
+ oldvel[4*i] += (force[4*i] * invMass) * DELTATIME * DAMPING;
+ oldvel[4*i+1] += (force[4*i+1] * invMass) * DELTATIME * DAMPING;
+ oldvel[4*i+2] += (force[4*i+2] * invMass) * DELTATIME * DAMPING;
+
+ oldpos[4*i] += oldvel[4*i] * DELTATIME;
+ oldpos[4*i+1] += oldvel[4*i+1] * DELTATIME;
+ oldpos[4*i+2] += oldvel[4*i+2] * DELTATIME;
+
+ newpos[4*i+0] = oldpos[4*i];
+ newpos[4*i+1] = oldpos[4*i+1];
+ newpos[4*i+2] = oldpos[4*i+2];
+ newpos[4*i+3] = oldpos[4*i+3];
+
+ newvel[4*i+0] = oldvel[4*i];
+ newvel[4*i+1] = oldvel[4*i+1];
+ newvel[4*i+2] = oldvel[4*i+2];
+ newvel[4*i+3] = oldvel[4*i+3];
+ }*/
+}
diff --git a/test-chill/test-cases/examples/cuda-chill/nbody.lua b/test-chill/test-cases/examples/cuda-chill/nbody.lua
new file mode 100644
index 0000000..08f88a9
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/nbody.lua
@@ -0,0 +1,53 @@
+--CUBLAS 2 MM Multiply
+
+--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
+--call init() and use global variables to specify procedure and loop
+
+--Second parameter is procedure # and third is loop #
+init("nbody.c", "nbody_cpu" , 0)
+
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+ --copy_to_shared methods
+NBODIES=16384
+
+
+--Tj=128 CHANGE FOR BEST..... BEST IS 64BLOCKS 128THREADS
+--Ti=256
+Tj=64
+Ti=32
+Tjjj=1
+Tiii=1
+Tn=0.1
+--normalize_index("j")
+--
+--print_code()
+--normalize_index("n")
+-- TILE COMMANDS ZEROOOOOOOOOOO:3
+--tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j"})--CU=-1
+tile_by_index({"i"},{Ti},{l1_control="ii"},{"ii","i","j"})--CU=-1
+--normalize_index("i")
+--tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1
+
+--tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3
+--tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"})
+--tile_by_index({"i"}, {Ti/2}, {l1_control="iii"}, {"ii","iii", "jj","i","j"})
+--print_code()
+cudaize("kernel_GPU",{oldpos=4*NBODIES,oldpos1=4*NBODIES,oldvel=4*NBODIES,force=4*NBODIES,newpos=4*NBODIES,newvel=4*NBODIES},{block={"ii"}, thread={"i"}})--CU=3
+print_code()
+--tile(0,6,6)
+--copy_to_shared("tx","oldpos",-16)
+--copy_to_registers("j","oldpos")
+--copy_to_registers("j","oldpos1")
+--copy_to_registers("j","force")
+
+--copy_to_texture("oldpos")
+--tile(1,3,3)
+--tile(2,3,3)
+
+print_code()
+--unroll_to_depth(1)
+--
+--tile(2,3,3)
+--unroll(2,3,0)
+--unroll(0,5,0)
+--print_code()
diff --git a/test-chill/test-cases/examples/cuda-chill/tmv-shadow.c b/test-chill/test-cases/examples/cuda-chill/tmv-shadow.c
new file mode 100644
index 0000000..cb9ea8d
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/tmv-shadow.c
@@ -0,0 +1,9 @@
+#define N 1024
+
+void normalMV(float c[N][N], float a[N], float b[N]) {
+ int i, j;
+
+ for (i = 0; i < N; i++)
+ for (j = 0; j < N; j++)
+ a[i] = a[i] + c[i][j] * b[j];
+}
diff --git a/test-chill/test-cases/examples/cuda-chill/tmv-shadow.lua b/test-chill/test-cases/examples/cuda-chill/tmv-shadow.lua
new file mode 100644
index 0000000..196b939
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/tmv-shadow.lua
@@ -0,0 +1,50 @@
+init("tmv-shadow.c","normalMV",0)
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+ --copy_to_shared methods
+
+N=1024
+--N= 8209
+--N=129
+TI=64
+N=1024
+TI=32
+--tile, "k" for the control loop for the "j" tile, with the final order
+--of {"ii", "k", "i", "j"}
+tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
+--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
+--print_code()
+--tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
+
+--print_code()
+--Normalize indx will do a tile size of one over the loop level specified
+--by the input index. This is useful to get a zero lower bound and hard
+--upper bound on a loop instead of it being relative to previous loop
+--levels.
+--normalize_index("i")
+--print_code()
+
+--Cudaize now determines the grid dimentions from the loops themselves
+--(the upper bounds of the block and thread loops). It also renames the
+--given block and thread loops's indexes to the approviate values from
+--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
+--size of the arrays to be copied in the CUDA scaffolding.
+cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}})
+
+--print_code()
+
+--Does a datacopy, tile, and add_sync to get a shared memory copy
+copy_to_shared("tx", "b", 1)
+--copy_to_texture("b")
+--print_code()
+
+copy_to_shared("tx", "c", -16)
+--copy_to_texture("c")
+--print_code()
+
+copy_to_registers("k", "a")
+print_code()
+--unroll(0,5,0)
+--unroll(0,4,0)
+--unroll(2,4,16)
+unroll_to_depth(1)
+--print_code()
diff --git a/test-chill/test-cases/examples/cuda-chill/tmv.c b/test-chill/test-cases/examples/cuda-chill/tmv.c
new file mode 100644
index 0000000..cb9ea8d
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/tmv.c
@@ -0,0 +1,9 @@
+#define N 1024
+
+void normalMV(float c[N][N], float a[N], float b[N]) {
+ int i, j;
+
+ for (i = 0; i < N; i++)
+ for (j = 0; j < N; j++)
+ a[i] = a[i] + c[i][j] * b[j];
+}
diff --git a/test-chill/test-cases/examples/cuda-chill/tmv.lua b/test-chill/test-cases/examples/cuda-chill/tmv.lua
new file mode 100644
index 0000000..5071108
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/tmv.lua
@@ -0,0 +1,50 @@
+init("tmv.c","normalMV",0)
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+ --copy_to_shared methods
+
+N=1024
+--N= 8209
+--N=129
+TI=64
+N=1024
+TI=32
+--tile, "k" for the control loop for the "j" tile, with the final order
+--of {"ii", "k", "i", "j"}
+tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
+--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
+--print_code()
+--tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
+
+--print_code()
+--Normalize indx will do a tile size of one over the loop level specified
+--by the input index. This is useful to get a zero lower bound and hard
+--upper bound on a loop instead of it being relative to previous loop
+--levels.
+--normalize_index("i")
+--print_code()
+
+--Cudaize now determines the grid dimentions from the loops themselves
+--(the upper bounds of the block and thread loops). It also renames the
+--given block and thread loops's indexes to the approviate values from
+--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
+--size of the arrays to be copied in the CUDA scaffolding.
+cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}})
+
+--print_code()
+
+--Does a datacopy, tile, and add_sync to get a shared memory copy
+copy_to_shared("tx", "b", 1)
+--copy_to_texture("b")
+--print_code()
+
+copy_to_shared("tx", "c", -16)
+--copy_to_texture("c")
+--print_code()
+
+copy_to_registers("k", "a")
+print_code()
+--unroll(0,5,0)
+--unroll(0,4,0)
+--unroll(2,4,16)
+unroll_to_depth(1)
+--print_code()
diff --git a/test-chill/test-cases/unit/chill-basic-python.tclist b/test-chill/test-cases/unit/chill-basic-python.tclist
new file mode 100644
index 0000000..555fa25
--- /dev/null
+++ b/test-chill/test-cases/unit/chill-basic-python.tclist
@@ -0,0 +1,20 @@
+build-chill-testcase -v dev -i python
+
+chill-testcase test-cases/chill/test_distribute.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_fuse.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_known.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_original.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_peel.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_permute.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_print_code.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_print_dep.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_print_space.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_reverse.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_scale.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_shift.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_shift_to.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_skew.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_tile.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_unroll_extra.py test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_unroll.py test-cases/chill/mm.c
+
diff --git a/test-chill/test-cases/unit/chill-basic-script.tclist b/test-chill/test-cases/unit/chill-basic-script.tclist
new file mode 100644
index 0000000..8bc34dc
--- /dev/null
+++ b/test-chill/test-cases/unit/chill-basic-script.tclist
@@ -0,0 +1,20 @@
+build-chill-testcase -v dev
+
+chill-testcase test-cases/chill/test_distribute.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_fuse.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_known.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_original.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_peel.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_permute.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_print_code.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_print_dep.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_print_space.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_reverse.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_scale.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_shift.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_shift_to.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_skew.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_tile.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_unroll_extra.script test-cases/chill/mm.c
+chill-testcase test-cases/chill/test_unroll.script test-cases/chill/mm.c
+
diff --git a/test-chill/test-cases/unit/chill-basic.tclist b/test-chill/test-cases/unit/chill-basic.tclist
new file mode 100644
index 0000000..57cddbd
--- /dev/null
+++ b/test-chill/test-cases/unit/chill-basic.tclist
@@ -0,0 +1,4 @@
+
+-w $STAGING_DIR_WD -O $OMEGA_DEV_SRC -C $CHILL_DEV_SRC -b $STAGING_DIR_BIN batch test-cases/unit/chill-basic-script.tclist
+-w $STAGING_DIR_WD -O $OMEGA_DEV_SRC -C $CHILL_DEV_SRC -b $STAGING_DIR_BIN batch test-cases/unit/chill-basic-python.tclist
+