diff options
| author | Derick Huth <derickhuth@gmail.com> | 2016-01-18 15:43:52 -0700 | 
|---|---|---|
| committer | Derick Huth <derickhuth@gmail.com> | 2016-01-18 15:43:52 -0700 | 
| commit | 983749787ee0dc1beb1107873e8a13ebdaeba576 (patch) | |
| tree | e9bff337b4d5582b87ad2edc25baa4d3b0c163fa /test-chill/test-cases/examples | |
| parent | 0cff3f9a3c4ccd434900162ebef4bd814850f481 (diff) | |
| download | chill-983749787ee0dc1beb1107873e8a13ebdaeba576.tar.gz chill-983749787ee0dc1beb1107873e8a13ebdaeba576.tar.bz2 chill-983749787ee0dc1beb1107873e8a13ebdaeba576.zip | |
restore test suite
Diffstat (limited to 'test-chill/test-cases/examples')
56 files changed, 3552 insertions, 0 deletions
| diff --git a/test-chill/test-cases/examples/chill/gemm.c b/test-chill/test-cases/examples/chill/gemm.c new file mode 100644 index 0000000..2c90ea5 --- /dev/null +++ b/test-chill/test-cases/examples/chill/gemm.c @@ -0,0 +1,25 @@ + +#ifndef N +#define N 512 +#endif + +/* +<test name=gemm define="{'N':512}"> +procedure int gemm( +    in  float[N][N] a = matrix([,], lambda i,j: random(2,-2)), +    in  float[N][N] b = matrix([,], lambda i,j: random(2,-2)), +    out float[N][N] c = matrix([,], lambda i,j: 0)) +</test> +*/ +int gemm(float a[N][N], float b[N][N], float c[N][N]) { +	int i, j, k; +	int n = N; +	for (j = 0; j < n; j++) +		for (k = 0; k < n; k++) +			for (i = 0; i < n; i++) { +				c[i][j] = c[i][j] + a[i][k] * b[k][j]; +			} + +	return 0; +} + diff --git a/test-chill/test-cases/examples/chill/gemm.script b/test-chill/test-cases/examples/chill/gemm.script new file mode 100644 index 0000000..393f236 --- /dev/null +++ b/test-chill/test-cases/examples/chill/gemm.script @@ -0,0 +1,31 @@ +#matrix multiply large array size for intel machine +source: gemm.c +procedure: gemm +format: rose +loop: 0 + +TI = 128 +TJ = 8 +TK = 512 +UI = 2 +UJ = 2 + +permute([3,1,2]) +tile(0,2,TJ) +#print space +tile(0,2,TI) +#print space +tile(0,5,TK) +#print space + +datacopy(0,3,a,false,1) +#print space + +datacopy(0,4,b) +print +unroll(0,4,UI)#print space +print  +unroll(0,5,UJ) +#print space +print + diff --git a/test-chill/test-cases/examples/chill/gemv.c b/test-chill/test-cases/examples/chill/gemv.c new file mode 100644 index 0000000..39b083c --- /dev/null +++ b/test-chill/test-cases/examples/chill/gemv.c @@ -0,0 +1,21 @@ +#ifndef N +#define N 512 +#endif + +/* +<test name=gemv define="{'N':512}"> +procedure int gemv( +    out float[N]    a = matrix([],  lambda i:   random(2,-2)), +    in  float[N]    b = matrix([],  lambda i:   random(2,-2)), +    in  float[N][N] c = matrix([,], lambda i,j: random(2,-2))) +</test> +*/ +int gemv(float a[N], float b[N], float c[N][N]) { +    int i, j; + +    for (i = 1; i < N; i++) +        for (j = 1; j < N; j++) +            a[i] = a[i] + c[i][j] * b[j]; + +    return 0; +} diff --git a/test-chill/test-cases/examples/chill/gemv.script b/test-chill/test-cases/examples/chill/gemv.script new file mode 100644 index 0000000..73b3b58 --- /dev/null +++ b/test-chill/test-cases/examples/chill/gemv.script @@ -0,0 +1,9 @@ +source: gemv.c # matrix-vector multiply +procedure: gemv +format : rose +loop: 0 + + + +original() +print diff --git a/test-chill/test-cases/examples/chill/jacobi1.c b/test-chill/test-cases/examples/chill/jacobi1.c new file mode 100644 index 0000000..e7ff8f8 --- /dev/null +++ b/test-chill/test-cases/examples/chill/jacobi1.c @@ -0,0 +1,19 @@ + +#ifndef N +#define N 512 +#endif + +/* +<test name=jacobi define="{'N':512}"> +procedure int jacobi( +    in out float[N][N] a = matrix [i,j] random(2,-2)) +</test> +*/ +int jacobi(float a[N][N]) { +    int t, i; +	for (t = 2; t <= 100; t++) +		for (i = 2; i <= N - 1; i++) +			a[t][i] = a[t - 1][i - 1] + a[t - 1][i] + a[t - 1][i + 1]; + +	return 0; +} diff --git a/test-chill/test-cases/examples/chill/jacobi1.script b/test-chill/test-cases/examples/chill/jacobi1.script new file mode 100644 index 0000000..604f763 --- /dev/null +++ b/test-chill/test-cases/examples/chill/jacobi1.script @@ -0,0 +1,18 @@ +# +# tiling perfect jacobi loop nest with time step, use +# unimodular transformation first (only applicable to the +# perfect loop nest) to make tiling legal. +# + +source: jacobi1.c +procedure: jacobi +format : rose +loop: 0 + +print dep + +nonsingular([[1,0],[1,1]])  # unimodular matrix, determinant is one +tile(0,2,64) + +print dep +print diff --git a/test-chill/test-cases/examples/chill/jacobi2.c b/test-chill/test-cases/examples/chill/jacobi2.c new file mode 100644 index 0000000..b8d8d7b --- /dev/null +++ b/test-chill/test-cases/examples/chill/jacobi2.c @@ -0,0 +1,15 @@ +#define N 512 + +int main() { +	double a[N]; +	double b[N]; +	int t, i; +	for (t = 1; t <= 100; t++) { +		for (i = 2; i <= N - 1; i++) +			b[i] = (double) 0.25 * (a[i - 1] + a[i + 1]) + (double) 0.5 * a[i]; + +		for (i = 2; i <= N - 1; i++) +			a[i] = b[i]; +	} +	return 0; +} diff --git a/test-chill/test-cases/examples/chill/jacobi2.script b/test-chill/test-cases/examples/chill/jacobi2.script new file mode 100644 index 0000000..afe14c6 --- /dev/null +++ b/test-chill/test-cases/examples/chill/jacobi2.script @@ -0,0 +1,21 @@ +# +# tiling imperfect jacobi loop nest, more details in the paper +# "Automatic Tiling of Iterative Stencil Loops" by Zhiyuan Li and +# Yonghong Song, TOPLAS, 2004. +# + +source: jacobi2.c +procedure: main +format: rose +loop: 0 + +print dep + +original() +shift([1], 2, 1) +fuse([0,1], 2)  # optional +skew([0,1], 2, [2,1]) +tile(0, 2, 32, 1) + +print dep +print diff --git a/test-chill/test-cases/examples/chill/qr.c b/test-chill/test-cases/examples/chill/qr.c new file mode 100644 index 0000000..8d18b72 --- /dev/null +++ b/test-chill/test-cases/examples/chill/qr.c @@ -0,0 +1,44 @@ +#include <math.h> + +int main() { + +	int M, N; +	float** A; +	float *s; +	float *Rdiag; +	float *nrm; +	int i, j, k; +        float t; +	for (k = 0; k < N; k++) { +		nrm[k] = 0; + +		for (i = k; i < M; i++) +			nrm[k] = sqrt(nrm[k] * nrm[k] + A[i][k] * A[i][k]); +                //t = A[k][k]; + +		//if (t < 0) +		//	nrm[k] = -nrm[k]; +		for (i = k; i < M; i++) +			A[i][k] = A[i][k] / nrm[k]; + +		A[k][k] = A[k][k] + 1; + +		for (j = k + 1; j < N; j++) { +			s[j] = 0; //S6 + +			for (i = k; i < M; i++) +				s[j] = s[j] + A[i][k] * A[i][j]; //S7 + +			s[j] = -s[j] / A[k][k]; //S8 + +			for (i = k; i < M; i++) +				A[i][j] = A[i][j] + s[j] * A[i][k]; //S9 + +		} + +		Rdiag[k] = -nrm[k]; + +	} + +	return 0; +} diff --git a/test-chill/test-cases/examples/chill/qr.script b/test-chill/test-cases/examples/chill/qr.script new file mode 100644 index 0000000..6b4cd46 --- /dev/null +++ b/test-chill/test-cases/examples/chill/qr.script @@ -0,0 +1,13 @@ +# +# tiling imperfect jacobi loop nest, more details in the paper +# "Automatic Tiling of Iterative Stencil Loops" by Zhiyuan Li and +# Yonghong Song, TOPLAS, 2004. +# + +source: qr.c +procedure: main +format: rose +loop: 0 +original() +print  + diff --git a/test-chill/test-cases/examples/chill/scalar_test.c b/test-chill/test-cases/examples/chill/scalar_test.c new file mode 100644 index 0000000..733c882 --- /dev/null +++ b/test-chill/test-cases/examples/chill/scalar_test.c @@ -0,0 +1,16 @@ +int a[10][10]; +int main() { + +	int temp; +	int i, j; + +	for (i = 0; i < 10; i++) { +		for (j = 0; j < 10; j++) { +			a[i + 1][j - 1] = a[i][j]; +		} + +	} + +	return 0; + +} diff --git a/test-chill/test-cases/examples/chill/scalar_test.script b/test-chill/test-cases/examples/chill/scalar_test.script new file mode 100644 index 0000000..f5b0aa8 --- /dev/null +++ b/test-chill/test-cases/examples/chill/scalar_test.script @@ -0,0 +1,10 @@ +#Simple Scalar dependence check +source: scalar_test.c +procedure: main +format : rose +loop: 0 + +original() +permute([2,1]) +print dep +print space diff --git a/test-chill/test-cases/examples/chill/swim.c b/test-chill/test-cases/examples/chill/swim.c new file mode 100644 index 0000000..a21ef24 --- /dev/null +++ b/test-chill/test-cases/examples/chill/swim.c @@ -0,0 +1,159 @@ +#define M 100 +#define N 100 +#define  N3 10  + +int main() { + +	int DX; +	int DY; +	int FSDX; +	int FSDY; +	int TDT; +	int TDTS8; +	int TDTSDX; +	int TDTSDY; +	int t, i, j; +	double CU[M + 1][N + 1]; +	double CV[M + 1][N + 1]; +	double Z[M + 1][N + 1]; +	double H[M + 1][N + 1]; +	double P[M + 1][N + 1]; +	double U[M + 1][N + 1]; +	double V[M + 1][N + 1]; +	double UNEW[M + 1][N + 1]; +	double UOLD[M + 1][N + 1]; +	double PNEW[M + 1][N + 1]; +	double POLD[M + 1][N + 1]; +	double VNEW[M + 1][N + 1]; +	double VOLD[M + 1][N + 1]; +	double ALPHA; + +	for (t = 0; t < N3; t++) { + +		FSDX = 4 / DX; +		FSDY = 4 / DY; + +		for (i = 0; i < M; i++) { +			for (j = 0; j < N; j++) { +				CU[i + 1][j] = (double) 0.5 * (P[i + 1][j] + P[i][j]) +						* U[i + 1][j]; +				CV[i][j + 1] = (double) 0.5 * (P[i][j + 1] + P[i][j]) +						* V[i][j + 1]; +				Z[i + 1][j + 1] = +						(FSDX * (V[i + 1][j + 1] - V[i][j + 1]) +								- FSDY * (U[i + 1][j + 1] - U[i + 1][j])) +								/ (P[i][j] + P[i + 1][j] + P[i + 1][j + 1] +										+ P[i][j + 1]); +				H[i][j] = P[i][j] +						+ (double) 0.25 +								* (U[i + 1][j] * U[i + 1][j] + U[i][j] * U[i][j] +										+ V[i][j + 1] * V[i][j + 1] +										+ V[i][j] * V[i][j]); +			} +		} + +		for (j = 0; j < N; j++) { +			// CU[0][j] = CU[M+1][j]; +			CU[0][j] = CU[M][j]; +			CV[M][j + 1] = CV[0][j + 1]; +			Z[0][j + 1] = Z[M][j + 1]; +			H[M][j] = H[0][j]; +		} + +		for (i = 0; i < M; i++) { +			CU[i + 1][N] = CU[i + 1][0]; +			CV[i][0] = CV[i][N]; +			Z[i + 1][0] = Z[i + 1][N]; +			H[i][N] = H[i][0]; +		} + +		CU[0][N] = CU[M][0]; +		CV[M][0] = CV[0][N]; +		Z[0][0] = Z[M][N]; +		H[M][N] = H[0][0]; + +		TDTS8 = TDT / 8; +		TDTSDX = TDT / DX; +		TDTSDY = TDT / DY; + +		for (i = 0; i < M; i++) { +			for (j = 0; j < N; j++) { +				UNEW[i + 1][j] = UOLD[i + 1][j] +						+ TDTS8 * (Z[i + 1][j + 1] + Z[i + 1][j]) +								* (CV[i + 1][j + 1] + CV[i][j + 1] + CV[i][j] +										+ CV[i + 1][j]) +						- TDTSDX * (H[i + 1][j] - H[i][j]); +				VNEW[i][j + 1] = VOLD[i][j + 1] +						- TDTS8 * (Z[i + 1][j + 1] + Z[i][j + 1]) +								* (CU[i + 1][j + 1] + CU[i][j + 1] + CU[i][j] +										+ CU[i + 1][j]) +						- TDTSDY * (H[i][j + 1] - H[i][j]); +				PNEW[i][j] = POLD[i][j] - TDTSDX * (CU[i + 1][j] - CU[i][j]) +						- TDTSDY * (CV[i][j + 1] - CV[i][j]); +			} +		} +		for (j = 0; j < N; j++) { +			UNEW[0][j] = UNEW[M][j]; +			VNEW[M][j + 1] = VNEW[0][j + 1]; +			PNEW[M][j] = PNEW[0][j]; +		} + +		for (i = 0; i < M; i++) { +			UNEW[i + 1][N] = UNEW[i + 1][0]; +			VNEW[i][0] = VNEW[i][N]; +			PNEW[i][N] = PNEW[i][0]; +		} + +		UNEW[0][N] = UNEW[M][0]; +		VNEW[M][0] = VNEW[0][N]; +		PNEW[M][N] = PNEW[0][0]; +		// time = time + DT; + +		for (i = 0; i < M; i++) { +			for (j = 0; j < N; j++) { +				UOLD[i][j] = U[i][j] +						+ ALPHA +								* (UNEW[i][j] - (double) 2 * U[i][j] +										+ UOLD[i][j]); +				VOLD[i][j] = V[i][j] +						+ ALPHA +								* (VNEW[i][j] - (double) 2 * V[i][j] +										+ VOLD[i][j]); +				POLD[i][j] = P[i][j] +						+ ALPHA +								* (PNEW[i][j] - (double) 2 * P[i][j] +										+ POLD[i][j]); +				U[i][j] = UNEW[i][j]; +				V[i][j] = VNEW[i][j]; +				P[i][j] = PNEW[i][j]; +			} +		} + +		for (j = 0; j < N; j++) { +			UOLD[M][j] = UOLD[0][j]; +			VOLD[M][j] = VOLD[0][j]; +			POLD[M][j] = POLD[0][j]; +			U[M][j] = U[0][j]; +			V[M][j] = V[0][j]; +			P[M][j] = P[0][j]; +		} + +		for (i = 0; i < M; i++) { +			UOLD[i][N] = UOLD[i][0]; +			VOLD[i][N] = VOLD[i][0]; +			POLD[i][N] = POLD[i][0]; +			U[i][N] = U[i][0]; +			V[i][N] = V[i][0]; +			P[i][N] = P[i][0]; +		} + +		UOLD[M][N] = UOLD[0][0]; +		VOLD[M][N] = VOLD[0][0]; +		POLD[M][N] = POLD[0][0]; +		U[M][N] = U[0][0]; +		V[M][N] = V[0][0]; +		P[M][N] = P[0][0]; + +	} +	return 0; +} diff --git a/test-chill/test-cases/examples/chill/swim.script b/test-chill/test-cases/examples/chill/swim.script new file mode 100644 index 0000000..79de9d9 --- /dev/null +++ b/test-chill/test-cases/examples/chill/swim.script @@ -0,0 +1,13 @@ +# +# tiling imperfect jacobi loop nest, more details in the paper +# "Automatic Tiling of Iterative Stencil Loops" by Zhiyuan Li and +# Yonghong Song, TOPLAS, 2004. +# + +source: swim.c +procedure: main +format: rose +loop: 0 +original() +#print space +print diff --git a/test-chill/test-cases/examples/chill/test_align.c b/test-chill/test-cases/examples/chill/test_align.c new file mode 100644 index 0000000..d1365ca --- /dev/null +++ b/test-chill/test-cases/examples/chill/test_align.c @@ -0,0 +1,20 @@ +int main() { + +	int m, n; +	int a[10], b[10]; +	int i, j; +	for (i = 0; i < n; i++) { +		for (j = 0; j < n; j++) { +			a[i] = 1; +		} + +		for (j = 0; j < n; j++) { +			b[i] -= 1; +		} + +	} + +	return 0; + +} + diff --git a/test-chill/test-cases/examples/chill/test_align.script b/test-chill/test-cases/examples/chill/test_align.script new file mode 100644 index 0000000..c990e22 --- /dev/null +++ b/test-chill/test-cases/examples/chill/test_align.script @@ -0,0 +1,12 @@ +#matrix multiply large array size for intel machine +source: test_align.c +procedure: main +format: rose +loop: 0 + +original() + + + +print + diff --git a/test-chill/test-cases/examples/chill/test_fusion.c b/test-chill/test-cases/examples/chill/test_fusion.c new file mode 100644 index 0000000..bd2c4f2 --- /dev/null +++ b/test-chill/test-cases/examples/chill/test_fusion.c @@ -0,0 +1,13 @@ +int main() { + +	int a[10][10]; +	int i, j; +	for (i = 0; i < 10; i++) { +		for (j = 0; j < 10; j++) +			a[i][j] = a[i][j] + 5; +		for (j = 0; j < 10; j++) +			a[i][j + 1] = a[i][j + 1] + 5; + +	} + +} diff --git a/test-chill/test-cases/examples/chill/test_fusion.script b/test-chill/test-cases/examples/chill/test_fusion.script new file mode 100644 index 0000000..41f6cc0 --- /dev/null +++ b/test-chill/test-cases/examples/chill/test_fusion.script @@ -0,0 +1,7 @@ +source: test_fusion.c +procedure: main +loop: 0 +original() +fuse([0,1],2) +print + diff --git a/test-chill/test-cases/examples/chill/test_lex_order.c b/test-chill/test-cases/examples/chill/test_lex_order.c new file mode 100644 index 0000000..1a3b26d --- /dev/null +++ b/test-chill/test-cases/examples/chill/test_lex_order.c @@ -0,0 +1,31 @@ +int main() { + +	int m, n; +	int a[10]; +        int b[10];  +        int c[10]; +	int i, j; +	for (i = 0; i < n; i++) { +		for (j = 0; j < n; j++) { +			b[j] = a[j]; +		} + +            +                 +                for (j = 0; j < n; j++) { +			a[j+1] = 6; +		} + +                for (j = 0; j < n; j++) { +			c[j] = a[j]; +		} + + +        + +	} + +	return 0; + +} + diff --git a/test-chill/test-cases/examples/chill/test_lex_order.script b/test-chill/test-cases/examples/chill/test_lex_order.script new file mode 100644 index 0000000..2629e50 --- /dev/null +++ b/test-chill/test-cases/examples/chill/test_lex_order.script @@ -0,0 +1,12 @@ +#matrix multiply large array size for intel machine +source: test_lex_order.c +procedure: main +format: rose +loop: 0 + +original() + + + +print + diff --git a/test-chill/test-cases/examples/chill/test_split.c b/test-chill/test-cases/examples/chill/test_split.c new file mode 100644 index 0000000..6ca62cc --- /dev/null +++ b/test-chill/test-cases/examples/chill/test_split.c @@ -0,0 +1,14 @@ +int main() { + +	int a[10][10][10][10]; +	int i, j, k, l; + +	for (i = 0; i < 10; i++) +		for (j = 0; j < 10; j++) +			for (k = 0; k < 10; k++) +				for (l = 0; l < 10; l++) +					a[i][j][k + 1][l] = a[i][j][k][l]; +	//    a[i+1][j-1] = a[i][j]; + +	return 0; +} diff --git a/test-chill/test-cases/examples/chill/test_split.script b/test-chill/test-cases/examples/chill/test_split.script new file mode 100644 index 0000000..e1ebba9 --- /dev/null +++ b/test-chill/test-cases/examples/chill/test_split.script @@ -0,0 +1,9 @@ +source: test_split.c +procedure: main +format: rose +loop: 0 +original() +N=10 +split(0,1, L3-L2-L4 <= 5)   +print + diff --git a/test-chill/test-cases/examples/chill/test_split2.c b/test-chill/test-cases/examples/chill/test_split2.c new file mode 100644 index 0000000..1ab8e43 --- /dev/null +++ b/test-chill/test-cases/examples/chill/test_split2.c @@ -0,0 +1,14 @@ +int main() { + +	int a[10][10][10][10]; +	int i, j, k, l; + +	for (i = 0; i < 10; i++) +		for (j = 0; j < 10; j++) +			for (k = 0; k < 10; k++) +				for (l = 0; l < 10; l++) +					a[i][j][k + 1][l - 1] = a[i][j][k][l]; +	//    a[i+1][j-1] = a[i][j]; + +	return 0; +} diff --git a/test-chill/test-cases/examples/chill/test_split2.script b/test-chill/test-cases/examples/chill/test_split2.script new file mode 100644 index 0000000..bcaa2a0 --- /dev/null +++ b/test-chill/test-cases/examples/chill/test_split2.script @@ -0,0 +1,9 @@ +source: test_split2.c +procedure: main +format: rose +loop: 0 +original() +N=10 +split(0,1, L4 <= 5)   +print + diff --git a/test-chill/test-cases/examples/chill/test_tile.c b/test-chill/test-cases/examples/chill/test_tile.c new file mode 100644 index 0000000..aeaaefc --- /dev/null +++ b/test-chill/test-cases/examples/chill/test_tile.c @@ -0,0 +1,16 @@ +void func(int n) { + +	int i; +	int a[10]; + +	for (i = 0; i < n; i++) +		a[i] = 2; + +} + +int main() { + +	func(10); + +	return 0; +} diff --git a/test-chill/test-cases/examples/chill/test_tile.script b/test-chill/test-cases/examples/chill/test_tile.script new file mode 100644 index 0000000..d437145 --- /dev/null +++ b/test-chill/test-cases/examples/chill/test_tile.script @@ -0,0 +1,14 @@ +#matrix multiply large array size for intel machine +source: test_tile.c +procedure: func +format : rose +loop: 0 + +original() +#permute([3,2,1]) +tile(0,1,4) + + + +print + diff --git a/test-chill/test-cases/examples/chill/tile_violation.c b/test-chill/test-cases/examples/chill/tile_violation.c new file mode 100644 index 0000000..d719e52 --- /dev/null +++ b/test-chill/test-cases/examples/chill/tile_violation.c @@ -0,0 +1,12 @@ +int main() { + +	int i, j, k; +	int a[10][10][10]; + +	for (i = 0; i < 10; i++) +		for (j = 0; j < 10; j++) +			for (k = 0; k < 10; k++) +				a[i][j + 1][k - 1] = a[i][j][k]; + +	return 0; +} diff --git a/test-chill/test-cases/examples/chill/tile_violation.script b/test-chill/test-cases/examples/chill/tile_violation.script new file mode 100644 index 0000000..57d1423 --- /dev/null +++ b/test-chill/test-cases/examples/chill/tile_violation.script @@ -0,0 +1,14 @@ +#matrix multiply large array size for intel machine +source: tile_violation.c +procedure: main +format :rose +loop: 0 + +original() +#permute([3,2,1]) +tile(0,3,2,1) + + + +print + diff --git a/test-chill/test-cases/examples/chill/unroll.c b/test-chill/test-cases/examples/chill/unroll.c new file mode 100644 index 0000000..68f4633 --- /dev/null +++ b/test-chill/test-cases/examples/chill/unroll.c @@ -0,0 +1,31 @@ +#define N 14 +void foo(int n, float* x, float* y, float* z, float* f3, float* f1, float* w) { +	int dt; + +	int i, j; + +	for (i = 1; i <= 14; i++) +		x[i] = 1.0; + +	for (i = 1; i <= 14; i += 3) +		y[i] = 1.0; + +	for (i = N + 1; i <= N + 20; i += 3) +		z[i] = 1.0; + +	for (i = 0; i <= N; i++) { +		for (j = i; j <= i + N; j++) +			f3[i] = f3[i] + f1[j] * w[j - i]; +		f3[i] = f3[i] * dt; +	} + +	return 0; +} + +int main() { +	float x[N], y[N], z[N], f3[N], f1[N], w[N]; + +	foo(N, x, y, z, f3, f1, w); +	return 0; +} + diff --git a/test-chill/test-cases/examples/chill/unroll.script b/test-chill/test-cases/examples/chill/unroll.script new file mode 100644 index 0000000..e64acb6 --- /dev/null +++ b/test-chill/test-cases/examples/chill/unroll.script @@ -0,0 +1,35 @@ +# +# Test unroll-and-jam. The last loop adapted from the simple +# convolution example from p463 of "Optimizing Compilers for +# Modern Architectures", by Randy Allen and Ken Kennedy. +# + +source: unroll.c +procedure: foo +format: rose +# fully unroll a loop with known iteration count +loop: 0 +original() +unroll(0,1,3) +print +print space + + +# a strided loop +loop: 1 +original() +unroll(0,1,2) +print +print space + +# lower and upper bounds are not constant +loop: 2 +original() +unroll(0,1,20) +print + +# parallelogram iteration space +loop: 3 +original() +unroll(0,1,2) +print diff --git a/test-chill/test-cases/examples/chill/unroll_violation.c b/test-chill/test-cases/examples/chill/unroll_violation.c new file mode 100644 index 0000000..d719e52 --- /dev/null +++ b/test-chill/test-cases/examples/chill/unroll_violation.c @@ -0,0 +1,12 @@ +int main() { + +	int i, j, k; +	int a[10][10][10]; + +	for (i = 0; i < 10; i++) +		for (j = 0; j < 10; j++) +			for (k = 0; k < 10; k++) +				a[i][j + 1][k - 1] = a[i][j][k]; + +	return 0; +} diff --git a/test-chill/test-cases/examples/chill/unroll_violation.script b/test-chill/test-cases/examples/chill/unroll_violation.script new file mode 100644 index 0000000..019473d --- /dev/null +++ b/test-chill/test-cases/examples/chill/unroll_violation.script @@ -0,0 +1,14 @@ +#matrix multiply large array size for intel machine +source: unroll_violation.c +procedure: main +format: rose +loop: 0 + +original() +#permute([3,2,1]) +unroll(0,2,2) + + + +print + diff --git a/test-chill/test-cases/examples/cuda-chill/cp.c b/test-chill/test-cases/examples/cuda-chill/cp.c new file mode 100644 index 0000000..837d7a6 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/cp.c @@ -0,0 +1,29 @@ +#define N 1 + +#define VOLSIZEY 512 +#define VOLSIZEX 512 +#define VOLSIZEZ 1 +#define ATOMCOUNT 4000 +#define GRIDSPACING 0.1 +#define zDim 0 + +extern float sqrtf(float); + +void cenergy_cpu(float atoms[ATOMCOUNT*4],float *energy,float z) +{ +int i,j,n;float dx,dy,dz;  +    +    for (j=0; j<VOLSIZEY; j++) { +        for (i=0; i<VOLSIZEX; i++) { +            	  for (n=0;n<ATOMCOUNT;n+=4) { +				dx = (GRIDSPACING * i) - atoms[n]; +				dy = (GRIDSPACING * j) - atoms[n+1]; +				dz = z - atoms[n+2]; +        		        energy[(j*VOLSIZEX + i)+VOLSIZEX*VOLSIZEY*zDim] += atoms[n+3]/sqrtf( (dx*dx) + (dy*dy)+ (dz*dz) ) ; +            } +               + +        } +    } +} + diff --git a/test-chill/test-cases/examples/cuda-chill/cp.lua b/test-chill/test-cases/examples/cuda-chill/cp.lua new file mode 100644 index 0000000..1ef2264 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/cp.lua @@ -0,0 +1,46 @@ +--CUBLAS 2 MM Multiply + +--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you +--call init() and use global variables to specify procedure and loop + +--Second parameter is procedure # and third is loop # +init("cp.c", "cenergy_cpu", 0)  + +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, +                     --copy_to_shared methods +V=512 +N=4000 +N=1 + +Tj=32 +Ti=16 +Tii=16 +Tjj=16 + +--normalize_index("j") +--normalize_index("i") +print_code() +normalize_index("n") +-- TILE COMMANDS ZEROOOOOOOOOOO:3 +--permute(0,{"i","j","n"}) +--tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","n"})--CU=-1 +tile_by_index({"j","i"},{Tj,Ti},{l1_control="jj",l2_control="ii"},{"jj","ii","j","i","n"})--CU=-1 +--tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1 + +--tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3 +--tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","i","iii","j","jjj","n"})--CU=3 +--tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"}) +--tile_by_index({"i"}, {Tii}, {l1_control="iii",l1_tile="i"}, {"ii", "jj", "iii","i","j","n"}) +print_code() +cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"jj","ii"}, thread={"j","i"}})--CU=3 +--cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"ii","jj"}, thread={"i","j"}})--CU=3 +print_code() +copy_to_shared("tx","atoms",-16) +copy_to_registers("tx","energy") +--copy_to_texture("atoms") +--unroll_to_depth(1) +--unroll(0,9,0) +--unroll(0,5,0) + +--unroll(0,8,256) +print_code() diff --git a/test-chill/test-cases/examples/cuda-chill/cudaize.lua b/test-chill/test-cases/examples/cuda-chill/cudaize.lua new file mode 100644 index 0000000..7359cca --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/cudaize.lua @@ -0,0 +1,1004 @@ + +-- THIS IS CUDAIZE.LUA + +function table.contains_key(table, key) +   for k in pairs(table) do +      if k == key then +         return true +      end +   end +   return false +end + +function valid_indices(stmt, indices) +   --print( "valid_indices() lua calling C cur_indices") +   --io.flush() +   cur = cur_indices(stmt)  +   --print("Cur indices "..list_to_string(cur)) +   for idx in pairs(indices) do +      if not table.contains_key(cur,idx) then +         return false +      end +   end +   return true +end + +function next_clean_level(cur_idxs,level) +   --print("next_clean_level( ..., "..level.." )") +   --print(string.format("indices_at_each_level %s ",list_to_string(cur_idxs) )) +    +   --print("loop to "..#cur_idxs) +   for i=level+1,#cur_idxs do +      --print("Checking level "..i.." = '"..cur_idxs[i].."'") +      if (# cur_idxs[i] > 0) then +         --print("Good enough"..(# cur_idxs[i])) +         --print("returning "..i) +         return i +      end +   end +   return -1 --sentinal that there were no non-dummy indices left +end + +function build_order(final_order, tile_idx_names, ctrl_idx_names, tile_idx_map, cur_level) +   order = {} +   --print("\nbuild_order()") +   --print("build_order(): final_order = ( "..list_to_string(final_order).." )") +   --print("build_order(): ctrl_idx_names = ("..list_to_string(ctrl_idx_names).." )") +   --print("cur_level "..cur_level.."") +   --io.flush() +    +   for i,k in ipairs(final_order) do +      skip = false +      cur = final_order[i] +      --print("\ncur "..cur.." = final_order["..i.."] = "..final_order[i].."  ") +      --control loops below our current level should not be in the current order +      for j=cur_level+2,# ctrl_idx_names do +         --print("j "..j.." final_order["..i.."] = "..final_order[i].."  ") +         if ctrl_idx_names[j] == final_order[i] then +            skip = true +            --print("SKIP "..final_order[i].."  ") +            --io.flush() +         end +      end +      --possibly substitute tile indices ifn necessar +      if table.contains_key(tile_idx_map,final_order[i]) then +         approved_sub = false +         sub_string = tile_idx_map[final_order[i]] +         for j=cur_level+2,# tile_idx_names do +            if tile_idx_names[j] == sub_string then +               approved_sub = true +            end +         end +         if approved_sub then +            cur = sub_string +         end +      end +      if not skip then +         table.insert(order,cur) +      end +   end +   return order +end + +function list_to_string(str_list) +   --Helpful debug output +   l = "" +   for i,str in ipairs(str_list) do +      if i > 1 then +         l = l .. ", " .. str +      else +         l = str +      end +   end +   return l +end + + +function find_cur_level(stmt,idx) +   --Search cur_indices for a idx at stmt +   cur = cur_indices(stmt) +   --print(string.format("find_cur_level(stmt %d, idx %s)  Cur indices %s", stmt, idx, list_to_string(cur))) +   for i,cidx in ipairs(cur) do +      if cidx == idx then +         --print(string.format("found it at index %d", i)) +         return i +      end +   end +   error("Unable to find "..idx.." in current list of indices") +end + + +function chk_cur_level(stmt,idx) +   --Search cur_indices for a idx at stmt +   cur = cur_indices(stmt) +   for i,cidx in ipairs(cur) do +      if cidx == idx then +         return i +      end +   end +   return -1 +end + + +function find_offset(cur_order, tile, control) +   --print("Looking for tile '"..tile.."' and control '"..control.."' in ( "..list_to_string(cur_order)..", )") +   idx1 = -1 +   idx2 = -1 +   for i,cur in ipairs(cur_order) do +      if(cur == tile) then +         idx1 = i +      end +      if(cur == control) then +         idx2 = i +      end +   end +   if(idx1 < 0) then +      error("Unable to find tile " .. tile .. " in current list of indices") +   end +   if(idx2 < 0) then +      error("Unable to find control " .. control .. " in current list of indices") +   end +   --print("found at level " .. idx2 .. " and " .. idx1) +   if(idx2 < idx1) then +      return idx2-idx1+1 +   else +      return idx2-idx1 +   end +end + +function tile_by_index(tile_indices, sizes, index_names, final_order, tile_method) +   --print "STARTING TILE BY INDEX" +   --io.flush() +   stmt = 0 --assume stmt 0 +   cur = cur_indices(stmt) +   --print("Cur indices "..list_to_string(cur)) +   if not valid_indices(stmt,tile_indices) then +      error('One of the indices in the first parameter were not '.. +            'found in the current set of indices.') +   end +   if not tile_method then tile_method = counted end +   tile_idx_names = {} +   for i,s in ipairs(tile_indices) do tile_idx_names[i]=s end --shallow copy +   --print("tile_index_names: ['"..list_to_string(tile_indices).."']") +    +   --print("index_names:  ")  +   --for k,v in pairs(index_names) do print(k,v) end +    +   --io.flush() +    +   ctrl_idx_names = {} +   tile_idx_map = {} +   for k,v in pairs(index_names) do +      valid = false +      if(string.sub(k,1,1) == "l") then +         if string.sub(k,-8) == "_control" then +            i = tonumber(string.sub(k,2,-9)) +            if i and i >= 1 and i <= (# tile_indices) then +               ctrl_idx_names[i] = v +               --print(string.format("Handling control %s for loop level %d",v,i)) +               --print("control "..k.."   name  "..v.." ") +               valid = true +            end +         elseif string.sub(k,-5) == "_tile" then +            i = tonumber(string.sub(k,2,-6)) +            if i and i >= 1 and i <= (# tile_indices) then +               --print(string.format("tile %s -> %s",tile_indices[i], v)) +               tile_idx_names[i] = v +               tile_idx_map[v] = tile_indices[i] +               --print(string.format("tile %s -> %s",tile_indices[i], v)) +               valid = true +            end +         end +      end +      if not valid then error(string.format("%s is not a proper key for specifying ".. +                                            "tile or control loop indices\n", k)) end +   end +    +   --filter out control indices (and do name substitution of unprocessed tile indices) for a given level +   cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, -1) +   permute(stmt, cur_order) +    +   for i,cur_idx in ipairs(tile_indices) do +      --print(string.format("i %d  cur_idx %s calling build order ********", i-1, cur_idx)) +      cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1) +      --Find a offset between tile loop and control loop +      -- 0   = control loop one level above tile loop +      -- -1  = control loop two levels above tile loop +      -- > 0 = tile loop above control loop +      -- In the last case, we do two extra tile commands to get the control +      -- above the tile and then rely on the final permute to handle the +      -- rest +      level = find_cur_level(stmt,cur_idx) +      offset = find_offset(cur_order, tile_idx_names[i], ctrl_idx_names[i]) +      --print(string.format("offset %d", offset)) +       +      if (offset <= 0) then +         --print(string.format("[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %s)",stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method))  +         tile(stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method) +      else +         --print(string.format("2tile(%d, %d, %d, %d, %s, %s, %s)", stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method)) +         tile(stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method);--regular level +         --flip tile and control loop +         --print(string.format("3tile(%d, %d, %d)",stmt, level+1, level+1)) +         tile(stmt, level+1, level+1); +         --print(string.format("4tile(%d, %d, %d)",stmt, level+1, level)) +         tile(stmt, level+1, level); +         --print(string.format("\n[offset>0]tile(%d, %d, %d, %d,%s,%s,%s)",stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method))  +	 --print_code() +          +      end +       +      --Do permutation based on cur_order +      --print "permute based on build order calling build_order()" +      --print "cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)" +      cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1) +      --print "permute(stmt, cur_order);" +      permute(stmt, cur_order); +      --print "\nafter permute(), code is:" +      --print_code() +   end +   --print "ENDING TILE BY INDEX" +   --print_code() +end + +function normalize_index(index) +   stmt = 0 --assume stmt 0cur = cur_indices(stmt) +   --print("Cur indices "..list_to_string(cur)) +   l = find_cur_level(stmt, index) +   tile(stmt, l, l) +   --print(string.format("\n[Normalize]tile(%d, %d, %d)",stmt, l,l))  +end + +function is_in_indices(stmt, idx) +   cur = cur_indices(stmt) +   for i=0,#cur,1 do +      if(cur[i]==idx) then +         return true +      end +   end +   return false +    +end + + +function copy_to_registers(start_loop, array_name) +    +   --print("\n\n****** starting copy to registers") +   io.flush() + +   stmt = 0 --assume stmt 0 +    +   -- [Malik] first we make sure that tx and ty are consecutive loops in the 2D thread setup, otherwise all levels for subsequent operations are messed up. Start logic. +   cur = cur_indices(stmt) +   table_Size = table.getn(cur) +    +   --print(string.format("Cur indices %s,",list_to_string(cur))) +   --print(string.format("The table size is %d", table_Size)) +   --table.foreach(cur, print) +   --print_code() +    +   level_tx = -1 +   level_ty = -1 +   if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end +   if is_in_indices(stmt,"ty") then level_ty = find_cur_level(stmt,"ty") end +   --print(string.format("level_tx %d  level_ty %d", level_tx, level_ty)) +    +   ty_lookup_idx = ""  +   org_level_ty = level_ty +    +   --if(cur[level_tx+1]~=nil and cur[level_tx+1]~="") then ty_lookup = ty_lookup+1 end +   if(cur[level_ty+1]~=nil and cur[level_ty+1]~="") then  +      --print(string.format("IF  cur[%d] = %s", level_ty+1, cur[level_ty+1])) +      ty_lookup_idx = cur[level_ty+1]  +   else +      --if cur[level_ty]  ~= nil then print(string.format("ELSE ty_lookup_idx = cur[%d] = %s", level_ty, cur[level_ty])) --   TODO  +      --else print "ELSE (dangerous)" end +      ty_lookup_idx = cur[level_ty]  -- may assign nil !? +   end +   --if ty_lookup_idx ~= nil then print(string.format("ty_lookup_idx '%s'", ty_lookup_idx))  --  TODO  +   --else print "ty_lookup_idx is NIL" +   --end +    +   if level_ty > 0 then +      --print(string.format("\ntile3(%d,%d,%d)",stmt,level_ty,level_tx+1)) +      tile(stmt,level_ty,level_tx+1)  +   end +   --print_code() +    +   --print("\ntylookup is %d",ty_lookup) +   --exit(0) +   -- +   cur = cur_indices(stmt) +   table_Size = table.getn(cur) +   --print(string.format("Cur indices %s,",list_to_string(cur))) +   --print("The table size is "..table.getn(cur)) +   --table.foreach(cur, print) +    +   if is_in_indices(stmt,"tx") then   level_tx = find_cur_level(stmt,"tx") end +   if ty_lookup_idx then +      if is_in_indices(stmt,ty_lookup_idx) then level_ty = find_cur_level(stmt,ty_lookup_idx) end +   end +    +   ty_lookup = 1 +   idx_flag = -1 +   -- find the level of the next valid index after ty+1 +   --print(string.format("\nlevel_ty %d", level_ty)) +   if level_ty > 0 then +      --print(string.format("table_Size %d", table_Size)) +      for num= level_ty+ty_lookup,table_Size do +         --print(string.format("num=%d   cur[num] = '%s'",num, cur[num])) +         if(cur[num] ~= "") then +            idx_flag = find_cur_level(stmt,cur[num]) +            --print (string.format("idx_flag = %d", idx_flag)) +            break +         end +      end +   end +    +   --print(string.format("\n(first) I am checking all indexes after ty+1 %s",idx_flag)) +   --print_code() +   --print "" +    +   how_many_levels = 1 +   startat = idx_flag + 1 +   if startat == 0 then startat = 1 end  -- avoid attempt to examine an illegal array offset +   --print(string.format("idx_flag = %d   I will check levels starting with %d", idx_flag, idx_flag+1)) +    +   for ch_lev = startat,table_Size,1 do    -- was for ch_lev = idx_flag+1,table_Size,1 do +      --print(string.format("ch_lev %d", ch_lev)) +      if(cur[ch_lev] ~= nil and cur[ch_lev] ~= "") then +         --print(string.format("cur[%d] = '%s'", ch_lev, cur[ch_lev]))  +         how_many_levels = how_many_levels+1 +      end +   end +   --print("\nHow Many Levels",how_many_levels) +    +   -- change this all to reflect the real logic which is to normalize all loops inside the thread loops.  +   if(how_many_levels <2) then +      while( idx_flag >= 0) do +         for num = level_ty+ty_lookup,(table_Size) do +            --print(string.format("at top of loop, num is %d", num)) +            --print(string.format("num %d", num)) +            --print(string.format("cur[num] = '%s'", cur[num])) +            if(cur[num] ~= "") then +               idx=cur[num] +               --print(string.format("idx '%s'", idx)) +                +               curlev = find_cur_level(stmt,idx) +               --print(string.format("curlev %d", curlev)) +                +               --print_code() +               --print(string.format("\n[COPYTOREG]tile(%d,%d,%d)",stmt,find_cur_level(stmt,idx),level_tx)) +               tile(stmt,find_cur_level(stmt,idx),find_cur_level(stmt,idx)) +               curlev = find_cur_level(stmt,idx) +               --print(string.format("curlev %d", curlev)) +               tile(stmt,find_cur_level(stmt,idx),level_tx) +               --print(string.format("hehe '%s'",cur[num])) +                +               cur = cur_indices(stmt) +               --print("Cur indices INSIDE"..list_to_string(cur)) +               table_Size = table.getn(cur) +               --print(string.format("Table Size is: %d",table_Size)) +               level_tx = find_cur_level(stmt,"tx") +               --print(string.format("\n level TX is: %d",level_tx)) +               level_ty = find_cur_level(stmt,ty_lookup_idx) +               --print(string.format("\n level TY is: %d",level_ty)) +               idx_flag = -1 +               --print "idx_flag = -1" +                +               -- find the level of the next valid index after ty+1 +                +               -- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?) +               for num= level_ty+ty_lookup,table_Size do +                  --print(string.format("num mucking num = %d", num)) +                  if(cur[num] ~= nil and cur[num] ~= "") then +                     idx_flag = find_cur_level(stmt,cur[num]) +                     --print("\n(second) I am checking all indexes after ty+1 %s",cur[num]) +                     break +                  end +               end +               --print(string.format("num mucked to %d     idx_flag = %d", num, idx_flag)) +                +            end +            --print(string.format("at bottom of loop, num is %d", num)) +         end +      end +   end +   --print "done with levels" +    +    +    +    +   --print "ARE WE SYNCED HERE?" +   --print_code() +   --print("\ntile(%d,%d,%d)",stmt,level_k,level_k) +   --tile(stmt,level_k,level_k) +    +   -- [Malik] end logic +   --print_code() +   start_level = find_cur_level(stmt, start_loop) +   --We should hold contant any block or tile loop +   block_idxs = block_indices() +   thread_idxs = thread_indices() +   --print("\nblock indices are") +   --table.foreach(block_idxs, print) +   --print("\nthread indices are") +   --table.foreach(thread_idxs, print) +   --print(string.format("\nStart Level: %d",start_level)) +    +   hold_constant = {} +   --print("\n Now in Blocks") +   for i,idx in ipairs(block_idxs) do +      --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx))) +      if find_cur_level(stmt,idx) >= start_level then +         table.insert(hold_constant, idx) +         --print(string.format("\nJust inserted block %s in hold_constant",idx)) +      end +   end +    +    +   --print("\n Now in Threads") +   for i,idx in ipairs(thread_idxs) do +      --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx))) +      if find_cur_level(stmt,idx) >= start_level then +         table.insert(hold_constant, idx) +         --print(string.format("\nJust inserted thread %s in hold_constant",idx)) +      end +   end +    +   --print "\nhold constant table is: " +   --table.foreach(hold_constant, print) +    +   --print("\nbefore datacopy pvt") +   old_num_stmts = num_statements() +   --print_code() +   --print(string.format("\n[DataCopy]datacopy_privatized(%d, %s, %s, vector having privatized levels)",stmt, start_loop, array_name))  +   --table.foreach(hold_constant, print) +   datacopy_privatized(stmt, start_loop, array_name, hold_constant) +    +   --print(hold_constant) +   new_num_stmts = num_statements() +   --print("\nthe num of statements:%d\n",new_num_stmt) +   --print_code() +   --exit(0) +   -- [Malik] normalize the copy loops created. +   cur = cur_indices(old_num_stmts) +   --print("Cur indices "..list_to_string(cur)) +   for cidx,i in ipairs(cur) do +      if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then +         --tile(old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i)) +         --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i)) +      end +   end +   --print_code() +   --print("\nthe num of statements OLD+1 :",(old_num_stmts+1))   + + +--[[  +   is this commented out? why yes, yes it is   block comment  +   if( (old_num_stmts+1) <= new_num_stmts) then +      cur = cur_indices(old_num_stmts+1) +      --print("Cur indices+1 "..list_to_string(cur)) +      for cidx,i in ipairs(cur) do +         if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then +            tile(old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i)) +	    --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i)) +         end +      end +   end +--]] + + +   --Unroll to the last thread level +   --for stmt=old_num_stmts,new_num_stmts-1 do +   -- level = find_cur_level(stmt,thread_idxs[#thread_idxs])--get last thread level +   --if level < #cur_indices(stmt) then +   -- unroll(stmt,level+1,0) +   --print(string.format("\n[Unroll]unroll(%d, %d, 0)",stmt, level+1))  +   ----print_code() +   --end +   --end +   io.flush() +   --print("****** ending copy to registers\n\n") +   --io.flush() +end + +function copy_to_shared(start_loop, array_name, alignment) +   --print(string.format("\nstarting copy to shared(%s, %s, %d )",start_loop,array_name,alignment)) +   stmt = 0 --assume stmt 0 +   cur = cur_indices(stmt) +   --print("Cur indices "..list_to_string(cur)) +    +   start_level = find_cur_level(stmt, start_loop) +   --print(string.format("start_level %d", start_level)) +    +   old_num_stmts = num_statements() +   --print(string.format("old_num_statements %d", old_num_stmts)) +    +   --Now, we give it indices for up to two dimentions for copy loop +   copy_loop_idxs = {"tmp1","tmp2"} +   --print(string.format("\n[DataCopy]datacopy(%d, %d, %s, {\"tmp1\",\"tmp2\"},false,0,1,%d,true)",stmt, start_level, array_name, alignment))  +   datacopy(stmt, start_level, array_name, copy_loop_idxs, false, 0, 1, alignment,true) +    +   add_sync(stmt,start_loop) +   new_num_stmts = num_statements() +    +   --This is fairly CUBLAS2 specific, not sure how well it generalizes, +   --but for a 2D copy, what we want to do is "normalize" the first loop +   --"tmp1" then get its hard upper bound. We then want to tile it to +   --make the control loop of that tile "ty". We then tile "tmp2" with a +   --size of 1 and make it "tx". +   --print(string.format("fairly CUBLAS2 specific, OLD %d  NEW %d",  old_num_stmts, new_num_stmts )) +    +   for stmt=old_num_stmts,new_num_stmts-1 do +      --print(string.format("for stmt = %d", stmt)) +      was_no_error, level = pcall(find_cur_level, stmt, "tmp2") +       +      if was_no_error then  +         --print_code()  +         --print("\nCopy to shared: [If was no error]\n") +         find_cur_level(stmt,"tmp2") +         tile(stmt, level, level) +          +         lower,upper = hard_loop_bounds(stmt, level) +         upper = upper + 1 +         --print(string.format("lower %d  upper %d", lower, upper)) +          +         tx,ty = thread_dims() +         --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx) +          +         level = find_cur_level(stmt,"tmp1") +         --print(string.format("level %d", level)) +          +         if tx == upper and ty == 1 then +            --print(string.format("tx = %d    upper = %d     ty = %d", tx, upper, ty)) +            --print "Don't need" +             +            --Don't need an extra tile level, just move this loop up +            second_level = find_cur_level(stmt,"tmp2") +            --print(string.format("\n[Tile0]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx"))  +            tile(stmt, second_level, 1, level, "tx", "tx", counted) +         else +            --print "DO need?" +            --print_code() +            if(ty == 1) then new_ctrl = "tmp3" else new_ctrl = "ty" end + + +--[[ Commenting out a block of Gabe's code in this control flow +               -- level = find_cur_level(stmt,"tmp1") +               tile(stmt, level, level) + +               lower,upper = hard_loop_bounds(stmt, level) +               upper = upper + 1 +               --print_code() +               --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx..", level: "..level) +               if(math.ceil(upper/ty) > 1)then +                  tile(stmt, level, math.ceil(upper/ty), level, "tmp", new_ctrl, counted) +                  --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level,  math.ceil(upper/ty),upper,ty, level, "tmp", new_ctrl))  +               else +                  tile(stmt, level, math.ceil(upper/ty), level, "ty", new_ctrl, counted) +		  --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level,  math.ceil(upper/ty),upper,ty, level, "tx", new_ctrl)) +               end +                +               --print_code()     +               -- [Malik] If here we have the loop upper bound > tx, then we should tile once more after the next tile, to carve out the correct tx.  +               lower1,upper1 = hard_loop_bounds(stmt,level) +               level1 = level +               stmt1 = stmt +               -- [Malik] Do the tile after the second level tile with if condition. Just to keep the original order, the tile is being pushed to the end.  +                +               --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1) + +               --print_code() +               --level = find_cur_level(stmt,"tmp") +               --tile(stmt,level,level) +               --print_code()  +                +               --[Malik] if you are moving the loop above the level1, you need to update level1 with new position which would be level1+2 or second_level +               if(level <= level1) then level1 = level1+2 end + 	       --print(string.format("\n[Tile2]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx"))  +               --print("\n----------------------------------") +               --print_code() +               --print("\n**********************************") +               --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1) +               -- [Malik] If the upper bound > tx, we do another tile to carve out the correct tx from a bigger loop. Else just normalize the bounds.  +               if( upper1 > ty) then +                  third_level = find_cur_level(stmt1,"tmp") +                  --print("\n\n\n\t\t\t\tthirdlevel:"..third_level) +                  tile(stmt1, third_level, ty, third_level, "ty", "tmp", counted) +                  --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt1, third_level, ty,third_level, "ty", "tmp")) +                  tile(stmt1,third_level+1,third_level+1) +                  --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level+1)) +                  tile(stmt1,third_level+1,third_level) +                  --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level)) +               else +                  tile(stmt1,level1,level1) +                  --print(string.format("\n[Tile3ELSE]tile(%d, %d, %d)",stmt1,level1,level1)) +               end +                +               --print("\nStarting tmp2\n");--print_code(); +               second_level = find_cur_level(stmt,"tmp2") +               lower,upper = hard_loop_bounds(stmt,second_level) +               level = second_level +               --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..", level:"..level) +                +               if(math.ceil(upper/tx) > 1)then +                  tile(stmt, second_level,math.ceil(upper/tx), level, "tmp", "tx", counted) +                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tmp", "tx")) +               else +                  tile(stmt, second_level,math.ceil(upper/tx), level, "tx", "tx", counted) +                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tx", "tx")) +               end +               --print_code() +               lower2,upper2 = hard_loop_bounds(stmt,level) +               level2 = level +               stmt2 = stmt +               --print("[Malik]-loop cleanup@tmp2: lower2, upper2: "..lower2..", "..upper2..", tx: "..tx..", level:"..level2) +               -- now for the second level. +               if( upper2 > tx) then +                  forth_level = find_cur_level(stmt2,"tmp") +                  --print("\n\n\n\t\t\t\tforthlevel:"..forth_level) +                  --print_code() +                  tile(stmt2, forth_level, 1, forth_level, "tx", "tmp", counted) +                  --print(string.format("\n[Tile3B]tile(%d, %d, %d,%d,%s,%s,counted)",stmt2, forth_level, tx,forth_level, "ty", "tmp")) +                  --print_code() +                  --tile(stmt2,forth_level+1,forth_level+1) +                  --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level+1)) +                  --tile(stmt2,forth_level+1,forth_level) +                  --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level)) +               else +                  new_level = find_cur_level(stmt2,"ty") +                  tile(stmt2,level2,1,new_level,"tx","tx",counted) +                  --print(string.format("\n[Tile3BELSE]tile(%d, %d, %d)",stmt2,level2,level2)) +                  tmp_level = find_cur_level(stmt2,"tmp") +                  tile(stmt2,tmp_level,tmp_level) +               end +                +               --print_code() +               --print("\n----------------------------------") +--]] +                +               --print_code()  +               --print("\nStarting tmp2\n");--print_code(); +               first_level = find_cur_level(stmt,"tmp1") +               second_level = find_cur_level(stmt,"tmp2") +               lower,upper = hard_loop_bounds(stmt,second_level) +                +               --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..",first level:"..first_level..",second_level:"..second_level) +                +               -- Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx. +               --print(string.format("\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,1,first_level, "tx", "tx")) +               tile(stmt,second_level,1,first_level,"tx","tx",counted) +               --print_code() +                +               first_level = find_cur_level(stmt,"tmp1") +               lower_1,upper_1 = hard_loop_bounds(stmt,first_level) +               tx_level = find_cur_level(stmt,"tx") +               lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level) +               --print(string.format("UL_1 %d %d     UL_tx %d %d", lower_1, upper_1, lower_tx, upper_tx)) +                +               if(math.ceil(upper_tx/tx) > 1)then +                  --print "ceil I say" +                  --print(string.format("\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,tx,tx_level, "tx", "tmp1")) +                  tile(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted) +                  --print_code() +                   +                  peat = find_cur_level(stmt,"tx") +                  --print(string.format("\n[Tile1]tile(%d, %d, %d)",stmt, peat, peat)) +                  tile(stmt, peat, peat )  --find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx")) +                  --print_code() +                   +                  if (find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx")) then +                     --print(string.format("\nagain [Tile1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))) +                     tile(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")) +                     --print_code() +                  end +                  --else +                  --tile(stmt, tx_level,1, tx_level, "tx", "tx", counted) +                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,1,tx_level, "tx", "tx")) +               end +               --print_code() +               --]]  -- this apparently is NOT the end of a block comment +                +               --print("\nStarting tmp1\n") +               -- Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty". +               tile(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))      +               --print_code()   +                +               ty_level = find_cur_level(stmt,"tmp1") +               lower_ty,upper_ty = hard_loop_bounds(stmt,ty_level) +                +               tx_level = find_cur_level(stmt,"tx") +               lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level) +               --print("[Malik]-loop cleanup@tmp1: lowerty, upperty: "..lower_ty..", "..upper_ty..", ty: "..ty..",ty level:"..ty_level..",tx_level:"..tx_level..", stmt: "..stmt) +                +               --print "before ceil" +               if(math.ceil(upper_ty/ty) > 1)then +                  --print "CEIL IF" +                  --print("\n Inside upper_ty/ty > 1\n"); +                   +                  --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,ty,ty_level, "ty", "tmp_ty")) +                  tile(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted) +                  --print_code() +                   +                  --print(string.format("\n[Tile2-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt  ,"ty"),find_cur_level(stmt,"ty"))) +                  tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty")) +                  --print_code() +                   +                  ----------------------------------------------------------------------- +                  ---------------------------------------------------------------------- +                  cur_idxs = cur_indices(stmt) +                  --print("\n cur indexes are "..list_to_string(cur_idxs)) +                   +                  -- Putting ty before any tmp_tx    +                  idx_flag = -1 +                  for num= 0,table.getn(cur_idxs) do +                     if(cur[num] == "tmp_tx") then +                        idx_flag = find_cur_level(stmt,cur[num]) +                        break +                     end +                  end +                  --print(string.format("\n (1) so i have found out the value of idx flag as %d",idx_flag) ) +                   +                  if(idx_flag >=0 ) then   +                     if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then +                        --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) +                        tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                        --print_code() +                     end +                  end +                   +                  -- Now Putting ty before any tmp_ty +                  idx_flag = -1 +                  for num= 0,table.getn(cur_idxs) do +                     if(cur[num] == "tmp_ty") then +                        idx_flag = find_cur_level(stmt,cur[num]) +                        break +                     end +                  end +		  --print(string.format("\n IF  so i have found out the value of idx flag as %d",idx_flag) ) +                  if(idx_flag >=0 ) then   +                     --print "one more test" +                     if ((find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"))) then +                        --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) +                        tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                        --print_code() +                     end +                  end +               else +                  --print "CEIL ELSE" +                  --cur_idxs = cur_indices(stmt) +                  --print("\n Inside upper_ty/ty <= 1\n"); +                   +                  --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,1,ty_level, "ty", "ty")) +                  tile(stmt, ty_level,1, ty_level, "ty", "ty", counted) +                  --print_code() +                   +                  --print(string.format("\n[Tile3-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)) +                  tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1) +                  --print_code() +                   +                  idx_flag = -1 +                  if(cur_idxs) then +                     --print "CAN NEVER GET HERE?  cur_idxs" +                     for num= 0,table.getn(cur_idxs) do +                        if(cur[num] == "tmp_ty") then +                           idx_flag = find_cur_level(stmt,cur[num]) +                           break +                        end +                     end +                  end +                  --print(string.format("\n ELSE so i have found out the value of idx flag as %d",idx_flag) ) +                  if(idx_flag >=0 ) then   +                     if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then +                        --print(string.format("tile( stmt %d, level ty %d, level ty %d",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))  +                        tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                        --print(string.format("\n[Tile3-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) +                     end +                  end +               end +                +               --print_code() +         end +          +          +         --print "\n\n *** at bottom of if in copy to shared, " +         --print_code() +         --print "end of if" +          +      else +         --copy to shared only created one level, not two, so we use a different approach (MV & TMV) +         --print("\nCopy to shared: [If was error]\n") +         level = find_cur_level(stmt,"tmp1") +         tile(stmt, level, level) +          +         --print(string.format("\n[Tile]tile(%d, %d, %d)",stmt, level, level))  +         tx,ty = thread_dims() +         lower,upper = hard_loop_bounds(stmt, level) +         upper = upper+1 --upper bound given as <=, compare to dimensions tx which is < +         --print("upper "..upper.." tx "..tx) +         if upper == tx then +            rename_index(stmt, "tmp1", "tx") +         else +            --print("upper is not tx") +            --TODO: Don't know, maybe do some tileing etc +            --print_code() +            --print("upper "..upper.." tx "..tx.." stmt: "..stmt.." level: "..level) +            tile(stmt, level,tx,level, "tx", "tmp_tx", counted) +            --print_code() +             +            --print("stmt:"..stmt.." level+1: "..level+1) +            --print("TILE 7") +            tile(stmt, level+1,1,level+1,"tx", "tx",counted) +            --print("TILE 3") +            tile(stmt,level+1,level) +            --print_code() +             +            if(ty > 1) then +               --print_code() +               --print("GOING IN") +               lower,upper = hard_loop_bounds(stmt, level+1) +               --print(string.format("ty %d  lower %d  upper %d", ty, lower, upper)) +               --upper=125 +               --print("NOW FOR Y: upper "..upper.." ty "..ty.." stmt: "..stmt.." level: "..(level+1).." bound:"..math.ceil(upper/ty)) +               tile(stmt, level+1,math.ceil(upper/ty),level+1, "tmp_ty", "ty", counted) +               --tile(stmt, level+2,math.ceil(upper/ty),level+2, "tmp_ty", "ty", counted) +            end +            --print_code() +            --rename_index(stmt, "tmp1", "tx") +            --print("Warning: Need to implement some logic here to tile the single level shared copy loop to match thread dimensions") +         end +      end +      --Always add sync +      add_sync(stmt,start_loop) +       +   end +   --print("ending copy to shared\n") +   --print_code() +end + +function unroll_to_depth(max_depth) +   --print(string.format("\n\nunroll_to_depth(%d)", max_depth )) +   --print "SYNC UP" +    +   cur = cur_indices(0) +   thread_idxs = thread_indices() +   guard_idx = thread_idxs[#thread_idxs] +    +   --print(string.format("cur    indices %s",list_to_string(cur))) +   --print(string.format("thread indices %s",list_to_string(thread_idxs))) +   --print(string.format("#thread_idxs = %d", #thread_idxs)) +   --print(string.format("guard_idx = %s", guard_idx)) +    +   ---- HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS    +   common_loops = {} +   comm_loops_cnt = 0 +   num_stmts = num_statements() +   --print(string.format("num statements %d", num_stmts)) +    +   for stmt=0,num_stmts-1 do +      cur_idxs = cur_indices(stmt) +       +      --print(string.format("\nSTMT %d Current Indices: %s",stmt,list_to_string(cur_idxs))) +       +      if(chk_cur_level(stmt,"tx")>0) then +         for ii=1,find_cur_level(stmt,"tx")-1 do    -- started at 0 +            --print(string.format("ii = %d", ii)) -- index starts at 1, what does index 0 do? +            --if cur_idxs[ii] == nil then print "cur_idxs[i]] is NIL"  +            --else print(string.format("cur_idxs[%d] = '%s'", ii, cur_idxs[ii])) -- index starts at 1, what does index 0 do? +            --end +             +            if(cur_idxs[ii] ~= "bx" and cur_idxs[ii] ~= "by" and cur_idxs[ii] ~= nil and cur_idxs[ii] ~= "tx" and cur_idxs[ii] ~= "ty" and cur_idxs[ii] ~= "") then  +                +               --print(string.format("id %s is not in the list", cur_idxs[ii] )) +                +               for stmt1=stmt+1,num_stmts-1 do +                  --print(string.format("\nii %d stmt1 is %d", ii, stmt1))           +                  cur_idxs1 = cur_indices(stmt1) +                  --print("\nstmt1 cur_idxs1 is "..list_to_string(cur_idxs1))    +                   +                  --print(string.format("cur level(%d, %s) = %d", stmt, "tx",  find_cur_level(stmt,"tx")))     +                   +                  endrange = find_cur_level(stmt,"tx")-1 +                  --print(string.format("for iii=1, %d do", endrange)) +                   +                  for iii=1,find_cur_level(stmt,"tx")-1 do  -- started at 0 +                     --print(string.format("stmt %d   ii %d   iii %d ", stmt, ii, iii)) +                     --if(cur_idxs1[iii] ~= nil) then  +                     --   print(string.format("stmt %d   ii %d   iii %d  cur_idxs1[%d] = '%s'", stmt, ii, iii, iii, cur_idxs1[iii]))   +                     --else  +                     --   print(string.format("stmt %d   ii %d   iii %d  cur_idxs1[%d] = NIL", stmt, ii, iii, iii))   +                     --end +                      +                     if(cur_idxs1[iii] ~= "bx" and cur_idxs1[iii] ~= "by" and cur_idxs1[iii] ~= nil and cur_idxs1[iii] ~= "tx" and cur_idxs1[iii] ~= "ty" and cur_idxs1[iii] ~= "") then   +                        if(cur_idxs[ii] == cur_idxs1[iii]) then +                           --print("\nfound idx:"..cur_idxs[ii]) +			   --if(comm_loops_cnt == 0) then print "\n\n*** WARNING *** assigning to array index ZERO in Lua" end +                           common_loops[comm_loops_cnt] = cur_idxs[ii] +                           --print(string.format("cl[%d] = '%s'", comm_loops_cnt,   common_loops[comm_loops_cnt])) +                           comm_loops_cnt = comm_loops_cnt + 1 +                        end +                     end   +                  end +               end   +            end +         end +      end +   end +   ---- +   --if(comm_loops_cnt>0) then  +   --   print("\n COMM LOOPS :TOTAL "..comm_loops_cnt..", and are "..list_to_string(common_loops).." this loop :"..common_loops[0]) +   --else +   --   print "UNROLL can't unroll any loops?" +   --end +    +    +    +    +   repeat +      old_num_stmts = num_statements() +      --print(string.format("old_num_statements %d", old_num_stmts)) +       +      for stmt=0,old_num_stmts-1 do +         cur_idxs = cur_indices(stmt) +         --print(string.format("stmt %d    cur_idxs = %s", stmt, list_to_string(cur_idxs))) +         if(#cur_idxs > 0) then  +            gaurd_level = -1 +            if(chk_cur_level(stmt,guard_idx)>0) then +               gaurd_level = find_cur_level(stmt,guard_idx) +            end +            --print(string.format("guard_level(sp) = %d", gaurd_level)) +             +            if(gaurd_level>-1) then +               level = next_clean_level(cur_idxs,gaurd_level) +               --print(string.format("next clean level %d", level)) +                +               --need to handle max_depth +               num_unrolled = 0 +               level_unroll_comm = level +               level_arr = {} +               while level >= 0 do +                  --print(string.format("while: level = %d", level)) +                   +                  if num_unrolled == max_depth then break end +                  --print("Unrolling "..stmt.." at level "..(level).." index ".. cur_idxs[gaurd_level+1]) +                   +                  level_arr[num_unrolled] = level +                  num_unrolled = num_unrolled + 1 +                   +                  guard_level = find_cur_level(stmt,guard_idx) +                  level = next_clean_level(cur_idxs,level+1) +               end +               --dies print("How many levels for unroll commands"..table.getn(level_arr).." which is "..level_arr[0].." and "..level_arr[#level_arr]) +               --if(table.getn(level_arr) ~= nil) then +                +               --print "OK, NOW WE UNROLL" +                +               if(level_unroll_comm >= 0)then +                  for i = table.getn(level_arr),0,-1 do +                     --print(string.format("\ni=%d", i)) +                     --print(string.format("[Unroll]unroll(%d, %d, 0)",stmt, level_arr[i]))      +                      +                     unroll(stmt,level_arr[i],0) +                     --print("finished unroll]]\n") +                     --print_code() +                  end +               end +------ +            end     +--[[ + +THERE WAS A BIG BLOCK OF COMMENTED OUT CODE HERE  + + +--]] +------ +         end +      end +      new_num_stmts = num_statements() + +   until old_num_stmts == new_num_stmts + +end + + diff --git a/test-chill/test-cases/examples/cuda-chill/cudaize.py b/test-chill/test-cases/examples/cuda-chill/cudaize.py new file mode 100755 index 0000000..ffef009 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/cudaize.py @@ -0,0 +1,1047 @@ +#! /usr/bin/python + +# THIS IS CUDAIZE.PY + +import chill +import sys +import math  + +strided = 0 +counted = 1 + +def print_code(): +    chill.print_code() +    print "" +    sys.stdout.flush() + +     +def table_contains_key( table, key ):  # use a dict for the 'table'? +    return table.has_key(key) # (key in table)? + +def print_array( arr ):  # a useful function to mimic lua output  +    for a in arr[:-1]: +        print "%s," % a, +    print "%s" % arr[-1] +    sys.stdout.flush() + +def valid_indices( statement, indices ): +    #print "valid_indices() python calling C cur_indices" +    #print statement +    cur = chill.cur_indices(statement) # calls C +    #print "python valid_indices(), cur = ", +    #print cur +    #print "indices = ", +    #print indices + +    for index in indices: +        if not index in cur: +            return False +    return True + +def next_clean_level( indices_at_each_level, level): +    #print "next_clean_level( ..., %d )" % level  +    #print "indices_at_each_level ", +    print_array( indices_at_each_level ) + +    numlevels = len(indices_at_each_level) +    #print "loop to %d" % numlevels +    for i in range(level+1, numlevels+1): +        pythoni = i-1 # LUA index starts at 1 +        #print "Checking level %d = '%s'" % (i, indices_at_each_level[pythoni]) +        sys.stdout.flush() +        if len(indices_at_each_level[pythoni]) > 0: # LUA INDEX STARTS AT 1 +            #print "returning %d" % i +            return i  # MATCH lua return value, LUA index starts at one +    return -1  # no non-dummy indices + + + + +def build_order(  final_order, tile_index_names, control_index_names, tile_index_map, current_level): +    order = []    +    #print "\nbuild_order()" +    #print "build_order(): final_order = (", +    count = 0 +    for f in final_order: +        #if count+1 == len(final_order): +        #    print "%s )" % f +        #else: +        #    print "%s," % f , +        count += 1 + +        keys = control_index_names.keys() +        keys.sort() +        #if (2 == len(keys)): +        #    print "build_order(): ctrl_idx_names = (%s, %s)" % (control_index_names[0], control_index_names[1]) +        #else: +        #    print "build_order(): ctrl_idx_names = (%s" % control_index_names[0], +        #    for k in keys[1:]: +        #        print ", %s" % control_index_names[k], +        #    print ")" + +    #print control_index_names +    #print "cur_level %d" % current_level +     +    #print "tile index map: ", +    #print tile_index_map + + +    for i in range(len(final_order)): +        k = final_order[i]  # not used? +        skip = False +        cur = final_order[i]   +        # control loops below our current level should not be in the current order + +        # skip = cur in control_index_names[current_level+2:]  +        #print "\n%d control_index_names, " % len(control_index_names) +        #print control_index_names + +        for j in range(current_level+1, len(control_index_names)): +            #print "comparing cur %s with cin[%d] %s" % ( cur, j, control_index_names[j]) +            if control_index_names[j] == cur: +                skip = True  +                #print "SKIP %s  " % cur + +        # possibly substitute tile indices if necessary +        if tile_index_map.has_key(cur): +            approved_sub = False +            sub_string = tile_index_map[cur] +            #print "sub_string = ", +            #print sub_string + +            # approved_sub = sub_string in tile_index_names[current_level+2:] +            for j in range(current_level+1, len(tile_index_names)): +                if tile_index_names[j] == sub_string: +                    approved_sub = True +            if approved_sub: +                cur = sub_string + +        if not skip: +            order.append( cur)   +    #print "build_order() returning order (", +    #print order +    #for o in order: +    #    print "%s," % o, +    #print ")" +    return order + +def find_cur_level( stmt, idx ): +    #print "find_cur_level(stmt %d, idx %s)  Cur indices" % ( stmt, idx ), +     +    cur = chill.cur_indices(stmt) +    #for c in cur[:-1]: +    #    print "%s," % c, +    #print "%s" % cur[ -1 ]  + +    index = 1 # lua starts indices at 1 !!   +    for c in cur: +        if c == idx: +            #print "found it at index %d" % index +            #sys.stdout.flush() +            #print "in find_cur_level, returning ", +            #print index +            return index +        index += 1 +    #print "find_cur_level(), Unable to find index %s in" % idx, +    #print cur +    #print "in find_cur_level, returning -1" +    return -1  # special meaning "it's not there" + +def chk_cur_level( stmt, idx ): +    # search cur_indices for a ind at stmt +    cur = chill.cur_indices(stmt) +    if idx in cur: +       return 1 + cur.index(idx)  # lua index starts at 1 ! +    return -1 + +def find_offset( cur_order, tile, control): +    #print "Looking for tile '%s' and control '%s' in (" % (tile, control), +    #print cur_order +    #for o in cur_order: +    #    print "%s," % o, +    #print ")" + +    idx1 = -1 +    idx2 = -1 +    if tile in cur_order:  +        idx1 = 1 + cur_order.index(tile) # lua indexes from 1! +    else: +        print "find_offset(), unable to find tile %s in current list of indices" % tile +        sys.exit(-1) + +    if control in cur_order: +        idx2 = 1 + cur_order.index(control) # lua indexes from 1! +    else: +        print "find_offset(), unable to find control %s in current list of indices" % control +        sys.exit(-1) + +    #print "found at level %d and %d" % ( idx2, idx1 ) +    # this appears horrible +    if idx2 < idx1: +        return idx2-idx1+1 # bad ordering +    else: +        return idx2-idx1 + + + +def tile_by_index( tile_indices, sizes, index_names, final_order, tile_method): +    #print "STARTING TILE BY INDEX" +    #print "tile_by_index() tile_method ", +    #print tile_method +    #print "index_names: ", +    #print index_names + +    stmt = 0 # assume statement 0 +    if not valid_indices( stmt, tile_indices): +        print "python tile_by_index() one or more of ", +        print tile_indices, +        print " is not valid" +        sys.exit(-1) + +    if tile_method == None: +        #print "CREATING tile_method = 1" +        tile_method = 1 # "counted" + +    tile_index_names = [] +    for ti in tile_indices: +        tile_index_names.append( ti )  # make a copy?  +    #print "tile_index_names:", +    #print tile_index_names + +    control_index_names = {} # a dictionary? +    tile_index_map =  {} +     +    #print "index_names: " +    #print index_names + +    for pair in index_names: +        valid = False +        control = pair[0] +        name    = pair[1] +        #print "control %s   name  %s" % ( control, name ) +         +        if control[0] == "l" and control[1].isdigit(): +            if control.endswith("_control"): +                index = int(control[1: -8]) +                control_index_names[index-1] = name +                valid = True + +            elif control.endswith("_tile"): +                index = int(control[1: -5]) +                #print "index %d" % index +                tile_index_names[index-1] = name # ??  +                tile_index_map[name] = tile_indices[index-1] +                valid = True +        if not valid: +            print "%s is not a proper key for specifying tile or control loop indices\n" % control + +    #print "control_index_names = ", +    #print control_index_names + +    #print "tile_index_names = ", +    #print tile_index_names + +    #print "before call to build_order(), tile_index_map = ", +    #print tile_index_map + + +    # filter out control indices (and do name substitution of unprocessed tile indices) for a given level +    cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, -1) + +    #print "returned from build_order python\n\n" + +    # print("permute("..stmt..", {"..list_to_string(cur_order).."})") +    #print "permute(%d, {" % stmt, +    #print "cur_order = ", +    #print cur_order, +    #print "})" + +    cur_order.insert(0, stmt) +    #print cur_order +    chill.permute( tuple( cur_order))  +    #print "in cudaize.py, returned from C code chill.permute()\n" + +    for i in range(len(tile_indices)): +        cur_idx = tile_indices[i] +        #print "i %d  cur_idx %s calling build order ********" % (i, cur_idx) +        cur_order = build_order( final_order, tile_indices, control_index_names, tile_index_map, i) +        #print "cur_idx %s return from build order" % cur_idx +         +        # Find an offset between tile loop and control loop +        #  0   = control loop one level above tile loop +        #  -1  = control loop two levels above tile loop +        #  > 0 = tile loop above control loop +        #  In the last case, we do two extra tile commands to get the control +        #  above the tile and then rely on the final permute to handle the +        #  rest +        level = find_cur_level(stmt,cur_idx) +        #print "level %d\n" % level      + +        offset = find_offset(cur_order, tile_index_names[i], control_index_names[i]) +        #print "offset %d" % offset + +        if offset <= 0: +            #print "[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  ) +            chill.tile7( stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  ) +            #print "in cudaize.py, returned from C code chill.tile7\n" + +        else: +            #print "2tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  ) +            chill.tile7( stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  ) # regular level + +            # flip and tile control loop +            #print "3tile(%d, %d, %d)" % ( stmt, level+1, level+1) +            chill.tile3( stmt, level+1, level+1) + +            #print "4tile(%d, %d, %d)" % ( stmt, level+1, level) +            chill.tile3( stmt, level+1, level) + +            #print_code() + +        # Do permutation based on cur_order +        #print("permute based on build order calling build_order()") +        cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, i) + +        #print("permute based on build order return from build_order()") + +        #  print("permute("..stmt..", {"..list_to_string(cur_order).."})") +        topermute = cur_order +        topermute.insert(0, stmt) +        chill.permute( tuple(topermute) )  +        #print "\nafter permute(), code is:" +        #print_code() + +def normalize_index( index ): +    #print "in cudaize.py, normalize_index( %s )" % index +    stmt = 0  # assume stmt 0 +    l = find_cur_level( stmt, index ) +    chill.tile3( stmt, l, l ) + +def is_in_indices( stmt, idx): +    cur = chill.cur_indices(stmt) +    return idx in cur + +def copy_to_registers( start_loop, array_name ): +    #print "\n\n****** starting copy to registers" +    #sys.stdout.flush() + +    stmt = 0    # assume stmt 0 +    cur = chill.cur_indices(stmt) # calls C     +    table_Size = len(cur) + +    #print "Cur indices", +    #print_array(cur) +    #print "\nThe table size is %d" % table_Size +    #count=1 +    #for c in cur: +    #    print "%d\t%s" % (count,c) +    #    count += 1 + +    #print_code() + +    # would be much cleaner if not translating this code from lua! +    level_tx = -1 +    level_ty = -1    +    if is_in_indices(stmt,"tx"): +        level_tx = find_cur_level(stmt,"tx") +    if is_in_indices(stmt,"ty"): +        level_ty = find_cur_level(stmt,"ty") +    #print "level_tx %d  level_ty %d" % ( level_tx, level_ty ) +    #sys.stdout.flush() + +    ty_lookup_idx = ""  +    org_level_ty = level_ty + +    # UGLY logic. Lua index starts at 1, so all tests etc here are off by 1 from the lua code +    # level_ty initializes to -1 , which is not a valid index, and so there is added code to  +    # make it not try to acccess offset -1.   -1 IS a valid python array index +    # to top it off, the else below can assign a NIL to ty_lookup_idx!  +    if level_ty != -1 and cur[level_ty] != "": +        #print "IF  cur[%d] = %s" % ( level_ty, cur[level_ty] ) +        ty_lookup_idx = cur[level_ty]  +    else: +        #print "ELSE ty_lookup_idx = cur[%d] = %s" % ( level_ty, cur[level_ty-1])  +        ty_lookup_idx = cur[level_ty-1]  +    #print "ty_lookup_idx '%s'" % ty_lookup_idx + +    if level_ty > -1: +        #print "\ntile3(%d,%d,%d)" % (stmt,level_ty,level_tx+1) +        chill.tile3(stmt,level_ty,level_tx+1)  +    #print_code()    + +    cur = chill.cur_indices(stmt) # calls C  +    table_Size = len(cur) +    #print "Cur indices ", +    #for c in cur: +    #    print "%s," % c, +    #print "\nThe table size is %d" % len(cur) +    #count=1 +    #for c in cur: +    #    print "%d\t%s" % (count,c) +    #    count += 1 +    #sys.stdout.flush() + +    if is_in_indices(stmt,"tx"): +        level_tx = find_cur_level(stmt,"tx") +    if ty_lookup_idx != "":                      # perhaps incorrect test  +        if is_in_indices(stmt,ty_lookup_idx): +           level_ty = find_cur_level(stmt,ty_lookup_idx) +            +    ty_lookup = 1 +    idx_flag = -1 +    # find the level of the next valid index after ty+1 +    #print "\nlevel_ty %d" % level_ty +    if level_ty > -1: +       #print "table_Size %d" % table_Size +       for num in range(-1 + level_ty+ty_lookup,table_Size):   # ??  off by one? +           #print "num=%d   cur[num] = '%s'" % (num+1, cur[num]) # num+1 is lua index ???? +           sys.stdout.flush() +           if cur[num] != "": +               idx_flag = find_cur_level(stmt,cur[num]) +               #print "idx_flag = %d" % idx_flag +               break +                +    #print "\n(first) I am checking all indexes after ty+1 %s" % idx_flag +    #print_code()    +    #print ""  + +    how_many_levels = 1 +     +    #print "idx_flag = %d   I will check levels starting with %d" % (idx_flag, idx_flag+1) +    # lua arrays start at index 1. the next loop in lua starts at offset 0, since idx_flag can be -1 +    # thus the check for "not equal nil" in lua (bad idea) +    # python arrays start at 0, so will check for things that lua doesn't (?) +    startat = idx_flag + 1 +    if idx_flag == -1: +        startat = 1  # pretend we're lua for now.   TODO: fix the logic + +    for ch_lev in range(startat,table_Size+1):       # logic may be wrong (off by one) +        #print "ch_lev %d" % ch_lev +        if ch_lev <= table_Size and cur[ch_lev-1] != "": +           #print "cur[%d] = '%s'" % ( ch_lev, cur[ch_lev-1] ) +           how_many_levels += 1 + +    #print "\nHow Many Levels %d" % how_many_levels +    sys.stdout.flush() +    sys.stdout.flush() + +    if how_many_levels< 2: +        while( idx_flag >= 0): +            for num in range(level_ty+ty_lookup,table_Size+1): +                #print "at top of loop, num is %d" % num +                #print "cur[num] = '%s'" % cur[num-1] +                if cur[num-1] != "": +                    idx = cur[num-1] +                    #print "idx '%s'" % idx +                    sys.stdout.flush() +                    curlev = find_cur_level(stmt,idx) +                    #print "curlev %d" % curlev + +                    #print "\n[COPYTOREG]tile(%d,%d,%d)"%(stmt,curlev,level_tx) + +                    chill.tile3(stmt, curlev, curlev) +                    curlev = find_cur_level(stmt,idx) +                    #print "curlev %d" % curlev +                    chill.tile3(stmt,curlev,level_tx) +                    #print "hehe '%s'" % cur[num-1] +                     +                    cur = chill.cur_indices(stmt) +                    #print "Cur indices INSIDE", +                    #for c in cur: +                    #    print "%s," % c, +                    table_Size = len(cur) +                    #print "\nTable Size is: %d" % len(cur) + +                    level_tx = find_cur_level(stmt,"tx") +                    #print "\n level TX is: %d" % level_tx +                    level_ty = find_cur_level(stmt,ty_lookup_idx) +                    #print "\n level TY is: %d" %level_ty +                    idx_flag = -1 +                    #print "idx_flag = -1" + + +                    #- find the level of the next valid index after ty+1 +                    #- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?) +                    for num2 in range( -1 + level_ty+ty_lookup ,table_Size): # lua starts index at one +                        #print "num mucking num = %d" % num2 +                        if(cur[num2] != ""): +                            #print "cur[%d] = '%s'" % ( num2, cur[num2] ) +                            idx_flag = find_cur_level(stmt,cur[num2]) +                            #print("\n(second) I am checking all indexes after ty+1 %s",cur[num2]) +                            break + +                    #print "num mucked to %d     idx_flag = %d" % (num, idx_flag) + +                #print "at bottom of loop, num is %d" % num +           +    #print "done with levels" + +    # this was a block comment ??? + +#    for num in range(level_ty+1, table_Size+1): +#        print "num %d" % num +#        if cur[num-1] != "": +#            idx_flag = find_cur_level(stmt,cur[num-1])  ## ugly  +#    print "idx_flag = %d" % idx_flag + +    # change this all to reflect the real logic which is to normalize all loops inside the thread loops.  +#    print "change this all ...\n" +#    print "level_ty+1 %d  table_Size-1 %d     idx_flag %d" %( level_ty+1, table_Size-1, idx_flag) +#    sys.stdout.flush() +#    sys.stdout.flush() + +#    while level_ty+1 < (table_Size-1) and idx_flag >= 0: +#        print "*** level_ty %d" %  level_ty +#        for num in range(level_ty+2,table_Size+1):  # lua for includes second value +#            print "num %d   cur[num] %s" % (num, cur[num]) +#            if cur[num] != "": +#                idx = cur[num] +#                print "idx='%s'" % idx +#                #print_code() +                 +                 +             + +    #print "ARE WE SYNCED HERE?" +    #print_code() + +    #  [Malik] end logic +    start_level = find_cur_level(stmt, start_loop) # start_loop was passed parameter! + +    # We should hold constant any block or tile loop +    block_idxs  = chill.block_indices() +    thread_idxs = chill.thread_indices() +    #print"\nblock indices are" +    #for index, val in enumerate(block_idxs): +    #    print "%d\t%s" % ( int(index)+1 , val ) +    #print"\nthread indices are" +    #for index, val in enumerate(thread_idxs): +    #    print "%d\t%s" % ( int(index)+1 , val ) +    #print "\nStart Level: %d" % start_level + +    hold_constant = [] +    #print("\n Now in Blocks") +    for idx in block_idxs: +        blocklevel = find_cur_level(stmt,idx) +        if blocklevel >= start_level: +           hold_constant.append(idx) +           #print "\nJust inserted block %s in hold_constant" %idx + +    #print("\n Now in Threads") +    for idx in thread_idxs: +        blocklevel = find_cur_level(stmt,idx) +        if blocklevel >= start_level: +            hold_constant.append(idx) +            #print "\nJust inserted thread %s in hold_constant" %idx +    #print "\nhold constant table is: " +    #for index, val in enumerate(hold_constant): +    #    print "%d\t%s" % ( int(index)+1 , val ) +     +    #print("\nbefore datacopy pvt") +    old_num_stmts = chill.num_statements() +    #sys.stdout.flush() + +    #print "\n[DataCopy]datacopy_privatized(%d, %s, %s, " % (stmt, start_loop, array_name), +    #print hold_constant, +    #print ")" +    passtoC = [stmt, start_loop, array_name ] # a list +    passtoC.append( len(hold_constant ) ) +    for h in hold_constant: +        passtoC.append( h ) +    chill.datacopy_privatized( tuple( passtoC )) +    sys.stdout.flush() +    sys.stdout.flush() +     +    new_num_statements = chill.num_statements() +    #print "new num statements %d" % new_num_statements     + +    # Unroll to the last thread level +#    for stmt in range(old_num_statements, new_num_statements): +#        print "unrolling statement %d" % stmt +#        level = find_cur_level(stmt,thread_idxs[-1]) #get last thread level +#        print "level is %d" % level +#        idxs = chill.cur_indices(stmt) +#        if level < len(idxs): +#            chill.unroll(stmt,level+1,0) + + + +def copy_to_shared( start_loop, array_name, alignment ): +    #print "\nstarting copy to shared( %s, %s, %d)" % (start_loop, array_name, alignment )  +    #print "copy_to_shared( %s, %s, %d) in cudaize.py" % ( start_loop, array_name, alignment ) +    stmt = 0 # assume statement 0 + +    cur = chill.cur_indices(stmt) +    #print "Cur indices ", +    #print_array( cur ) + +    start_level = find_cur_level( stmt, start_loop ) +    #print "start_level %d" % start_level + +    old_num_statements = chill.num_statements() +    #print "old_num_statements %d" % old_num_statements +     + +    # Now, we give it indices for up to two dimensions for copy loop +    copy_loop_idxs = ["tmp1","tmp2"] +    #chill.datacopy_9arg(stmt, start_level, array_name, copy_loop_idxs, False, 0, 1, alignment,True) +    passtoC = [stmt, start_level, array_name]   # a list +    passtoC.append( len(copy_loop_idxs)) +    for i in copy_loop_idxs: +        passtoC.append(i) +    passtoC.append( 0 ) # False +    passtoC.append( 0 ) +    passtoC.append( 1 ) +    passtoC.append( alignment ) +    passtoC.append( 1 )   # True +    #print "\n[DataCopy]datacopy( ", +    #print passtoC, +    #print ")" + +    #if array_name == "b": +    #    chill.cheat(1) +    #if array_name == "c": +    #    chill.cheat(2) +     +    chill.datacopy_9arg( tuple( passtoC )) + +    #print "back from datacopy_9arg\n\n\n" +    #sys.stdout.flush() + + +    #print "calling add_sync( %d, %s )" % ( stmt, start_loop ) +    chill.add_sync( stmt, start_loop ) +    #print "back from add_sync()\n\n" + +    new_num_statements = chill.num_statements() +     +    #  This is fairly CUBLAS2 specific, not sure how well it generalizes, +    #  but for a 2D copy, what we want to do is "normalize" the first loop +    #  "tmp1" then get its hard upper bound. We then want to tile it to +    #  make the control loop of that tile "ty". We then tile "tmp2" with a +    #  size of 1 and make it "tx". + +    #print "fairly CUBLAS2 specific, OLD %d  NEW %d" % ( old_num_statements, new_num_statements) +    sys.stdout.flush() +    sys.stdout.flush() + +    for stmt in range(old_num_statements, new_num_statements): +        #print "for stmt = %d" % stmt +        level = find_cur_level( stmt, "tmp2") +        #print "FOUND CUR LEVEL?  level '", +        #print level, +        #print "'" + +        #print "in loop, stmt %d   level %d" % ( stmt, level ) +        if level != -1: +            #print "\nCopy to shared: [If was no error]\n" +            find_cur_level(stmt,"tmp2") +            chill.tile3( stmt, level, level ) +             +            #print "hard_loop_bounds( %d, %d )" % (stmt, level) +            bounds = chill.hard_loop_bounds(stmt, level) +            lower = bounds[0] +            upper = 1+ bounds[1] +            #print "lower %d  upper %d" % ( lower, upper ) + +            dims = chill.thread_dims() +            #print "in cudaize.py copy_to_shared, dims =", +            #print dims +            tx = dims[0] +            ty = dims[1] +            #print "2-loop cleanup: lower, upper: %d, %d,  tx: %d" % ( lower, upper, tx) + +            level = find_cur_level(stmt,"tmp1") +            #print "level %d" % level +            if tx == upper and ty == 1: +                #print "tx = %d    upper = %d     ty = %d"% (tx, upper, ty) +                #print "Don't need" + +                # Don't need an extra tile level, just move this loop up +                second_level = find_cur_level(stmt,"tmp2") +                chill.tile7(stmt, second_level, 1, level, "tx", "tx", counted) + +            else: +                #print "DO need?" +                if ty == 1: +                    new_ctrl = "tmp3"  +                else: +                    new_ctrl = "ty" + +                # LOTS of commented out code here in cudaize.lua  + +                #print_code() +                #print "\nStarting tmp2\n" +                first_level  = find_cur_level(stmt,"tmp1") +                second_level = find_cur_level(stmt,"tmp2") +                bounds = chill.hard_loop_bounds(stmt, second_level) +                lower = bounds[0] +                upper = 1 + bounds[1]   # BROKEN? +                         +                #print "[Malik]-loop cleanup@tmp2: lower, upper: %d, %d, tx: %d,first level:%d,second_level:%d" % ( lower, upper-1, tx, first_level, second_level)  + +                # Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx. +                #print "\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, second_level,1,first_level, "tx", "tx") +                chill.tile7(stmt, second_level,1,first_level,"tx","tx",counted) +                #print_code() + +                first_level = find_cur_level(stmt,"tmp1") +                bounds = chill.hard_loop_bounds(stmt, first_level) +                lower_1 =     bounds[0] +                upper_1 = 1 + bounds[1] +                tx_level = find_cur_level(stmt,"tx") +                bounds = chill.hard_loop_bounds(stmt,tx_level) +                lower_tx =   bounds[0] +                upper_tx = 1+bounds[1] +                #print "UL_1 %d %d     UL_tx %d %d" % ( lower_1, upper_1-1, lower_tx, upper_tx-1) + +                if int(math.ceil( float(upper_tx)/float(tx))) > 1: +                     #print "ceil I say" +                     #print "\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, tx_level,tx,tx_level, "tx", "tmp1") +                     chill.tile7(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted) +                     #print_code() + +                     repeat = find_cur_level(stmt,"tx") +                     #print "\n[Tile1]tile(%d, %d, %d)" % (stmt, repeat, repeat) +                     chill.tile3(stmt, repeat, repeat)  #find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx")) +                     #print_code() + +                     if find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx"): +                        #print "\nagain [Tile1]tile(%d, %d, %d)" % (stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")) +                        chill.tile3(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")) +                        #print_code() + +                #print_code() + +                #print "\nStarting tmp1\n" +                # Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty". +                chill.tile3(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))       +                #print_code() + +                ty_level = find_cur_level(stmt,"tmp1") +                bounds = chill.hard_loop_bounds(stmt,ty_level) +                lower_ty = bounds[0] +                upper_ty = 1 + bounds[1] + +                tx_level = find_cur_level(stmt,"tx") +                bounds = chill.hard_loop_bounds(stmt,tx_level) +                lower_tx = bounds[0] +                upper_tx = 1 + bounds[1] + +                #print "[Malik]-loop cleanup@tmp1: lowerty, upperty: %d, %d, ty: %d,ty level:%d,tx_level:%d, stmt: %d" % ( lower_ty, upper_ty-1, ty, ty_level, tx_level, stmt) +                 +                #print "before ceil" +                #sys.stdout.flush() + +                if(math.ceil(float(upper_ty)/float(ty)) > 1): +                    #print "CEIL IF" +                    #print "\n Inside upper_ty/ty > 1\n" + +                    #print "\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, ty_level,ty,ty_level, "ty", "tmp_ty") +                    chill.tile7(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted) +                    #print_code() + +                    #print "\n[Tile2-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt  ,"ty"),find_cur_level(stmt,"ty")) +                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty")) +                    #print_code() + +                    cur_idxs = chill.cur_indices(stmt) +                    #print "\n cur indexes are ", +                    #print_array( cur_idxs) +                    #sys.stdout.flush() + +                    # Putting ty before any tmp_tx +                    idx_flag = -1 +                    if "tmp_tx" in cur_idxs: +                        idx_flag = 1 + cur_idxs.index("tmp_tx")   # lua index starts at 1 +                    #print "\n (1) so i have found out the value of idx flag as %d" % idx_flag +                    #sys.stdout.flush()       +                     +                    if idx_flag >= 0: +                         if find_cur_level(stmt,"ty") > find_cur_level(stmt,"tmp_ty"): +                             #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                             chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                             #print_code() +                     +                     +                    #  Now Putting ty before any tmp_ty +                    sys.stdout.flush()       +                    idx_flag = -1 +                    if "tmp_ty" in cur_idxs: +                        idx_flag = 1 + cur_idxs.index("tmp_ty") # lua index starts at 1 +                    #print "\n IF  so i have found out the value of idx flag as %d" % idx_flag +                    #sys.stdout.flush()       +                                             +                    if idx_flag >= 0: +                        #print "one more test" +                        sys.stdout.flush() +                        if find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"): +                            #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                            #sys.stdout.flush() +                            chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                            #print_code() + + + +                else: +                    #print "CEIL ELSE" +                    #print "\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, ty_level,1,ty_level, "ty", "ty") +                    #sys.stdout.flush() +                    chill.tile7( stmt, ty_level, 1, ty_level, "ty", "ty", counted ) +                    #print_code() + +                    #print "\n[Tile3-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1) +                    sys.stdout.flush() + +                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1) +                    #print_code() + + +                    idx_flag = -1 +                    # LUA code checks to see if cur_idxs exists?  it is unused except in the other clause of this is +                    #if(cur_idxs) then +                        #print "CAN NEVER GET HERE?  cur_idxs" +                        #for num= 0,table.getn(cur_idxs) do +                            #if(cur[num] == "tmp_ty") then +                            #idx_flag = find_cur_level(stmt,cur[num]) +                            #break +                        #end +                    #end +                    print "\n ELSE so i have found out the value of idx flag as %d" % idx_flag +                    if idx_flag >= 0:  # can't happen +                        print "tile( stmt %d, level ty %d, level ty %d" % ( stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                        #chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) +                     +                         +                     + +                     +            #print "\n\n *** at bottom of if in copy to shared, " +            #print_code() +            #print "end of if" + +        else: +            #  copy to shared only created one level, not two, so we use a different approach (MV & TMV) +            #print "\nCopy to shared: [If was error]\n" +            level = find_cur_level(stmt,"tmp1") +            chill.tile3(stmt, level, level) + +            dims = chill.thread_dims() +            #print dims +            tx = dims[0] +            ty = dims[1] + +            bounds = chill.hard_loop_bounds(stmt, level) +            lower = bounds[0]    +            upper = bounds[1] + +            #print "bounds  lower %d    upper %d" % (lower, upper) +            upper = upper+1 # upper bound given as <=, compare to dimensions tx which is < +            if upper == tx: +                #print "upper == tx" +                chill.rename_index( stmt, "tmp1", "tx") +            else: +                #print "upper is not tx" +                #print "upper %d tx %d stmt: %d level: %d" % ( upper, tx, stmt, level) +                chill.tile7( stmt, level, tx, level, "tx", "tmp_tx", counted) +                #print_code() + +                #print "stmt:%d level+1: %d" % ( stmt, level+1)  +                #print("TILE 7") +                chill.tile7( stmt, level+1,1,level+1,"tx", "tx",counted) +                #print("TILE 3") +                chill.tile3( stmt, level+1, level) +                #print_code()            + + +                if ty > 1: +                   #print "GOING IN" +                   bounds = chill.hard_loop_bounds(stmt, level+1) +                   lower = bounds[0]    +                   upper = bounds[1]    +                   #print "ty %d  lower %d  upper %d" % ( ty, lower, upper ) +                   floatdiv = float(upper)/float(ty) +                   bound =  int(math.ceil(float(upper)/float(ty))) +                   #print "NOW FOR Y: upper %d ty %d stmt: %d level: %d bound: %d" % ( upper, ty, stmt, level+1,   bound) +                   chill.tile7(stmt, level+1, bound, level+1, "tmp_ty", "ty", counted) + +        # Always add sync +        chill.add_sync( stmt, start_loop ) +    #print "ending copy to shared\n" +    #sys.stdout.flush() +    #print_code()      + + + + + + + + + + + + + + + + + + + +def unroll_to_depth( max_depth ): +    print "\n\nunroll_to_depth(%d)" % max_depth +    print "SYNC UP" +    sys.stdout.flush() + +    cur = chill.cur_indices(0) +    thread_idxs = chill.thread_indices() +    guard_idx = thread_idxs[-1]  # last one + +    print "cur    indices", +    print_array(cur) +    print "thread indices",  +    print_array(thread_idxs) +    print "guard_idx = %s" % guard_idx + +    #print "thread_idxs = ", +    #print thread_idxs +    guard_idx = thread_idxs[-1] +    #print "guard_idx = %s" % guard_idx + +    #  HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS +    common_loops = [] +    comm_loops_cnt = 0 +    num_stmts = chill.num_statements() +    print "num statements %d" % num_stmts + +    for stmt in range(num_stmts): +        sys.stdout.flush() +        print "\nSTMT %d" % stmt, +        cur_idxs = chill.cur_indices(stmt) +        print "Current Indices:", +        for c in cur_idxs[:-1]: +            print "%s," % c, +        print "%s" % cur_idxs[-1]   # last one +        sys.stdout.flush() +        #print_code() +         +        if chk_cur_level(stmt, "tx") > 0: +             +            for ii in range(find_cur_level(stmt,"tx")-1): +                print "ii = %d\ncur_idxs[%d] = '%s'" % (ii+1, ii+1, cur_idxs[ii]) # print to match lua +                id = cur_idxs[ii] +                if id not in ["bx", "by", "", "tx", "ty"]: + +                    print "id %s is not in the list" % id + +                    for stmt1 in range(stmt+1, num_stmts): +                        print "\nii %d stmt1 is %d" % (ii+1, stmt1)  # print to match lua  +                        cur_idxs1 = chill.cur_indices(stmt1) +                        print "\nstmt1 cur_idxs1 is ", +                        for ind in cur_idxs1[:-1]: +                            print "%s," % ind, +                        print "%s" % cur_idxs1[-1] + +                        print "cur level(%d, %s) = %d" % (stmt, "tx", find_cur_level(stmt,"tx") ) +                        sys.stdout.flush() + +                        endrange = find_cur_level(stmt,"tx")-1 +                        print "for iii=1, %d do" % endrange +                        sys.stdout.flush() +                        for iii in range(endrange):   # off by one?  TODO  +                            print "stmt %d   ii %d   iii %d\n" % (stmt, ii+1, iii+1), +                            sys.stdout.flush() +                             +                            if iii >= len(cur_idxs1): +                                print "stmt %d   ii %d   iii %d  cur_idxs1[%d] = NIL" % (stmt, ii+1, iii+1, iii+1, )  # print to match lua  +                            else: +                                print "stmt %d   ii %d   iii %d  cur_idxs1[%d] = '%s'" % (stmt, ii+1, iii+1, iii+1, cur_idxs1[iii])  # print to match lua  +                            sys.stdout.flush() + +                            # this will still probably die  +                            if iii < len(cur_idxs1) and [iii] not in ["bx", "by", "tx", "ty", ""]: +                                if cur_idxs[ii] == cur_idxs1[iii]: +                                    print "\nfound idx:%s" % cur_idxs[ii] +                                    common_loops.append(cur_idxs[ii]) +                                    print "cl[%d] = '%s'" % ( comm_loops_cnt, cur_idxs[ii] ) +                                    comm_loops_cnt = len(common_loops) + +    if len(common_loops) > 0: +        print "\n COMM LOOPS :TOTAL %d, and are " % comm_loops_cnt, +        print common_loops,  +        print " this loop : %s" % common_loops[0] +    else: +        print "UNROLL can't unroll any loops?" + + +    while True:  # break at bottom of loop   (repeat in lua) +        old_num_statements = chill.num_statements() +        print "old_num_statements %d" % old_num_statements + +        for stmt in range(old_num_statements): +            cur_idxs = chill.cur_indices(stmt) +            print "stmt %d    cur_idxs =" % stmt, +            index = 0 +            for i in cur_idxs: +                index +=1 +                if index == len(cur_idxs): +                    print "%s" %i +                else: +                    print "%s," % i, + +            if len(cur_idxs) > 0: +                guard_level = -1 +                if chk_cur_level(stmt, guard_idx) > 0: +                    guard_level = find_cur_level(stmt,guard_idx) +                print "guard_level(sp) = %d" % guard_level +                if guard_level > -1: +                    level = next_clean_level(cur_idxs,guard_level) +                    print "next clean level %d" % level + +                     +                    #print "looking at %d" % stmt +                    #print "comparing %d and %d in" % (guard_level, level), +                    #index = 0 +                    #for i in cur_idxs: +                    #index +=1 +                    #if index == len(cur_idxs): +                    #    print "%s" %i +                    #else: +                    #    print "%s," % i, + +                    # need to handle max_depth +                    num_unrolled = 0 +                    level_unroll_comm = level +                    level_arr = [] + +                    #print "before while, level = %d" % level  +                    while level >= 0: +                        print "while: level = %d" % level  +                        if num_unrolled == max_depth: +                            break + +                        print "Unrolling %d at level %d index %s" % ( stmt, level, cur_idxs[guard_level])  # ???  +                        level_arr.append(level) + +                        guard_level = find_cur_level(stmt,guard_idx) +                        level = next_clean_level(cur_idxs,level+1) + +                    print "OK, NOW WE UNROLL" +                    if level_unroll_comm >= 0: +                        level_arr.reverse()   +                        for i,lev in enumerate(level_arr): +                            print "\ni=%d" % i +                            print "[Unroll]unroll(%d, %d, 0)" % (stmt, lev) +                            chill.unroll(stmt, lev, 0) + + +        new_num_statements = chill.num_statements() +        if old_num_statements == new_num_statements: +            break  # exit infinite loop + + +#  all other calls to C have a routine in this file   (?) +def unroll( statement, level, unroll_amount ): +    chill.unroll( statement, level, unroll_amount ) + diff --git a/test-chill/test-cases/examples/cuda-chill/mm.c b/test-chill/test-cases/examples/cuda-chill/mm.c new file mode 100644 index 0000000..0efbeeb --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mm.c @@ -0,0 +1,10 @@ +#define N 1024 + +void normalMM(float c[N][N], float a[N][N], float b[N][N]) { +  int i, j, k; + +  for (i = 0; i < N; i++) +    for (j = 0; j < N; j++) +      for (k = 0; k < N; k++) +        c[j][i] = c[j][i] + a[k][i] * b[j][k]; +} diff --git a/test-chill/test-cases/examples/cuda-chill/mm.lua b/test-chill/test-cases/examples/cuda-chill/mm.lua new file mode 100644 index 0000000..5bde1b0 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mm.lua @@ -0,0 +1,38 @@ +init("mm.c", "normalMM", 0) +dofile("cudaize.lua") +N=1024 +Ti=128 +Tj=64 +Tk=16 +Tii=16 +Tjj=16 + + + + +N=1024 + + + + + + + + + + + + + +tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k"})CU=1 + +tile_by_index({"k"},{Tk},{l1_control="kk"},{"ii","jj","kk","i","j","k"})CU=3 + +tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","kk","i","iii","j","jjj","k"},1)CU=2 + +cudaize("mm_GPU",{a=1048576,b=1048576,c=1048576},{block={"ii","jj"}, thread={"i","j"}})CU=2 +copy_to_shared("tx","a",-16) +copy_to_shared("tx","b",-16) +copy_to_registers("kk","c") +--print_code() +unroll_to_depth(2) diff --git a/test-chill/test-cases/examples/cuda-chill/mpeg4.c b/test-chill/test-cases/examples/cuda-chill/mpeg4.c new file mode 100755 index 0000000..7f83bf7 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mpeg4.c @@ -0,0 +1,23 @@ +#define N1 4096 +#define N2 4096 +#define WINDOW_SIZE 16 + +void mpeg4_cpu(float result[N1][N2], float prev[N2+WINDOW_SIZE][N2+WINDOW_SIZE], float  curr[WINDOW_SIZE*WINDOW_SIZE]) +{ +	unsigned int i; +	unsigned int j; +	unsigned int k; +	unsigned int l; + +	for ( i = 0; i < N1; ++i)     +		for ( j = 0; j < N2; ++j)  +                       for ( k = 0; k < WINDOW_SIZE; ++k)  +				for ( l = 0; l < WINDOW_SIZE; ++l)  +					result[i][j] += prev[i+k][j+l] * curr[k*WINDOW_SIZE+l]; +				 +			 + +		 +	 +} + diff --git a/test-chill/test-cases/examples/cuda-chill/mpeg4.lua b/test-chill/test-cases/examples/cuda-chill/mpeg4.lua new file mode 100644 index 0000000..f025dc0 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mpeg4.lua @@ -0,0 +1,45 @@ +--CUBLAS 2 MM Multiply + +--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you +--call init() and use global variables to specify procedure and loop + +--Second parameter is procedure # and third is loop # +init("mpeg4.c", "mpeg4_cpu", 0)  + +--dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods + +N=4096 +M=4096 +W=16 + +--TI 4ust be <= M +--TJ must be <=TI +Ti=32 +Tj=32 +Tii=16 +Tjj=16 +Tk=4 +--permute(0,{"j","i","k","l"}) +tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k","l"}) +--tile_by_index({"k","l"},{Tk*2,Tk*2},{l1_control="kk",l2_control="ll"},{"ii","jj","kk","ll","i","j","k","l"}) +--print_code() +--tile_by_index({"k","l"},{Tk,Tk},{l1_control="kk",l2_control="ll"},{"ii","jj","i","j","kk","k","ll","l"}) +tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","iii","i","jjj","j","k","l"}) +--print_code() +--normalize_index("j") +--normalize_index("i") +--print_code() +cudaize("kernel_GPU",{curr=W*W,prev=(N+W)*(M+W),result=N*M},{block={"ii","jj"}, thread={"i","j"}}) +--print_code() +copy_to_shared("iii","prev",16) + +copy_to_registers("jjj","result") + +--print_code() +--copy_to_constant_no_tile("curr") +unroll_to_depth(2) +print_code() +print_space() + + diff --git a/test-chill/test-cases/examples/cuda-chill/mriq-fh.c b/test-chill/test-cases/examples/cuda-chill/mriq-fh.c new file mode 100755 index 0000000..1e924b7 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mriq-fh.c @@ -0,0 +1,38 @@ +#define X 32768 +#define K 256 +struct kValues { +  float Kx; +  float Ky; +  float Kz; +  float PhiMag; +}; +extern float sin(float); +extern float cos(float); + +void mriFH_cpu(float *rPhi,float *rRho,float *iRho, float *iPhi, float *rD, float *iD, float *kx, float *ky, float *kz, float *dx, float *dy, float *dz, float *rFHref, float *iFHref) +{ + +    	float rfh; +	float ifh; +	float exp; +	float cArg; +	float sArg; +    	//float rRho[K]; +	//float iRho[K]; +        unsigned int k; +	unsigned int x; +  +       +    for (x = 0; x < X; ++x) { +        for (k = 0; k < K; ++k) { +             +	       exp = 2 * 3.14159 * (kx[k]* dx[x] + ky[k]* dy[x] + kz[k]* dz[x]); +	       cArg = cos(exp); +	       sArg = sin(exp); +            rFHref[x] += rRho[k]* cArg - iRho[k]* sArg; +            iFHref[x] += iRho[k]*cArg + rRho[k]*sArg; +        } +          +    } +} + diff --git a/test-chill/test-cases/examples/cuda-chill/mriq-fh.lua b/test-chill/test-cases/examples/cuda-chill/mriq-fh.lua new file mode 100755 index 0000000..3277bac --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mriq-fh.lua @@ -0,0 +1,73 @@ +--CUBLAS 2 MM Multiply + +--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you +--call init() and use global variables to specify procedure and loop + +--Second parameter is procedure # and third is loop # +init("mriq-fh.c", "mriFH_cpu", 0)  + +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, +                      --copy_to_shared methods +N=32768 +M=256 +Tx=256 + + +print_code() +--permute(0,{"j","i"}) +--tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"}) +tile_by_index({"x"},{Tx},{l1_control="xx"},{"xx","x","k"}) +--tile_by_index({"x"},{16},{l1_control="xx1"},{"xx","x","xx1","k"}) +--tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"}) +--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) +print_code() + +normalize_index("x") +--normalize_index("i") +print_code() +--tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"}) +--print_code() +--cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}}) +cudaize("kernel_GPU",{dx=N,dy=N,dz=N,iRho=M,kx=M,ky=M,kz=M,rFHref=N,iFHref=N,rRho=M},{block={"xx"}, thread={"x"}}) +--copy_to_shared("tx","iRho",-16) +--copy_to_shared("tx","dz",1) +--copy_to_shared("tx","rRho",-16) +--copy_to_registers("tx","rFHref") +--copy_to_registers("tx","rRho") +--copy_to_registers("tx","iRho") +--copy_to_registers("tx","kx") +--copy_to_registers("tx","dx") +--copy_to_registers("tx","ky") +--copy_to_registers("tx","dy") +--copy_to_registers("tx","kz") +--copy_to_registers("tx","dz") +--copy_to_registers("tx","iFHref") +--copy_to_texture("rRho") +--copy_to_texture("kx") +--copy_to_texture("dx") +--copy_to_texture("ky") +--copy_to_texture("dy") +--copy_to_texture("kz") +--copy_to_texture("dz") +--copy_to_texture("iRho") +--print_code()--]] +--unroll(0,4,0) +--copy_to_constant_no_tile("kx") +--copy_to_constant_no_tile("ky") +--copy_to_constant_no_tile("kz") +--copy_to_constant_no_tile("rRho") +--copy_to_constant_no_tile("iRho") + +--unroll_to_depth(1) +print_code() +--[[ +copy_to_Texture("rRho") +copy_to_Texture("kx") +copy_to_Texture("dx") +copy_to_Texture("ky") +copy_to_Texture("dy") +copy_to_Texture("kz") +copy_to_Texture("dz") +copy_to_Texture("iRho") +--unroll_to_depth(2) +--]] diff --git a/test-chill/test-cases/examples/cuda-chill/mriq.c b/test-chill/test-cases/examples/cuda-chill/mriq.c new file mode 100644 index 0000000..ba4b87c --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mriq.c @@ -0,0 +1,33 @@ +#define N 32768 +#define M 3072 +struct kValues { +  float Kx; +  float Ky; +  float Kz; +  float PhiMag; +}; +extern float sinf(float); +extern float cosf(float); + +void +ComputeQCPU(int numK, int numX,struct kValues kVals[M],float x[N], float y[N], float z[N],float Qr[N], float Qi[N]) { +  float expArg; +  float cosArg; +  float sinArg; +  float phi; +  int i; +  int j; +  numK = M; +  numX = N; +  for ( i = 0; i < M; i++) { +    for ( j = 0; j < N; j++) { +      expArg = 6.2831853071795864769252867665590058f * (kVals[i].Kx * x[j] +kVals[i].Ky * y[j] +kVals[i].Kz * z[j]); +      cosArg = cosf(expArg); +      sinArg = sinf(expArg); +      phi = kVals[i].PhiMag; +      Qr[j] += phi * cosArg; +      Qi[j] += phi * sinArg; +    } +  } +} +   diff --git a/test-chill/test-cases/examples/cuda-chill/mriq.lua b/test-chill/test-cases/examples/cuda-chill/mriq.lua new file mode 100644 index 0000000..1170111 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mriq.lua @@ -0,0 +1,55 @@ +--CUBLAS 2 MM Multiply + +--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you +--call init() and use global variables to specify procedure and loop + +--Second parameter is procedure # and third is loop # +init("mriq.c", "ComputeQCPU", 0)  + +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, +                      --copy_to_shared methods +N=32768 +M=3072 +TI=128 +TJ=128 + +permute(0,{"j","i"}) +--tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"}) +tile_by_index({"i"}, {TJ}, {l1_control="ii",l1_tile="i"}, {"ii", "j","i"}) +tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"}) +--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) +--print_code() + +normalize_index("j") +normalize_index("i") +--print_code() +--tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"}) +--print_code() +cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}}) + +copy_to_shared("tx","kVals",1) +--copy_to_shared("tx","x",1) +--copy_to_shared("tx","y",1) +--copy_to_shared("tx","z",1) + +--copy_to_texture("kVals") +--datacopy(0, 3, "kVals", {"tt","t"},false,0,1,-16,true) +--print_code() +--datacopy_privatized(0,"tx","kVals",{"tx"}) +--copy_to_registers("tx","kVals") +copy_to_registers("ii","x") +copy_to_registers("ii","y") +copy_to_registers("ii","z") +copy_to_registers("ii","Qi") +copy_to_registers("ii","Qr") +--[[datacopy_privatized(0,"tx","x",{"tx"}) +datacopy_privatized(0,"tx","y",{"tx"}) +datacopy_privatized(0,"tx","z",{"tx"}) +datacopy_privatized(0,"tx","Qi",{"tx"}) +datacopy_privatized(0,"tx","Qr",{"tx"}) + + +]]-- +--unroll(0,5,64) +print_code() +--unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels diff --git a/test-chill/test-cases/examples/cuda-chill/mv-shadow.c b/test-chill/test-cases/examples/cuda-chill/mv-shadow.c new file mode 100644 index 0000000..582b187 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mv-shadow.c @@ -0,0 +1,9 @@ +#define N 1024 + +void normalMV(float c[N][N], float a[N], float b[N]) { +  int i, j; + +  for (i = 0; i < N; i++) +    for (j = 0; j < N; j++) +      a[i] = a[i] + c[j][i] * b[j]; +} diff --git a/test-chill/test-cases/examples/cuda-chill/mv-shadow.lua b/test-chill/test-cases/examples/cuda-chill/mv-shadow.lua new file mode 100644 index 0000000..43e8491 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mv-shadow.lua @@ -0,0 +1,65 @@ +init("mv-shadow.c","normalMV",0) +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, +                      --copy_to_shared methods + +N=129 +TI=32 +TJ=64 + +N=1024 +TI=16 + + + + + + + + + + + + + + + + +--Tile the i and j loop, introducing "ii" as the control loop for the "i" +--tile, "k" for the control loop fo the "j" tile, with the final order +--of {"ii", "k", "i", "j"} +tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) +--tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) +--print_code() +--Normalize indx will do a tile size of one over the loop level specified +--by the input index. This is useful to get a zero lower bound and hard +--upper bound on a loop instead of it being relative to previous loop +--levels. +--normalize_index("ii") +normalize_index("i") +print_code() + +--Cudaize now determines the grid dimentions from the loops themselves +--(the upper bounds of the block and thread loops). It also renames the +--given block and thread loops's indexes to the approviate values from +--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the +--size of the arrays to be copied in the CUDA scaffolding. +cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}}) +--print_code() + +--Does a datacopy, tile, and add_sync to get a shared memory copy + +--copy_to_shared("tx", "b", 1) +--copy_to_shared("tx", "c", -16) +--print_code() +--copy_to_texture("b") +--copy_to_texture("c") +copy_to_registers("k", "a") +--print_code() + +unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels +--copy_to_texture("b") +--print_code() +--unroll(0,5,0) +--print_code() diff --git a/test-chill/test-cases/examples/cuda-chill/mv.c b/test-chill/test-cases/examples/cuda-chill/mv.c new file mode 100644 index 0000000..582b187 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mv.c @@ -0,0 +1,9 @@ +#define N 1024 + +void normalMV(float c[N][N], float a[N], float b[N]) { +  int i, j; + +  for (i = 0; i < N; i++) +    for (j = 0; j < N; j++) +      a[i] = a[i] + c[j][i] * b[j]; +} diff --git a/test-chill/test-cases/examples/cuda-chill/mv.lua b/test-chill/test-cases/examples/cuda-chill/mv.lua new file mode 100644 index 0000000..ca54501 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mv.lua @@ -0,0 +1,65 @@ +init("mv.c","normalMV",0) +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, +                      --copy_to_shared methods + +N=129 +TI=32 +TJ=64 + +N=1024 + + + + + + + + + + + + + + + + +--Tile the i and j loop, introducing "ii" as the control loop for the "i" +--tile, "k" for the control loop fo the "j" tile, with the final order +--of {"ii", "k", "i", "j"} +tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) +--tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) +--print_code() +--Normalize indx will do a tile size of one over the loop level specified +--by the input index. This is useful to get a zero lower bound and hard +--upper bound on a loop instead of it being relative to previous loop +--levels. +--normalize_index("ii") +normalize_index("i") +print_code() + +--Cudaize now determines the grid dimentions from the loops themselves +--(the upper bounds of the block and thread loops). It also renames the +--given block and thread loops's indexes to the approviate values from +--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the +--size of the arrays to be copied in the CUDA scaffolding. +cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}}) + +--print_code() + +--Does a datacopy, tile, and add_sync to get a shared memory copy + +--copy_to_shared("tx", "b", 1) +--copy_to_shared("tx", "c", -16) +--print_code() +--copy_to_texture("b") +--copy_to_texture("c") +copy_to_registers("k", "a") +--print_code() + +unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels +--copy_to_texture("b") +--print_code() +--unroll(0,5,0) +--print_code() diff --git a/test-chill/test-cases/examples/cuda-chill/mv_try.c b/test-chill/test-cases/examples/cuda-chill/mv_try.c new file mode 100644 index 0000000..7781f3b --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mv_try.c @@ -0,0 +1,9 @@ +#define N 4096 + +void normalMV(int n, float c[N][N], float a[N], float b[N]) { +  int i, j; + +  for (i = 0; i < n; i++) +    for (j = 0; j < n; j++) +      a[i] = a[i] + c[i][j] * b[j]; +} diff --git a/test-chill/test-cases/examples/cuda-chill/mv_try.lua b/test-chill/test-cases/examples/cuda-chill/mv_try.lua new file mode 100644 index 0000000..db4d9ad --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/mv_try.lua @@ -0,0 +1,14 @@ +init("mv_try.c","normalMV",0) +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, +                      --copy_to_shared methods + +TI=96 + +N=4096 + + +tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) +cudaize("mv_GPU", {a=N, b=N, c=N*N}, +        {block={"ii"}, thread={"i"}}) + +print_code() diff --git a/test-chill/test-cases/examples/cuda-chill/nbody.c b/test-chill/test-cases/examples/cuda-chill/nbody.c new file mode 100644 index 0000000..57899b6 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/nbody.c @@ -0,0 +1,66 @@ +#define NBODIES 16384 +#define SOFTENINGSQUARED 0.01f +#define DELTATIME 0.001f +#define DAMPING 1.0f + +#define NBLOCKSY 1 +#define NBLOCKSX (NBODIES/NTHREADSX) +#define NTHREADSY 1  +#define NTHREADSX 64 + +#define BLOCKSIZE 128 + +#define SHARED 1 +#define TIMER 1 +#define VERIFY 1 + +extern float sqrtf(float); + +void nbody_cpu(float* oldpos,float* oldpos1, float *newpos, float *oldvel, float *newvel, float *force) +{ +    float r0,r1,r2; +    float invDist, invDistCube, mass, invMass; +    unsigned int i,j; +    for(i = 0; i < NBODIES; ++i) { +        //force[i*4  ] = 0; +        //force[i*4+1] = 0; +        //force[i*4+2] = 0; +        //force[i*4+3] = 0; +        for(j = 0; j < NBODIES; ++j) { +	    r0 = oldpos[j*4]-oldpos1[i*4]; +	    r1 = oldpos[j*4+1]-oldpos1[i*4+1]; +	    r2 = oldpos[j*4+2]-oldpos1[i*4+2]; + +	    invDist = 1.0/sqrtf(r0 * r0 + r1 * r1 + r2 * r2 + SOFTENINGSQUARED); +	    invDistCube =  invDist * invDist * invDist; +	    mass = oldpos1[i*4+3]; + +	    force[i*4] = force[i*4] + r0 * mass * invDistCube; +	    force[i*4+1] = force[i*4+1] + r1 * mass * invDistCube; +	    force[i*4+2] = force[i*4+2] + r2 * mass * invDistCube; + +        } +    } + +/*    for (i = 0; i < NBODIES; ++i) { +        invMass = oldvel[4*i+3]; + +        oldvel[4*i] += (force[4*i] * invMass) * DELTATIME * DAMPING; +        oldvel[4*i+1] += (force[4*i+1] * invMass) * DELTATIME * DAMPING; +        oldvel[4*i+2] += (force[4*i+2] * invMass) * DELTATIME * DAMPING; + +        oldpos[4*i] += oldvel[4*i] * DELTATIME; +        oldpos[4*i+1] += oldvel[4*i+1] * DELTATIME; +        oldpos[4*i+2] += oldvel[4*i+2] * DELTATIME; + +        newpos[4*i+0] = oldpos[4*i]; +        newpos[4*i+1] = oldpos[4*i+1]; +        newpos[4*i+2] = oldpos[4*i+2]; +        newpos[4*i+3] = oldpos[4*i+3]; + +        newvel[4*i+0] = oldvel[4*i]; +        newvel[4*i+1] = oldvel[4*i+1]; +        newvel[4*i+2] = oldvel[4*i+2]; +        newvel[4*i+3] = oldvel[4*i+3]; +    }*/ +} diff --git a/test-chill/test-cases/examples/cuda-chill/nbody.lua b/test-chill/test-cases/examples/cuda-chill/nbody.lua new file mode 100644 index 0000000..08f88a9 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/nbody.lua @@ -0,0 +1,53 @@ +--CUBLAS 2 MM Multiply + +--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you +--call init() and use global variables to specify procedure and loop + +--Second parameter is procedure # and third is loop # +init("nbody.c", "nbody_cpu" , 0)  + +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, +                     --copy_to_shared methods +NBODIES=16384 + + +--Tj=128 CHANGE FOR BEST..... BEST IS 64BLOCKS 128THREADS +--Ti=256 +Tj=64 +Ti=32 +Tjjj=1 +Tiii=1 +Tn=0.1 +--normalize_index("j") +-- +--print_code() +--normalize_index("n") +-- TILE COMMANDS ZEROOOOOOOOOOO:3 +--tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j"})--CU=-1 +tile_by_index({"i"},{Ti},{l1_control="ii"},{"ii","i","j"})--CU=-1 +--normalize_index("i") +--tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1 + +--tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3 +--tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"}) +--tile_by_index({"i"}, {Ti/2}, {l1_control="iii"}, {"ii","iii", "jj","i","j"}) +--print_code() +cudaize("kernel_GPU",{oldpos=4*NBODIES,oldpos1=4*NBODIES,oldvel=4*NBODIES,force=4*NBODIES,newpos=4*NBODIES,newvel=4*NBODIES},{block={"ii"}, thread={"i"}})--CU=3 +print_code() +--tile(0,6,6) +--copy_to_shared("tx","oldpos",-16) +--copy_to_registers("j","oldpos") +--copy_to_registers("j","oldpos1") +--copy_to_registers("j","force") + +--copy_to_texture("oldpos") +--tile(1,3,3) +--tile(2,3,3) + +print_code() +--unroll_to_depth(1) +-- +--tile(2,3,3) +--unroll(2,3,0) +--unroll(0,5,0) +--print_code() diff --git a/test-chill/test-cases/examples/cuda-chill/tmv-shadow.c b/test-chill/test-cases/examples/cuda-chill/tmv-shadow.c new file mode 100644 index 0000000..cb9ea8d --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/tmv-shadow.c @@ -0,0 +1,9 @@ +#define N 1024 + +void normalMV(float c[N][N], float a[N], float b[N]) { +  int i, j; + +  for (i = 0; i < N; i++) +    for (j = 0; j < N; j++) +      a[i] = a[i] + c[i][j] * b[j]; +} diff --git a/test-chill/test-cases/examples/cuda-chill/tmv-shadow.lua b/test-chill/test-cases/examples/cuda-chill/tmv-shadow.lua new file mode 100644 index 0000000..196b939 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/tmv-shadow.lua @@ -0,0 +1,50 @@ +init("tmv-shadow.c","normalMV",0) +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, +                      --copy_to_shared methods + +N=1024 +--N= 8209 +--N=129 +TI=64 +N=1024 +TI=32 +--tile, "k" for the control loop for the "j" tile, with the final order +--of {"ii", "k", "i", "j"} +tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii",  "i", "j"}) +--print_code() +--tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) + +--print_code() +--Normalize indx will do a tile size of one over the loop level specified +--by the input index. This is useful to get a zero lower bound and hard +--upper bound on a loop instead of it being relative to previous loop +--levels. +--normalize_index("i") +--print_code() + +--Cudaize now determines the grid dimentions from the loops themselves +--(the upper bounds of the block and thread loops). It also renames the +--given block and thread loops's indexes to the approviate values from +--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the +--size of the arrays to be copied in the CUDA scaffolding. +cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}}) + +--print_code() + +--Does a datacopy, tile, and add_sync to get a shared memory copy +copy_to_shared("tx", "b", 1) +--copy_to_texture("b") +--print_code() + +copy_to_shared("tx", "c", -16) +--copy_to_texture("c") +--print_code() + +copy_to_registers("k", "a") +print_code() +--unroll(0,5,0) +--unroll(0,4,0) +--unroll(2,4,16) +unroll_to_depth(1) +--print_code() diff --git a/test-chill/test-cases/examples/cuda-chill/tmv.c b/test-chill/test-cases/examples/cuda-chill/tmv.c new file mode 100644 index 0000000..cb9ea8d --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/tmv.c @@ -0,0 +1,9 @@ +#define N 1024 + +void normalMV(float c[N][N], float a[N], float b[N]) { +  int i, j; + +  for (i = 0; i < N; i++) +    for (j = 0; j < N; j++) +      a[i] = a[i] + c[i][j] * b[j]; +} diff --git a/test-chill/test-cases/examples/cuda-chill/tmv.lua b/test-chill/test-cases/examples/cuda-chill/tmv.lua new file mode 100644 index 0000000..5071108 --- /dev/null +++ b/test-chill/test-cases/examples/cuda-chill/tmv.lua @@ -0,0 +1,50 @@ +init("tmv.c","normalMV",0) +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, +                      --copy_to_shared methods + +N=1024 +--N= 8209 +--N=129 +TI=64 +N=1024 +TI=32 +--tile, "k" for the control loop for the "j" tile, with the final order +--of {"ii", "k", "i", "j"} +tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii",  "i", "j"}) +--print_code() +--tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) + +--print_code() +--Normalize indx will do a tile size of one over the loop level specified +--by the input index. This is useful to get a zero lower bound and hard +--upper bound on a loop instead of it being relative to previous loop +--levels. +--normalize_index("i") +--print_code() + +--Cudaize now determines the grid dimentions from the loops themselves +--(the upper bounds of the block and thread loops). It also renames the +--given block and thread loops's indexes to the approviate values from +--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the +--size of the arrays to be copied in the CUDA scaffolding. +cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}}) + +--print_code() + +--Does a datacopy, tile, and add_sync to get a shared memory copy +copy_to_shared("tx", "b", 1) +--copy_to_texture("b") +--print_code() + +copy_to_shared("tx", "c", -16) +--copy_to_texture("c") +--print_code() + +copy_to_registers("k", "a") +print_code() +--unroll(0,5,0) +--unroll(0,4,0) +--unroll(2,4,16) +unroll_to_depth(1) +--print_code() | 
