summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDerick Huth <derickhuth@gmail.com>2015-09-24 11:26:53 -0600
committerDerick Huth <derickhuth@gmail.com>2015-09-24 11:26:53 -0600
commitc285135eb903c31cd221f90f03e288a6b67770cd (patch)
tree1f6ea3120a09feef7236dac579d5a2d5b774aaa7
parentf5c39e4c6ff55520948c2ef331c968cd84b817d9 (diff)
downloadchill-c285135eb903c31cd221f90f03e288a6b67770cd.tar.gz
chill-c285135eb903c31cd221f90f03e288a6b67770cd.tar.bz2
chill-c285135eb903c31cd221f90f03e288a6b67770cd.zip
pre-v0.2.1
-rw-r--r--INSTALL18
-rw-r--r--Makefile-Old251
-rw-r--r--chill_run.cc15
-rw-r--r--chillmodule.cc4
-rw-r--r--dep.cc2
-rw-r--r--examples/chill/gemm.c18
-rw-r--r--examples/chill/gemm.script31
-rw-r--r--examples/chill/gemv.c15
-rw-r--r--examples/chill/gemv.script9
-rw-r--r--examples/chill/jacobi1.c13
-rw-r--r--examples/chill/jacobi1.script18
-rw-r--r--examples/chill/jacobi2.c15
-rw-r--r--examples/chill/jacobi2.script21
-rw-r--r--examples/chill/unroll.c33
-rw-r--r--examples/chill/unroll.script35
-rw-r--r--examples/cuda-chill/cp.c29
-rw-r--r--examples/cuda-chill/cp.lua46
-rw-r--r--examples/cuda-chill/cudaize.lua1004
-rwxr-xr-xexamples/cuda-chill/cudaize.py1047
-rw-r--r--examples/cuda-chill/mm.c10
-rw-r--r--examples/cuda-chill/mm.lua38
-rwxr-xr-xexamples/cuda-chill/mpeg4.c23
-rw-r--r--examples/cuda-chill/mpeg4.lua45
-rwxr-xr-xexamples/cuda-chill/mriq-fh.c38
-rwxr-xr-xexamples/cuda-chill/mriq-fh.lua73
-rw-r--r--examples/cuda-chill/mriq.c33
-rw-r--r--examples/cuda-chill/mriq.lua55
-rw-r--r--examples/cuda-chill/mv-shadow.c9
-rw-r--r--examples/cuda-chill/mv-shadow.lua65
-rw-r--r--examples/cuda-chill/mv.c9
-rw-r--r--examples/cuda-chill/mv.lua65
-rw-r--r--examples/cuda-chill/mv_try.c9
-rw-r--r--examples/cuda-chill/mv_try.lua14
-rw-r--r--examples/cuda-chill/nbody.c66
-rw-r--r--examples/cuda-chill/nbody.lua53
-rw-r--r--examples/cuda-chill/tmv-shadow.c9
-rw-r--r--examples/cuda-chill/tmv-shadow.lua50
-rw-r--r--examples/cuda-chill/tmv.c9
-rw-r--r--examples/cuda-chill/tmv.lua50
-rw-r--r--examples/fortran/README10
-rw-r--r--examples/fortran/ccd.f32
-rw-r--r--examples/fortran/ccd.script18
-rw-r--r--examples/fortran/gemm.f9058
-rw-r--r--examples/fortran/gemm.script30
-rw-r--r--examples/fortran/rose_gemm.f90155
-rw-r--r--graph-test.cc148
-rw-r--r--graph.hh3
l---------include/ir_suif.hh1
l---------include/ir_suif_utils.hh1
-rw-r--r--ir_cuda_rose_utils.cc191
-rw-r--r--ir_cuda_suif_utils.cc54
-rw-r--r--ir_cudarose.cc165
-rw-r--r--ir_cudarose.hh46
-rw-r--r--ir_cudasuif.cc144
-rw-r--r--ir_cudasuif.hh36
-rw-r--r--loop.cc1
-rw-r--r--loop_backup.cc3311
-rw-r--r--loop_cuda.cc2123
-rw-r--r--loop_cuda_rose.cc3734
-rw-r--r--loop_modified.cc4234
-rw-r--r--mem_mapping_utils.cc76
-rw-r--r--mem_mapping_utils.hh59
-rw-r--r--omega/INSTALL34
-rw-r--r--omega/README96
-rw-r--r--omega/ROSE_INSTALL.txt77
l---------omega/bin/oc1
-rw-r--r--orig_loop_datacopy.cc1175
67 files changed, 12 insertions, 19348 deletions
diff --git a/INSTALL b/INSTALL
deleted file mode 100644
index aef619a..0000000
--- a/INSTALL
+++ /dev/null
@@ -1,18 +0,0 @@
-BUILD
-=====
-
-1. Edit Makefile. Change SUIFHOME and OMEGAHOME to correct paths.
-
-2. Do "make depend" in the chill directory.
-
-3. Optional, do "make clean" or "make veryclean" which removes additional
- target files and flex/bison generated files.
-
-4. Do "make".
-
-
-INSTALLATION
-============
-
-You can use CHiLL in source directory since all links are already
-created in bin/, lib/ and include/ directories.
diff --git a/Makefile-Old b/Makefile-Old
deleted file mode 100644
index 7f2c8b5..0000000
--- a/Makefile-Old
+++ /dev/null
@@ -1,251 +0,0 @@
-
-.SUFFIXES:
-.PHONY: all depend depend-cuda-chill clean veryclean cuda-chill
-.PHONY: chill
-
-CC = g++
-CFLAGS = -g -Wno-write-strings
-DEPENDENCE_CFLAGS = -M
-OMEGAHOME=./omega
-
-ifdef TEST_COVERAGE
- CFLAGS := $(CFLAGS) -fprofile-arcs -ftest-coverage
-endif
-
-# TODO auto-generate using config.h generated by autoconf?
-CHILLVERSION = "\"0.2.0\""
-PYTHON=python #=$(shell `which python` )
-PYVERSION=$(shell $(PYTHON) -c "import sys; print(sys.version[:3])") # 2.6
-PYTHONVER = python$(PYVERSION)
-PYTHONINCLUDE = $(shell $(PYTHON) -c "from distutils import sysconfig; print(sysconfig.get_python_inc())")
-PYTHONLIBDIR = $(shell $(PYTHON) -c "from distutils import sysconfig; print(sysconfig.get_config_var('LIBDIR'))")
-PYTHONCONFIG = $(shell $(PYTHON) -c "from distutils import sysconfig; print(sysconfig.get_config_var('LIBPL'))")
-# SCRIPT_LANG = lua <-- supplied by the command line
-
-
-# this creates a LUAHOME even if you don't have such a directory
-ifeq ($(strip $(wildcard $(LUAHOME))),)
-LUAHOME = $(HOME)/lua
-endif
-LUA_PATH = -L${LUAHOME}/lib
-
-
-# where do include files live
-INC_PATH = -I${PYTHONINCLUDE} -I${OMEGAHOME}/include -I${LUAHOME}/include
-
-# where do libraries live
-LIB_PATH = -L${OMEGAHOME}/code_gen/obj -L${OMEGAHOME}/omega_lib/obj
-# seemingly not needed -L${PYTHONCONFIG}
-
-
-
-CORE_LIBS = -lm -lcodegen -lomega
-RUNNER_LIBS = -llua -ldl -lreadline -lhistory -lpthread -ldl -lutil -lm -l${PYTHONVER}
-
-TDLHOME = ${ROSEHOME}/libltdl
-
-BOOST_DATE_TIME_LIB = -lboost_date_time
-BOOST_FILESYSTEM_LIB = -lboost_filesystem
-BOOST_LDFLAGS = -L${BOOSTHOME}/lib
-BOOST_PROGRAM_OPTIONS_LIB = -lboost_program_options
-BOOST_REGEX_LIB = -lboost_regex
-BOOST_SYSTEM_LIB = -lboost_system
-BOOST_THREAD_LIB = -lboost_thread
-BOOST_WAVE_LIB = -lboost_wave
-
-ROSE_LIBS = -lrose $(BOOST_LDFLAGS) $(BOOST_DATE_TIME_LIB)\
- $(BOOST_THREAD_LIB) $(BOOST_FILESYSTEM_LIB) $(BOOST_PROGRAM_OPTIONS_LIB)\
- $(BOOST_REGEX_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) \
- $(BOOST_WAVE_LIB) -lrt -ldl
-
-
-# Source files common to both chill and cuda-chill
-CORE_SRCS = dep.cc omegatools.cc irtools.cc loop.cc loop_basic.cc loop_datacopy.cc loop_unroll.cc loop_tile.cc loop_extra.cc
-LIB_SRCS = $(CORE_SRCS)
-
-# files that will be generated by bison, flex, and make that need to be removed at clean.
-GENERATED_SRCS = parser.tab.hh parser.tab.cc parse_expr.yy.cc parse_expr.ll.hh parse_expr.tab.cc parse_expr.tab.hh Makefile.deps
-# object files that are specific to lua or python builds. -- This is used so that SCRIPT_LANG does not need to be specified during clean
-ORPHAN_OBJS = chill_run_util.o chillmodule.o parse_expr.tab.o parse_expr.yy.o
-
-# files used in chill and cuda-chill interfaces
-ifeq ($(SCRIPT_LANG),lua)
- RUNNER_SRCS = chill_run.cc chill_env.cc
-else
- ifeq ($(SCRIPT_LANG),python)
- RUNNER_SRCS = chill_run.cc chillmodule.cc
- else
- RUNNER_SRCS = chill_run.cc chill_env.cc
- endif
-endif
-
-# files used in chill but not cuda-chill
-IR_CHILL_SRCS = ir_rose.cc ir_rose_utils.cc
-ifeq ($(SCRIPT_LANG),lua)
- YACC_SRCS = parse_expr.yy.cc parse_expr.tab.cc
- CHILL_RUNNER_SRCS = chill_run_util.cc
- CHILL_SRCS = $(CORE_SRCS) $(IR_CHILL_SRCS) $(CHILL_RUNNER_SRCS) $(RUNNER_SRCS)
-else
- ifeq ($(SCRIPT_LANG),python)
- YACC_SRCS = parse_expr.yy.cc parse_expr.tab.cc
- CHILL_RUNNER_SRCS = chill_run_util.cc
- CHILL_SRCS = $(CORE_SRCS) $(IR_CHILL_SRCS) $(CHILL_RUNNER_SRCS) $(RUNNER_SRCS)
- else
- YACC_SRCS = lex.yy.cc parser.tab.cc
- CHILL_RUNNER_SRCS =
- CHILL_SRCS = $(CORE_SRCS) $(IR_CHILL_SRCS) $(YACC_SRCS) $(RUNNER_SRCS)
- endif
-endif
-
-# source files for cuda-chill but not chill
-CUDACHILL_ONLY_SRCS = mem_mapping_utils.cc loop_cuda_rose.cc
-IR_CUDACHILL_SRCS = ir_rose.cc ir_rose_utils.cc ir_cudarose.cc ir_cuda_rose_utils.cc
-CUDACHILL_RUNNER_SRCS =
-CUDACHILL_SRCS = $(CORE_SRCS) $(CUDACHILL_ONLY_SRCS) $(IR_CUDACHILL_SRCS) $(RUNNER_SRCS) $(CUDACHILL_RUNNER_SRCS)
-
-# set interface language flags
-ifeq ($(SCRIPT_LANG),lua)
- RUNNER_EXTRA_CFLAGS = -DLUA
-else
- ifeq ($(SCRIPT_LANG),python)
- RUNNER_EXTRA_CFLAGS = -DPYTHON
- endif
-endif
-
-depend-cuda-chill: CFLAGS := $(CFLAGS) -DCUDACHILL
-cuda-chill: CFLAGS := $(CFLAGS) -DCUDACHILL
-
-ALL_SRCS = $(CORE_SRCS) $(YACC_SRCS) $(IR_CHILL_SRCS) $(CUDACHILL_ONLY_SRCS) $(IR_CUDACHILL_SRCS) $(RUNNER_SRCS) $(CHILL_RUNNER_SRCS) $(CUDACHILL_RUNNER_SRCS)
-ALL_OBJS = $(ALL_SRCS:.cc=.o) $(ORPHAN_OBJS)
-
-RUNNER_DEFINES = -DLUA_USE_LINUX -DCHILL_BUILD_VERSION=$(CHILLVERSION) -DCHILL_BUILD_DATE="\"$(CHILL_BUILD_DATE)\""
-
-
-YACC_EXTRA_CFLAGS =
-
-#####################################################################
-# compiler intermediate code specific definitions
-#####################################################################
-
-
-
-#LIBS := $(LIBS) $(ROSE_LIBS)
-LIB_PATH := $(LIB_PATH) -L${ROSEHOME}/lib -L${TDLHOME}
-#LIB_SRCS := $(LIB_SRCS) # $(IR_SRCS)
-INC_PATH := $(INC_PATH) -I${ROSEHOME}/include -I${BOOSTHOME}/include
-YACC_EXTRA_CFLAGS := -DBUILD_ROSE
-RUNNER_EXTRA_CFLAGS := $(RUNNER_EXTRA_CFLAGS) -DBUILD_ROSE
-
-
-#####################################################################
-# build rules
-#####################################################################
-
-YACC_OBJS = $(YACC_SRCS:.cc=.o)
-RUNNER_OBJS = $(RUNNER_SRCS:.cc=.o)
-CHILL_RUNNER_OBJS = $(CHILL_RUNNER_SRCS:.cc=.o)
-CUDACHILL_RUNNER_OBJS = $(CUDACHILL_RUNNER_SRCS:.cc=.o)
-LIB_OBJS = $(LIB_SRCS:.cc=.o)
-IR_CHILL_OBJS = $(IR_CHILL_SRCS:.cc=.o)
-IR_CUDACHILL_OBJS = $(IR_CUDACHILL_SRCS:.cc=.o)
-CUDACHILL_ONLY_OBJS = $(CUDACHILL_ONLY_SRCS:.cc=.o)
-
-CHILL_OBJS = $(CHILL_SRCS:.cc=.o)
-CUDACHILL_OBJS = $(CUDACHILL_SRCS:.cc=.o)
-
-
-all:
- $(MAKE) depend-chill
- $(MAKE) chill
- $(MAKE) depend-cuda-chill
- $(MAKE) cuda-chill
-
-
-# can't these be combined to a superset of all source files?
-depend: depend-cuda-chill
-
-depend-chill: $(LIB_SRCS) $(RUNNER_SRCS) $(CHILL_RUNNER_SRCS) $(YACC_SRCS)
- $(CC) $(DEPENDENCE_CFLAGS) $(INC_PATH) $(LIB_SRCS) $(RUNNER_SRCS) $(CHILL_RUNNER_SRCS) $(YACC_SRCS) > Makefile.deps
-
-depend-cuda-chill: $(LIB_SRCS) $(RUNNER_SRCS) $(CUDACHILL_RUNNER_SRCS)
- $(CC) $(DEPENDENCE_CFLAGS) $(INC_PATH) $(LIB_SRCS) $(RUNNER_SRCS) $(CUDACHILL_RUNNER_SRCS) > Makefile.deps
-
-libchill_xform.a: $(LIB_OBJS) $(IR_CHILL_OBJS)
- ar -rs $@ $(LIB_OBJS) $(IR_CHILL_OBJS)
-
-libcudachill_xform.a: $(LIB_OBJS) $(IR_CUDACHILL_OBJS) $(CUDACHILL_ONLY_OBJS)
- ar -rs $@ $(LIB_OBJS) $(IR_CUDACHILL_OBJS) $(CUDACHILL_ONLY_OBJS)
-
-%.o: %.cc
- $(CC) $(CFLAGS) $(INC_PATH) $< -c -o $@
-
-
-clean:
- @rm -fr $(ALL_OBJS) $(YACC_SRCS) $(GENERATED_SRCS)
-
-veryclean:
- @rm -fr $(ALL_OBJS) $(YACC_SRCS) libchill_xform.a libcudachill_xform.a chill cuda-chill
-
-
-cuda-chill: libcudachill_xform.a $(CUDACHILL_RUNNER_OBJS) $(RUNNER_OBJS)
- $(CC) $(CFLAGS) $(LIB_PATH) $(LUA_PATH) $(CUDACHILL_RUNNER_OBJS) $(RUNNER_OBJS) $< $(CORE_LIBS) $(ROSE_LIBS) $(RUNNER_LIBS) -o $@
-
-ifeq ($(SCRIPT_LANG),lua)
-chill: libchill_xform.a $(CHILL_RUNNER_OBJS) $(RUNNER_OBJS) $(YACC_OBJS)
- $(CC) $(CFLAGS) $(LIB_PATH) $(LUA_PATH) $(YACC_OBJS) $(CHILL_RUNNER_OBJS) $(RUNNER_OBJS) $< $(CORE_LIBS) $(ROSE_LIBS) $(RUNNER_LIBS) -o $@
-else
-ifeq ($(SCRIPT_LANG),python)
-chill: libchill_xform.a $(CHILL_RUNNER_OBJS) $(RUNNER_OBJS) $(YACC_OBJS)
- $(CC) $(CFLAGS) $(LIB_PATH) $(YACC_OBJS) $(CHILL_RUNNER_OBJS) $(RUNNER_OBJS) $< $(CORE_LIBS) $(ROSE_LIBS) $(RUNNER_LIBS) -o $@
-
-else
-chill: libchill_xform.a $(YACC_OBJS)
- $(CC) $(CFLAGS) $(LIB_PATH) $(YACC_OBJS) $< $(CORE_LIBS) $(ROSE_LIBS) -o $@
-endif
-endif
-
-
-lex.yy.cc: parser.ll parser.tab.hh
- flex++ parser.ll
-
-lex.yy.o: lex.yy.cc
- $(CC) $(CFLAGS) -c $< -o $@
-
-parser.tab.hh parser.tab.cc: parser.yy
- bison -t -d $<
-
-parser.tab.o: parser.tab.cc
- $(CC) $(CFLAGS) $(YACC_EXTRA_CFLAGS) $(INC_PATH) -DCHILL_BUILD_DATE="\"$(CHILL_BUILD_DATE)\"" -c $< -o $@
-
-
-parse_expr.tab.cc: parse_expr.yy
- bison -t -d parse_expr.yy
-
-parse_expr.tab.o: parse_expr.tab.cc
- $(CC) $(CFLAGS) $(YACC_CFLAGS) $(INC_PATH) -o $@ -c parse_expr.tab.cc
-
-parse_expr.yy.cc: parse_expr.tab.cc parse_expr.ll
- flex -o parse_expr.yy.cc parse_expr.ll
-
-parse_expr.yy.o: parse_expr.yy.cc
- $(CC) $(CFLAGS) $(YACC_CFLAGS) $(INC_PATH) -o $@ -c parse_expr.yy.cc
-
-$(RUNNER_SRCS:.cc=.o): %.o: %.cc
- $(CC) $(CFLAGS) $(RUNNER_EXTRA_CFLAGS) $(INC_PATH) $(RUNNER_DEFINES) $< -c -o $@
-
-$(CHILL_RUNNER_SRCS:.cc=.o): %.o: %.cc
- $(CC) $(CFLAGS) $(RUNNER_EXTRA_CFLAGS) $(INC_PATH) $(RUNNER_DEFINES) $< -c -o $@
-
-$(CUDACHILL_RUNNER_SRCS:.cc=.o): %.o %.cc
- $(CC) $(CFLAGS) $(RUNNER_EXTRA_CFLAGS) $(INC_PATH) $(RUNNER_DEFINES) $< -c -o $@
-
-
-$(IR_SRCS:.cc=.o): %.o: %.cc
- $(CC) -Wno-write-strings $(CFLAGS) $(INC_PATH) $< -c -o $@
-
-ifeq ($(shell test -f Makefile.deps && echo "true"), true)
-include Makefile.deps
-endif
-
-CHILL_BUILD_DATE = $(shell date +%m/%d/%Y)
-
diff --git a/chill_run.cc b/chill_run.cc
index a3c9180..d33819b 100644
--- a/chill_run.cc
+++ b/chill_run.cc
@@ -281,14 +281,14 @@ int main( int argc, char* argv[] )
//---
// Run a CHiLL interpreter
//---
- printf("CUDA-CHiLL v0.2.0 (built on %s)\n", CHILL_BUILD_DATE);
+ printf("CHiLL v0.2.1 (built on %s)\n", CHILL_BUILD_DATE);
printf("Copyright (C) 2008 University of Southern California\n");
printf("Copyright (C) 2009-2012 University of Utah\n");
//is_interactive = true; // let the lua interpreter know.
fflush(stdout);
// TODO: read lines of python code.
//Not sure if we should set fail from interactive mode
- printf("CUDA-CHiLL ending...\n");
+ printf("CHiLL ending...\n");
fflush(stdout);
}
@@ -336,7 +336,7 @@ int main( int argc, char* argv[] )
//---
// Run a CHiLL interpreter
//---
- printf("CUDA-CHiLL v0.2.0 (built on %s)\n", CHILL_BUILD_DATE);
+ printf("CUDA-CHiLL v0.2.1 (built on %s)\n", CHILL_BUILD_DATE);
printf("Copyright (C) 2008 University of Southern California\n");
printf("Copyright (C) 2009-2012 University of Utah\n");
is_interactive = true; // let the lua interpreter know.
@@ -359,7 +359,6 @@ int main( int argc, char* argv[] )
#endif
#ifdef BUILD_ROSE
((IR_cudaroseCode *)(ir_code))->commit_loop(myloop, lnum);
- ((IR_roseCode*)(ir_code))->finalizeRose();
#elif BUILD_SUIF
((IR_cudasuifCode *)(ir_code))->commit_loop(myloop, lnum);
#endif
@@ -375,16 +374,14 @@ int main( int argc, char* argv[] )
lnum_end = get_loop_num_end(L);
DEBUG_PRINT("calling ROSE code gen? loop num %d - %d\n", lnum_start, lnum_end);
#endif
-
+#endif
#ifdef BUILD_ROSE
finalize_loop(lnum_start, lnum_end);
//((IR_roseCode*)(ir_cide))->commit_loop(myloop, lnum);
((IR_roseCode*)(ir_code))->finalizeRose();
- #elif BUILD_SUIF
- ((IR_suifCode*)(ir_code))->commit_loop(myloop, lnum);
+ //#elif BUILD_SUIF
+ //((IR_suifCode*)(ir_code))->commit_loop(myloop, lnum);
#endif
-
-#endif
delete ir_code;
}
#ifdef PYTHON
diff --git a/chillmodule.cc b/chillmodule.cc
index fa55199..fbeb477 100644
--- a/chillmodule.cc
+++ b/chillmodule.cc
@@ -1431,7 +1431,7 @@ static PyObject* chill_permute(PyObject* self, PyObject* args) {
int stmt_num = intArg(args, 1);
int level = intArg(args, 2);
std::vector<int> pi;
- if(!tointvector(args, 2, pi))
+ if(!tointvector(args, 3, pi))
throw std::runtime_error("the third argument in permute(stmt_num, level, pi) must be an int vector");
myloop->permute(stmt_num, level, pi);
}
@@ -1750,7 +1750,7 @@ static PyMethodDef ChillMethods[] = {
{"print_space", chill_print_space, METH_VARARGS, "print something or other "},
{"add_sync", chill_add_sync, METH_VARARGS, "add sync, whatever that is"},
{"rename_index", chill_rename_index, METH_VARARGS, "rename a loop index"},
- {"permute", chill_permute_v2, METH_VARARGS, "change the order of loops?"},
+ {"permute", chill_permute, METH_VARARGS, "change the order of loops?"},
{"tile3", chill_tile_v2_3arg, METH_VARARGS, "something to do with tile"},
{"tile7", chill_tile_v2_7arg, METH_VARARGS, "something to do with tile"},
{"thread_dims", thread_dims, METH_VARARGS, "tx, ty, tz "},
diff --git a/dep.cc b/dep.cc
index 7bf781a..a675d03 100644
--- a/dep.cc
+++ b/dep.cc
@@ -37,7 +37,7 @@ std::ostream& operator<<(std::ostream &os, const DependenceVector &d) {
switch (d.type) {
case DEP_W2R:
- os << "flow";
+ os << "true";
if (d.is_reduction)
os << "_reduction";
break;
diff --git a/examples/chill/gemm.c b/examples/chill/gemm.c
deleted file mode 100644
index a565511..0000000
--- a/examples/chill/gemm.c
+++ /dev/null
@@ -1,18 +0,0 @@
-
-#define N 512
-
-int main() {
-
- float a[N][N], b[N][N], c[N][N];
-
- int i, j, k;
-
- for (j = 0; j < N; j++)
- for (k = 0; k < N; k++)
- for (i = 0; i < N; i++) {
- c[i][j] = c[i][j] + a[i][k] * b[k][j];
- }
-
- return 0;
-}
-
diff --git a/examples/chill/gemm.script b/examples/chill/gemm.script
deleted file mode 100644
index ed91567..0000000
--- a/examples/chill/gemm.script
+++ /dev/null
@@ -1,31 +0,0 @@
-#matrix multiply large array size for intel machine
-source: gemm.c
-procedure: main
-format: rose
-loop: 0
-
-TI = 128
-TJ = 8
-TK = 512
-UI = 2
-UJ = 2
-
-permute([3,1,2])
-tile(0,2,TJ)
-#print space
-tile(0,2,TI)
-#print space
-tile(0,5,TK)
-#print space
-
-datacopy(0,3,a,false,1)
-#print space
-
-datacopy(0,4,b)
-print
-unroll(0,4,UI)#print space
-print
-unroll(0,5,UJ)
-#print space
-print
-
diff --git a/examples/chill/gemv.c b/examples/chill/gemv.c
deleted file mode 100644
index 610d4cb..0000000
--- a/examples/chill/gemv.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#define N 10
-
-int main() {
- // int n;
- float a[N];
- float b[N];
- float c[N][N];
-
- int i, j;
-
- for (i = 1; i < N; i++)
- for (j = 1; j < N; j++)
- a[i] = a[i] + c[i][j] * b[j];
-
-}
diff --git a/examples/chill/gemv.script b/examples/chill/gemv.script
deleted file mode 100644
index f1d5f89..0000000
--- a/examples/chill/gemv.script
+++ /dev/null
@@ -1,9 +0,0 @@
-source: gemv.c # matrix-vector multiply
-procedure: main
-format : rose
-loop: 0
-
-
-
-original()
-print
diff --git a/examples/chill/jacobi1.c b/examples/chill/jacobi1.c
deleted file mode 100644
index 0fcaee4..0000000
--- a/examples/chill/jacobi1.c
+++ /dev/null
@@ -1,13 +0,0 @@
-#define N 512
-
-int main() {
- int i, t;
-
- float a[N][N];
-
- for (t = 2; t <= 100; t++)
- for (i = 2; i <= N - 1; i++)
- a[t][i] = a[t - 1][i - 1] + a[t - 1][i] + a[t - 1][i + 1];
-
- return 0;
-}
diff --git a/examples/chill/jacobi1.script b/examples/chill/jacobi1.script
deleted file mode 100644
index c0dec8d..0000000
--- a/examples/chill/jacobi1.script
+++ /dev/null
@@ -1,18 +0,0 @@
-#
-# tiling perfect jacobi loop nest with time step, use
-# unimodular transformation first (only applicable to the
-# perfect loop nest) to make tiling legal.
-#
-
-source: jacobi1.c
-procedure: main
-format : rose
-loop: 0
-
-print dep
-
-nonsingular([[1,0],[1,1]]) # unimodular matrix, determinant is one
-tile(0,2,64)
-
-print dep
-print
diff --git a/examples/chill/jacobi2.c b/examples/chill/jacobi2.c
deleted file mode 100644
index b8d8d7b..0000000
--- a/examples/chill/jacobi2.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#define N 512
-
-int main() {
- double a[N];
- double b[N];
- int t, i;
- for (t = 1; t <= 100; t++) {
- for (i = 2; i <= N - 1; i++)
- b[i] = (double) 0.25 * (a[i - 1] + a[i + 1]) + (double) 0.5 * a[i];
-
- for (i = 2; i <= N - 1; i++)
- a[i] = b[i];
- }
- return 0;
-}
diff --git a/examples/chill/jacobi2.script b/examples/chill/jacobi2.script
deleted file mode 100644
index afe14c6..0000000
--- a/examples/chill/jacobi2.script
+++ /dev/null
@@ -1,21 +0,0 @@
-#
-# tiling imperfect jacobi loop nest, more details in the paper
-# "Automatic Tiling of Iterative Stencil Loops" by Zhiyuan Li and
-# Yonghong Song, TOPLAS, 2004.
-#
-
-source: jacobi2.c
-procedure: main
-format: rose
-loop: 0
-
-print dep
-
-original()
-shift([1], 2, 1)
-fuse([0,1], 2) # optional
-skew([0,1], 2, [2,1])
-tile(0, 2, 32, 1)
-
-print dep
-print
diff --git a/examples/chill/unroll.c b/examples/chill/unroll.c
deleted file mode 100644
index e74dea3..0000000
--- a/examples/chill/unroll.c
+++ /dev/null
@@ -1,33 +0,0 @@
-
-#define N 14
-#define DT 0.314
-
-void foo(int n, float* x, float* y, float* z, float* f3, float* f1, float* w) {
-
- int i, j;
-
- for (i = 1; i <= 14; i++)
- x[i] = 1.0;
-
- for (i = 1; i <= 14; i += 3)
- y[i] = 1.0;
-
- for (i = N + 1; i <= N + 20; i += 3)
- z[i] = 1.0;
-
- for (i = 0; i <= N; i++) {
- for (j = i; j <= i + N; j++)
- f3[i] = f3[i] + f1[j] * w[j - i];
- f3[i] = f3[i] * DT;
- }
-
- return 0;
-}
-
-int main() {
- float x[N], y[N], z[N], f3[N], f1[N], w[N];
-
- foo(N, x, y, z, f3, f1, w);
- return 0;
-}
-
diff --git a/examples/chill/unroll.script b/examples/chill/unroll.script
deleted file mode 100644
index e64acb6..0000000
--- a/examples/chill/unroll.script
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Test unroll-and-jam. The last loop adapted from the simple
-# convolution example from p463 of "Optimizing Compilers for
-# Modern Architectures", by Randy Allen and Ken Kennedy.
-#
-
-source: unroll.c
-procedure: foo
-format: rose
-# fully unroll a loop with known iteration count
-loop: 0
-original()
-unroll(0,1,3)
-print
-print space
-
-
-# a strided loop
-loop: 1
-original()
-unroll(0,1,2)
-print
-print space
-
-# lower and upper bounds are not constant
-loop: 2
-original()
-unroll(0,1,20)
-print
-
-# parallelogram iteration space
-loop: 3
-original()
-unroll(0,1,2)
-print
diff --git a/examples/cuda-chill/cp.c b/examples/cuda-chill/cp.c
deleted file mode 100644
index 837d7a6..0000000
--- a/examples/cuda-chill/cp.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#define N 1
-
-#define VOLSIZEY 512
-#define VOLSIZEX 512
-#define VOLSIZEZ 1
-#define ATOMCOUNT 4000
-#define GRIDSPACING 0.1
-#define zDim 0
-
-extern float sqrtf(float);
-
-void cenergy_cpu(float atoms[ATOMCOUNT*4],float *energy,float z)
-{
-int i,j,n;float dx,dy,dz;
-
- for (j=0; j<VOLSIZEY; j++) {
- for (i=0; i<VOLSIZEX; i++) {
- for (n=0;n<ATOMCOUNT;n+=4) {
- dx = (GRIDSPACING * i) - atoms[n];
- dy = (GRIDSPACING * j) - atoms[n+1];
- dz = z - atoms[n+2];
- energy[(j*VOLSIZEX + i)+VOLSIZEX*VOLSIZEY*zDim] += atoms[n+3]/sqrtf( (dx*dx) + (dy*dy)+ (dz*dz) ) ;
- }
-
-
- }
- }
-}
-
diff --git a/examples/cuda-chill/cp.lua b/examples/cuda-chill/cp.lua
deleted file mode 100644
index 1ef2264..0000000
--- a/examples/cuda-chill/cp.lua
+++ /dev/null
@@ -1,46 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("cp.c", "cenergy_cpu", 0)
-
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-V=512
-N=4000
-N=1
-
-Tj=32
-Ti=16
-Tii=16
-Tjj=16
-
---normalize_index("j")
---normalize_index("i")
-print_code()
-normalize_index("n")
--- TILE COMMANDS ZEROOOOOOOOOOO:3
---permute(0,{"i","j","n"})
---tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","n"})--CU=-1
-tile_by_index({"j","i"},{Tj,Ti},{l1_control="jj",l2_control="ii"},{"jj","ii","j","i","n"})--CU=-1
---tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1
-
---tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3
---tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","i","iii","j","jjj","n"})--CU=3
---tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"})
---tile_by_index({"i"}, {Tii}, {l1_control="iii",l1_tile="i"}, {"ii", "jj", "iii","i","j","n"})
-print_code()
-cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"jj","ii"}, thread={"j","i"}})--CU=3
---cudaize("kernel_GPU",{atoms=N*4,energy=V*V*1},{block={"ii","jj"}, thread={"i","j"}})--CU=3
-print_code()
-copy_to_shared("tx","atoms",-16)
-copy_to_registers("tx","energy")
---copy_to_texture("atoms")
---unroll_to_depth(1)
---unroll(0,9,0)
---unroll(0,5,0)
-
---unroll(0,8,256)
-print_code()
diff --git a/examples/cuda-chill/cudaize.lua b/examples/cuda-chill/cudaize.lua
deleted file mode 100644
index 7359cca..0000000
--- a/examples/cuda-chill/cudaize.lua
+++ /dev/null
@@ -1,1004 +0,0 @@
-
--- THIS IS CUDAIZE.LUA
-
-function table.contains_key(table, key)
- for k in pairs(table) do
- if k == key then
- return true
- end
- end
- return false
-end
-
-function valid_indices(stmt, indices)
- --print( "valid_indices() lua calling C cur_indices")
- --io.flush()
- cur = cur_indices(stmt)
- --print("Cur indices "..list_to_string(cur))
- for idx in pairs(indices) do
- if not table.contains_key(cur,idx) then
- return false
- end
- end
- return true
-end
-
-function next_clean_level(cur_idxs,level)
- --print("next_clean_level( ..., "..level.." )")
- --print(string.format("indices_at_each_level %s ",list_to_string(cur_idxs) ))
-
- --print("loop to "..#cur_idxs)
- for i=level+1,#cur_idxs do
- --print("Checking level "..i.." = '"..cur_idxs[i].."'")
- if (# cur_idxs[i] > 0) then
- --print("Good enough"..(# cur_idxs[i]))
- --print("returning "..i)
- return i
- end
- end
- return -1 --sentinal that there were no non-dummy indices left
-end
-
-function build_order(final_order, tile_idx_names, ctrl_idx_names, tile_idx_map, cur_level)
- order = {}
- --print("\nbuild_order()")
- --print("build_order(): final_order = ( "..list_to_string(final_order).." )")
- --print("build_order(): ctrl_idx_names = ("..list_to_string(ctrl_idx_names).." )")
- --print("cur_level "..cur_level.."")
- --io.flush()
-
- for i,k in ipairs(final_order) do
- skip = false
- cur = final_order[i]
- --print("\ncur "..cur.." = final_order["..i.."] = "..final_order[i].." ")
- --control loops below our current level should not be in the current order
- for j=cur_level+2,# ctrl_idx_names do
- --print("j "..j.." final_order["..i.."] = "..final_order[i].." ")
- if ctrl_idx_names[j] == final_order[i] then
- skip = true
- --print("SKIP "..final_order[i].." ")
- --io.flush()
- end
- end
- --possibly substitute tile indices ifn necessar
- if table.contains_key(tile_idx_map,final_order[i]) then
- approved_sub = false
- sub_string = tile_idx_map[final_order[i]]
- for j=cur_level+2,# tile_idx_names do
- if tile_idx_names[j] == sub_string then
- approved_sub = true
- end
- end
- if approved_sub then
- cur = sub_string
- end
- end
- if not skip then
- table.insert(order,cur)
- end
- end
- return order
-end
-
-function list_to_string(str_list)
- --Helpful debug output
- l = ""
- for i,str in ipairs(str_list) do
- if i > 1 then
- l = l .. ", " .. str
- else
- l = str
- end
- end
- return l
-end
-
-
-function find_cur_level(stmt,idx)
- --Search cur_indices for a idx at stmt
- cur = cur_indices(stmt)
- --print(string.format("find_cur_level(stmt %d, idx %s) Cur indices %s", stmt, idx, list_to_string(cur)))
- for i,cidx in ipairs(cur) do
- if cidx == idx then
- --print(string.format("found it at index %d", i))
- return i
- end
- end
- error("Unable to find "..idx.." in current list of indices")
-end
-
-
-function chk_cur_level(stmt,idx)
- --Search cur_indices for a idx at stmt
- cur = cur_indices(stmt)
- for i,cidx in ipairs(cur) do
- if cidx == idx then
- return i
- end
- end
- return -1
-end
-
-
-function find_offset(cur_order, tile, control)
- --print("Looking for tile '"..tile.."' and control '"..control.."' in ( "..list_to_string(cur_order)..", )")
- idx1 = -1
- idx2 = -1
- for i,cur in ipairs(cur_order) do
- if(cur == tile) then
- idx1 = i
- end
- if(cur == control) then
- idx2 = i
- end
- end
- if(idx1 < 0) then
- error("Unable to find tile " .. tile .. " in current list of indices")
- end
- if(idx2 < 0) then
- error("Unable to find control " .. control .. " in current list of indices")
- end
- --print("found at level " .. idx2 .. " and " .. idx1)
- if(idx2 < idx1) then
- return idx2-idx1+1
- else
- return idx2-idx1
- end
-end
-
-function tile_by_index(tile_indices, sizes, index_names, final_order, tile_method)
- --print "STARTING TILE BY INDEX"
- --io.flush()
- stmt = 0 --assume stmt 0
- cur = cur_indices(stmt)
- --print("Cur indices "..list_to_string(cur))
- if not valid_indices(stmt,tile_indices) then
- error('One of the indices in the first parameter were not '..
- 'found in the current set of indices.')
- end
- if not tile_method then tile_method = counted end
- tile_idx_names = {}
- for i,s in ipairs(tile_indices) do tile_idx_names[i]=s end --shallow copy
- --print("tile_index_names: ['"..list_to_string(tile_indices).."']")
-
- --print("index_names: ")
- --for k,v in pairs(index_names) do print(k,v) end
-
- --io.flush()
-
- ctrl_idx_names = {}
- tile_idx_map = {}
- for k,v in pairs(index_names) do
- valid = false
- if(string.sub(k,1,1) == "l") then
- if string.sub(k,-8) == "_control" then
- i = tonumber(string.sub(k,2,-9))
- if i and i >= 1 and i <= (# tile_indices) then
- ctrl_idx_names[i] = v
- --print(string.format("Handling control %s for loop level %d",v,i))
- --print("control "..k.." name "..v.." ")
- valid = true
- end
- elseif string.sub(k,-5) == "_tile" then
- i = tonumber(string.sub(k,2,-6))
- if i and i >= 1 and i <= (# tile_indices) then
- --print(string.format("tile %s -> %s",tile_indices[i], v))
- tile_idx_names[i] = v
- tile_idx_map[v] = tile_indices[i]
- --print(string.format("tile %s -> %s",tile_indices[i], v))
- valid = true
- end
- end
- end
- if not valid then error(string.format("%s is not a proper key for specifying "..
- "tile or control loop indices\n", k)) end
- end
-
- --filter out control indices (and do name substitution of unprocessed tile indices) for a given level
- cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, -1)
- permute(stmt, cur_order)
-
- for i,cur_idx in ipairs(tile_indices) do
- --print(string.format("i %d cur_idx %s calling build order ********", i-1, cur_idx))
- cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)
- --Find a offset between tile loop and control loop
- -- 0 = control loop one level above tile loop
- -- -1 = control loop two levels above tile loop
- -- > 0 = tile loop above control loop
- -- In the last case, we do two extra tile commands to get the control
- -- above the tile and then rely on the final permute to handle the
- -- rest
- level = find_cur_level(stmt,cur_idx)
- offset = find_offset(cur_order, tile_idx_names[i], ctrl_idx_names[i])
- --print(string.format("offset %d", offset))
-
- if (offset <= 0) then
- --print(string.format("[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %s)",stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method))
- tile(stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method)
- else
- --print(string.format("2tile(%d, %d, %d, %d, %s, %s, %s)", stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method))
- tile(stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method);--regular level
- --flip tile and control loop
- --print(string.format("3tile(%d, %d, %d)",stmt, level+1, level+1))
- tile(stmt, level+1, level+1);
- --print(string.format("4tile(%d, %d, %d)",stmt, level+1, level))
- tile(stmt, level+1, level);
- --print(string.format("\n[offset>0]tile(%d, %d, %d, %d,%s,%s,%s)",stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method))
- --print_code()
-
- end
-
- --Do permutation based on cur_order
- --print "permute based on build order calling build_order()"
- --print "cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)"
- cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)
- --print "permute(stmt, cur_order);"
- permute(stmt, cur_order);
- --print "\nafter permute(), code is:"
- --print_code()
- end
- --print "ENDING TILE BY INDEX"
- --print_code()
-end
-
-function normalize_index(index)
- stmt = 0 --assume stmt 0cur = cur_indices(stmt)
- --print("Cur indices "..list_to_string(cur))
- l = find_cur_level(stmt, index)
- tile(stmt, l, l)
- --print(string.format("\n[Normalize]tile(%d, %d, %d)",stmt, l,l))
-end
-
-function is_in_indices(stmt, idx)
- cur = cur_indices(stmt)
- for i=0,#cur,1 do
- if(cur[i]==idx) then
- return true
- end
- end
- return false
-
-end
-
-
-function copy_to_registers(start_loop, array_name)
-
- --print("\n\n****** starting copy to registers")
- io.flush()
-
- stmt = 0 --assume stmt 0
-
- -- [Malik] first we make sure that tx and ty are consecutive loops in the 2D thread setup, otherwise all levels for subsequent operations are messed up. Start logic.
- cur = cur_indices(stmt)
- table_Size = table.getn(cur)
-
- --print(string.format("Cur indices %s,",list_to_string(cur)))
- --print(string.format("The table size is %d", table_Size))
- --table.foreach(cur, print)
- --print_code()
-
- level_tx = -1
- level_ty = -1
- if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end
- if is_in_indices(stmt,"ty") then level_ty = find_cur_level(stmt,"ty") end
- --print(string.format("level_tx %d level_ty %d", level_tx, level_ty))
-
- ty_lookup_idx = ""
- org_level_ty = level_ty
-
- --if(cur[level_tx+1]~=nil and cur[level_tx+1]~="") then ty_lookup = ty_lookup+1 end
- if(cur[level_ty+1]~=nil and cur[level_ty+1]~="") then
- --print(string.format("IF cur[%d] = %s", level_ty+1, cur[level_ty+1]))
- ty_lookup_idx = cur[level_ty+1]
- else
- --if cur[level_ty] ~= nil then print(string.format("ELSE ty_lookup_idx = cur[%d] = %s", level_ty, cur[level_ty])) -- TODO
- --else print "ELSE (dangerous)" end
- ty_lookup_idx = cur[level_ty] -- may assign nil !?
- end
- --if ty_lookup_idx ~= nil then print(string.format("ty_lookup_idx '%s'", ty_lookup_idx)) -- TODO
- --else print "ty_lookup_idx is NIL"
- --end
-
- if level_ty > 0 then
- --print(string.format("\ntile3(%d,%d,%d)",stmt,level_ty,level_tx+1))
- tile(stmt,level_ty,level_tx+1)
- end
- --print_code()
-
- --print("\ntylookup is %d",ty_lookup)
- --exit(0)
- --
- cur = cur_indices(stmt)
- table_Size = table.getn(cur)
- --print(string.format("Cur indices %s,",list_to_string(cur)))
- --print("The table size is "..table.getn(cur))
- --table.foreach(cur, print)
-
- if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end
- if ty_lookup_idx then
- if is_in_indices(stmt,ty_lookup_idx) then level_ty = find_cur_level(stmt,ty_lookup_idx) end
- end
-
- ty_lookup = 1
- idx_flag = -1
- -- find the level of the next valid index after ty+1
- --print(string.format("\nlevel_ty %d", level_ty))
- if level_ty > 0 then
- --print(string.format("table_Size %d", table_Size))
- for num= level_ty+ty_lookup,table_Size do
- --print(string.format("num=%d cur[num] = '%s'",num, cur[num]))
- if(cur[num] ~= "") then
- idx_flag = find_cur_level(stmt,cur[num])
- --print (string.format("idx_flag = %d", idx_flag))
- break
- end
- end
- end
-
- --print(string.format("\n(first) I am checking all indexes after ty+1 %s",idx_flag))
- --print_code()
- --print ""
-
- how_many_levels = 1
- startat = idx_flag + 1
- if startat == 0 then startat = 1 end -- avoid attempt to examine an illegal array offset
- --print(string.format("idx_flag = %d I will check levels starting with %d", idx_flag, idx_flag+1))
-
- for ch_lev = startat,table_Size,1 do -- was for ch_lev = idx_flag+1,table_Size,1 do
- --print(string.format("ch_lev %d", ch_lev))
- if(cur[ch_lev] ~= nil and cur[ch_lev] ~= "") then
- --print(string.format("cur[%d] = '%s'", ch_lev, cur[ch_lev]))
- how_many_levels = how_many_levels+1
- end
- end
- --print("\nHow Many Levels",how_many_levels)
-
- -- change this all to reflect the real logic which is to normalize all loops inside the thread loops.
- if(how_many_levels <2) then
- while( idx_flag >= 0) do
- for num = level_ty+ty_lookup,(table_Size) do
- --print(string.format("at top of loop, num is %d", num))
- --print(string.format("num %d", num))
- --print(string.format("cur[num] = '%s'", cur[num]))
- if(cur[num] ~= "") then
- idx=cur[num]
- --print(string.format("idx '%s'", idx))
-
- curlev = find_cur_level(stmt,idx)
- --print(string.format("curlev %d", curlev))
-
- --print_code()
- --print(string.format("\n[COPYTOREG]tile(%d,%d,%d)",stmt,find_cur_level(stmt,idx),level_tx))
- tile(stmt,find_cur_level(stmt,idx),find_cur_level(stmt,idx))
- curlev = find_cur_level(stmt,idx)
- --print(string.format("curlev %d", curlev))
- tile(stmt,find_cur_level(stmt,idx),level_tx)
- --print(string.format("hehe '%s'",cur[num]))
-
- cur = cur_indices(stmt)
- --print("Cur indices INSIDE"..list_to_string(cur))
- table_Size = table.getn(cur)
- --print(string.format("Table Size is: %d",table_Size))
- level_tx = find_cur_level(stmt,"tx")
- --print(string.format("\n level TX is: %d",level_tx))
- level_ty = find_cur_level(stmt,ty_lookup_idx)
- --print(string.format("\n level TY is: %d",level_ty))
- idx_flag = -1
- --print "idx_flag = -1"
-
- -- find the level of the next valid index after ty+1
-
- -- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?)
- for num= level_ty+ty_lookup,table_Size do
- --print(string.format("num mucking num = %d", num))
- if(cur[num] ~= nil and cur[num] ~= "") then
- idx_flag = find_cur_level(stmt,cur[num])
- --print("\n(second) I am checking all indexes after ty+1 %s",cur[num])
- break
- end
- end
- --print(string.format("num mucked to %d idx_flag = %d", num, idx_flag))
-
- end
- --print(string.format("at bottom of loop, num is %d", num))
- end
- end
- end
- --print "done with levels"
-
-
-
-
- --print "ARE WE SYNCED HERE?"
- --print_code()
- --print("\ntile(%d,%d,%d)",stmt,level_k,level_k)
- --tile(stmt,level_k,level_k)
-
- -- [Malik] end logic
- --print_code()
- start_level = find_cur_level(stmt, start_loop)
- --We should hold contant any block or tile loop
- block_idxs = block_indices()
- thread_idxs = thread_indices()
- --print("\nblock indices are")
- --table.foreach(block_idxs, print)
- --print("\nthread indices are")
- --table.foreach(thread_idxs, print)
- --print(string.format("\nStart Level: %d",start_level))
-
- hold_constant = {}
- --print("\n Now in Blocks")
- for i,idx in ipairs(block_idxs) do
- --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx)))
- if find_cur_level(stmt,idx) >= start_level then
- table.insert(hold_constant, idx)
- --print(string.format("\nJust inserted block %s in hold_constant",idx))
- end
- end
-
-
- --print("\n Now in Threads")
- for i,idx in ipairs(thread_idxs) do
- --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx)))
- if find_cur_level(stmt,idx) >= start_level then
- table.insert(hold_constant, idx)
- --print(string.format("\nJust inserted thread %s in hold_constant",idx))
- end
- end
-
- --print "\nhold constant table is: "
- --table.foreach(hold_constant, print)
-
- --print("\nbefore datacopy pvt")
- old_num_stmts = num_statements()
- --print_code()
- --print(string.format("\n[DataCopy]datacopy_privatized(%d, %s, %s, vector having privatized levels)",stmt, start_loop, array_name))
- --table.foreach(hold_constant, print)
- datacopy_privatized(stmt, start_loop, array_name, hold_constant)
-
- --print(hold_constant)
- new_num_stmts = num_statements()
- --print("\nthe num of statements:%d\n",new_num_stmt)
- --print_code()
- --exit(0)
- -- [Malik] normalize the copy loops created.
- cur = cur_indices(old_num_stmts)
- --print("Cur indices "..list_to_string(cur))
- for cidx,i in ipairs(cur) do
- if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then
- --tile(old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i))
- --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i))
- end
- end
- --print_code()
- --print("\nthe num of statements OLD+1 :",(old_num_stmts+1))
-
-
---[[
- is this commented out? why yes, yes it is block comment
- if( (old_num_stmts+1) <= new_num_stmts) then
- cur = cur_indices(old_num_stmts+1)
- --print("Cur indices+1 "..list_to_string(cur))
- for cidx,i in ipairs(cur) do
- if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then
- tile(old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i))
- --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i))
- end
- end
- end
---]]
-
-
- --Unroll to the last thread level
- --for stmt=old_num_stmts,new_num_stmts-1 do
- -- level = find_cur_level(stmt,thread_idxs[#thread_idxs])--get last thread level
- --if level < #cur_indices(stmt) then
- -- unroll(stmt,level+1,0)
- --print(string.format("\n[Unroll]unroll(%d, %d, 0)",stmt, level+1))
- ----print_code()
- --end
- --end
- io.flush()
- --print("****** ending copy to registers\n\n")
- --io.flush()
-end
-
-function copy_to_shared(start_loop, array_name, alignment)
- --print(string.format("\nstarting copy to shared(%s, %s, %d )",start_loop,array_name,alignment))
- stmt = 0 --assume stmt 0
- cur = cur_indices(stmt)
- --print("Cur indices "..list_to_string(cur))
-
- start_level = find_cur_level(stmt, start_loop)
- --print(string.format("start_level %d", start_level))
-
- old_num_stmts = num_statements()
- --print(string.format("old_num_statements %d", old_num_stmts))
-
- --Now, we give it indices for up to two dimentions for copy loop
- copy_loop_idxs = {"tmp1","tmp2"}
- --print(string.format("\n[DataCopy]datacopy(%d, %d, %s, {\"tmp1\",\"tmp2\"},false,0,1,%d,true)",stmt, start_level, array_name, alignment))
- datacopy(stmt, start_level, array_name, copy_loop_idxs, false, 0, 1, alignment,true)
-
- add_sync(stmt,start_loop)
- new_num_stmts = num_statements()
-
- --This is fairly CUBLAS2 specific, not sure how well it generalizes,
- --but for a 2D copy, what we want to do is "normalize" the first loop
- --"tmp1" then get its hard upper bound. We then want to tile it to
- --make the control loop of that tile "ty". We then tile "tmp2" with a
- --size of 1 and make it "tx".
- --print(string.format("fairly CUBLAS2 specific, OLD %d NEW %d", old_num_stmts, new_num_stmts ))
-
- for stmt=old_num_stmts,new_num_stmts-1 do
- --print(string.format("for stmt = %d", stmt))
- was_no_error, level = pcall(find_cur_level, stmt, "tmp2")
-
- if was_no_error then
- --print_code()
- --print("\nCopy to shared: [If was no error]\n")
- find_cur_level(stmt,"tmp2")
- tile(stmt, level, level)
-
- lower,upper = hard_loop_bounds(stmt, level)
- upper = upper + 1
- --print(string.format("lower %d upper %d", lower, upper))
-
- tx,ty = thread_dims()
- --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx)
-
- level = find_cur_level(stmt,"tmp1")
- --print(string.format("level %d", level))
-
- if tx == upper and ty == 1 then
- --print(string.format("tx = %d upper = %d ty = %d", tx, upper, ty))
- --print "Don't need"
-
- --Don't need an extra tile level, just move this loop up
- second_level = find_cur_level(stmt,"tmp2")
- --print(string.format("\n[Tile0]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx"))
- tile(stmt, second_level, 1, level, "tx", "tx", counted)
- else
- --print "DO need?"
- --print_code()
- if(ty == 1) then new_ctrl = "tmp3" else new_ctrl = "ty" end
-
-
---[[ Commenting out a block of Gabe's code in this control flow
- -- level = find_cur_level(stmt,"tmp1")
- tile(stmt, level, level)
-
- lower,upper = hard_loop_bounds(stmt, level)
- upper = upper + 1
- --print_code()
- --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx..", level: "..level)
- if(math.ceil(upper/ty) > 1)then
- tile(stmt, level, math.ceil(upper/ty), level, "tmp", new_ctrl, counted)
- --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level, math.ceil(upper/ty),upper,ty, level, "tmp", new_ctrl))
- else
- tile(stmt, level, math.ceil(upper/ty), level, "ty", new_ctrl, counted)
- --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level, math.ceil(upper/ty),upper,ty, level, "tx", new_ctrl))
- end
-
- --print_code()
- -- [Malik] If here we have the loop upper bound > tx, then we should tile once more after the next tile, to carve out the correct tx.
- lower1,upper1 = hard_loop_bounds(stmt,level)
- level1 = level
- stmt1 = stmt
- -- [Malik] Do the tile after the second level tile with if condition. Just to keep the original order, the tile is being pushed to the end.
-
- --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1)
-
- --print_code()
- --level = find_cur_level(stmt,"tmp")
- --tile(stmt,level,level)
- --print_code()
-
- --[Malik] if you are moving the loop above the level1, you need to update level1 with new position which would be level1+2 or second_level
- if(level <= level1) then level1 = level1+2 end
- --print(string.format("\n[Tile2]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx"))
- --print("\n----------------------------------")
- --print_code()
- --print("\n**********************************")
- --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1)
- -- [Malik] If the upper bound > tx, we do another tile to carve out the correct tx from a bigger loop. Else just normalize the bounds.
- if( upper1 > ty) then
- third_level = find_cur_level(stmt1,"tmp")
- --print("\n\n\n\t\t\t\tthirdlevel:"..third_level)
- tile(stmt1, third_level, ty, third_level, "ty", "tmp", counted)
- --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt1, third_level, ty,third_level, "ty", "tmp"))
- tile(stmt1,third_level+1,third_level+1)
- --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level+1))
- tile(stmt1,third_level+1,third_level)
- --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level))
- else
- tile(stmt1,level1,level1)
- --print(string.format("\n[Tile3ELSE]tile(%d, %d, %d)",stmt1,level1,level1))
- end
-
- --print("\nStarting tmp2\n");--print_code();
- second_level = find_cur_level(stmt,"tmp2")
- lower,upper = hard_loop_bounds(stmt,second_level)
- level = second_level
- --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..", level:"..level)
-
- if(math.ceil(upper/tx) > 1)then
- tile(stmt, second_level,math.ceil(upper/tx), level, "tmp", "tx", counted)
- --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tmp", "tx"))
- else
- tile(stmt, second_level,math.ceil(upper/tx), level, "tx", "tx", counted)
- --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tx", "tx"))
- end
- --print_code()
- lower2,upper2 = hard_loop_bounds(stmt,level)
- level2 = level
- stmt2 = stmt
- --print("[Malik]-loop cleanup@tmp2: lower2, upper2: "..lower2..", "..upper2..", tx: "..tx..", level:"..level2)
- -- now for the second level.
- if( upper2 > tx) then
- forth_level = find_cur_level(stmt2,"tmp")
- --print("\n\n\n\t\t\t\tforthlevel:"..forth_level)
- --print_code()
- tile(stmt2, forth_level, 1, forth_level, "tx", "tmp", counted)
- --print(string.format("\n[Tile3B]tile(%d, %d, %d,%d,%s,%s,counted)",stmt2, forth_level, tx,forth_level, "ty", "tmp"))
- --print_code()
- --tile(stmt2,forth_level+1,forth_level+1)
- --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level+1))
- --tile(stmt2,forth_level+1,forth_level)
- --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level))
- else
- new_level = find_cur_level(stmt2,"ty")
- tile(stmt2,level2,1,new_level,"tx","tx",counted)
- --print(string.format("\n[Tile3BELSE]tile(%d, %d, %d)",stmt2,level2,level2))
- tmp_level = find_cur_level(stmt2,"tmp")
- tile(stmt2,tmp_level,tmp_level)
- end
-
- --print_code()
- --print("\n----------------------------------")
---]]
-
- --print_code()
- --print("\nStarting tmp2\n");--print_code();
- first_level = find_cur_level(stmt,"tmp1")
- second_level = find_cur_level(stmt,"tmp2")
- lower,upper = hard_loop_bounds(stmt,second_level)
-
- --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..",first level:"..first_level..",second_level:"..second_level)
-
- -- Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx.
- --print(string.format("\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,1,first_level, "tx", "tx"))
- tile(stmt,second_level,1,first_level,"tx","tx",counted)
- --print_code()
-
- first_level = find_cur_level(stmt,"tmp1")
- lower_1,upper_1 = hard_loop_bounds(stmt,first_level)
- tx_level = find_cur_level(stmt,"tx")
- lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level)
- --print(string.format("UL_1 %d %d UL_tx %d %d", lower_1, upper_1, lower_tx, upper_tx))
-
- if(math.ceil(upper_tx/tx) > 1)then
- --print "ceil I say"
- --print(string.format("\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,tx,tx_level, "tx", "tmp1"))
- tile(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted)
- --print_code()
-
- peat = find_cur_level(stmt,"tx")
- --print(string.format("\n[Tile1]tile(%d, %d, %d)",stmt, peat, peat))
- tile(stmt, peat, peat ) --find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx"))
- --print_code()
-
- if (find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx")) then
- --print(string.format("\nagain [Tile1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")))
- tile(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
- --print_code()
- end
- --else
- --tile(stmt, tx_level,1, tx_level, "tx", "tx", counted)
- --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,1,tx_level, "tx", "tx"))
- end
- --print_code()
- --]] -- this apparently is NOT the end of a block comment
-
- --print("\nStarting tmp1\n")
- -- Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty".
- tile(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))
- --print_code()
-
- ty_level = find_cur_level(stmt,"tmp1")
- lower_ty,upper_ty = hard_loop_bounds(stmt,ty_level)
-
- tx_level = find_cur_level(stmt,"tx")
- lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level)
- --print("[Malik]-loop cleanup@tmp1: lowerty, upperty: "..lower_ty..", "..upper_ty..", ty: "..ty..",ty level:"..ty_level..",tx_level:"..tx_level..", stmt: "..stmt)
-
- --print "before ceil"
- if(math.ceil(upper_ty/ty) > 1)then
- --print "CEIL IF"
- --print("\n Inside upper_ty/ty > 1\n");
-
- --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,ty,ty_level, "ty", "tmp_ty"))
- tile(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted)
- --print_code()
-
- --print(string.format("\n[Tile2-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt ,"ty"),find_cur_level(stmt,"ty")))
- tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty"))
- --print_code()
-
- -----------------------------------------------------------------------
- ----------------------------------------------------------------------
- cur_idxs = cur_indices(stmt)
- --print("\n cur indexes are "..list_to_string(cur_idxs))
-
- -- Putting ty before any tmp_tx
- idx_flag = -1
- for num= 0,table.getn(cur_idxs) do
- if(cur[num] == "tmp_tx") then
- idx_flag = find_cur_level(stmt,cur[num])
- break
- end
- end
- --print(string.format("\n (1) so i have found out the value of idx flag as %d",idx_flag) )
-
- if(idx_flag >=0 ) then
- if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then
- --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
- tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- --print_code()
- end
- end
-
- -- Now Putting ty before any tmp_ty
- idx_flag = -1
- for num= 0,table.getn(cur_idxs) do
- if(cur[num] == "tmp_ty") then
- idx_flag = find_cur_level(stmt,cur[num])
- break
- end
- end
- --print(string.format("\n IF so i have found out the value of idx flag as %d",idx_flag) )
- if(idx_flag >=0 ) then
- --print "one more test"
- if ((find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"))) then
- --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
- tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- --print_code()
- end
- end
- else
- --print "CEIL ELSE"
- --cur_idxs = cur_indices(stmt)
- --print("\n Inside upper_ty/ty <= 1\n");
-
- --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,1,ty_level, "ty", "ty"))
- tile(stmt, ty_level,1, ty_level, "ty", "ty", counted)
- --print_code()
-
- --print(string.format("\n[Tile3-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1))
- tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
- --print_code()
-
- idx_flag = -1
- if(cur_idxs) then
- --print "CAN NEVER GET HERE? cur_idxs"
- for num= 0,table.getn(cur_idxs) do
- if(cur[num] == "tmp_ty") then
- idx_flag = find_cur_level(stmt,cur[num])
- break
- end
- end
- end
- --print(string.format("\n ELSE so i have found out the value of idx flag as %d",idx_flag) )
- if(idx_flag >=0 ) then
- if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then
- --print(string.format("tile( stmt %d, level ty %d, level ty %d",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
- tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- --print(string.format("\n[Tile3-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")))
- end
- end
- end
-
- --print_code()
- end
-
-
- --print "\n\n *** at bottom of if in copy to shared, "
- --print_code()
- --print "end of if"
-
- else
- --copy to shared only created one level, not two, so we use a different approach (MV & TMV)
- --print("\nCopy to shared: [If was error]\n")
- level = find_cur_level(stmt,"tmp1")
- tile(stmt, level, level)
-
- --print(string.format("\n[Tile]tile(%d, %d, %d)",stmt, level, level))
- tx,ty = thread_dims()
- lower,upper = hard_loop_bounds(stmt, level)
- upper = upper+1 --upper bound given as <=, compare to dimensions tx which is <
- --print("upper "..upper.." tx "..tx)
- if upper == tx then
- rename_index(stmt, "tmp1", "tx")
- else
- --print("upper is not tx")
- --TODO: Don't know, maybe do some tileing etc
- --print_code()
- --print("upper "..upper.." tx "..tx.." stmt: "..stmt.." level: "..level)
- tile(stmt, level,tx,level, "tx", "tmp_tx", counted)
- --print_code()
-
- --print("stmt:"..stmt.." level+1: "..level+1)
- --print("TILE 7")
- tile(stmt, level+1,1,level+1,"tx", "tx",counted)
- --print("TILE 3")
- tile(stmt,level+1,level)
- --print_code()
-
- if(ty > 1) then
- --print_code()
- --print("GOING IN")
- lower,upper = hard_loop_bounds(stmt, level+1)
- --print(string.format("ty %d lower %d upper %d", ty, lower, upper))
- --upper=125
- --print("NOW FOR Y: upper "..upper.." ty "..ty.." stmt: "..stmt.." level: "..(level+1).." bound:"..math.ceil(upper/ty))
- tile(stmt, level+1,math.ceil(upper/ty),level+1, "tmp_ty", "ty", counted)
- --tile(stmt, level+2,math.ceil(upper/ty),level+2, "tmp_ty", "ty", counted)
- end
- --print_code()
- --rename_index(stmt, "tmp1", "tx")
- --print("Warning: Need to implement some logic here to tile the single level shared copy loop to match thread dimensions")
- end
- end
- --Always add sync
- add_sync(stmt,start_loop)
-
- end
- --print("ending copy to shared\n")
- --print_code()
-end
-
-function unroll_to_depth(max_depth)
- --print(string.format("\n\nunroll_to_depth(%d)", max_depth ))
- --print "SYNC UP"
-
- cur = cur_indices(0)
- thread_idxs = thread_indices()
- guard_idx = thread_idxs[#thread_idxs]
-
- --print(string.format("cur indices %s",list_to_string(cur)))
- --print(string.format("thread indices %s",list_to_string(thread_idxs)))
- --print(string.format("#thread_idxs = %d", #thread_idxs))
- --print(string.format("guard_idx = %s", guard_idx))
-
- ---- HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS
- common_loops = {}
- comm_loops_cnt = 0
- num_stmts = num_statements()
- --print(string.format("num statements %d", num_stmts))
-
- for stmt=0,num_stmts-1 do
- cur_idxs = cur_indices(stmt)
-
- --print(string.format("\nSTMT %d Current Indices: %s",stmt,list_to_string(cur_idxs)))
-
- if(chk_cur_level(stmt,"tx")>0) then
- for ii=1,find_cur_level(stmt,"tx")-1 do -- started at 0
- --print(string.format("ii = %d", ii)) -- index starts at 1, what does index 0 do?
- --if cur_idxs[ii] == nil then print "cur_idxs[i]] is NIL"
- --else print(string.format("cur_idxs[%d] = '%s'", ii, cur_idxs[ii])) -- index starts at 1, what does index 0 do?
- --end
-
- if(cur_idxs[ii] ~= "bx" and cur_idxs[ii] ~= "by" and cur_idxs[ii] ~= nil and cur_idxs[ii] ~= "tx" and cur_idxs[ii] ~= "ty" and cur_idxs[ii] ~= "") then
-
- --print(string.format("id %s is not in the list", cur_idxs[ii] ))
-
- for stmt1=stmt+1,num_stmts-1 do
- --print(string.format("\nii %d stmt1 is %d", ii, stmt1))
- cur_idxs1 = cur_indices(stmt1)
- --print("\nstmt1 cur_idxs1 is "..list_to_string(cur_idxs1))
-
- --print(string.format("cur level(%d, %s) = %d", stmt, "tx", find_cur_level(stmt,"tx")))
-
- endrange = find_cur_level(stmt,"tx")-1
- --print(string.format("for iii=1, %d do", endrange))
-
- for iii=1,find_cur_level(stmt,"tx")-1 do -- started at 0
- --print(string.format("stmt %d ii %d iii %d ", stmt, ii, iii))
- --if(cur_idxs1[iii] ~= nil) then
- -- print(string.format("stmt %d ii %d iii %d cur_idxs1[%d] = '%s'", stmt, ii, iii, iii, cur_idxs1[iii]))
- --else
- -- print(string.format("stmt %d ii %d iii %d cur_idxs1[%d] = NIL", stmt, ii, iii, iii))
- --end
-
- if(cur_idxs1[iii] ~= "bx" and cur_idxs1[iii] ~= "by" and cur_idxs1[iii] ~= nil and cur_idxs1[iii] ~= "tx" and cur_idxs1[iii] ~= "ty" and cur_idxs1[iii] ~= "") then
- if(cur_idxs[ii] == cur_idxs1[iii]) then
- --print("\nfound idx:"..cur_idxs[ii])
- --if(comm_loops_cnt == 0) then print "\n\n*** WARNING *** assigning to array index ZERO in Lua" end
- common_loops[comm_loops_cnt] = cur_idxs[ii]
- --print(string.format("cl[%d] = '%s'", comm_loops_cnt, common_loops[comm_loops_cnt]))
- comm_loops_cnt = comm_loops_cnt + 1
- end
- end
- end
- end
- end
- end
- end
- end
- ----
- --if(comm_loops_cnt>0) then
- -- print("\n COMM LOOPS :TOTAL "..comm_loops_cnt..", and are "..list_to_string(common_loops).." this loop :"..common_loops[0])
- --else
- -- print "UNROLL can't unroll any loops?"
- --end
-
-
-
-
- repeat
- old_num_stmts = num_statements()
- --print(string.format("old_num_statements %d", old_num_stmts))
-
- for stmt=0,old_num_stmts-1 do
- cur_idxs = cur_indices(stmt)
- --print(string.format("stmt %d cur_idxs = %s", stmt, list_to_string(cur_idxs)))
- if(#cur_idxs > 0) then
- gaurd_level = -1
- if(chk_cur_level(stmt,guard_idx)>0) then
- gaurd_level = find_cur_level(stmt,guard_idx)
- end
- --print(string.format("guard_level(sp) = %d", gaurd_level))
-
- if(gaurd_level>-1) then
- level = next_clean_level(cur_idxs,gaurd_level)
- --print(string.format("next clean level %d", level))
-
- --need to handle max_depth
- num_unrolled = 0
- level_unroll_comm = level
- level_arr = {}
- while level >= 0 do
- --print(string.format("while: level = %d", level))
-
- if num_unrolled == max_depth then break end
- --print("Unrolling "..stmt.." at level "..(level).." index ".. cur_idxs[gaurd_level+1])
-
- level_arr[num_unrolled] = level
- num_unrolled = num_unrolled + 1
-
- guard_level = find_cur_level(stmt,guard_idx)
- level = next_clean_level(cur_idxs,level+1)
- end
- --dies print("How many levels for unroll commands"..table.getn(level_arr).." which is "..level_arr[0].." and "..level_arr[#level_arr])
- --if(table.getn(level_arr) ~= nil) then
-
- --print "OK, NOW WE UNROLL"
-
- if(level_unroll_comm >= 0)then
- for i = table.getn(level_arr),0,-1 do
- --print(string.format("\ni=%d", i))
- --print(string.format("[Unroll]unroll(%d, %d, 0)",stmt, level_arr[i]))
-
- unroll(stmt,level_arr[i],0)
- --print("finished unroll]]\n")
- --print_code()
- end
- end
-------
- end
---[[
-
-THERE WAS A BIG BLOCK OF COMMENTED OUT CODE HERE
-
-
---]]
-------
- end
- end
- new_num_stmts = num_statements()
-
- until old_num_stmts == new_num_stmts
-
-end
-
-
diff --git a/examples/cuda-chill/cudaize.py b/examples/cuda-chill/cudaize.py
deleted file mode 100755
index ffef009..0000000
--- a/examples/cuda-chill/cudaize.py
+++ /dev/null
@@ -1,1047 +0,0 @@
-#! /usr/bin/python
-
-# THIS IS CUDAIZE.PY
-
-import chill
-import sys
-import math
-
-strided = 0
-counted = 1
-
-def print_code():
- chill.print_code()
- print ""
- sys.stdout.flush()
-
-
-def table_contains_key( table, key ): # use a dict for the 'table'?
- return table.has_key(key) # (key in table)?
-
-def print_array( arr ): # a useful function to mimic lua output
- for a in arr[:-1]:
- print "%s," % a,
- print "%s" % arr[-1]
- sys.stdout.flush()
-
-def valid_indices( statement, indices ):
- #print "valid_indices() python calling C cur_indices"
- #print statement
- cur = chill.cur_indices(statement) # calls C
- #print "python valid_indices(), cur = ",
- #print cur
- #print "indices = ",
- #print indices
-
- for index in indices:
- if not index in cur:
- return False
- return True
-
-def next_clean_level( indices_at_each_level, level):
- #print "next_clean_level( ..., %d )" % level
- #print "indices_at_each_level ",
- print_array( indices_at_each_level )
-
- numlevels = len(indices_at_each_level)
- #print "loop to %d" % numlevels
- for i in range(level+1, numlevels+1):
- pythoni = i-1 # LUA index starts at 1
- #print "Checking level %d = '%s'" % (i, indices_at_each_level[pythoni])
- sys.stdout.flush()
- if len(indices_at_each_level[pythoni]) > 0: # LUA INDEX STARTS AT 1
- #print "returning %d" % i
- return i # MATCH lua return value, LUA index starts at one
- return -1 # no non-dummy indices
-
-
-
-
-def build_order( final_order, tile_index_names, control_index_names, tile_index_map, current_level):
- order = []
- #print "\nbuild_order()"
- #print "build_order(): final_order = (",
- count = 0
- for f in final_order:
- #if count+1 == len(final_order):
- # print "%s )" % f
- #else:
- # print "%s," % f ,
- count += 1
-
- keys = control_index_names.keys()
- keys.sort()
- #if (2 == len(keys)):
- # print "build_order(): ctrl_idx_names = (%s, %s)" % (control_index_names[0], control_index_names[1])
- #else:
- # print "build_order(): ctrl_idx_names = (%s" % control_index_names[0],
- # for k in keys[1:]:
- # print ", %s" % control_index_names[k],
- # print ")"
-
- #print control_index_names
- #print "cur_level %d" % current_level
-
- #print "tile index map: ",
- #print tile_index_map
-
-
- for i in range(len(final_order)):
- k = final_order[i] # not used?
- skip = False
- cur = final_order[i]
- # control loops below our current level should not be in the current order
-
- # skip = cur in control_index_names[current_level+2:]
- #print "\n%d control_index_names, " % len(control_index_names)
- #print control_index_names
-
- for j in range(current_level+1, len(control_index_names)):
- #print "comparing cur %s with cin[%d] %s" % ( cur, j, control_index_names[j])
- if control_index_names[j] == cur:
- skip = True
- #print "SKIP %s " % cur
-
- # possibly substitute tile indices if necessary
- if tile_index_map.has_key(cur):
- approved_sub = False
- sub_string = tile_index_map[cur]
- #print "sub_string = ",
- #print sub_string
-
- # approved_sub = sub_string in tile_index_names[current_level+2:]
- for j in range(current_level+1, len(tile_index_names)):
- if tile_index_names[j] == sub_string:
- approved_sub = True
- if approved_sub:
- cur = sub_string
-
- if not skip:
- order.append( cur)
- #print "build_order() returning order (",
- #print order
- #for o in order:
- # print "%s," % o,
- #print ")"
- return order
-
-def find_cur_level( stmt, idx ):
- #print "find_cur_level(stmt %d, idx %s) Cur indices" % ( stmt, idx ),
-
- cur = chill.cur_indices(stmt)
- #for c in cur[:-1]:
- # print "%s," % c,
- #print "%s" % cur[ -1 ]
-
- index = 1 # lua starts indices at 1 !!
- for c in cur:
- if c == idx:
- #print "found it at index %d" % index
- #sys.stdout.flush()
- #print "in find_cur_level, returning ",
- #print index
- return index
- index += 1
- #print "find_cur_level(), Unable to find index %s in" % idx,
- #print cur
- #print "in find_cur_level, returning -1"
- return -1 # special meaning "it's not there"
-
-def chk_cur_level( stmt, idx ):
- # search cur_indices for a ind at stmt
- cur = chill.cur_indices(stmt)
- if idx in cur:
- return 1 + cur.index(idx) # lua index starts at 1 !
- return -1
-
-def find_offset( cur_order, tile, control):
- #print "Looking for tile '%s' and control '%s' in (" % (tile, control),
- #print cur_order
- #for o in cur_order:
- # print "%s," % o,
- #print ")"
-
- idx1 = -1
- idx2 = -1
- if tile in cur_order:
- idx1 = 1 + cur_order.index(tile) # lua indexes from 1!
- else:
- print "find_offset(), unable to find tile %s in current list of indices" % tile
- sys.exit(-1)
-
- if control in cur_order:
- idx2 = 1 + cur_order.index(control) # lua indexes from 1!
- else:
- print "find_offset(), unable to find control %s in current list of indices" % control
- sys.exit(-1)
-
- #print "found at level %d and %d" % ( idx2, idx1 )
- # this appears horrible
- if idx2 < idx1:
- return idx2-idx1+1 # bad ordering
- else:
- return idx2-idx1
-
-
-
-def tile_by_index( tile_indices, sizes, index_names, final_order, tile_method):
- #print "STARTING TILE BY INDEX"
- #print "tile_by_index() tile_method ",
- #print tile_method
- #print "index_names: ",
- #print index_names
-
- stmt = 0 # assume statement 0
- if not valid_indices( stmt, tile_indices):
- print "python tile_by_index() one or more of ",
- print tile_indices,
- print " is not valid"
- sys.exit(-1)
-
- if tile_method == None:
- #print "CREATING tile_method = 1"
- tile_method = 1 # "counted"
-
- tile_index_names = []
- for ti in tile_indices:
- tile_index_names.append( ti ) # make a copy?
- #print "tile_index_names:",
- #print tile_index_names
-
- control_index_names = {} # a dictionary?
- tile_index_map = {}
-
- #print "index_names: "
- #print index_names
-
- for pair in index_names:
- valid = False
- control = pair[0]
- name = pair[1]
- #print "control %s name %s" % ( control, name )
-
- if control[0] == "l" and control[1].isdigit():
- if control.endswith("_control"):
- index = int(control[1: -8])
- control_index_names[index-1] = name
- valid = True
-
- elif control.endswith("_tile"):
- index = int(control[1: -5])
- #print "index %d" % index
- tile_index_names[index-1] = name # ??
- tile_index_map[name] = tile_indices[index-1]
- valid = True
- if not valid:
- print "%s is not a proper key for specifying tile or control loop indices\n" % control
-
- #print "control_index_names = ",
- #print control_index_names
-
- #print "tile_index_names = ",
- #print tile_index_names
-
- #print "before call to build_order(), tile_index_map = ",
- #print tile_index_map
-
-
- # filter out control indices (and do name substitution of unprocessed tile indices) for a given level
- cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, -1)
-
- #print "returned from build_order python\n\n"
-
- # print("permute("..stmt..", {"..list_to_string(cur_order).."})")
- #print "permute(%d, {" % stmt,
- #print "cur_order = ",
- #print cur_order,
- #print "})"
-
- cur_order.insert(0, stmt)
- #print cur_order
- chill.permute( tuple( cur_order))
- #print "in cudaize.py, returned from C code chill.permute()\n"
-
- for i in range(len(tile_indices)):
- cur_idx = tile_indices[i]
- #print "i %d cur_idx %s calling build order ********" % (i, cur_idx)
- cur_order = build_order( final_order, tile_indices, control_index_names, tile_index_map, i)
- #print "cur_idx %s return from build order" % cur_idx
-
- # Find an offset between tile loop and control loop
- # 0 = control loop one level above tile loop
- # -1 = control loop two levels above tile loop
- # > 0 = tile loop above control loop
- # In the last case, we do two extra tile commands to get the control
- # above the tile and then rely on the final permute to handle the
- # rest
- level = find_cur_level(stmt,cur_idx)
- #print "level %d\n" % level
-
- offset = find_offset(cur_order, tile_index_names[i], control_index_names[i])
- #print "offset %d" % offset
-
- if offset <= 0:
- #print "[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method )
- chill.tile7( stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method )
- #print "in cudaize.py, returned from C code chill.tile7\n"
-
- else:
- #print "2tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method )
- chill.tile7( stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method ) # regular level
-
- # flip and tile control loop
- #print "3tile(%d, %d, %d)" % ( stmt, level+1, level+1)
- chill.tile3( stmt, level+1, level+1)
-
- #print "4tile(%d, %d, %d)" % ( stmt, level+1, level)
- chill.tile3( stmt, level+1, level)
-
- #print_code()
-
- # Do permutation based on cur_order
- #print("permute based on build order calling build_order()")
- cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, i)
-
- #print("permute based on build order return from build_order()")
-
- # print("permute("..stmt..", {"..list_to_string(cur_order).."})")
- topermute = cur_order
- topermute.insert(0, stmt)
- chill.permute( tuple(topermute) )
- #print "\nafter permute(), code is:"
- #print_code()
-
-def normalize_index( index ):
- #print "in cudaize.py, normalize_index( %s )" % index
- stmt = 0 # assume stmt 0
- l = find_cur_level( stmt, index )
- chill.tile3( stmt, l, l )
-
-def is_in_indices( stmt, idx):
- cur = chill.cur_indices(stmt)
- return idx in cur
-
-def copy_to_registers( start_loop, array_name ):
- #print "\n\n****** starting copy to registers"
- #sys.stdout.flush()
-
- stmt = 0 # assume stmt 0
- cur = chill.cur_indices(stmt) # calls C
- table_Size = len(cur)
-
- #print "Cur indices",
- #print_array(cur)
- #print "\nThe table size is %d" % table_Size
- #count=1
- #for c in cur:
- # print "%d\t%s" % (count,c)
- # count += 1
-
- #print_code()
-
- # would be much cleaner if not translating this code from lua!
- level_tx = -1
- level_ty = -1
- if is_in_indices(stmt,"tx"):
- level_tx = find_cur_level(stmt,"tx")
- if is_in_indices(stmt,"ty"):
- level_ty = find_cur_level(stmt,"ty")
- #print "level_tx %d level_ty %d" % ( level_tx, level_ty )
- #sys.stdout.flush()
-
- ty_lookup_idx = ""
- org_level_ty = level_ty
-
- # UGLY logic. Lua index starts at 1, so all tests etc here are off by 1 from the lua code
- # level_ty initializes to -1 , which is not a valid index, and so there is added code to
- # make it not try to acccess offset -1. -1 IS a valid python array index
- # to top it off, the else below can assign a NIL to ty_lookup_idx!
- if level_ty != -1 and cur[level_ty] != "":
- #print "IF cur[%d] = %s" % ( level_ty, cur[level_ty] )
- ty_lookup_idx = cur[level_ty]
- else:
- #print "ELSE ty_lookup_idx = cur[%d] = %s" % ( level_ty, cur[level_ty-1])
- ty_lookup_idx = cur[level_ty-1]
- #print "ty_lookup_idx '%s'" % ty_lookup_idx
-
- if level_ty > -1:
- #print "\ntile3(%d,%d,%d)" % (stmt,level_ty,level_tx+1)
- chill.tile3(stmt,level_ty,level_tx+1)
- #print_code()
-
- cur = chill.cur_indices(stmt) # calls C
- table_Size = len(cur)
- #print "Cur indices ",
- #for c in cur:
- # print "%s," % c,
- #print "\nThe table size is %d" % len(cur)
- #count=1
- #for c in cur:
- # print "%d\t%s" % (count,c)
- # count += 1
- #sys.stdout.flush()
-
- if is_in_indices(stmt,"tx"):
- level_tx = find_cur_level(stmt,"tx")
- if ty_lookup_idx != "": # perhaps incorrect test
- if is_in_indices(stmt,ty_lookup_idx):
- level_ty = find_cur_level(stmt,ty_lookup_idx)
-
- ty_lookup = 1
- idx_flag = -1
- # find the level of the next valid index after ty+1
- #print "\nlevel_ty %d" % level_ty
- if level_ty > -1:
- #print "table_Size %d" % table_Size
- for num in range(-1 + level_ty+ty_lookup,table_Size): # ?? off by one?
- #print "num=%d cur[num] = '%s'" % (num+1, cur[num]) # num+1 is lua index ????
- sys.stdout.flush()
- if cur[num] != "":
- idx_flag = find_cur_level(stmt,cur[num])
- #print "idx_flag = %d" % idx_flag
- break
-
- #print "\n(first) I am checking all indexes after ty+1 %s" % idx_flag
- #print_code()
- #print ""
-
- how_many_levels = 1
-
- #print "idx_flag = %d I will check levels starting with %d" % (idx_flag, idx_flag+1)
- # lua arrays start at index 1. the next loop in lua starts at offset 0, since idx_flag can be -1
- # thus the check for "not equal nil" in lua (bad idea)
- # python arrays start at 0, so will check for things that lua doesn't (?)
- startat = idx_flag + 1
- if idx_flag == -1:
- startat = 1 # pretend we're lua for now. TODO: fix the logic
-
- for ch_lev in range(startat,table_Size+1): # logic may be wrong (off by one)
- #print "ch_lev %d" % ch_lev
- if ch_lev <= table_Size and cur[ch_lev-1] != "":
- #print "cur[%d] = '%s'" % ( ch_lev, cur[ch_lev-1] )
- how_many_levels += 1
-
- #print "\nHow Many Levels %d" % how_many_levels
- sys.stdout.flush()
- sys.stdout.flush()
-
- if how_many_levels< 2:
- while( idx_flag >= 0):
- for num in range(level_ty+ty_lookup,table_Size+1):
- #print "at top of loop, num is %d" % num
- #print "cur[num] = '%s'" % cur[num-1]
- if cur[num-1] != "":
- idx = cur[num-1]
- #print "idx '%s'" % idx
- sys.stdout.flush()
- curlev = find_cur_level(stmt,idx)
- #print "curlev %d" % curlev
-
- #print "\n[COPYTOREG]tile(%d,%d,%d)"%(stmt,curlev,level_tx)
-
- chill.tile3(stmt, curlev, curlev)
- curlev = find_cur_level(stmt,idx)
- #print "curlev %d" % curlev
- chill.tile3(stmt,curlev,level_tx)
- #print "hehe '%s'" % cur[num-1]
-
- cur = chill.cur_indices(stmt)
- #print "Cur indices INSIDE",
- #for c in cur:
- # print "%s," % c,
- table_Size = len(cur)
- #print "\nTable Size is: %d" % len(cur)
-
- level_tx = find_cur_level(stmt,"tx")
- #print "\n level TX is: %d" % level_tx
- level_ty = find_cur_level(stmt,ty_lookup_idx)
- #print "\n level TY is: %d" %level_ty
- idx_flag = -1
- #print "idx_flag = -1"
-
-
- #- find the level of the next valid index after ty+1
- #- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?)
- for num2 in range( -1 + level_ty+ty_lookup ,table_Size): # lua starts index at one
- #print "num mucking num = %d" % num2
- if(cur[num2] != ""):
- #print "cur[%d] = '%s'" % ( num2, cur[num2] )
- idx_flag = find_cur_level(stmt,cur[num2])
- #print("\n(second) I am checking all indexes after ty+1 %s",cur[num2])
- break
-
- #print "num mucked to %d idx_flag = %d" % (num, idx_flag)
-
- #print "at bottom of loop, num is %d" % num
-
- #print "done with levels"
-
- # this was a block comment ???
-
-# for num in range(level_ty+1, table_Size+1):
-# print "num %d" % num
-# if cur[num-1] != "":
-# idx_flag = find_cur_level(stmt,cur[num-1]) ## ugly
-# print "idx_flag = %d" % idx_flag
-
- # change this all to reflect the real logic which is to normalize all loops inside the thread loops.
-# print "change this all ...\n"
-# print "level_ty+1 %d table_Size-1 %d idx_flag %d" %( level_ty+1, table_Size-1, idx_flag)
-# sys.stdout.flush()
-# sys.stdout.flush()
-
-# while level_ty+1 < (table_Size-1) and idx_flag >= 0:
-# print "*** level_ty %d" % level_ty
-# for num in range(level_ty+2,table_Size+1): # lua for includes second value
-# print "num %d cur[num] %s" % (num, cur[num])
-# if cur[num] != "":
-# idx = cur[num]
-# print "idx='%s'" % idx
-# #print_code()
-
-
-
-
- #print "ARE WE SYNCED HERE?"
- #print_code()
-
- # [Malik] end logic
- start_level = find_cur_level(stmt, start_loop) # start_loop was passed parameter!
-
- # We should hold constant any block or tile loop
- block_idxs = chill.block_indices()
- thread_idxs = chill.thread_indices()
- #print"\nblock indices are"
- #for index, val in enumerate(block_idxs):
- # print "%d\t%s" % ( int(index)+1 , val )
- #print"\nthread indices are"
- #for index, val in enumerate(thread_idxs):
- # print "%d\t%s" % ( int(index)+1 , val )
- #print "\nStart Level: %d" % start_level
-
- hold_constant = []
- #print("\n Now in Blocks")
- for idx in block_idxs:
- blocklevel = find_cur_level(stmt,idx)
- if blocklevel >= start_level:
- hold_constant.append(idx)
- #print "\nJust inserted block %s in hold_constant" %idx
-
- #print("\n Now in Threads")
- for idx in thread_idxs:
- blocklevel = find_cur_level(stmt,idx)
- if blocklevel >= start_level:
- hold_constant.append(idx)
- #print "\nJust inserted thread %s in hold_constant" %idx
- #print "\nhold constant table is: "
- #for index, val in enumerate(hold_constant):
- # print "%d\t%s" % ( int(index)+1 , val )
-
- #print("\nbefore datacopy pvt")
- old_num_stmts = chill.num_statements()
- #sys.stdout.flush()
-
- #print "\n[DataCopy]datacopy_privatized(%d, %s, %s, " % (stmt, start_loop, array_name),
- #print hold_constant,
- #print ")"
- passtoC = [stmt, start_loop, array_name ] # a list
- passtoC.append( len(hold_constant ) )
- for h in hold_constant:
- passtoC.append( h )
- chill.datacopy_privatized( tuple( passtoC ))
- sys.stdout.flush()
- sys.stdout.flush()
-
- new_num_statements = chill.num_statements()
- #print "new num statements %d" % new_num_statements
-
- # Unroll to the last thread level
-# for stmt in range(old_num_statements, new_num_statements):
-# print "unrolling statement %d" % stmt
-# level = find_cur_level(stmt,thread_idxs[-1]) #get last thread level
-# print "level is %d" % level
-# idxs = chill.cur_indices(stmt)
-# if level < len(idxs):
-# chill.unroll(stmt,level+1,0)
-
-
-
-def copy_to_shared( start_loop, array_name, alignment ):
- #print "\nstarting copy to shared( %s, %s, %d)" % (start_loop, array_name, alignment )
- #print "copy_to_shared( %s, %s, %d) in cudaize.py" % ( start_loop, array_name, alignment )
- stmt = 0 # assume statement 0
-
- cur = chill.cur_indices(stmt)
- #print "Cur indices ",
- #print_array( cur )
-
- start_level = find_cur_level( stmt, start_loop )
- #print "start_level %d" % start_level
-
- old_num_statements = chill.num_statements()
- #print "old_num_statements %d" % old_num_statements
-
-
- # Now, we give it indices for up to two dimensions for copy loop
- copy_loop_idxs = ["tmp1","tmp2"]
- #chill.datacopy_9arg(stmt, start_level, array_name, copy_loop_idxs, False, 0, 1, alignment,True)
- passtoC = [stmt, start_level, array_name] # a list
- passtoC.append( len(copy_loop_idxs))
- for i in copy_loop_idxs:
- passtoC.append(i)
- passtoC.append( 0 ) # False
- passtoC.append( 0 )
- passtoC.append( 1 )
- passtoC.append( alignment )
- passtoC.append( 1 ) # True
- #print "\n[DataCopy]datacopy( ",
- #print passtoC,
- #print ")"
-
- #if array_name == "b":
- # chill.cheat(1)
- #if array_name == "c":
- # chill.cheat(2)
-
- chill.datacopy_9arg( tuple( passtoC ))
-
- #print "back from datacopy_9arg\n\n\n"
- #sys.stdout.flush()
-
-
- #print "calling add_sync( %d, %s )" % ( stmt, start_loop )
- chill.add_sync( stmt, start_loop )
- #print "back from add_sync()\n\n"
-
- new_num_statements = chill.num_statements()
-
- # This is fairly CUBLAS2 specific, not sure how well it generalizes,
- # but for a 2D copy, what we want to do is "normalize" the first loop
- # "tmp1" then get its hard upper bound. We then want to tile it to
- # make the control loop of that tile "ty". We then tile "tmp2" with a
- # size of 1 and make it "tx".
-
- #print "fairly CUBLAS2 specific, OLD %d NEW %d" % ( old_num_statements, new_num_statements)
- sys.stdout.flush()
- sys.stdout.flush()
-
- for stmt in range(old_num_statements, new_num_statements):
- #print "for stmt = %d" % stmt
- level = find_cur_level( stmt, "tmp2")
- #print "FOUND CUR LEVEL? level '",
- #print level,
- #print "'"
-
- #print "in loop, stmt %d level %d" % ( stmt, level )
- if level != -1:
- #print "\nCopy to shared: [If was no error]\n"
- find_cur_level(stmt,"tmp2")
- chill.tile3( stmt, level, level )
-
- #print "hard_loop_bounds( %d, %d )" % (stmt, level)
- bounds = chill.hard_loop_bounds(stmt, level)
- lower = bounds[0]
- upper = 1+ bounds[1]
- #print "lower %d upper %d" % ( lower, upper )
-
- dims = chill.thread_dims()
- #print "in cudaize.py copy_to_shared, dims =",
- #print dims
- tx = dims[0]
- ty = dims[1]
- #print "2-loop cleanup: lower, upper: %d, %d, tx: %d" % ( lower, upper, tx)
-
- level = find_cur_level(stmt,"tmp1")
- #print "level %d" % level
- if tx == upper and ty == 1:
- #print "tx = %d upper = %d ty = %d"% (tx, upper, ty)
- #print "Don't need"
-
- # Don't need an extra tile level, just move this loop up
- second_level = find_cur_level(stmt,"tmp2")
- chill.tile7(stmt, second_level, 1, level, "tx", "tx", counted)
-
- else:
- #print "DO need?"
- if ty == 1:
- new_ctrl = "tmp3"
- else:
- new_ctrl = "ty"
-
- # LOTS of commented out code here in cudaize.lua
-
- #print_code()
- #print "\nStarting tmp2\n"
- first_level = find_cur_level(stmt,"tmp1")
- second_level = find_cur_level(stmt,"tmp2")
- bounds = chill.hard_loop_bounds(stmt, second_level)
- lower = bounds[0]
- upper = 1 + bounds[1] # BROKEN?
-
- #print "[Malik]-loop cleanup@tmp2: lower, upper: %d, %d, tx: %d,first level:%d,second_level:%d" % ( lower, upper-1, tx, first_level, second_level)
-
- # Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx.
- #print "\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, second_level,1,first_level, "tx", "tx")
- chill.tile7(stmt, second_level,1,first_level,"tx","tx",counted)
- #print_code()
-
- first_level = find_cur_level(stmt,"tmp1")
- bounds = chill.hard_loop_bounds(stmt, first_level)
- lower_1 = bounds[0]
- upper_1 = 1 + bounds[1]
- tx_level = find_cur_level(stmt,"tx")
- bounds = chill.hard_loop_bounds(stmt,tx_level)
- lower_tx = bounds[0]
- upper_tx = 1+bounds[1]
- #print "UL_1 %d %d UL_tx %d %d" % ( lower_1, upper_1-1, lower_tx, upper_tx-1)
-
- if int(math.ceil( float(upper_tx)/float(tx))) > 1:
- #print "ceil I say"
- #print "\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, tx_level,tx,tx_level, "tx", "tmp1")
- chill.tile7(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted)
- #print_code()
-
- repeat = find_cur_level(stmt,"tx")
- #print "\n[Tile1]tile(%d, %d, %d)" % (stmt, repeat, repeat)
- chill.tile3(stmt, repeat, repeat) #find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx"))
- #print_code()
-
- if find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx"):
- #print "\nagain [Tile1]tile(%d, %d, %d)" % (stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
- chill.tile3(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
- #print_code()
-
- #print_code()
-
- #print "\nStarting tmp1\n"
- # Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty".
- chill.tile3(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))
- #print_code()
-
- ty_level = find_cur_level(stmt,"tmp1")
- bounds = chill.hard_loop_bounds(stmt,ty_level)
- lower_ty = bounds[0]
- upper_ty = 1 + bounds[1]
-
- tx_level = find_cur_level(stmt,"tx")
- bounds = chill.hard_loop_bounds(stmt,tx_level)
- lower_tx = bounds[0]
- upper_tx = 1 + bounds[1]
-
- #print "[Malik]-loop cleanup@tmp1: lowerty, upperty: %d, %d, ty: %d,ty level:%d,tx_level:%d, stmt: %d" % ( lower_ty, upper_ty-1, ty, ty_level, tx_level, stmt)
-
- #print "before ceil"
- #sys.stdout.flush()
-
- if(math.ceil(float(upper_ty)/float(ty)) > 1):
- #print "CEIL IF"
- #print "\n Inside upper_ty/ty > 1\n"
-
- #print "\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, ty_level,ty,ty_level, "ty", "tmp_ty")
- chill.tile7(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted)
- #print_code()
-
- #print "\n[Tile2-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt ,"ty"),find_cur_level(stmt,"ty"))
- chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty"))
- #print_code()
-
- cur_idxs = chill.cur_indices(stmt)
- #print "\n cur indexes are ",
- #print_array( cur_idxs)
- #sys.stdout.flush()
-
- # Putting ty before any tmp_tx
- idx_flag = -1
- if "tmp_tx" in cur_idxs:
- idx_flag = 1 + cur_idxs.index("tmp_tx") # lua index starts at 1
- #print "\n (1) so i have found out the value of idx flag as %d" % idx_flag
- #sys.stdout.flush()
-
- if idx_flag >= 0:
- if find_cur_level(stmt,"ty") > find_cur_level(stmt,"tmp_ty"):
- #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- #print_code()
-
-
- # Now Putting ty before any tmp_ty
- sys.stdout.flush()
- idx_flag = -1
- if "tmp_ty" in cur_idxs:
- idx_flag = 1 + cur_idxs.index("tmp_ty") # lua index starts at 1
- #print "\n IF so i have found out the value of idx flag as %d" % idx_flag
- #sys.stdout.flush()
-
- if idx_flag >= 0:
- #print "one more test"
- sys.stdout.flush()
- if find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"):
- #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- #sys.stdout.flush()
- chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- #print_code()
-
-
-
- else:
- #print "CEIL ELSE"
- #print "\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, ty_level,1,ty_level, "ty", "ty")
- #sys.stdout.flush()
- chill.tile7( stmt, ty_level, 1, ty_level, "ty", "ty", counted )
- #print_code()
-
- #print "\n[Tile3-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
- sys.stdout.flush()
-
- chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
- #print_code()
-
-
- idx_flag = -1
- # LUA code checks to see if cur_idxs exists? it is unused except in the other clause of this is
- #if(cur_idxs) then
- #print "CAN NEVER GET HERE? cur_idxs"
- #for num= 0,table.getn(cur_idxs) do
- #if(cur[num] == "tmp_ty") then
- #idx_flag = find_cur_level(stmt,cur[num])
- #break
- #end
- #end
- print "\n ELSE so i have found out the value of idx flag as %d" % idx_flag
- if idx_flag >= 0: # can't happen
- print "tile( stmt %d, level ty %d, level ty %d" % ( stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
- #chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
-
-
-
-
-
- #print "\n\n *** at bottom of if in copy to shared, "
- #print_code()
- #print "end of if"
-
- else:
- # copy to shared only created one level, not two, so we use a different approach (MV & TMV)
- #print "\nCopy to shared: [If was error]\n"
- level = find_cur_level(stmt,"tmp1")
- chill.tile3(stmt, level, level)
-
- dims = chill.thread_dims()
- #print dims
- tx = dims[0]
- ty = dims[1]
-
- bounds = chill.hard_loop_bounds(stmt, level)
- lower = bounds[0]
- upper = bounds[1]
-
- #print "bounds lower %d upper %d" % (lower, upper)
- upper = upper+1 # upper bound given as <=, compare to dimensions tx which is <
- if upper == tx:
- #print "upper == tx"
- chill.rename_index( stmt, "tmp1", "tx")
- else:
- #print "upper is not tx"
- #print "upper %d tx %d stmt: %d level: %d" % ( upper, tx, stmt, level)
- chill.tile7( stmt, level, tx, level, "tx", "tmp_tx", counted)
- #print_code()
-
- #print "stmt:%d level+1: %d" % ( stmt, level+1)
- #print("TILE 7")
- chill.tile7( stmt, level+1,1,level+1,"tx", "tx",counted)
- #print("TILE 3")
- chill.tile3( stmt, level+1, level)
- #print_code()
-
-
- if ty > 1:
- #print "GOING IN"
- bounds = chill.hard_loop_bounds(stmt, level+1)
- lower = bounds[0]
- upper = bounds[1]
- #print "ty %d lower %d upper %d" % ( ty, lower, upper )
- floatdiv = float(upper)/float(ty)
- bound = int(math.ceil(float(upper)/float(ty)))
- #print "NOW FOR Y: upper %d ty %d stmt: %d level: %d bound: %d" % ( upper, ty, stmt, level+1, bound)
- chill.tile7(stmt, level+1, bound, level+1, "tmp_ty", "ty", counted)
-
- # Always add sync
- chill.add_sync( stmt, start_loop )
- #print "ending copy to shared\n"
- #sys.stdout.flush()
- #print_code()
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-def unroll_to_depth( max_depth ):
- print "\n\nunroll_to_depth(%d)" % max_depth
- print "SYNC UP"
- sys.stdout.flush()
-
- cur = chill.cur_indices(0)
- thread_idxs = chill.thread_indices()
- guard_idx = thread_idxs[-1] # last one
-
- print "cur indices",
- print_array(cur)
- print "thread indices",
- print_array(thread_idxs)
- print "guard_idx = %s" % guard_idx
-
- #print "thread_idxs = ",
- #print thread_idxs
- guard_idx = thread_idxs[-1]
- #print "guard_idx = %s" % guard_idx
-
- # HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS
- common_loops = []
- comm_loops_cnt = 0
- num_stmts = chill.num_statements()
- print "num statements %d" % num_stmts
-
- for stmt in range(num_stmts):
- sys.stdout.flush()
- print "\nSTMT %d" % stmt,
- cur_idxs = chill.cur_indices(stmt)
- print "Current Indices:",
- for c in cur_idxs[:-1]:
- print "%s," % c,
- print "%s" % cur_idxs[-1] # last one
- sys.stdout.flush()
- #print_code()
-
- if chk_cur_level(stmt, "tx") > 0:
-
- for ii in range(find_cur_level(stmt,"tx")-1):
- print "ii = %d\ncur_idxs[%d] = '%s'" % (ii+1, ii+1, cur_idxs[ii]) # print to match lua
- id = cur_idxs[ii]
- if id not in ["bx", "by", "", "tx", "ty"]:
-
- print "id %s is not in the list" % id
-
- for stmt1 in range(stmt+1, num_stmts):
- print "\nii %d stmt1 is %d" % (ii+1, stmt1) # print to match lua
- cur_idxs1 = chill.cur_indices(stmt1)
- print "\nstmt1 cur_idxs1 is ",
- for ind in cur_idxs1[:-1]:
- print "%s," % ind,
- print "%s" % cur_idxs1[-1]
-
- print "cur level(%d, %s) = %d" % (stmt, "tx", find_cur_level(stmt,"tx") )
- sys.stdout.flush()
-
- endrange = find_cur_level(stmt,"tx")-1
- print "for iii=1, %d do" % endrange
- sys.stdout.flush()
- for iii in range(endrange): # off by one? TODO
- print "stmt %d ii %d iii %d\n" % (stmt, ii+1, iii+1),
- sys.stdout.flush()
-
- if iii >= len(cur_idxs1):
- print "stmt %d ii %d iii %d cur_idxs1[%d] = NIL" % (stmt, ii+1, iii+1, iii+1, ) # print to match lua
- else:
- print "stmt %d ii %d iii %d cur_idxs1[%d] = '%s'" % (stmt, ii+1, iii+1, iii+1, cur_idxs1[iii]) # print to match lua
- sys.stdout.flush()
-
- # this will still probably die
- if iii < len(cur_idxs1) and [iii] not in ["bx", "by", "tx", "ty", ""]:
- if cur_idxs[ii] == cur_idxs1[iii]:
- print "\nfound idx:%s" % cur_idxs[ii]
- common_loops.append(cur_idxs[ii])
- print "cl[%d] = '%s'" % ( comm_loops_cnt, cur_idxs[ii] )
- comm_loops_cnt = len(common_loops)
-
- if len(common_loops) > 0:
- print "\n COMM LOOPS :TOTAL %d, and are " % comm_loops_cnt,
- print common_loops,
- print " this loop : %s" % common_loops[0]
- else:
- print "UNROLL can't unroll any loops?"
-
-
- while True: # break at bottom of loop (repeat in lua)
- old_num_statements = chill.num_statements()
- print "old_num_statements %d" % old_num_statements
-
- for stmt in range(old_num_statements):
- cur_idxs = chill.cur_indices(stmt)
- print "stmt %d cur_idxs =" % stmt,
- index = 0
- for i in cur_idxs:
- index +=1
- if index == len(cur_idxs):
- print "%s" %i
- else:
- print "%s," % i,
-
- if len(cur_idxs) > 0:
- guard_level = -1
- if chk_cur_level(stmt, guard_idx) > 0:
- guard_level = find_cur_level(stmt,guard_idx)
- print "guard_level(sp) = %d" % guard_level
- if guard_level > -1:
- level = next_clean_level(cur_idxs,guard_level)
- print "next clean level %d" % level
-
-
- #print "looking at %d" % stmt
- #print "comparing %d and %d in" % (guard_level, level),
- #index = 0
- #for i in cur_idxs:
- #index +=1
- #if index == len(cur_idxs):
- # print "%s" %i
- #else:
- # print "%s," % i,
-
- # need to handle max_depth
- num_unrolled = 0
- level_unroll_comm = level
- level_arr = []
-
- #print "before while, level = %d" % level
- while level >= 0:
- print "while: level = %d" % level
- if num_unrolled == max_depth:
- break
-
- print "Unrolling %d at level %d index %s" % ( stmt, level, cur_idxs[guard_level]) # ???
- level_arr.append(level)
-
- guard_level = find_cur_level(stmt,guard_idx)
- level = next_clean_level(cur_idxs,level+1)
-
- print "OK, NOW WE UNROLL"
- if level_unroll_comm >= 0:
- level_arr.reverse()
- for i,lev in enumerate(level_arr):
- print "\ni=%d" % i
- print "[Unroll]unroll(%d, %d, 0)" % (stmt, lev)
- chill.unroll(stmt, lev, 0)
-
-
- new_num_statements = chill.num_statements()
- if old_num_statements == new_num_statements:
- break # exit infinite loop
-
-
-# all other calls to C have a routine in this file (?)
-def unroll( statement, level, unroll_amount ):
- chill.unroll( statement, level, unroll_amount )
-
diff --git a/examples/cuda-chill/mm.c b/examples/cuda-chill/mm.c
deleted file mode 100644
index 0efbeeb..0000000
--- a/examples/cuda-chill/mm.c
+++ /dev/null
@@ -1,10 +0,0 @@
-#define N 1024
-
-void normalMM(float c[N][N], float a[N][N], float b[N][N]) {
- int i, j, k;
-
- for (i = 0; i < N; i++)
- for (j = 0; j < N; j++)
- for (k = 0; k < N; k++)
- c[j][i] = c[j][i] + a[k][i] * b[j][k];
-}
diff --git a/examples/cuda-chill/mm.lua b/examples/cuda-chill/mm.lua
deleted file mode 100644
index 5bde1b0..0000000
--- a/examples/cuda-chill/mm.lua
+++ /dev/null
@@ -1,38 +0,0 @@
-init("mm.c", "normalMM", 0)
-dofile("cudaize.lua")
-N=1024
-Ti=128
-Tj=64
-Tk=16
-Tii=16
-Tjj=16
-
-
-
-
-N=1024
-
-
-
-
-
-
-
-
-
-
-
-
-
-tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k"})CU=1
-
-tile_by_index({"k"},{Tk},{l1_control="kk"},{"ii","jj","kk","i","j","k"})CU=3
-
-tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","kk","i","iii","j","jjj","k"},1)CU=2
-
-cudaize("mm_GPU",{a=1048576,b=1048576,c=1048576},{block={"ii","jj"}, thread={"i","j"}})CU=2
-copy_to_shared("tx","a",-16)
-copy_to_shared("tx","b",-16)
-copy_to_registers("kk","c")
---print_code()
-unroll_to_depth(2)
diff --git a/examples/cuda-chill/mpeg4.c b/examples/cuda-chill/mpeg4.c
deleted file mode 100755
index 7f83bf7..0000000
--- a/examples/cuda-chill/mpeg4.c
+++ /dev/null
@@ -1,23 +0,0 @@
-#define N1 4096
-#define N2 4096
-#define WINDOW_SIZE 16
-
-void mpeg4_cpu(float result[N1][N2], float prev[N2+WINDOW_SIZE][N2+WINDOW_SIZE], float curr[WINDOW_SIZE*WINDOW_SIZE])
-{
- unsigned int i;
- unsigned int j;
- unsigned int k;
- unsigned int l;
-
- for ( i = 0; i < N1; ++i)
- for ( j = 0; j < N2; ++j)
- for ( k = 0; k < WINDOW_SIZE; ++k)
- for ( l = 0; l < WINDOW_SIZE; ++l)
- result[i][j] += prev[i+k][j+l] * curr[k*WINDOW_SIZE+l];
-
-
-
-
-
-}
-
diff --git a/examples/cuda-chill/mpeg4.lua b/examples/cuda-chill/mpeg4.lua
deleted file mode 100644
index f025dc0..0000000
--- a/examples/cuda-chill/mpeg4.lua
+++ /dev/null
@@ -1,45 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("mpeg4.c", "mpeg4_cpu", 0)
-
---dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods
-
-N=4096
-M=4096
-W=16
-
---TI 4ust be <= M
---TJ must be <=TI
-Ti=32
-Tj=32
-Tii=16
-Tjj=16
-Tk=4
---permute(0,{"j","i","k","l"})
-tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k","l"})
---tile_by_index({"k","l"},{Tk*2,Tk*2},{l1_control="kk",l2_control="ll"},{"ii","jj","kk","ll","i","j","k","l"})
---print_code()
---tile_by_index({"k","l"},{Tk,Tk},{l1_control="kk",l2_control="ll"},{"ii","jj","i","j","kk","k","ll","l"})
-tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","iii","i","jjj","j","k","l"})
---print_code()
---normalize_index("j")
---normalize_index("i")
---print_code()
-cudaize("kernel_GPU",{curr=W*W,prev=(N+W)*(M+W),result=N*M},{block={"ii","jj"}, thread={"i","j"}})
---print_code()
-copy_to_shared("iii","prev",16)
-
-copy_to_registers("jjj","result")
-
---print_code()
---copy_to_constant_no_tile("curr")
-unroll_to_depth(2)
-print_code()
-print_space()
-
-
diff --git a/examples/cuda-chill/mriq-fh.c b/examples/cuda-chill/mriq-fh.c
deleted file mode 100755
index 1e924b7..0000000
--- a/examples/cuda-chill/mriq-fh.c
+++ /dev/null
@@ -1,38 +0,0 @@
-#define X 32768
-#define K 256
-struct kValues {
- float Kx;
- float Ky;
- float Kz;
- float PhiMag;
-};
-extern float sin(float);
-extern float cos(float);
-
-void mriFH_cpu(float *rPhi,float *rRho,float *iRho, float *iPhi, float *rD, float *iD, float *kx, float *ky, float *kz, float *dx, float *dy, float *dz, float *rFHref, float *iFHref)
-{
-
- float rfh;
- float ifh;
- float exp;
- float cArg;
- float sArg;
- //float rRho[K];
- //float iRho[K];
- unsigned int k;
- unsigned int x;
-
-
- for (x = 0; x < X; ++x) {
- for (k = 0; k < K; ++k) {
-
- exp = 2 * 3.14159 * (kx[k]* dx[x] + ky[k]* dy[x] + kz[k]* dz[x]);
- cArg = cos(exp);
- sArg = sin(exp);
- rFHref[x] += rRho[k]* cArg - iRho[k]* sArg;
- iFHref[x] += iRho[k]*cArg + rRho[k]*sArg;
- }
-
- }
-}
-
diff --git a/examples/cuda-chill/mriq-fh.lua b/examples/cuda-chill/mriq-fh.lua
deleted file mode 100755
index 3277bac..0000000
--- a/examples/cuda-chill/mriq-fh.lua
+++ /dev/null
@@ -1,73 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("mriq-fh.c", "mriFH_cpu", 0)
-
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-N=32768
-M=256
-Tx=256
-
-
-print_code()
---permute(0,{"j","i"})
---tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"})
-tile_by_index({"x"},{Tx},{l1_control="xx"},{"xx","x","k"})
---tile_by_index({"x"},{16},{l1_control="xx1"},{"xx","x","xx1","k"})
---tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
-print_code()
-
-normalize_index("x")
---normalize_index("i")
-print_code()
---tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"})
---print_code()
---cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}})
-cudaize("kernel_GPU",{dx=N,dy=N,dz=N,iRho=M,kx=M,ky=M,kz=M,rFHref=N,iFHref=N,rRho=M},{block={"xx"}, thread={"x"}})
---copy_to_shared("tx","iRho",-16)
---copy_to_shared("tx","dz",1)
---copy_to_shared("tx","rRho",-16)
---copy_to_registers("tx","rFHref")
---copy_to_registers("tx","rRho")
---copy_to_registers("tx","iRho")
---copy_to_registers("tx","kx")
---copy_to_registers("tx","dx")
---copy_to_registers("tx","ky")
---copy_to_registers("tx","dy")
---copy_to_registers("tx","kz")
---copy_to_registers("tx","dz")
---copy_to_registers("tx","iFHref")
---copy_to_texture("rRho")
---copy_to_texture("kx")
---copy_to_texture("dx")
---copy_to_texture("ky")
---copy_to_texture("dy")
---copy_to_texture("kz")
---copy_to_texture("dz")
---copy_to_texture("iRho")
---print_code()--]]
---unroll(0,4,0)
---copy_to_constant_no_tile("kx")
---copy_to_constant_no_tile("ky")
---copy_to_constant_no_tile("kz")
---copy_to_constant_no_tile("rRho")
---copy_to_constant_no_tile("iRho")
-
---unroll_to_depth(1)
-print_code()
---[[
-copy_to_Texture("rRho")
-copy_to_Texture("kx")
-copy_to_Texture("dx")
-copy_to_Texture("ky")
-copy_to_Texture("dy")
-copy_to_Texture("kz")
-copy_to_Texture("dz")
-copy_to_Texture("iRho")
---unroll_to_depth(2)
---]]
diff --git a/examples/cuda-chill/mriq.c b/examples/cuda-chill/mriq.c
deleted file mode 100644
index ba4b87c..0000000
--- a/examples/cuda-chill/mriq.c
+++ /dev/null
@@ -1,33 +0,0 @@
-#define N 32768
-#define M 3072
-struct kValues {
- float Kx;
- float Ky;
- float Kz;
- float PhiMag;
-};
-extern float sinf(float);
-extern float cosf(float);
-
-void
-ComputeQCPU(int numK, int numX,struct kValues kVals[M],float x[N], float y[N], float z[N],float Qr[N], float Qi[N]) {
- float expArg;
- float cosArg;
- float sinArg;
- float phi;
- int i;
- int j;
- numK = M;
- numX = N;
- for ( i = 0; i < M; i++) {
- for ( j = 0; j < N; j++) {
- expArg = 6.2831853071795864769252867665590058f * (kVals[i].Kx * x[j] +kVals[i].Ky * y[j] +kVals[i].Kz * z[j]);
- cosArg = cosf(expArg);
- sinArg = sinf(expArg);
- phi = kVals[i].PhiMag;
- Qr[j] += phi * cosArg;
- Qi[j] += phi * sinArg;
- }
- }
-}
-
diff --git a/examples/cuda-chill/mriq.lua b/examples/cuda-chill/mriq.lua
deleted file mode 100644
index 1170111..0000000
--- a/examples/cuda-chill/mriq.lua
+++ /dev/null
@@ -1,55 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("mriq.c", "ComputeQCPU", 0)
-
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-N=32768
-M=3072
-TI=128
-TJ=128
-
-permute(0,{"j","i"})
---tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"})
-tile_by_index({"i"}, {TJ}, {l1_control="ii",l1_tile="i"}, {"ii", "j","i"})
-tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
-
-normalize_index("j")
-normalize_index("i")
---print_code()
---tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"})
---print_code()
-cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}})
-
-copy_to_shared("tx","kVals",1)
---copy_to_shared("tx","x",1)
---copy_to_shared("tx","y",1)
---copy_to_shared("tx","z",1)
-
---copy_to_texture("kVals")
---datacopy(0, 3, "kVals", {"tt","t"},false,0,1,-16,true)
---print_code()
---datacopy_privatized(0,"tx","kVals",{"tx"})
---copy_to_registers("tx","kVals")
-copy_to_registers("ii","x")
-copy_to_registers("ii","y")
-copy_to_registers("ii","z")
-copy_to_registers("ii","Qi")
-copy_to_registers("ii","Qr")
---[[datacopy_privatized(0,"tx","x",{"tx"})
-datacopy_privatized(0,"tx","y",{"tx"})
-datacopy_privatized(0,"tx","z",{"tx"})
-datacopy_privatized(0,"tx","Qi",{"tx"})
-datacopy_privatized(0,"tx","Qr",{"tx"})
-
-
-]]--
---unroll(0,5,64)
-print_code()
---unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
diff --git a/examples/cuda-chill/mv-shadow.c b/examples/cuda-chill/mv-shadow.c
deleted file mode 100644
index 582b187..0000000
--- a/examples/cuda-chill/mv-shadow.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 1024
-
-void normalMV(float c[N][N], float a[N], float b[N]) {
- int i, j;
-
- for (i = 0; i < N; i++)
- for (j = 0; j < N; j++)
- a[i] = a[i] + c[j][i] * b[j];
-}
diff --git a/examples/cuda-chill/mv-shadow.lua b/examples/cuda-chill/mv-shadow.lua
deleted file mode 100644
index 43e8491..0000000
--- a/examples/cuda-chill/mv-shadow.lua
+++ /dev/null
@@ -1,65 +0,0 @@
-init("mv-shadow.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-
-N=129
-TI=32
-TJ=64
-
-N=1024
-TI=16
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
---Tile the i and j loop, introducing "ii" as the control loop for the "i"
---tile, "k" for the control loop fo the "j" tile, with the final order
---of {"ii", "k", "i", "j"}
-tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
---tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
---Normalize indx will do a tile size of one over the loop level specified
---by the input index. This is useful to get a zero lower bound and hard
---upper bound on a loop instead of it being relative to previous loop
---levels.
---normalize_index("ii")
-normalize_index("i")
-print_code()
-
---Cudaize now determines the grid dimentions from the loops themselves
---(the upper bounds of the block and thread loops). It also renames the
---given block and thread loops's indexes to the approviate values from
---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
---size of the arrays to be copied in the CUDA scaffolding.
-cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}})
---print_code()
-
---Does a datacopy, tile, and add_sync to get a shared memory copy
-
---copy_to_shared("tx", "b", 1)
---copy_to_shared("tx", "c", -16)
---print_code()
---copy_to_texture("b")
---copy_to_texture("c")
-copy_to_registers("k", "a")
---print_code()
-
-unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
---copy_to_texture("b")
---print_code()
---unroll(0,5,0)
---print_code()
diff --git a/examples/cuda-chill/mv.c b/examples/cuda-chill/mv.c
deleted file mode 100644
index 582b187..0000000
--- a/examples/cuda-chill/mv.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 1024
-
-void normalMV(float c[N][N], float a[N], float b[N]) {
- int i, j;
-
- for (i = 0; i < N; i++)
- for (j = 0; j < N; j++)
- a[i] = a[i] + c[j][i] * b[j];
-}
diff --git a/examples/cuda-chill/mv.lua b/examples/cuda-chill/mv.lua
deleted file mode 100644
index ca54501..0000000
--- a/examples/cuda-chill/mv.lua
+++ /dev/null
@@ -1,65 +0,0 @@
-init("mv.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-
-N=129
-TI=32
-TJ=64
-
-N=1024
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
---Tile the i and j loop, introducing "ii" as the control loop for the "i"
---tile, "k" for the control loop fo the "j" tile, with the final order
---of {"ii", "k", "i", "j"}
-tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
---tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
---Normalize indx will do a tile size of one over the loop level specified
---by the input index. This is useful to get a zero lower bound and hard
---upper bound on a loop instead of it being relative to previous loop
---levels.
---normalize_index("ii")
-normalize_index("i")
-print_code()
-
---Cudaize now determines the grid dimentions from the loops themselves
---(the upper bounds of the block and thread loops). It also renames the
---given block and thread loops's indexes to the approviate values from
---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
---size of the arrays to be copied in the CUDA scaffolding.
-cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}})
-
---print_code()
-
---Does a datacopy, tile, and add_sync to get a shared memory copy
-
---copy_to_shared("tx", "b", 1)
---copy_to_shared("tx", "c", -16)
---print_code()
---copy_to_texture("b")
---copy_to_texture("c")
-copy_to_registers("k", "a")
---print_code()
-
-unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
---copy_to_texture("b")
---print_code()
---unroll(0,5,0)
---print_code()
diff --git a/examples/cuda-chill/mv_try.c b/examples/cuda-chill/mv_try.c
deleted file mode 100644
index 7781f3b..0000000
--- a/examples/cuda-chill/mv_try.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 4096
-
-void normalMV(int n, float c[N][N], float a[N], float b[N]) {
- int i, j;
-
- for (i = 0; i < n; i++)
- for (j = 0; j < n; j++)
- a[i] = a[i] + c[i][j] * b[j];
-}
diff --git a/examples/cuda-chill/mv_try.lua b/examples/cuda-chill/mv_try.lua
deleted file mode 100644
index db4d9ad..0000000
--- a/examples/cuda-chill/mv_try.lua
+++ /dev/null
@@ -1,14 +0,0 @@
-init("mv_try.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-
-TI=96
-
-N=4096
-
-
-tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
-cudaize("mv_GPU", {a=N, b=N, c=N*N},
- {block={"ii"}, thread={"i"}})
-
-print_code()
diff --git a/examples/cuda-chill/nbody.c b/examples/cuda-chill/nbody.c
deleted file mode 100644
index 57899b6..0000000
--- a/examples/cuda-chill/nbody.c
+++ /dev/null
@@ -1,66 +0,0 @@
-#define NBODIES 16384
-#define SOFTENINGSQUARED 0.01f
-#define DELTATIME 0.001f
-#define DAMPING 1.0f
-
-#define NBLOCKSY 1
-#define NBLOCKSX (NBODIES/NTHREADSX)
-#define NTHREADSY 1
-#define NTHREADSX 64
-
-#define BLOCKSIZE 128
-
-#define SHARED 1
-#define TIMER 1
-#define VERIFY 1
-
-extern float sqrtf(float);
-
-void nbody_cpu(float* oldpos,float* oldpos1, float *newpos, float *oldvel, float *newvel, float *force)
-{
- float r0,r1,r2;
- float invDist, invDistCube, mass, invMass;
- unsigned int i,j;
- for(i = 0; i < NBODIES; ++i) {
- //force[i*4 ] = 0;
- //force[i*4+1] = 0;
- //force[i*4+2] = 0;
- //force[i*4+3] = 0;
- for(j = 0; j < NBODIES; ++j) {
- r0 = oldpos[j*4]-oldpos1[i*4];
- r1 = oldpos[j*4+1]-oldpos1[i*4+1];
- r2 = oldpos[j*4+2]-oldpos1[i*4+2];
-
- invDist = 1.0/sqrtf(r0 * r0 + r1 * r1 + r2 * r2 + SOFTENINGSQUARED);
- invDistCube = invDist * invDist * invDist;
- mass = oldpos1[i*4+3];
-
- force[i*4] = force[i*4] + r0 * mass * invDistCube;
- force[i*4+1] = force[i*4+1] + r1 * mass * invDistCube;
- force[i*4+2] = force[i*4+2] + r2 * mass * invDistCube;
-
- }
- }
-
-/* for (i = 0; i < NBODIES; ++i) {
- invMass = oldvel[4*i+3];
-
- oldvel[4*i] += (force[4*i] * invMass) * DELTATIME * DAMPING;
- oldvel[4*i+1] += (force[4*i+1] * invMass) * DELTATIME * DAMPING;
- oldvel[4*i+2] += (force[4*i+2] * invMass) * DELTATIME * DAMPING;
-
- oldpos[4*i] += oldvel[4*i] * DELTATIME;
- oldpos[4*i+1] += oldvel[4*i+1] * DELTATIME;
- oldpos[4*i+2] += oldvel[4*i+2] * DELTATIME;
-
- newpos[4*i+0] = oldpos[4*i];
- newpos[4*i+1] = oldpos[4*i+1];
- newpos[4*i+2] = oldpos[4*i+2];
- newpos[4*i+3] = oldpos[4*i+3];
-
- newvel[4*i+0] = oldvel[4*i];
- newvel[4*i+1] = oldvel[4*i+1];
- newvel[4*i+2] = oldvel[4*i+2];
- newvel[4*i+3] = oldvel[4*i+3];
- }*/
-}
diff --git a/examples/cuda-chill/nbody.lua b/examples/cuda-chill/nbody.lua
deleted file mode 100644
index 08f88a9..0000000
--- a/examples/cuda-chill/nbody.lua
+++ /dev/null
@@ -1,53 +0,0 @@
---CUBLAS 2 MM Multiply
-
---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
---call init() and use global variables to specify procedure and loop
-
---Second parameter is procedure # and third is loop #
-init("nbody.c", "nbody_cpu" , 0)
-
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-NBODIES=16384
-
-
---Tj=128 CHANGE FOR BEST..... BEST IS 64BLOCKS 128THREADS
---Ti=256
-Tj=64
-Ti=32
-Tjjj=1
-Tiii=1
-Tn=0.1
---normalize_index("j")
---
---print_code()
---normalize_index("n")
--- TILE COMMANDS ZEROOOOOOOOOOO:3
---tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j"})--CU=-1
-tile_by_index({"i"},{Ti},{l1_control="ii"},{"ii","i","j"})--CU=-1
---normalize_index("i")
---tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1
-
---tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3
---tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"})
---tile_by_index({"i"}, {Ti/2}, {l1_control="iii"}, {"ii","iii", "jj","i","j"})
---print_code()
-cudaize("kernel_GPU",{oldpos=4*NBODIES,oldpos1=4*NBODIES,oldvel=4*NBODIES,force=4*NBODIES,newpos=4*NBODIES,newvel=4*NBODIES},{block={"ii"}, thread={"i"}})--CU=3
-print_code()
---tile(0,6,6)
---copy_to_shared("tx","oldpos",-16)
---copy_to_registers("j","oldpos")
---copy_to_registers("j","oldpos1")
---copy_to_registers("j","force")
-
---copy_to_texture("oldpos")
---tile(1,3,3)
---tile(2,3,3)
-
-print_code()
---unroll_to_depth(1)
---
---tile(2,3,3)
---unroll(2,3,0)
---unroll(0,5,0)
---print_code()
diff --git a/examples/cuda-chill/tmv-shadow.c b/examples/cuda-chill/tmv-shadow.c
deleted file mode 100644
index cb9ea8d..0000000
--- a/examples/cuda-chill/tmv-shadow.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 1024
-
-void normalMV(float c[N][N], float a[N], float b[N]) {
- int i, j;
-
- for (i = 0; i < N; i++)
- for (j = 0; j < N; j++)
- a[i] = a[i] + c[i][j] * b[j];
-}
diff --git a/examples/cuda-chill/tmv-shadow.lua b/examples/cuda-chill/tmv-shadow.lua
deleted file mode 100644
index 196b939..0000000
--- a/examples/cuda-chill/tmv-shadow.lua
+++ /dev/null
@@ -1,50 +0,0 @@
-init("tmv-shadow.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-
-N=1024
---N= 8209
---N=129
-TI=64
-N=1024
-TI=32
---tile, "k" for the control loop for the "j" tile, with the final order
---of {"ii", "k", "i", "j"}
-tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
---tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
-
---print_code()
---Normalize indx will do a tile size of one over the loop level specified
---by the input index. This is useful to get a zero lower bound and hard
---upper bound on a loop instead of it being relative to previous loop
---levels.
---normalize_index("i")
---print_code()
-
---Cudaize now determines the grid dimentions from the loops themselves
---(the upper bounds of the block and thread loops). It also renames the
---given block and thread loops's indexes to the approviate values from
---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
---size of the arrays to be copied in the CUDA scaffolding.
-cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}})
-
---print_code()
-
---Does a datacopy, tile, and add_sync to get a shared memory copy
-copy_to_shared("tx", "b", 1)
---copy_to_texture("b")
---print_code()
-
-copy_to_shared("tx", "c", -16)
---copy_to_texture("c")
---print_code()
-
-copy_to_registers("k", "a")
-print_code()
---unroll(0,5,0)
---unroll(0,4,0)
---unroll(2,4,16)
-unroll_to_depth(1)
---print_code()
diff --git a/examples/cuda-chill/tmv.c b/examples/cuda-chill/tmv.c
deleted file mode 100644
index cb9ea8d..0000000
--- a/examples/cuda-chill/tmv.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#define N 1024
-
-void normalMV(float c[N][N], float a[N], float b[N]) {
- int i, j;
-
- for (i = 0; i < N; i++)
- for (j = 0; j < N; j++)
- a[i] = a[i] + c[i][j] * b[j];
-}
diff --git a/examples/cuda-chill/tmv.lua b/examples/cuda-chill/tmv.lua
deleted file mode 100644
index 5071108..0000000
--- a/examples/cuda-chill/tmv.lua
+++ /dev/null
@@ -1,50 +0,0 @@
-init("tmv.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-
-N=1024
---N= 8209
---N=129
-TI=64
-N=1024
-TI=32
---tile, "k" for the control loop for the "j" tile, with the final order
---of {"ii", "k", "i", "j"}
-tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
---tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
-
---print_code()
---Normalize indx will do a tile size of one over the loop level specified
---by the input index. This is useful to get a zero lower bound and hard
---upper bound on a loop instead of it being relative to previous loop
---levels.
---normalize_index("i")
---print_code()
-
---Cudaize now determines the grid dimentions from the loops themselves
---(the upper bounds of the block and thread loops). It also renames the
---given block and thread loops's indexes to the approviate values from
---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
---size of the arrays to be copied in the CUDA scaffolding.
-cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}})
-
---print_code()
-
---Does a datacopy, tile, and add_sync to get a shared memory copy
-copy_to_shared("tx", "b", 1)
---copy_to_texture("b")
---print_code()
-
-copy_to_shared("tx", "c", -16)
---copy_to_texture("c")
---print_code()
-
-copy_to_registers("k", "a")
-print_code()
---unroll(0,5,0)
---unroll(0,4,0)
---unroll(2,4,16)
-unroll_to_depth(1)
---print_code()
diff --git a/examples/fortran/README b/examples/fortran/README
deleted file mode 100644
index 4f23bee..0000000
--- a/examples/fortran/README
+++ /dev/null
@@ -1,10 +0,0 @@
-// Manu
-
-1) Fortran support added to permute, tile, unroll and datacopy. Tested these w.r.t gemm.c using gemm.script.
- There might be other issues (like fusion due to unroll, ...) that have not been tested.
-
-2) To incorporate Fortran support I had to modify certain values in omega (include/omega/omega_core/oc.h).
- To solve for large number of unknowns, these values have to be reverted back.
-
-3) Tested the existing chill scripts using Derick's python script.
- At least the existing chill scripts are not affected by the fortran related changes.
diff --git a/examples/fortran/ccd.f b/examples/fortran/ccd.f
deleted file mode 100644
index 12d834d..0000000
--- a/examples/fortran/ccd.f
+++ /dev/null
@@ -1,32 +0,0 @@
-c
-c These have been separated out from ccsd_t_singles_l.F and ccsd_t_doubles_l.F
-c
- subroutine clean_sd_t_s1_1(h3d,h2d,h1d,p6d,p5d,p4d,
- 2 triplesx,t1sub,v2sub)
- IMPLICIT NONE
- integer h3d,h2d,h1d,p6d,p5d,p4d
- integer h3,h2,h1,p6,p5,p4
- integer N
- double precision triplesx(16,16,16,16,16,16)
- double precision t1sub(16,16)
- double precision v2sub(16,16,16,16)
-
- N = 16
-
- do p4=1,10
- do p5=1,10
- do p6=1,10
- do h1=1,10
- do h2=1,10
- do h3=1,10
- triplesx(h3,h2,h1,p6,p5,p4)=triplesx(h3,h2,h1,p6,p5,p4)
- 1 + t1sub(p4,h1)*v2sub(h3,h2,p6,p5)
- enddo
- enddo
- enddo
- enddo
- enddo
- enddo
- return
- end
-
diff --git a/examples/fortran/ccd.script b/examples/fortran/ccd.script
deleted file mode 100644
index c2af500..0000000
--- a/examples/fortran/ccd.script
+++ /dev/null
@@ -1,18 +0,0 @@
-source: ccd.f
-procedure: clean_sd_t_s1_1
-format : rose
-loop: 0
-
-
-
-original()
-
-UN=4
-
-unroll(0,5,4)
-unroll(0,4,4)
-unroll(0,3,4)
-unroll(0,2,4)
-unroll(0,1,4)
-
-print
diff --git a/examples/fortran/gemm.f90 b/examples/fortran/gemm.f90
deleted file mode 100644
index b65bb58..0000000
--- a/examples/fortran/gemm.f90
+++ /dev/null
@@ -1,58 +0,0 @@
-program matmul
-
- integer N,i,j,k
- real*8 a(10,10), b(10,10), c(10,10), ct(10,10),mysum
-
- do i=1,10,1
- do j=1,10,1
- a(i,j) = i+j
- b(i,j) = i-j
- c(i,j) = 0.0
- ct(i,j) = 0.0
- end do
- b(i,i) = 1.0;
- end do
-
-
- DO j=1,10,1
- DO k=1,10,1
- DO i=1,10,1
- c(i,j) = c(i,j)+a(i,k)*b(k,j)
- end do
- end do
- end do
-
-
-
- call gemm(10,a,b,ct)
-
- mysum = 0.0
- do i=1,10,1
- do j=1,10,1
- mysum = c(i,j) - ct(i,j)
- end do
- end do
-
- if (abs(mysum) >= 0.00001) then
- write (*,*) "Something wrong"
- else
- write (*,*) "Output matches"
- end if
-
-end program matmul
-
- SUBROUTINE gemm(N,A,B,C)
- INTEGER N
- REAL*8 A(N,N), B(N,N), C(N,N)
-
- INTEGER I,J,K
-
- DO J=1,N,1
- DO K=1,N,1
- DO I=1,N,1
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
- end do
- end do
- end do
-
- END subroutine
diff --git a/examples/fortran/gemm.script b/examples/fortran/gemm.script
deleted file mode 100644
index 01eb859..0000000
--- a/examples/fortran/gemm.script
+++ /dev/null
@@ -1,30 +0,0 @@
-#matrix multiply large array size for intel machine
-source: gemm.f90
-procedure: gemm
-format: rose
-loop: 0
-
-TI = 128
-#TI = 4
-TJ = 8
-#TK = 3
-TK = 512
-UI = 2
-UJ = 2
-
-permute([3,1,2])
-tile(0,2,TJ)
-#print space
-tile(0,2,TI)
-#print space
-tile(0,5,TK)
-#print space
-
-
-datacopy(0,3,A,false,-1)
-#print space
-
-datacopy(0,4,B)
-unroll(0,4,UI)
-unroll(0,5,UJ)
-
diff --git a/examples/fortran/rose_gemm.f90 b/examples/fortran/rose_gemm.f90
deleted file mode 100644
index d150922..0000000
--- a/examples/fortran/rose_gemm.f90
+++ /dev/null
@@ -1,155 +0,0 @@
-PROGRAM matmul
-INTEGER :: N, i, j, k
-REAL(kind=8) :: a(10,10), b(10,10), c(10,10), ct(10,10), mysum
-DO i = 1, 10, 1
-DO j = 1, 10, 1
-a(i,j) = i + j
-b(i,j) = i - j
-c(i,j) = 0.0
-ct(i,j) = 0.0
-END DO
-b(i,i) = 1.0
-END DO
-DO j = 1, 10, 1
-DO k = 1, 10, 1
-DO i = 1, 10, 1
-c(i,j) = c(i,j) + a(i,k) * b(k,j)
-END DO
-END DO
-END DO
-CALL gemm(10,a,b,ct)
-mysum = 0.0
-DO i = 1, 10, 1
-DO j = 1, 10, 1
-mysum = c(i,j) - ct(i,j)
-END DO
-END DO
-IF (abs(mysum) >= 0.00001) THEN
-WRITE (*, FMT=*) "Something wrong"
-ELSE
-WRITE (*, FMT=*) "Output matches"
-END IF
-END PROGRAM matmul
-
-SUBROUTINE gemm(N,A,B,C)
-INTEGER :: t12
-INTEGER :: t10
-INTEGER :: t8
-INTEGER :: t6
-INTEGER :: t4
-INTEGER :: t2
-INTEGER :: chill_t64
-INTEGER :: chill_t63
-INTEGER :: chill_t62
-INTEGER :: chill_t61
-INTEGER :: chill_t60
-INTEGER :: chill_t59
-INTEGER :: chill_t58
-INTEGER :: chill_t57
-INTEGER :: chill_t56
-INTEGER :: chill_t55
-INTEGER :: chill_t54
-INTEGER :: chill_t53
-INTEGER :: chill_t52
-INTEGER :: chill_t51
-INTEGER :: chill_t50
-INTEGER :: chill_t49
-INTEGER :: chill_t48
-INTEGER :: chill_t47
-INTEGER :: over2
-INTEGER :: chill_t46
-INTEGER :: chill_t45
-INTEGER :: chill_t44
-INTEGER :: chill_t43
-INTEGER :: chill_t42
-INTEGER :: chill_t41
-INTEGER :: chill_t40
-INTEGER :: chill_t39
-INTEGER :: chill_t38
-INTEGER :: chill_t37
-INTEGER :: chill_t36
-INTEGER :: chill_t35
-INTEGER :: chill_t34
-INTEGER :: chill_t33
-INTEGER :: chill_t32
-INTEGER :: chill_t31
-INTEGER :: chill_t30
-INTEGER :: chill_t29
-INTEGER :: chill_t28
-INTEGER :: chill_t27
-INTEGER :: chill_t26
-INTEGER :: chill_t25
-INTEGER :: chill_t24
-INTEGER :: chill_t23
-INTEGER :: over1
-INTEGER :: chill_t22
-INTEGER :: chill_t21
-INTEGER :: chill_t20
-INTEGER :: chill_t19
-INTEGER :: chill_t18
-INTEGER :: chill_t17
-INTEGER :: chill_t16
-INTEGER :: chill_t15
-REAL(kind=8), DIMENSION(8,512) :: f_P2
-INTEGER :: chill_t14
-INTEGER :: chill_t13
-INTEGER :: chill_t12
-INTEGER :: chill_t11
-INTEGER :: chill_t10
-INTEGER :: chill_t9
-INTEGER :: chill_t8
-INTEGER :: chill_t7
-REAL(kind=8), DIMENSION(512,128) :: f_P1
-INTEGER :: chill_t1
-INTEGER :: chill_t2
-INTEGER :: chill_t4
-INTEGER :: chill_t6
-INTEGER :: chill_t5
-INTEGER :: N
-REAL(kind=8) :: A(N,N), B(N,N), C(N,N)
-INTEGER :: I, J, K
-over1 = 0
-over2 = 0
-DO t2 = 1, N, 512
-DO t4 = 1, N, 128
-DO t6 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
-DO t8 = t4, merge(t4 + 127,N,t4 + 127 <= N), 1
-f_P1(t8 - t4 + 1,t6 - t2 + 1) = A(t8,t6)
-END DO
-END DO
-DO t6 = 1, N, 8
-DO t8 = t6, merge(N,t6 + 7,N <= t6 + 7), 1
-DO t10 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
-f_P2(t10 - t2 + 1,t8 - t6 + 1) = B(t10,t8)
-END DO
-END DO
-over1 = MOD(N,2)
-DO t8 = t4, merge(-over1 + N,t4 + 126,-over1 + N <= t4 + 126), 2
-over2 = MOD(N,2)
-DO t10 = t6, merge(t6 + 6,N - over2,t6 + 6 <= N - over2), 2
-DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1
-C(t8,t10) = C(t8,t10) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
-C(t8 + 1,t10) = C(t8 + 1,t10) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
-C(t8,t10 + 1) = C(t8,t10 + 1) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1)
-C(t8 + 1,t10 + 1) = C(t8 + 1,t10 + 1) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1)
-END DO
-END DO
-IF (N - 7 <= t6 .AND. 1 <= over2) THEN
-DO t12 = t2, merge(N,t2 + 511,N <= t2 + 511), 1
-C(t8,N) = C(t8,N) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1)
-C(t8 + 1,N) = C(t8 + 1,N) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1)
-END DO
-END IF
-END DO
-IF (N - 127 <= t4 .AND. 1 <= over1) THEN
-DO t10 = t6, merge(t6 + 7,N,t6 + 7 <= N), 1
-DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1
-C(N,t10) = C(N,t10) + f_P1(N - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1)
-END DO
-END DO
-END IF
-END DO
-END DO
-END DO
-END SUBROUTINE
-
diff --git a/graph-test.cc b/graph-test.cc
deleted file mode 100644
index 3cdcbee..0000000
--- a/graph-test.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-#include "graph.hh"
-
-using std::cout;
-using std::endl;
-template<typename T>
-struct A {
-};
-
-template struct Graph<Empty,Empty>;
-
-int main() {
- Graph<> g;
-
- for (int i = 0; i < 8; i++)
- g.insert();
-
- std::vector<Empty> t;
- t.push_back(Empty());
- t.push_back(Empty());
-
- g.connect(0,1);
- g.connect(1,4);
- g.connect(4,0);
- g.connect(4,5);
- g.connect(1,5);
- g.connect(1,2);
- g.connect(2,3);
- g.connect(3,2);
- g.connect(2,6);
- g.connect(5,6);
- g.connect(6,5);
- g.connect(6,7);
- g.connect(3,7);
- g.connect(7,7,t);
-
- g.insert();
- g.insert();
- g.connect(9,8);
- g.connect(8,0);
-
- cout << "Graph #1:" << endl;
- cout << g;
-
- std::vector<std::set<int> > r = g.topoSort();
-
- cout << "topological order: ";
- int num_scc = 0;
- for (int i = 0; i < r.size(); i++) {
- if (i != 0)
- cout << ' ';
- if (r[i].size() > 1) {
- cout << '(';
- num_scc++;
- }
- for (std::set<int>::iterator j = r[i].begin(); j != r[i].end(); j++) {
- if (j != r[i].begin())
- cout << ' ';
- cout << (*j+1);
- }
- if (r[i].size() > 1)
- cout << ')';
- }
- cout << endl;
- cout << "total number of SCC: " << num_scc << endl;
-
- Graph<> g2;
-
- for (int i = 0; i < 6; i++)
- g2.insert();
-
- g2.connect(0,1);
- g2.connect(0,2);
- g2.connect(3,4);
- g2.connect(3,5);
- g2.connect(3,2);
- g2.connect(5,0);
-
- cout << endl << "Graph #2:" << endl;
- cout << g2;
-
- std::vector<std::set<int> > r2 = g2.packed_topoSort();
-
- cout << "packed topological order: ";
- for (int i = 0; i < r2.size(); i++) {
- if (i != 0)
- cout << ' ';
- if (r2[i].size() > 1)
- cout << '(';
- for (std::set<int>::iterator j = r2[i].begin(); j != r2[i].end(); j++) {
- if (j != r2[i].begin())
- cout << ' ';
- cout << (*j+1);
- }
- if (r2[i].size() > 1)
- cout << ')';
- }
- cout << endl;
-
- Graph<> g3;
-
- for (int i = 0; i < 6; i++)
- g3.insert();
-
- g3.connect(5,2);
- g3.connect(5,3);
- g3.connect(5,4);
- g3.connect(3,1);
- g3.connect(1,0);
-
- cout << endl << "Graph #3:" << endl;
- cout << g3;
-
- std::vector<std::set<int> > r3 = g3.topoSort();
-
- cout << "topological order: ";
- for (int i = 0; i < r3.size(); i++) {
- if (i != 0)
- cout << ' ';
- if (r3[i].size() > 1)
- cout << '(';
- for (std::set<int>::iterator j = r3[i].begin(); j != r3[i].end(); j++) {
- if (j != r3[i].begin())
- cout << ' ';
- cout << (*j+1);
- }
- if (r3[i].size() > 1)
- cout << ')';
- }
- cout << endl;
-
- r3 = g3.packed_topoSort();
-
- cout << "packed topological order: ";
- for (int i = 0; i < r3.size(); i++) {
- if (i != 0)
- cout << ' ';
- if (r3[i].size() > 1)
- cout << '(';
- for (std::set<int>::iterator j = r3[i].begin(); j != r3[i].end(); j++) {
- if (j != r3[i].begin())
- cout << ' ';
- cout << (*j+1);
- }
- if (r3[i].size() > 1)
- cout << ')';
- }
- cout << endl;
-}
diff --git a/graph.hh b/graph.hh
index 5d0ff66..f8471df 100644
--- a/graph.hh
+++ b/graph.hh
@@ -76,7 +76,8 @@ template<typename VertexType, typename EdgeType>
std::ostream& operator<<(std::ostream &os, const Graph<VertexType, EdgeType> &g) {
for (int i = 0; i < g.vertex.size(); i++)
for (typename Graph<VertexType,EdgeType>::EdgeList::const_iterator j = g.vertex[i].second.begin(); j != g.vertex[i].second.end(); j++) {
- os << "s" << i << "->" << "s" << j->first << ":";
+ // os << i+1 << "->" << j->first+1 << ":";
+ os << "s" << i << "->" << "s" << j->first << ":";
for (typename std::vector<EdgeType>::const_iterator k = j->second.begin(); k != j->second.end(); k++)
os << " " << *k;
os << std::endl;
diff --git a/include/ir_suif.hh b/include/ir_suif.hh
deleted file mode 120000
index 37f4ae8..0000000
--- a/include/ir_suif.hh
+++ /dev/null
@@ -1 +0,0 @@
-../ir_suif.hh \ No newline at end of file
diff --git a/include/ir_suif_utils.hh b/include/ir_suif_utils.hh
deleted file mode 120000
index 327320d..0000000
--- a/include/ir_suif_utils.hh
+++ /dev/null
@@ -1 +0,0 @@
-../ir_suif_utils.hh \ No newline at end of file
diff --git a/ir_cuda_rose_utils.cc b/ir_cuda_rose_utils.cc
deleted file mode 100644
index e7b4c37..0000000
--- a/ir_cuda_rose_utils.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2008 University of Southern California
- Copyright (C) 2009 University of Utah
- All Rights Reserved.
-
- Purpose:
- SUIF interface utilities.
-
- Notes:
-
- Update history:
- 01/2006 created by Chun Chen
-*****************************************************************************/
-
-//#include <suif1.h>
-#include "ir_rose_utils.hh"
-
-
-/**
- * Returns the body of the for loop found by finding the first loop in
- * code, and if level > 1 recursively calling on the body of the found
- * loop and (level-1)
- */
-SgNode* loop_body_at_level(SgNode* tnl, int level) {
- SgNode *inner_nl = 0;
- //Now strip out the tnl on the inner level of the for loop
- //tree_node_list_iter tnli(tnl);
-
- if (isSgBasicBlock(tnl)) {
-
- SgStatementPtrList& tnli = isSgBasicBlock(tnl)->get_statements();
-
- for (SgStatementPtrList::iterator it = tnli.begin(); it != tnli.end();
- it++) {
- if (isSgForStatement(*it)) {
- inner_nl = loop_body_at_level(isSgForStatement(*it), level);
- break;
- }
-
- }
-
- }
-
- return inner_nl;
-}
-
-SgNode* loop_body_at_level(SgForStatement* loop, int level) {
- if (level > 1)
- return loop_body_at_level(loop->get_loop_body(), level - 1);
- return loop->get_loop_body();
-}
-
-void swap_node_for_node_list(SgNode* tn, SgNode* new_tnl) {
- SgStatement *s = isSgStatement(tn);
-
- SgStatement* p;
- if (s != 0) {
- p = isSgStatement(tn->get_parent());
-
- if (p != 0) {
-
- if (isSgBasicBlock(new_tnl)) {
-
- /*SgStatementPtrList & list_ =
- isSgBasicBlock(new_tnl)->get_statements();
-
- if (isSgForStatement(p)) {
- if (!isSgBasicBlock(isSgForStatement(p)->get_loop_body()))
- p->replace_statement(s, isSgStatement(new_tnl));
- else {
- p->insert_statement(s, list_, true);
- p->remove(s);
- }
- } else {
- p->insert_statement(s, list_, true);
- p->remove(s);
- }
- */
- if (isSgForStatement(p)) {
- if (!isSgBasicBlock(isSgForStatement(p)->get_loop_body()))
- p->replace_statement(s, isSgStatement(new_tnl));
- else {
-
- SgStatementPtrList& list_ =
- isSgBasicBlock(new_tnl)->get_statements();
-
- //std::vector<SgStatement*> list;
-
- SgStatementPtrList::iterator it = list_.begin();
- SgStatement* begin = *it;
- begin->set_parent(p);
-
- p->replace_statement(s, begin);
- it++;
- //SgStatement* stmt = first;
- SgStatement* temp = begin;
- for (; it != list_.end(); it++) {
- (*it)->set_parent(p);
- p->insert_statement(temp, *it, false);
- temp = *it;
- }
-
- }
-
- } else {
-
-
- SgStatementPtrList& list_ =
- isSgBasicBlock(new_tnl)->get_statements();
-
- //std::vector<SgStatement*> list;
-
- SgStatementPtrList::iterator it = list_.begin();
- SgStatement* begin = *it;
- begin->set_parent(p);
-
- p->replace_statement(s, begin);
- it++;
- //SgStatement* stmt = first;
- SgStatement* temp = begin;
- for (; it != list_.end(); it++) {
- (*it)->set_parent(p);
- p->insert_statement(temp, *it, false);
- temp = *it;
- }
-
- }
-
- /* SgStatement* temp = s;
-
- SgStatementPtrList::iterator it = list_.begin();
- p->insert_statement(temp, *it, true);
- temp = *it;
- p->remove_statement(s);
- it++;
- for (; it != list_.end(); it++) {
- p->insert_statement(temp, *it, false);
- temp = *it;
- }
-
- // new_tnl->set_parent(p);
- //new_tnl->get_statements();
- SgStatementPtrList& list =
- isSgBasicBlock(new_tnl)->get_statements();
-
- //std::vector<SgStatement*> list;
-
- SgStatementPtrList::iterator it = list.begin();
- SgStatement* begin = *it;
- begin->set_parent(p);
-
- p->replace_statement(s, begin);
- it++;
- //SgStatement* stmt = first;
- SgStatement* temp = begin;
- for (; it != list.end(); it++) {
- (*it)->set_parent(p);
- p->insert_statement(temp, *it, false);
- temp = *it;
- }
- */
- /* SgStatementPtrList& stmt_list = isSgBasicBlock(new_tnl)->get_statements();
- SgStatement* target = s;
-
- for(SgStatementPtrList::iterator it = stmt_list.begin() ; it != stmt_list.end(); it++)
- {
- isSgNode(*it)->set_parent(p);
- p->insert_statement(isSgStateme, *it, false);
- target = *it;
- }
-
- p->remove_statement(s);
-
- */
- }else if(isSgIfStmt(p)) {
-
- if(isSgIfStmt(p)->get_true_body() == s)
- isSgIfStmt(p)->set_true_body(isSgStatement(new_tnl));
- else if(isSgIfStmt(p)->get_false_body() == s)
- isSgIfStmt(p)->set_false_body(isSgStatement(new_tnl));
- new_tnl->set_parent(p);
- }
- else {
- p->replace_statement(s, isSgStatement(new_tnl));
- new_tnl->set_parent(p);
- }
- }
-
- }
- // return isSgNode(p);
-}
diff --git a/ir_cuda_suif_utils.cc b/ir_cuda_suif_utils.cc
deleted file mode 100644
index f15c190..0000000
--- a/ir_cuda_suif_utils.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2008 University of Southern California
- Copyright (C) 2009 University of Utah
- All Rights Reserved.
-
- Purpose:
- SUIF interface utilities.
-
- Notes:
-
- Update history:
- 01/2006 created by Chun Chen
-*****************************************************************************/
-
-#include <suif1.h>
-#include "ir_suif_utils.hh"
-
-
-/**
- * Returns the body of the for loop found by finding the first loop in
- * code, and if level > 1 recursively calling on the body of the found
- * loop and (level-1)
- */
-tree_node_list* loop_body_at_level(tree_node_list* tnl, int level)
-{
- tree_node_list *inner_nl = 0;
- //Now strip out the tnl on the inner level of the for loop
- tree_node_list_iter tnli(tnl);
- while (!tnli.is_empty()) {
- tree_node *node = tnli.step();
- if(node->kind() == TREE_FOR)
- {
- //Found the first tree_for, call sibling function
- inner_nl = loop_body_at_level((tree_for*)node, level);
- break;
- }
- }
- return inner_nl;
-}
-
-tree_node_list* loop_body_at_level(tree_for* loop, int level)
-{
- if(level > 1)
- return loop_body_at_level(loop->body(), level-1);
- return loop->body();
-}
-
-tree_node_list* swap_node_for_node_list(tree_node* tn, tree_node_list* new_tnl)
-{
- tree_node_list* tnl = tn->parent();
- tnl->insert_after(new_tnl, tn->list_e());
- delete tnl->remove(tn->list_e());
- return tnl;
-}
diff --git a/ir_cudarose.cc b/ir_cudarose.cc
deleted file mode 100644
index 6b31bdd..0000000
--- a/ir_cudarose.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2009 University of Utah
- All Rights Reserved.
-
- Purpose:
- CHiLL's SUIF interface.
-
- Notes:
- Array supports mixed pointer and array type in a single declaration.
-
- History:
- 2/2/2011 Created by Protonu Basu.
-*****************************************************************************/
-
-#include <typeinfo>
-#include "ir_cudarose.hh"
-#include "loop.hh"
-#include "loop_cuda_rose.hh"
-//#include "ir_suif_utils.hh"
-
-using namespace SageBuilder;
-using namespace SageInterface;
-
-IR_cudaroseCode::IR_cudaroseCode(const char *filename, const char* proc_name) :
- IR_roseCode(filename, proc_name) {
-
- //std::string file_suffix = StringUtility::fileNameSuffix(filename);
-
- //if (CommandlineProcessing::isCFileNameSuffix(file_suffix))
- //{
- std::string orig_name = StringUtility::stripPathFromFileName(filename);
- std::string naked_name = StringUtility::stripFileSuffixFromFileName(
- orig_name);
- file->set_unparse_output_filename("rose_" + naked_name + ".cu");
-
- //}
-
- gsym_ = root;
- first_scope = firstScope;
- parameter = symtab2_;
- body = symtab3_;
- defn = func->get_definition()->get_body();
- func_defn = func->get_definition();
-}
-
-
-
-IR_ArraySymbol *IR_cudaroseCode::CreateArraySymbol(const IR_Symbol *sym,
- std::vector<omega::CG_outputRepr *> &size, int sharedAnnotation) {
- SgType *tn;
- SgVariableSymbol* vs;
- if (typeid(*sym) == typeid(IR_roseScalarSymbol)) {
- tn = static_cast<const IR_roseScalarSymbol *>(sym)->vs_->get_type();
- } else if (typeid(*sym) == typeid(IR_roseArraySymbol)) {
- tn = static_cast<const IR_roseArraySymbol *>(sym)->vs_->get_type();
- while (isSgArrayType(tn) || isSgPointerType(tn)) {
- if (isSgArrayType(tn))
- tn = isSgArrayType(tn)->get_base_type();
- else if (isSgPointerType(tn))
- tn = isSgPointerType(tn)->get_base_type();
- else
- throw ir_error(
- "in CreateScalarSymbol: symbol not an array nor a pointer!");
- }
- } else
- throw std::bad_typeid();
-
- for (int i = size.size() - 1; i >= 0; i--)
- tn = buildArrayType(tn,
- static_cast<omega::CG_roseRepr *>(size[i])->GetExpression());
-
- static int rose_array_counter = 1;
- std::string s = std::string("_P") + omega::to_string(rose_array_counter++);
- SgVariableDeclaration* defn2 = buildVariableDeclaration(
- const_cast<char *>(s.c_str()), tn);
- SgInitializedNamePtrList& variables2 = defn2->get_variables();
-
- SgInitializedNamePtrList::const_iterator i2 = variables2.begin();
- SgInitializedName* initializedName2 = *i2;
- vs = new SgVariableSymbol(initializedName2);
-
- prependStatement(defn2,
- isSgScopeStatement(func->get_definition()->get_body()));
-
- vs->set_parent(symtab_);
- symtab_->insert(SgName(s.c_str()), vs);
-
- SgStatementPtrList* tnl5 = new SgStatementPtrList;
-
- (*tnl5).push_back(isSgStatement(defn2));
-
- omega::CG_roseRepr* stmt = new omega::CG_roseRepr(tnl5);
-
- init_code_ = ocg_->StmtListAppend(init_code_,
- static_cast<omega::CG_outputRepr *>(stmt));
-
- if (sharedAnnotation == 1)
- isSgNode(defn2)->setAttribute("__shared__",
- new AstTextAttribute("__shared__"));
-
- return new IR_roseArraySymbol(this, vs);
-}
-
-bool IR_cudaroseCode::commit_loop(Loop *loop, int loop_num) {
- if (loop == NULL)
- return true;
-
- LoopCuda *cu_loop = (LoopCuda *) loop;
- SgNode *tnl = cu_loop->codegen();
- if (!tnl)
- return false;
-
- SgStatementPtrList* new_list = NULL;
- if (isSgBasicBlock(tnl)) {
- new_list = new SgStatementPtrList;
- for (SgStatementPtrList::iterator it =
- isSgBasicBlock(tnl)->get_statements().begin();
- it != isSgBasicBlock(tnl)->get_statements().end(); it++)
- (*new_list).push_back(*it);
- }
-
- //Only thing that should be left will be the inserting of the tnl* into the loop
- omega::CG_outputRepr *repr;
- if (new_list == NULL)
- repr = new omega::CG_roseRepr(tnl);
- else
- repr = new omega::CG_roseRepr(new_list);
- if (cu_loop->init_code != NULL)
- repr = ocg_->StmtListAppend(cu_loop->init_code->clone(), repr);
-
- std::vector<SgForStatement *> loops = find_loops(
- func->get_definition()->get_body());
- tnl = isSgNode(loops[loop_num])->get_parent();
-
- if (cu_loop->setup_code != NULL) {
- SgStatementPtrList* setup_tnl =
- static_cast<omega::CG_roseRepr *>(cu_loop->setup_code)->GetList();
-
- SgStatement* target = isSgStatement(loops[loop_num]);
-
- for (SgStatementPtrList::iterator it = (*setup_tnl).begin();
- it != (*setup_tnl).end(); it++) {
-
- isSgStatement(tnl)->insert_statement(target, *it, false);
- isSgNode(*it)->set_parent(tnl);
- target = *it;
- }
-
- //SgStatementPtrList
- // for SgStatementPtrList::it
- //TODO: I think this is a hack we can undo if we have loop->codegen()
- //loo->getCode(), maybe also get rid of setup and teardown...
- //fix_unfinished_comment(setup_tnl, indexes_string);
- //isSgStatement(tnl)->replace_statement(isSgStatement(loops[loop_num]), *setup_tnl);
- isSgStatement(tnl)->remove_statement(isSgStatement(loops[loop_num]));
- }
-
- delete repr;
-
- return true;
-}
-
-IR_cudaroseCode::~IR_cudaroseCode() {
-}
-
diff --git a/ir_cudarose.hh b/ir_cudarose.hh
deleted file mode 100644
index 34e0404..0000000
--- a/ir_cudarose.hh
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef IR_CUDA_ROSE
-#define IR_CUDA_ROSE
-
-#include <code_gen/CG_roseRepr.h>
-#include <code_gen/CG_roseBuilder.h>
-#include "ir_rose.hh"
-#include "loop.hh"
-#include "loop_cuda_rose.hh"
-#include "ir_rose_utils.hh"
-
-
-
-class IR_cudaroseCode : public IR_roseCode{
-
-public:
-
-
- IR_cudaroseCode(const char *filename, const char* proc_name);
-
-
-
- SgGlobal *gsym_;
- SgScopeStatement* defn;
- SgGlobal* first_scope;
- SgSymbolTable* parameter;
- SgSymbolTable* body;
- SgFunctionDefinition* func_defn;
- std::vector<SgSymbolTable*> write_procs;//procs to write
-
-
- IR_ArraySymbol *CreateArraySymbol(const IR_Symbol *sym, std::vector<omega::CG_outputRepr *> &size,int sharedAnnotation = 1);
- omega::CG_outputRepr* init_code(){ return init_code_; }
- bool commit_loop(Loop *loop, int loop_num);
- std::vector<SgForStatement *> get_loops()
- {
- std::vector<SgForStatement *> loops = find_loops(func->get_definition()->get_body());
- return loops;
- }
-
- ~IR_cudaroseCode();
-
-};
-
-
-#endif
-
diff --git a/ir_cudasuif.cc b/ir_cudasuif.cc
deleted file mode 100644
index c646e13..0000000
--- a/ir_cudasuif.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2009 University of Utah
- All Rights Reserved.
-
- Purpose:
- CHiLL's SUIF interface.
-
- Notes:
- Array supports mixed pointer and array type in a single declaration.
-
- History:
- 2/2/2011 Created by Protonu Basu.
-*****************************************************************************/
-
-#include <typeinfo>
-#include "ir_cudasuif.hh"
-#include "loop.hh"
-#include "loop_cuda.hh"
-#include "ir_suif_utils.hh"
-
-
-IR_cudasuifCode::IR_cudasuifCode(const char *filename, int proc_num)
- :IR_suifCode(filename, proc_num)
-{
- //setting up gsym_ here
- fileset->reset_iter();
- gsym_ = fileset->globals();
-
-}
-
-
-
-IR_ArraySymbol *IR_cudasuifCode::CreateArraySymbol(const IR_Symbol *sym,
- std::vector<omega::CG_outputRepr *> &size,
- int sharedAnnotation)
-{
- type_node *tn;
-
- if (typeid(*sym) == typeid(IR_suifScalarSymbol)) {
- tn = static_cast<const IR_suifScalarSymbol *>(sym)->vs_->type();
- }
- else if (typeid(*sym) == typeid(IR_suifArraySymbol)) {
- tn = static_cast<const IR_suifArraySymbol *>(sym)->vs_->type();
- if (tn->is_modifier())
- tn = static_cast<modifier_type *>(tn)->base();
- while (tn->is_array() || tn->is_ptr()) {
- if (tn->is_array())
- tn = static_cast<array_type *>(tn)->elem_type();
- else if (tn->is_ptr())
- tn = static_cast<ptr_type *>(tn)->ref_type();
- }
- }
- else
- throw std::bad_typeid();
-
- if (is_fortran_)
- for (int i = 0; i < size.size(); i++) {
- var_sym *temporary = symtab_->new_unique_var(type_s32);
- init_code_ = ocg_->StmtListAppend(init_code_, ocg_->StmtListAppend(ocg_->CreateAssignment(0, new omega::CG_suifRepr(operand(temporary)), size[i]),NULL));
-
- tn = new array_type(tn, array_bound(1), array_bound(temporary));
- symtab_->add_type(tn);
- }
- else
- for (int i = size.size()-1; i >= 0; i--) {
- var_sym *temporary = symtab_->new_unique_var(type_s32);
- //init_code_ = ocg_->StmtListAppend(init_code_, ocg_->CreateStmtList(ocg_->CreateAssignment(0, new omega::CG_suifRepr(operand(temporary)), size[i])));
- init_code_ = ocg_->StmtListAppend(init_code_, ocg_->StmtListAppend(ocg_->CreateAssignment(0, new omega::CG_suifRepr(operand(temporary)), size[i]), NULL));
-
- tn = new array_type(tn, array_bound(1), array_bound(temporary));
- symtab_->add_type(tn);
- if(i == 0 && sharedAnnotation == 1){
- tn = static_cast<omega::CG_suifBuilder*>(ocg_)->ModifyType(tn, "__shared__");
- symtab_->add_type(tn);
- }
- }
-
- static int suif_array_counter = 1;
- std::string s = std::string("_P") + omega::to_string(suif_array_counter++);
- var_sym *vs = new var_sym(tn, const_cast<char *>(s.c_str()));
- vs->add_to_table(symtab_);
-
- return new IR_suifArraySymbol(this, vs);
-}
-
-
-bool IR_cudasuifCode::commit_loop(Loop *loop, int loop_num) {
- if (loop == NULL)
- return true;
-
- //Call code-gen part of any scripting routines that were run.
- // internally call GetCode
- // Add stuff before and after (setup, teardown
- // return a tnl
- LoopCuda *cu_loop = (LoopCuda *)loop;
- tree_node_list *tnl = cu_loop->codegen();
- if(!tnl)
- return false;
-
- //set up our new procs
- for(int i=0; i<cu_loop->new_procs.size(); i++)
- {
- printf("setting proc fse\n");
- cu_loop->new_procs[i]->set_fse(fse_);
- write_procs.push_back(cu_loop->new_procs[i]);
- }
-
- //Only thing that should be left will be the inserting of the tnl* into the loop
-
- omega::CG_outputRepr *repr = new omega::CG_suifRepr(tnl);
- if (cu_loop->init_code != NULL)
- repr = ocg_->StmtListAppend(cu_loop->init_code->clone(), repr);
-
- std::vector<tree_for *> loops = find_loops(psym_->block()->body());
- tnl = loops[loop_num]->parent();
-
- if (cu_loop->setup_code != NULL) {
- tree_node_list *setup_tnl = static_cast<omega::CG_suifRepr *>(cu_loop->setup_code->clone())->GetCode();
- //TODO: I think this is a hack we can undo if we have loop->codegen()
- //loo->getCode(), maybe also get rid of setup and teardown...
- //fix_unfinished_comment(setup_tnl, indexes_string);
- tnl->insert_before(setup_tnl, loops[loop_num]->list_e());
- }
- tnl->insert_before(static_cast<omega::CG_suifRepr *>(repr)->GetCode(), loops[loop_num]->list_e());
- if (cu_loop->teardown_code != NULL) {
- tree_node_list *setup_tnl = static_cast<omega::CG_suifRepr *>(cu_loop->teardown_code->clone())->GetCode();
- tnl->insert_before(setup_tnl, loops[loop_num]->list_e());
- }
-
- tnl->remove(loops[loop_num]->list_e());
-
- delete repr;
- return true;
-}
-
-IR_cudasuifCode::~IR_cudasuifCode()
-{
- for(int i=0; i<write_procs.size(); i++)
- {
- if (!write_procs[i]->is_written())
- write_procs[i]->write_proc(fse_);
- write_procs[i]->flush_proc();
- }
-}
diff --git a/ir_cudasuif.hh b/ir_cudasuif.hh
deleted file mode 100644
index 834778e..0000000
--- a/ir_cudasuif.hh
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef IR_CUDA_SUIF
-#define IR_CUDA_SUIF
-
-#include <code_gen/CG_suifRepr.h>
-#include <code_gen/CG_suifBuilder.h>
-#include "ir_suif.hh"
-#include "loop.hh"
-#include "loop_cuda.hh"
-#include "ir_suif_utils.hh"
-
-
-
-class IR_cudasuifCode : public IR_suifCode{
-
-public:
- global_symtab *gsym_;
- std::vector<proc_sym*> write_procs;//procs to write
-
-
- IR_cudasuifCode(const char *filename, int proc_num);
- IR_ArraySymbol *CreateArraySymbol(const IR_Symbol *sym,
- std::vector<omega::CG_outputRepr *> &size,
- int sharedAnnotation = 1);
- omega::CG_outputRepr* init_code(){ return init_code_; }
- bool commit_loop(Loop *loop, int loop_num);
- std::vector<tree_for *> get_loops()
- {
- std::vector<tree_for *> loops = find_loops(psym_->block()->body());
- return loops;
- }
- ~IR_cudasuifCode();
-
-};
-
-
-#endif
diff --git a/loop.cc b/loop.cc
index ce83006..0a82f7a 100644
--- a/loop.cc
+++ b/loop.cc
@@ -53,6 +53,7 @@ bool Loop::isInitialized() const {
bool Loop::init_loop(std::vector<ir_tree_node *> &ir_tree,
std::vector<ir_tree_node *> &ir_stmt) {
+
ir_stmt = extract_ir_stmts(ir_tree);
stmt_nesting_level_.resize(ir_stmt.size());
std::vector<int> stmt_nesting_level(ir_stmt.size());
diff --git a/loop_backup.cc b/loop_backup.cc
deleted file mode 100644
index b361ed4..0000000
--- a/loop_backup.cc
+++ /dev/null
@@ -1,3311 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2008 University of Southern California
- Copyright (C) 2009-2010 University of Utah
- All Rights Reserved.
-
- Purpose:
- Core loop transformation functionality.
-
- Notes:
- "level" (starting from 1) means loop level and it corresponds to "dim"
- (starting from 0) in transformed iteration space [c_1,l_1,c_2,l_2,....,
- c_n,l_n,c_(n+1)], e.g., l_2 is loop level 2 in generated code, dim 3
- in transformed iteration space, and variable 4 in Omega relation.
- All c's are constant numbers only and they will not show up as actual loops.
- Formula:
- dim = 2*level - 1
- var = dim + 1
-
- History:
- 10/2005 Created by Chun Chen.
- 09/2009 Expand tile functionality, -chun
- 10/2009 Initialize unfusible loop nest without bailing out, -chun
-*****************************************************************************/
-
-#include <limits.h>
-#include <math.h>
-#include <code_gen/code_gen.h>
-#include <code_gen/CG_outputBuilder.h>
-#include <code_gen/output_repr.h>
-#include <iostream>
-#include <map>
-#include "loop.hh"
-#include "omegatools.hh"
-#include "irtools.hh"
-#include "chill_error.hh"
-
-using namespace omega;
-
-const std::string Loop::tmp_loop_var_name_prefix = std::string("_t");
-const std::string Loop::overflow_var_name_prefix = std::string("over");
-
-//-----------------------------------------------------------------------------
-// Class Loop
-//-----------------------------------------------------------------------------
-
-bool Loop::init_loop(std::vector<ir_tree_node *> &ir_tree, std::vector<ir_tree_node *> &ir_stmt) {
- ir_stmt = extract_ir_stmts(ir_tree);
- std::vector<int> stmt_nesting_level(ir_stmt.size());
- for (int i = 0; i < ir_stmt.size(); i++) {
- ir_stmt[i]->payload = i;
- int t = 0;
- ir_tree_node *itn = ir_stmt[i];
- while (itn->parent != NULL) {
- itn = itn->parent;
- if (itn->content->type() == IR_CONTROL_LOOP)
- t++;
- }
- stmt_nesting_level[i] = t;
- }
-
- stmt = std::vector<Statement>(ir_stmt.size());
- int n_dim = -1;
- int max_loc;
- std::vector<std::string> index;
- for (int i = 0; i < ir_stmt.size(); i++) {
- int max_nesting_level = -1;
- int loc;
- for (int j = 0; j < ir_stmt.size(); j++)
- if (stmt_nesting_level[j] > max_nesting_level) {
- max_nesting_level = stmt_nesting_level[j];
- loc = j;
- }
-
- // most deeply nested statement acting as a reference point
- if (n_dim == -1) {
- n_dim = max_nesting_level;
- max_loc = loc;
-
- index = std::vector<std::string>(n_dim);
-
- ir_tree_node *itn = ir_stmt[loc];
- int cur_dim = n_dim-1;
- while (itn->parent != NULL) {
- itn = itn->parent;
- if (itn->content->type() == IR_CONTROL_LOOP) {
- index[cur_dim] = static_cast<IR_Loop *>(itn->content)->index()->name();
- itn->payload = cur_dim--;
- }
- }
- }
-
- // align loops by names, temporary solution
- ir_tree_node *itn = ir_stmt[loc];
- while (itn->parent != NULL) {
- itn = itn->parent;
- if (itn->content->type() == IR_CONTROL_LOOP && itn->payload == -1) {
- std::string name = static_cast<IR_Loop *>(itn->content)->index()->name();
- for (int j = 0; j < n_dim; j++)
- if (index[j] == name) {
- itn->payload = j;
- break;
- }
- if (itn->payload == -1)
- throw loop_error("no complex alignment yet");
- }
- }
-
- // set relation variable names
- Relation r(n_dim);
- F_And *f_root = r.add_and();
- itn = ir_stmt[loc];
- while (itn->parent != NULL) {
- itn = itn->parent;
- if (itn->content->type() == IR_CONTROL_LOOP)
- r.name_set_var(itn->payload+1, static_cast<IR_Loop *>(itn->content)->index()->name());
- }
-
- // extract information from loop/if structures
- std::vector<bool> processed(n_dim, false);
- Tuple<std::string> vars_to_be_reversed;
- itn = ir_stmt[loc];
- while (itn->parent != NULL) {
- itn = itn->parent;
-
- switch (itn->content->type()) {
- case IR_CONTROL_LOOP: {
- IR_Loop *lp = static_cast<IR_Loop *>(itn->content);
- Variable_ID v = r.set_var(itn->payload+1);
- int c;
-
- try {
- c = lp->step_size();
- if (c > 0) {
- CG_outputRepr *lb = lp->lower_bound();
- exp2formula(ir, r, f_root, freevar, lb, v, 's', IR_COND_GE, true);
- CG_outputRepr *ub = lp->upper_bound();
- IR_CONDITION_TYPE cond = lp->stop_cond();
- if (cond == IR_COND_LT || cond == IR_COND_LE)
- exp2formula(ir, r, f_root, freevar, ub, v, 's', cond, true);
- else
- throw ir_error("loop condition not supported");
-
- }
- else if (c < 0) {
- CG_outputBuilder *ocg = ir->builder();
- CG_outputRepr *lb = lp->lower_bound();
- lb = ocg->CreateMinus(NULL, lb);
- exp2formula(ir, r, f_root, freevar, lb, v, 's', IR_COND_GE, true);
- CG_outputRepr *ub = lp->upper_bound();
- ub = ocg->CreateMinus(NULL, ub);
- IR_CONDITION_TYPE cond = lp->stop_cond();
- if (cond == IR_COND_GE)
- exp2formula(ir, r, f_root, freevar, ub, v, 's', IR_COND_LE, true);
- else if (cond == IR_COND_GT)
- exp2formula(ir, r, f_root, freevar, ub, v, 's', IR_COND_LT, true);
- else
- throw ir_error("loop condition not supported");
-
- vars_to_be_reversed.append(lp->index()->name());
- }
- else
- throw ir_error("loop step size zero");
- }
- catch (const ir_error &e) {
- for (int i = 0; i < itn->children.size(); i++)
- delete itn->children[i];
- itn->children = std::vector<ir_tree_node *>();
- itn->content = itn->content->convert();
- return false;
- }
-
- if (abs(c) != 1) {
- F_Exists *f_exists = f_root->add_exists();
- Variable_ID e = f_exists->declare();
- F_And *f_and = f_exists->add_and();
- Stride_Handle h = f_and->add_stride(abs(c));
- if (c > 0)
- h.update_coef(e, 1);
- else
- h.update_coef(e, -1);
- h.update_coef(v, -1);
- CG_outputRepr *lb = lp->lower_bound();
- exp2formula(ir, r, f_and, freevar, lb, e, 's', IR_COND_EQ, true);
- }
-
- processed[itn->payload] = true;
- break;
- }
- case IR_CONTROL_IF: {
- CG_outputRepr *cond = static_cast<IR_If *>(itn->content)->condition();
- try {
- if (itn->payload % 2 == 1)
- exp2constraint(ir, r, f_root, freevar, cond, true);
- else {
- F_Not *f_not = f_root->add_not();
- F_And *f_and = f_not->add_and();
- exp2constraint(ir, r, f_and, freevar, cond, true);
- }
- }
- catch (const ir_error &e) {
- std::vector<ir_tree_node *> *t;
- if (itn->parent == NULL)
- t = &ir_tree;
- else
- t = &(itn->parent->children);
- int id = itn->payload;
- int i = t->size() - 1;
- while (i >= 0) {
- if ((*t)[i] == itn) {
- for (int j = 0; j < itn->children.size(); j++)
- delete itn->children[j];
- itn->children = std::vector<ir_tree_node *>();
- itn->content = itn->content->convert();
- }
- else if ((*t)[i]->payload >> 1 == id >> 1) {
- delete (*t)[i];
- t->erase(t->begin()+i);
- }
- i--;
- }
- return false;
- }
-
- break;
- }
- default:
- for (int i = 0; i < itn->children.size(); i++)
- delete itn->children[i];
- itn->children = std::vector<ir_tree_node *>();
- itn->content = itn->content->convert();
- return false;
- }
- }
-
- // add information for missing loops
- for (int j = 0; j < n_dim; j++)
- if (!processed[j]) {
- ir_tree_node *itn = ir_stmt[max_loc];
- while (itn->parent != NULL) {
- itn = itn->parent;
- if (itn->content->type() == IR_CONTROL_LOOP && itn->payload == j)
- break;
- }
-
- Variable_ID v = r.set_var(j+1);
- if (loc < max_loc) {
- CG_outputRepr *lb = static_cast<IR_Loop *>(itn->content)->lower_bound();
- exp2formula(ir, r, f_root, freevar, lb, v, 's', IR_COND_EQ, true);
- }
- else { // loc > max_loc
- CG_outputRepr *ub = static_cast<IR_Loop *>(itn->content)->upper_bound();
- exp2formula(ir, r, f_root, freevar, ub, v, 's', IR_COND_EQ, true);
- }
- }
-
- r.setup_names();
- r.simplify();
-
- // insert the statement
- CG_outputBuilder *ocg = ir->builder();
- Tuple<CG_outputRepr *> reverse_expr;
- for (int j = 1; j <= vars_to_be_reversed.size(); j++) {
- CG_outputRepr *repl = ocg->CreateIdent(vars_to_be_reversed[j]);
- repl = ocg->CreateMinus(NULL, repl);
- reverse_expr.append(repl);
- }
- CG_outputRepr *code = static_cast<IR_Block *>(ir_stmt[loc]->content)->extract();
- code = ocg->CreatePlaceHolder(0, code, reverse_expr, vars_to_be_reversed);
- stmt[loc].code = code;
- stmt[loc].IS = r;
- stmt[loc].loop_level = std::vector<LoopLevel>(n_dim);
- for (int i = 0; i < n_dim; i++) {
- stmt[loc].loop_level[i].type = LoopLevelOriginal;
- stmt[loc].loop_level[i].payload = i;
- stmt[loc].loop_level[i].parallel_level = 0;
- }
-
- stmt_nesting_level[loc] = -1;
- }
-
- return true;
-}
-
-
-
-Loop::Loop(const IR_Control *control) {
- ir = const_cast<IR_Code *>(control->ir_);
- init_code = NULL;
- cleanup_code = NULL;
- tmp_loop_var_name_counter = 1;
- overflow_var_name_counter = 1;
- known = Relation::True(0);
-
- std::vector<ir_tree_node *> ir_tree = build_ir_tree(control->clone(), NULL);
- std::vector<ir_tree_node *> ir_stmt;
-
- while (!init_loop(ir_tree, ir_stmt)) {}
-
- // init the dependence graph
- for (int i = 0; i < stmt.size(); i++)
- dep.insert();
-
- for (int i = 0; i < stmt.size(); i++)
- for (int j = i; j < stmt.size(); j++) {
- std::pair<std::vector<DependenceVector>, std::vector<DependenceVector> > dv = test_data_dependences(ir, stmt[i].code, stmt[i].IS, stmt[j].code, stmt[j].IS, freevar);
-
- for (int k = 0; k < dv.first.size(); k++)
- if (is_dependence_valid(ir_stmt[i], ir_stmt[j], dv.first[k], true))
- dep.connect(i, j, dv.first[k]);
- else
- dep.connect(j, i, dv.first[k].reverse());
-
- for (int k = 0; k < dv.second.size(); k++)
- if (is_dependence_valid(ir_stmt[j], ir_stmt[i], dv.second[k], false))
- dep.connect(j, i, dv.second[k]);
- else
- dep.connect(i, j, dv.second[k].reverse());
- }
-
- // cleanup the IR tree
- for (int i = 0; i < ir_tree.size(); i++)
- delete ir_tree[i];
-
- // init dumb transformation relations e.g. [i, j] -> [ 0, i, 0, j, 0]
- for (int i = 0; i < stmt.size(); i++) {
- int n = stmt[i].IS.n_set();
- stmt[i].xform = Relation(n, 2*n+1);
- F_And *f_root = stmt[i].xform.add_and();
-
- for (int j = 1; j <= n; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(stmt[i].xform.output_var(2*j), 1);
- h.update_coef(stmt[i].xform.input_var(j), -1);
- }
-
- for (int j = 1; j <= 2*n+1; j+=2) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(stmt[i].xform.output_var(j), 1);
- }
- stmt[i].xform.simplify();
- }
-
- if (stmt.size() != 0)
- num_dep_dim = stmt[0].IS.n_set();
- else
- num_dep_dim = 0;
-}
-
-
-Loop::~Loop() {
- for (int i = 0; i < stmt.size(); i++)
- if (stmt[i].code != NULL) {
- stmt[i].code->clear();
- delete stmt[i].code;
- }
- if (init_code != NULL) {
- init_code->clear();
- delete init_code;
- }
- if (cleanup_code != NULL) {
- cleanup_code->clear();
- delete cleanup_code;
- }
-}
-
-
-int Loop::get_dep_dim_of(int stmt_num, int level) const {
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invaid statement " + to_string(stmt_num));
-
- if (level < 1 || level > stmt[stmt_num].loop_level.size())
- return -1;
-
- int trip_count = 0;
- while (true) {
- switch (stmt[stmt_num].loop_level[level-1].type) {
- case LoopLevelOriginal:
- return stmt[stmt_num].loop_level[level-1].payload;
- case LoopLevelTile:
- level = stmt[stmt_num].loop_level[level-1].payload;
- if (level < 1)
- return -1;
- if (level > stmt[stmt_num].loop_level.size())
- throw loop_error("incorrect loop level information for statement " + to_string(stmt_num));
- break;
- default:
- throw loop_error("unknown loop level information for statement " + to_string(stmt_num));
- }
- trip_count++;
- if (trip_count >= stmt[stmt_num].loop_level.size())
- throw loop_error("incorrect loop level information for statement " + to_string(stmt_num));
- }
-}
-
-
-int Loop::get_last_dep_dim_before(int stmt_num, int level) const {
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invaid statement " + to_string(stmt_num));
-
- if (level < 1)
- return -1;
- if (level > stmt[stmt_num].loop_level.size())
- level = stmt[stmt_num].loop_level.size() + 1;
-
- for (int i = level-1; i >= 1; i--)
- if (stmt[stmt_num].loop_level[i-1].type == LoopLevelOriginal)
- return stmt[stmt_num].loop_level[i-1].payload;
-
- return -1;
-}
-
-
-void Loop::print_internal_loop_structure() const {
- for (int i = 0; i < stmt.size(); i++) {
- std::vector<int> lex = getLexicalOrder(i);
- std::cout << "s" << i+1 << ": ";
- for (int j = 0; j < stmt[i].loop_level.size(); j++) {
- if (2*j < lex.size())
- std::cout << lex[2*j];
- switch (stmt[i].loop_level[j].type) {
- case LoopLevelOriginal:
- std::cout << "(dim:" << stmt[i].loop_level[j].payload << ")";
- break;
- case LoopLevelTile:
- std::cout << "(tile:" << stmt[i].loop_level[j].payload << ")";
- break;
- default:
- std::cout << "(unknown)";
- }
- std::cout << ' ';
- }
- for (int j = 2*stmt[i].loop_level.size(); j < lex.size(); j+=2) {
- std::cout << lex[j];
- if (j != lex.size()-1)
- std::cout << ' ';
- }
- std::cout << std::endl;
- }
-}
-
-
-CG_outputRepr *Loop::getCode(int effort) const {
- const int m = stmt.size();
- if (m == 0)
- return NULL;
- const int n = stmt[0].xform.n_out();
-
- Tuple<CG_outputRepr *> ni(m);
- Tuple<Relation> IS(m);
- Tuple<Relation> xform(m);
- for (int i = 0; i < m; i++) {
- ni[i+1] = stmt[i].code;
- IS[i+1] = stmt[i].IS;
- xform[i+1] = stmt[i].xform;
- }
-
- Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
- CG_outputBuilder *ocg = ir->builder();
- CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, effort);
-
- if (init_code != NULL)
- repr = ocg->StmtListAppend(init_code->clone(), repr);
- if (cleanup_code != NULL)
- repr = ocg->StmtListAppend(repr, cleanup_code->clone());
-
- return repr;
-}
-
-
-void Loop::printCode(int effort) const {
- const int m = stmt.size();
- if (m == 0)
- return;
- const int n = stmt[0].xform.n_out();
-
- Tuple<Relation> IS(m);
- Tuple<Relation> xform(m);
- for (int i = 0; i < m; i++) {
- IS[i+1] = stmt[i].IS;
- xform[i+1] = stmt[i].xform;
- }
-
- Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
- std::cout << MMGenerateCode(xform, IS, known, effort);
-}
-
-
-Relation Loop::getNewIS(int stmt_num) const {
- Relation result;
-
- if (stmt[stmt_num].xform.is_null()) {
- Relation known = Extend_Set(copy(this->known), stmt[stmt_num].IS.n_set() - this->known.n_set());
- result = Intersection(copy(stmt[stmt_num].IS), known);
- }
- else {
- Relation known = Extend_Set(copy(this->known), stmt[stmt_num].xform.n_out() - this->known.n_set());
- result = Intersection(Range(Restrict_Domain(copy(stmt[stmt_num].xform), copy(stmt[stmt_num].IS))), known);
- }
-
- result.simplify(2, 4);
-
- return result;
-}
-
-std::vector<Relation> Loop::getNewIS() const {
- const int m = stmt.size();
-
- std::vector<Relation> new_IS(m);
- for (int i = 0; i < m; i++)
- new_IS[i] = getNewIS(i);
-
- return new_IS;
-}
-
-
-void Loop::permute(const std::vector<int> &pi) {
- std::set<int> active;
- for (int i = 0; i < stmt.size(); i++)
- active.insert(i);
-
- permute(active, pi);
-}
-
-
-void Loop::original() {
- std::set<int> active;
- for (int i = 0; i < stmt.size(); i++)
- active.insert(i);
- setLexicalOrder(0, active);
-}
-
-
-void Loop::permute(const std::set<int> &active, const std::vector<int> &pi) {
- if (active.size() == 0 || pi.size() == 0)
- return;
-
- // check for sanity of parameters
- int level = pi[0];
- for (int i = 1; i < pi.size(); i++)
- if (pi[i] < level)
- level = pi[i];
- if (level < 1)
- throw std::invalid_argument("invalid permuation");
- std::vector<int> reverse_pi(pi.size(), 0);
- for (int i = 0; i < pi.size(); i++)
- if (pi[i] >= level+pi.size())
- throw std::invalid_argument("invalid permutation");
- else
- reverse_pi[pi[i]-level] = i+level;
- for (int i = 0; i < reverse_pi.size(); i++)
- if (reverse_pi[i] == 0)
- throw std::invalid_argument("invalid permuation");
- int ref_stmt_num;
- std::vector<int> lex;
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
- if (*i < 0 || *i >= stmt.size())
- throw std::invalid_argument("invalid statement " + to_string(*i));
- if (i == active.begin()) {
- ref_stmt_num = *i;
- lex = getLexicalOrder(*i);
- }
- else {
- if (level+pi.size()-1 > stmt[*i].loop_level.size())
- throw std::invalid_argument("invalid permuation");
- std::vector<int> lex2 = getLexicalOrder(*i);
- for (int j = 0; j < 2*level-3; j+=2)
- if (lex[j] != lex2[j])
- throw std::invalid_argument("statements to permute must be in the same subloop");
- for (int j = 0; j < pi.size(); j++)
- if (!(stmt[*i].loop_level[level+j-1].type == stmt[ref_stmt_num].loop_level[level+j-1].type &&
- stmt[*i].loop_level[level+j-1].payload == stmt[ref_stmt_num].loop_level[level+j-1].payload))
- throw std::invalid_argument("permuted loops must have the same loop level types");
- }
- }
-
- // Update transformation relations
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
- int n = stmt[*i].xform.n_out();
- Relation mapping(n, n);
- F_And *f_root = mapping.add_and();
- for (int j = 1; j <= n; j+= 2) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(j), 1);
- h.update_coef(mapping.input_var(j), -1);
- }
- for (int j = 0; j < pi.size(); j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(2*(level+j)), 1);
- h.update_coef(mapping.input_var(2*pi[j]), -1);
- }
- for (int j = 1; j < level; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(2*j), 1);
- h.update_coef(mapping.input_var(2*j), -1);
- }
- for (int j = level+pi.size(); j <= stmt[*i].loop_level.size(); j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(2*j), 1);
- h.update_coef(mapping.input_var(2*j), -1);
- }
-
- stmt[*i].xform = Composition(mapping, stmt[*i].xform);
- stmt[*i].xform.simplify();
- }
-
- // get the permuation for dependence vectors
- std::vector<int> t;
- for (int i = 0; i < pi.size(); i++)
- if (stmt[ref_stmt_num].loop_level[pi[i]-1].type == LoopLevelOriginal)
- t.push_back(stmt[ref_stmt_num].loop_level[pi[i]-1].payload);
- int max_dep_dim = -1;
- int min_dep_dim = num_dep_dim;
- for (int i = 0; i < t.size(); i++) {
- if (t[i] > max_dep_dim)
- max_dep_dim = t[i];
- if (t[i] < min_dep_dim)
- min_dep_dim = t[i];
- }
- if (min_dep_dim > max_dep_dim)
- return;
- if (max_dep_dim - min_dep_dim + 1 != t.size())
- throw loop_error("cannot update the dependence graph after permuation");
- std::vector<int> dep_pi(num_dep_dim);
- for (int i = 0; i < min_dep_dim; i++)
- dep_pi[i] = i;
- for (int i = min_dep_dim; i <= max_dep_dim; i++)
- dep_pi[i] = t[i-min_dep_dim];
- for (int i = max_dep_dim+1; i < num_dep_dim; i++)
- dep_pi[i] = i;
-
- // update the dependence graph
- DependenceGraph g;
- for (int i = 0; i < dep.vertex.size(); i++)
- g.insert();
- for (int i = 0; i < dep.vertex.size(); i++)
- for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++) {
- if ((active.find(i) != active.end() && active.find(j->first) != active.end())) {
- std::vector<DependenceVector> dv = j->second;
- for (int k = 0; k < dv.size(); k++) {
- switch (dv[k].type) {
- case DEP_W2R:
- case DEP_R2W:
- case DEP_W2W:
- case DEP_R2R: {
- std::vector<coef_t> lbounds(num_dep_dim);
- std::vector<coef_t> ubounds(num_dep_dim);
- for (int d = 0; d < num_dep_dim; d++) {
- lbounds[d] = dv[k].lbounds[dep_pi[d]];
- ubounds[d] = dv[k].ubounds[dep_pi[d]];
- }
- dv[k].lbounds = lbounds;
- dv[k].ubounds = ubounds;
- break;
- }
- case DEP_CONTROL: {
- break;
- }
- default:
- throw loop_error("unknown dependence type");
- }
- }
- g.connect(i, j->first, dv);
- }
- else if (active.find(i) == active.end() && active.find(j->first) == active.end()) {
- std::vector<DependenceVector> dv = j->second;
- g.connect(i, j->first, dv);
- }
- else {
- std::vector<DependenceVector> dv = j->second;
- for (int k = 0; k < dv.size(); k++)
- switch (dv[k].type) {
- case DEP_W2R:
- case DEP_R2W:
- case DEP_W2W:
- case DEP_R2R: {
- for (int d = 0; d < num_dep_dim; d++)
- if (dep_pi[d] != d) {
- dv[k].lbounds[d] = -posInfinity;
- dv[k].ubounds[d] = posInfinity;
- }
- break;
- }
- case DEP_CONTROL:
- break;
- default:
- throw loop_error("unknown dependence type");
- }
- g.connect(i, j->first, dv);
- }
- }
- dep = g;
-
- // update loop level information
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
- int cur_dep_dim = min_dep_dim;
- std::vector<LoopLevel> new_loop_level(stmt[*i].loop_level.size());
- for (int j = 1; j <= stmt[*i].loop_level.size(); j++)
- if (j >= level && j < level+pi.size()) {
- switch (stmt[*i].loop_level[reverse_pi[j-level]-1].type) {
- case LoopLevelOriginal:
- new_loop_level[j-1].type = LoopLevelOriginal;
- new_loop_level[j-1].payload = cur_dep_dim++;
- new_loop_level[j-1].parallel_level = stmt[*i].loop_level[reverse_pi[j-level]-1].parallel_level;
- break;
- case LoopLevelTile: {
- new_loop_level[j-1].type = LoopLevelTile;
- int ref_level = stmt[*i].loop_level[reverse_pi[j-level]-1].payload;
- if (ref_level >= level && ref_level < level+pi.size())
- new_loop_level[j-1].payload = reverse_pi[ref_level-level];
- else
- new_loop_level[j-1].payload = ref_level;
- new_loop_level[j-1].parallel_level = stmt[*i].loop_level[reverse_pi[j-level]-1].parallel_level;
- break;
- }
- default:
- throw loop_error("unknown loop level information for statement " + to_string(*i));
- }
- }
- else {
- switch (stmt[*i].loop_level[j-1].type) {
- case LoopLevelOriginal:
- new_loop_level[j-1].type = LoopLevelOriginal;
- new_loop_level[j-1].payload = stmt[*i].loop_level[j-1].payload;
- new_loop_level[j-1].parallel_level = stmt[*i].loop_level[j-1].parallel_level;
- break;
- case LoopLevelTile: {
- new_loop_level[j-1].type = LoopLevelTile;
- int ref_level = stmt[*i].loop_level[j-1].payload;
- if (ref_level >= level && ref_level < level+pi.size())
- new_loop_level[j-1].payload = reverse_pi[ref_level-level];
- else
- new_loop_level[j-1].payload = ref_level;
- new_loop_level[j-1].parallel_level = stmt[*i].loop_level[j-1].parallel_level;
- break;
- }
- default:
- throw loop_error("unknown loop level information for statement " + to_string(*i));
- }
- }
- stmt[*i].loop_level = new_loop_level;
- }
-
- setLexicalOrder(2*level-2, active);
-}
-
-std::set<int> Loop::split(int stmt_num, int level, const Relation &cond) {
- // check for sanity of parameters
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invalid statement " + to_string(stmt_num));
- if (level <= 0 || level > stmt[stmt_num].loop_level.size())
- throw std::invalid_argument("invalid loop level " + to_string(level));
-
- std::set<int> result;
- int dim = 2*level-1;
- std::vector<int> lex = getLexicalOrder(stmt_num);
- std::set<int> same_loop = getStatements(lex, dim-1);
-
- Relation cond2 = copy(cond);
- cond2.simplify();
- cond2 = EQs_to_GEQs(cond2);
- Conjunct *c = cond2.single_conjunct();
- int cur_lex = lex[dim-1];
- for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
- int max_level = (*gi).max_tuple_pos();
- Relation single_cond(max_level);
- single_cond.and_with_GEQ(*gi);
-
- // TODO: should decide where to place newly created statements with
- // complementary split condition from dependence graph.
- bool place_after;
- if (max_level == 0)
- place_after = true;
- else if ((*gi).get_coef(cond2.set_var(max_level)) < 0)
- place_after = true;
- else
- place_after = false;
-
- // make adjacent lexical number available for new statements
- if (place_after) {
- lex[dim-1] = cur_lex+1;
- shiftLexicalOrder(lex, dim-1, 1);
- }
- else {
- lex[dim-1] = cur_lex-1;
- shiftLexicalOrder(lex, dim-1, -1);
- }
-
- // original statements with split condition,
- // new statements with complement of split condition
- int old_num_stmt = stmt.size();
- std::map<int, int> what_stmt_num;
- apply_xform(same_loop);
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
- int n = stmt[*i].IS.n_set();
- Relation part1, part2;
- if (max_level > n) {
- part1 = copy(stmt[*i].IS);
- part2 = Relation::False(0);
- }
- else {
- part1 = Intersection(copy(stmt[*i].IS), Extend_Set(copy(single_cond), n-max_level));
- part2 = Intersection(copy(stmt[*i].IS), Extend_Set(Complement(copy(single_cond)), n-max_level));
- }
-
- stmt[*i].IS = part1;
-
- if (Intersection(copy(part2), Extend_Set(copy(this->known), n-this->known.n_set())).is_upper_bound_satisfiable()) {
- Statement new_stmt;
- new_stmt.code = stmt[*i].code->clone();
- new_stmt.IS = part2;
- new_stmt.xform = copy(stmt[*i].xform);
- if (place_after)
- assign_const(new_stmt.xform, dim-1, cur_lex+1);
- else
- assign_const(new_stmt.xform, dim-1, cur_lex-1);
- new_stmt.loop_level = stmt[*i].loop_level;
- stmt.push_back(new_stmt);
- dep.insert();
- what_stmt_num[*i] = stmt.size() - 1;
- if (*i == stmt_num)
- result.insert(stmt.size() - 1);
- }
- }
-
- // update dependence graph
- int dep_dim = get_dep_dim_of(stmt_num, level);
- for (int i = 0; i < old_num_stmt; i++) {
- std::vector<std::pair<int, std::vector<DependenceVector> > > D;
-
- for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++) {
- if (same_loop.find(i) != same_loop.end()) {
- if (same_loop.find(j->first) != same_loop.end()) {
- if (what_stmt_num.find(i) != what_stmt_num.end() && what_stmt_num.find(j->first) != what_stmt_num.end())
- dep.connect(what_stmt_num[i], what_stmt_num[j->first], j->second);
- if (place_after && what_stmt_num.find(j->first) != what_stmt_num.end()) {
- std::vector<DependenceVector> dvs;
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if (dv.is_data_dependence() && dep_dim != -1) {
- dv.lbounds[dep_dim] = -posInfinity;
- dv.ubounds[dep_dim] = posInfinity;
- }
- dvs.push_back(dv);
- }
- if (dvs.size() > 0)
- D.push_back(std::make_pair(what_stmt_num[j->first], dvs));
- }
- else if (!place_after && what_stmt_num.find(i) != what_stmt_num.end()) {
- std::vector<DependenceVector> dvs;
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if (dv.is_data_dependence() && dep_dim != -1) {
- dv.lbounds[dep_dim] = -posInfinity;
- dv.ubounds[dep_dim] = posInfinity;
- }
- dvs.push_back(dv);
- }
- if (dvs.size() > 0)
- dep.connect(what_stmt_num[i], j->first, dvs);
-
- }
- }
- else {
- if (what_stmt_num.find(i) != what_stmt_num.end())
- dep.connect(what_stmt_num[i], j->first, j->second);
- }
- }
- else if (same_loop.find(j->first) != same_loop.end()) {
- if (what_stmt_num.find(j->first) != what_stmt_num.end())
- D.push_back(std::make_pair(what_stmt_num[j->first], j->second));
- }
- }
-
- for (int j = 0; j < D.size(); j++)
- dep.connect(i, D[j].first, D[j].second);
- }
- }
-
- return result;
-}
-
-
-
-void Loop::tile(int stmt_num, int level, int tile_size, int outer_level, TilingMethodType method, int alignment_offset, int alignment_multiple) {
- // check for sanity of parameters
- if (tile_size < 0)
- throw std::invalid_argument("invalid tile size");
- if (alignment_multiple < 1 || alignment_offset < 0)
- throw std::invalid_argument("invalid alignment for tile");
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invalid statement " + to_string(stmt_num));
- if (level <= 0)
- throw std::invalid_argument("invalid loop level " + to_string(level));
- if (level > stmt[stmt_num].loop_level.size())
- throw std::invalid_argument("there is no loop level " + to_string(level) + " for statement " + to_string(stmt_num));
- if (outer_level <= 0 || outer_level > level)
- throw std::invalid_argument("invalid tile controlling loop level " + to_string(outer_level));
-
- int dim = 2*level-1;
- int outer_dim = 2*outer_level-1;
- std::vector<int> lex = getLexicalOrder(stmt_num);
- std::set<int> same_tiled_loop = getStatements(lex, dim-1);
- std::set<int> same_tile_controlling_loop = getStatements(lex, outer_dim-1);
-
- // special case for no tiling
- if (tile_size == 0) {
- for (std::set<int>::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) {
- Relation r(stmt[*i].xform.n_out(),stmt[*i].xform.n_out()+2);
- F_And *f_root = r.add_and();
- for (int j = 1; j <= 2*outer_level-1; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.input_var(j), 1);
- h.update_coef(r.output_var(j), -1);
- }
- EQ_Handle h1 = f_root->add_EQ();
- h1.update_coef(r.output_var(2*outer_level), 1);
- EQ_Handle h2 = f_root->add_EQ();
- h2.update_coef(r.output_var(2*outer_level+1), 1);
- for (int j = 2*outer_level; j <= stmt[*i].xform.n_out(); j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.input_var(j), 1);
- h.update_coef(r.output_var(j+2), -1);
- }
-
- stmt[*i].xform = Composition(copy(r), stmt[*i].xform);
- }
- }
- // normal tiling
- else {
- std::set<int> private_stmt;
- for (std::set<int>::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) {
-// if (same_tiled_loop.find(*i) == same_tiled_loop.end() && !is_single_iteration(getNewIS(*i), dim))
-// same_tiled_loop.insert(*i);
-
- // should test dim's value directly but it is ok for now
-// if (same_tiled_loop.find(*i) == same_tiled_loop.end() && get_const(stmt[*i].xform, dim+1, Output_Var) == posInfinity)
- if (same_tiled_loop.find(*i) == same_tiled_loop.end() && overflow.find(*i) != overflow.end())
- private_stmt.insert(*i);
- }
-
-
- // extract the union of the iteration space to be considered
- Relation hull;
- {
- Tuple<Relation> r_list;
- Tuple<int> r_mask;
-
- for (std::set<int>::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++)
- if (private_stmt.find(*i) == private_stmt.end()) {
- Relation r = project_onto_levels(getNewIS(*i), dim+1, true);
- for (int j = outer_dim; j < dim; j++)
- r = Project(r, j+1, Set_Var);
- for (int j = 0; j < outer_dim; j += 2)
- r = Project(r, j+1, Set_Var);
- r_list.append(r);
- r_mask.append(1);
- }
-
- hull = Hull(r_list, r_mask, 1, true);
- }
-
- // extract the bound of the dimension to be tiled
- Relation bound = get_loop_bound(hull, dim);
- if (!bound.has_single_conjunct()) {
- // further simplify the bound
- hull = Approximate(hull);
- bound = get_loop_bound(hull, dim);
-
- int i = outer_dim - 2;
- while (!bound.has_single_conjunct() && i >= 0) {
- hull = Project(hull, i+1, Set_Var);
- bound = get_loop_bound(hull, dim);
- i -= 2;
- }
-
- if (!bound.has_single_conjunct())
- throw loop_error("cannot handle tile bounds");
- }
-
- // separate lower and upper bounds
- std::vector<GEQ_Handle> lb_list, ub_list;
- {
- Conjunct *c = bound.query_DNF()->single_conjunct();
- for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
- int coef = (*gi).get_coef(bound.set_var(dim+1));
- if (coef < 0)
- ub_list.push_back(*gi);
- else if (coef > 0)
- lb_list.push_back(*gi);
- }
- }
- if (lb_list.size() == 0)
- throw loop_error("unable to calculate tile controlling loop lower bound");
- if (ub_list.size() == 0)
- throw loop_error("unable to calculate tile controlling loop upper bound");
-
- // find the simplest lower bound for StridedTile or simplest iteration count for CountedTile
- int simplest_lb = 0, simplest_ub = 0;
- if (method == StridedTile) {
- int best_cost = INT_MAX;
- for (int i = 0; i < lb_list.size(); i++) {
- int cost = 0;
- for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- cost += 5;
- break;
- }
- case Global_Var: {
- cost += 2;
- break;
- }
- default:
- cost += 15;
- break;
- }
- }
-
- if (cost < best_cost) {
- best_cost = cost;
- simplest_lb = i;
- }
- }
- }
- else if (method == CountedTile) {
- std::map<Variable_ID, coef_t> s1, s2, s3;
- int best_cost = INT_MAX;
- for (int i = 0; i < lb_list.size(); i++)
- for (int j = 0; j < ub_list.size(); j++) {
- int cost = 0;
-
- for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- s1[(*ci).var] += (*ci).coef;
- break;
- }
- case Global_Var: {
- s2[(*ci).var] += (*ci).coef;
- break;
- }
- case Exists_Var:
- case Wildcard_Var: {
- s3[(*ci).var] += (*ci).coef;
- break;
- }
- default:
- cost = INT_MAX-2;
- break;
- }
- }
-
- for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- s1[(*ci).var] += (*ci).coef;
- break;
- }
- case Global_Var: {
- s2[(*ci).var] += (*ci).coef;
- break;
- }
- case Exists_Var:
- case Wildcard_Var: {
- s3[(*ci).var] += (*ci).coef;
- break;
- }
- default:
- if (cost == INT_MAX-2)
- cost = INT_MAX-1;
- else
- cost = INT_MAX-3;
- break;
- }
- }
-
- if (cost == 0) {
- for (std::map<Variable_ID, coef_t>::iterator k = s1.begin(); k != s1.end(); k++)
- if ((*k).second != 0)
- cost += 5;
- for (std::map<Variable_ID, coef_t>::iterator k = s2.begin(); k != s2.end(); k++)
- if ((*k).second != 0)
- cost += 2;
- for (std::map<Variable_ID, coef_t>::iterator k = s3.begin(); k != s3.end(); k++)
- if ((*k).second != 0)
- cost += 15;
- }
-
- if (cost < best_cost) {
- best_cost = cost;
- simplest_lb = i;
- simplest_ub = j;
- }
- }
- }
-
- // prepare the new transformation relations
- for (std::set<int>::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) {
- Relation r(stmt[*i].xform.n_out(), stmt[*i].xform.n_out()+2);
- F_And *f_root = r.add_and();
- for (int j = 0; j < outer_dim-1; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.output_var(j+1), 1);
- h.update_coef(r.input_var(j+1), -1);
- }
-
- for (int j = outer_dim-1; j < stmt[*i].xform.n_out(); j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.output_var(j+3), 1);
- h.update_coef(r.input_var(j+1), -1);
- }
-
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.output_var(outer_dim), 1);
- h.update_const(-lex[outer_dim-1]);
-
- stmt[*i].xform = Composition(r, stmt[*i].xform);
- }
-
- // add tiling constraints.
- for (std::set<int>::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) {
- F_And *f_super_root = stmt[*i].xform.and_with_and();
- F_Exists *f_exists = f_super_root->add_exists();
- F_And *f_root = f_exists->add_and();
-
- // create a lower bound variable for easy formula creation later
- Variable_ID aligned_lb;
- {
- Variable_ID lb = f_exists->declare();
- coef_t coef = lb_list[simplest_lb].get_coef(bound.set_var(dim+1));
- if (coef == 1) { // e.g. if i >= m+5, then LB = m+5
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(lb, 1);
- for (Constr_Vars_Iter ci(lb_list[simplest_lb]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- int pos = (*ci).var->get_position();
- if (pos != dim + 1)
- h.update_coef(stmt[*i].xform.output_var(pos), (*ci).coef);
- break;
- }
- case Global_Var: {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = stmt[*i].xform.get_local(g);
- else
- v = stmt[*i].xform.get_local(g, (*ci).var->function_of());
- h.update_coef(v, (*ci).coef);
- break;
- }
- default:
- throw loop_error("cannot handle tile bounds");
- }
- }
- h.update_const(lb_list[simplest_lb].get_const());
- }
- else { // e.g. if 2i >= m+5, then m+5 <= 2*LB < m+5+2
- GEQ_Handle h1 = f_root->add_GEQ();
- GEQ_Handle h2 = f_root->add_GEQ();
- for (Constr_Vars_Iter ci(lb_list[simplest_lb]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- int pos = (*ci).var->get_position();
- if (pos == dim + 1) {
- h1.update_coef(lb, (*ci).coef);
- h2.update_coef(lb, -(*ci).coef);
- }
- else {
- h1.update_coef(stmt[*i].xform.output_var(pos), (*ci).coef);
- h2.update_coef(stmt[*i].xform.output_var(pos), -(*ci).coef);
- }
- break;
- }
- case Global_Var: {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = stmt[*i].xform.get_local(g);
- else
- v = stmt[*i].xform.get_local(g, (*ci).var->function_of());
- h1.update_coef(v, (*ci).coef);
- h2.update_coef(v, -(*ci).coef);
- break;
- }
- default:
- throw loop_error("cannot handle tile bounds");
- }
- }
- h1.update_const(lb_list[simplest_lb].get_const());
- h2.update_const(-lb_list[simplest_lb].get_const());
- h2.update_const(coef-1);
- }
-
- Variable_ID offset_lb;
- if (alignment_offset == 0)
- offset_lb = lb;
- else {
- EQ_Handle h = f_root->add_EQ();
- offset_lb = f_exists->declare();
- h.update_coef(offset_lb, 1);
- h.update_coef(lb, -1);
- h.update_const(alignment_offset);
- }
-
- if (alignment_multiple == 1) { // trivial
- aligned_lb = offset_lb;
- }
- else { // e.g. to align at 4, aligned_lb = 4*alpha && LB-4 < 4*alpha <= LB
- aligned_lb = f_exists->declare();
- Variable_ID e = f_exists->declare();
-
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(aligned_lb, 1);
- h.update_coef(e, -alignment_multiple);
-
- GEQ_Handle h1 = f_root->add_GEQ();
- GEQ_Handle h2 = f_root->add_GEQ();
- h1.update_coef(e, alignment_multiple);
- h2.update_coef(e, -alignment_multiple);
- h1.update_coef(offset_lb, -1);
- h2.update_coef(offset_lb, 1);
- h1.update_const(alignment_multiple-1);
- }
- }
-
- // create an upper bound variable for easy formula creation later
- Variable_ID ub = f_exists->declare();
- {
- coef_t coef = -ub_list[simplest_ub].get_coef(bound.set_var(dim+1));
- if (coef == 1) { // e.g. if i <= m+5, then UB = m+5
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(ub, -1);
- for (Constr_Vars_Iter ci(ub_list[simplest_ub]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- int pos = (*ci).var->get_position();
- if (pos != dim + 1)
- h.update_coef(stmt[*i].xform.output_var(pos), (*ci).coef);
- break;
- }
- case Global_Var: {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = stmt[*i].xform.get_local(g);
- else
- v = stmt[*i].xform.get_local(g, (*ci).var->function_of());
- h.update_coef(v, (*ci).coef);
- break;
- }
- default:
- throw loop_error("cannot handle tile bounds");
- }
- }
- h.update_const(ub_list[simplest_ub].get_const());
- }
- else { // e.g. if 2i <= m+5, then m+5-2 < 2*UB <= m+5
- GEQ_Handle h1 = f_root->add_GEQ();
- GEQ_Handle h2 = f_root->add_GEQ();
- for (Constr_Vars_Iter ci(ub_list[simplest_ub]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- int pos = (*ci).var->get_position();
- if (pos == dim + 1) {
- h1.update_coef(ub, -(*ci).coef);
- h2.update_coef(ub, (*ci).coef);
- }
- else {
- h1.update_coef(stmt[*i].xform.output_var(pos), -(*ci).coef);
- h2.update_coef(stmt[*i].xform.output_var(pos), (*ci).coef);
- }
- break;
- }
- case Global_Var: {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = stmt[*i].xform.get_local(g);
- else
- v = stmt[*i].xform.get_local(g, (*ci).var->function_of());
- h1.update_coef(v, -(*ci).coef);
- h2.update_coef(v, (*ci).coef);
- break;
- }
- default:
- throw loop_error("cannot handle tile bounds");
- }
- }
- h1.update_const(-ub_list[simplest_ub].get_const());
- h2.update_const(ub_list[simplest_ub].get_const());
- h1.update_const(coef-1);
- }
- }
-
- // insert tile controlling loop constraints
- if (method == StridedTile) { // e.g. ii = LB + 32 * alpha && alpha >= 0
- Variable_ID e = f_exists->declare();
- GEQ_Handle h1 = f_root->add_GEQ();
- h1.update_coef(e, 1);
-
- EQ_Handle h2 = f_root->add_EQ();
- h2.update_coef(stmt[*i].xform.output_var(outer_dim+1), 1);
- h2.update_coef(e, -tile_size);
- h2.update_coef(aligned_lb, -1);
- }
- else if (method == CountedTile) { // e.g. 0 <= ii < ceiling((UB-LB+1)/32)
- GEQ_Handle h1 = f_root->add_GEQ();
- h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), 1);
-
- GEQ_Handle h2 = f_root->add_GEQ();
- h2.update_coef(stmt[*i].xform.output_var(outer_dim+1), -tile_size);
- h2.update_coef(aligned_lb, -1);
- h2.update_coef(ub, 1);
- }
-
- // special care for private statements like overflow assignment
- if (private_stmt.find(*i) != private_stmt.end()) { // e.g. ii <= UB
- GEQ_Handle h = f_root->add_GEQ();
- h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1);
- h.update_coef(ub, 1);
- }
- // if (private_stmt.find(*i) != private_stmt.end()) {
- // if (stmt[*i].xform.n_out() > dim+3) { // e.g. ii <= UB && i = ii
- // GEQ_Handle h = f_root->add_GEQ();
- // h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1);
- // h.update_coef(ub, 1);
-
- // stmt[*i].xform = Project(stmt[*i].xform, dim+3, Output_Var);
- // f_root = stmt[*i].xform.and_with_and();
- // EQ_Handle h1 = f_root->add_EQ();
- // h1.update_coef(stmt[*i].xform.output_var(dim+3), 1);
- // h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1);
- // }
- // else if (method == StridedTile) { // e.g. ii <= UB since i does not exist
- // GEQ_Handle h = f_root->add_GEQ();
- // h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1);
- // h.update_coef(ub, 1);
- // }
- // }
-
- // restrict original loop index inside the tile
- else {
- if (method == StridedTile) { // e.g. ii <= i < ii + tile_size
- GEQ_Handle h1 = f_root->add_GEQ();
- h1.update_coef(stmt[*i].xform.output_var(dim+3), 1);
- h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1);
-
- GEQ_Handle h2 = f_root->add_GEQ();
- h2.update_coef(stmt[*i].xform.output_var(dim+3), -1);
- h2.update_coef(stmt[*i].xform.output_var(outer_dim+1), 1);
- h2.update_const(tile_size-1);
- }
- else if (method == CountedTile) { // e.g. LB+32*ii <= i < LB+32*ii+tile_size
- GEQ_Handle h1 = f_root->add_GEQ();
- h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), -tile_size);
- h1.update_coef(stmt[*i].xform.output_var(dim+3), 1);
- h1.update_coef(aligned_lb, -1);
-
- GEQ_Handle h2 = f_root->add_GEQ();
- h2.update_coef(stmt[*i].xform.output_var(outer_dim+1), tile_size);
- h2.update_coef(stmt[*i].xform.output_var(dim+3), -1);
- h2.update_const(tile_size-1);
- h2.update_coef(aligned_lb, 1);
- }
- }
- }
- }
-
- // update loop level information
- for (std::set<int>::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) {
- for (int j = 1; j <= stmt[*i].loop_level.size(); j++)
- switch (stmt[*i].loop_level[j-1].type) {
- case LoopLevelOriginal:
- break;
- case LoopLevelTile:
- if (stmt[*i].loop_level[j-1].payload >= outer_level)
- stmt[*i].loop_level[j-1].payload++;
- break;
- default:
- throw loop_error("unknown loop level type for statement " + to_string(*i));
- }
-
- LoopLevel ll;
- ll.type = LoopLevelTile;
- ll.payload = level+1;
- ll.parallel_level = 0;
- stmt[*i].loop_level.insert(stmt[*i].loop_level.begin()+(outer_level-1), ll);
- }
-}
-
-
-
-std::set<int> Loop::unroll(int stmt_num, int level, int unroll_amount) {
- // check for sanity of parameters
- if (unroll_amount < 0)
- throw std::invalid_argument("invalid unroll amount " + to_string(unroll_amount));
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invalid statement " + to_string(stmt_num));
- if (level <= 0 || level > stmt[stmt_num].loop_level.size())
- throw std::invalid_argument("invalid loop level " + to_string(level));
-
- int dim = 2*level - 1;
- std::vector<int> lex = getLexicalOrder(stmt_num);
- std::set<int> same_loop = getStatements(lex, dim-1);
-
- // nothing to do
- if (unroll_amount == 1)
- return std::set<int>();
-
- // extract the intersection of the iteration space to be considered
- Relation hull = Relation::True(level);
- apply_xform(same_loop);
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
- if (stmt[*i].IS.is_upper_bound_satisfiable()) {
- Relation mapping(stmt[*i].IS.n_set(), level);
- F_And *f_root = mapping.add_and();
- for (int j = 1; j <= level; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.input_var(j), 1);
- h.update_coef(mapping.output_var(j), -1);
- }
- hull = Intersection(hull, Range(Restrict_Domain(mapping, copy(stmt[*i].IS))));
- hull.simplify(2, 4);
- }
- }
- for (int i = 1; i <= level; i++) {
- std::string name = tmp_loop_var_name_prefix + to_string(i);
- hull.name_set_var(i, name);
- }
- hull.setup_names();
-
- // extract the exact loop bound of the dimension to be unrolled
- if (is_single_loop_iteration(hull, level, this->known))
- return std::set<int>();
- Relation bound = get_loop_bound(hull, level, this->known);
- if (!bound.has_single_conjunct() || !bound.is_satisfiable() || bound.is_tautology())
- throw loop_error("unable to extract loop bound for unrolling");
-
- // extract the loop stride
- EQ_Handle stride_eq;
- int stride = 1;
- {
- bool simple_stride = true;
- int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(level), stride_eq, simple_stride);
- if (strides > 1)
- throw loop_error("too many strides");
- else if (strides == 1) {
- int sign = stride_eq.get_coef(bound.set_var(level));
- Constr_Vars_Iter it(stride_eq, true);
- stride = abs((*it).coef/sign);
- }
- }
-
- // separate lower and upper bounds
- std::vector<GEQ_Handle> lb_list, ub_list;
- {
- Conjunct *c = bound.query_DNF()->single_conjunct();
- for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
- int coef = (*gi).get_coef(bound.set_var(level));
- if (coef < 0)
- ub_list.push_back(*gi);
- else if (coef > 0)
- lb_list.push_back(*gi);
- }
- }
-
- // simplify overflow expression for each pair of upper and lower bounds
- std::vector<std::vector<std::map<Variable_ID, int> > > overflow_table(lb_list.size(), std::vector<std::map<Variable_ID, int> >(ub_list.size(), std::map<Variable_ID, int>()));
- bool is_overflow_simplifiable = true;
- for (int i = 0; i < lb_list.size(); i++) {
- if (!is_overflow_simplifiable)
- break;
-
- for (int j = 0; j < ub_list.size(); j++) {
- // lower bound or upper bound has non-unit coefficient, can't simplify
- if (ub_list[j].get_coef(bound.set_var(level)) != -1 || lb_list[i].get_coef(bound.set_var(level)) != 1) {
- is_overflow_simplifiable = false;
- break;
- }
-
- for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) {
- switch((*ci).var->kind()) {
- case Input_Var:
- {
- if ((*ci).var != bound.set_var(level))
- overflow_table[i][j][(*ci).var] += (*ci).coef;
-
- break;
- }
- case Global_Var:
- {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = bound.get_local(g);
- else
- v = bound.get_local(g, (*ci).var->function_of());
- overflow_table[i][j][(*ci).var] += (*ci).coef;
- break;
- }
- default:
- throw loop_error("failed to calculate overflow amount");
- }
- }
- overflow_table[i][j][NULL] += ub_list[j].get_const();
-
- for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) {
- switch((*ci).var->kind()) {
- case Input_Var:
- {
- if ((*ci).var != bound.set_var(level)) {
- overflow_table[i][j][(*ci).var] += (*ci).coef;
- if (overflow_table[i][j][(*ci).var] == 0)
- overflow_table[i][j].erase(overflow_table[i][j].find((*ci).var));
- }
- break;
- }
- case Global_Var:
- {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = bound.get_local(g);
- else
- v = bound.get_local(g, (*ci).var->function_of());
- overflow_table[i][j][(*ci).var] += (*ci).coef;
- if (overflow_table[i][j][(*ci).var] == 0)
- overflow_table[i][j].erase(overflow_table[i][j].find((*ci).var));
- break;
- }
- default:
- throw loop_error("failed to calculate overflow amount");
- }
- }
- overflow_table[i][j][NULL] += lb_list[i].get_const();
-
- overflow_table[i][j][NULL] += stride;
- if (unroll_amount == 0 || (overflow_table[i][j].size() == 1 && overflow_table[i][j][NULL]/stride < unroll_amount))
- unroll_amount = overflow_table[i][j][NULL]/stride;
- }
- }
-
- // loop iteration count can't be determined, bail out gracefully
- if (unroll_amount == 0)
- return std::set<int>();
-
- // further simply overflow calculation using coefficients' modular
- if (is_overflow_simplifiable) {
- for (int i = 0; i < lb_list.size(); i++)
- for (int j = 0; j < ub_list.size(); j++)
- if (stride == 1) {
- for (std::map<Variable_ID, int>::iterator k = overflow_table[i][j].begin(); k != overflow_table[i][j].end(); )
- if ((*k).first != NULL) {
- int t = int_mod_hat((*k).second, unroll_amount);
- if (t == 0) {
- overflow_table[i][j].erase(k++);
- }
- else {
- int t2 = hull.query_variable_mod((*k).first, unroll_amount);
- if (t2 != INT_MAX) {
- overflow_table[i][j][NULL] += t * t2;
- overflow_table[i][j].erase(k++);
- }
- else {
- (*k).second = t;
- k++;
- }
- }
- }
- else
- k++;
-
- overflow_table[i][j][NULL] = int_mod_hat(overflow_table[i][j][NULL], unroll_amount);
-
- // Since we don't have MODULO instruction in SUIF yet (only MOD), make all coef positive in the final formula
- for (std::map<Variable_ID, int>::iterator k = overflow_table[i][j].begin(); k != overflow_table[i][j].end(); k++)
- if ((*k).second < 0)
- (*k).second += unroll_amount;
- }
- }
-
-
- // build overflow statement
- CG_outputBuilder *ocg = ir->builder();
- CG_outputRepr *overflow_code = NULL;
- Relation cond_upper(level), cond_lower(level);
- Relation overflow_constraint(0);
- F_And *overflow_constraint_root = overflow_constraint.add_and();
- std::vector<Free_Var_Decl *> over_var_list;
- if (is_overflow_simplifiable && lb_list.size() == 1) {
- for (int i = 0; i < ub_list.size(); i++) {
- if (overflow_table[0][i].size() == 1) {
- // upper splitting condition
- GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]);
- h.update_const(((overflow_table[0][i][NULL]/stride)%unroll_amount) * -stride);
- }
- else {
- // upper splitting condition
- std::string over_name = overflow_var_name_prefix + to_string(overflow_var_name_counter++);
- Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name);
- over_var_list.push_back(over_free_var);
- GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]);
- h.update_coef(cond_upper.get_local(over_free_var), -stride);
-
- // insert constraint 0 <= overflow < unroll_amount
- Variable_ID v = overflow_constraint.get_local(over_free_var);
- GEQ_Handle h1 = overflow_constraint_root->add_GEQ();
- h1.update_coef(v, 1);
- GEQ_Handle h2 = overflow_constraint_root->add_GEQ();
- h2.update_coef(v, -1);
- h2.update_const(unroll_amount-1);
-
- // create overflow assignment
- bound.setup_names();
- CG_outputRepr *rhs = NULL;
- for (std::map<Variable_ID, int>::iterator j = overflow_table[0][i].begin(); j != overflow_table[0][i].end(); j++)
- if ((*j).first != NULL) {
- CG_outputRepr *t = ocg->CreateIdent((*j).first->name());
- if ((*j).second != 1)
- t = ocg->CreateTimes(ocg->CreateInt((*j).second), t);
- rhs = ocg->CreatePlus(rhs, t);
- }
- else
- if ((*j).second != 0)
- rhs = ocg->CreatePlus(rhs, ocg->CreateInt((*j).second));
-
- if (stride != 1)
- rhs = ocg->CreateIntegerCeil(rhs, ocg->CreateInt(stride));
- rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount));
-
- CG_outputRepr *lhs = ocg->CreateIdent(over_name);
- init_code = ocg->StmtListAppend(init_code, ocg->CreateAssignment(0, lhs, ocg->CreateInt(0)));
- lhs = ocg->CreateIdent(over_name);
- overflow_code = ocg->StmtListAppend(overflow_code, ocg->CreateAssignment(0, lhs, rhs));
- }
- }
-
- // lower splitting condition
- GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[0]);
- }
- else if (is_overflow_simplifiable && ub_list.size() == 1) {
- for (int i = 0; i < lb_list.size(); i++) {
-
- if (overflow_table[i][0].size() == 1) {
- // lower splitting condition
- GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]);
- h.update_const(overflow_table[i][0][NULL] * -stride);
- }
- else {
- // lower splitting condition
- std::string over_name = overflow_var_name_prefix + to_string(overflow_var_name_counter++);
- Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name);
- over_var_list.push_back(over_free_var);
- GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]);
- h.update_coef(cond_lower.get_local(over_free_var), -stride);
-
- // insert constraint 0 <= overflow < unroll_amount
- Variable_ID v = overflow_constraint.get_local(over_free_var);
- GEQ_Handle h1 = overflow_constraint_root->add_GEQ();
- h1.update_coef(v, 1);
- GEQ_Handle h2 = overflow_constraint_root->add_GEQ();
- h2.update_coef(v, -1);
- h2.update_const(unroll_amount-1);
-
- // create overflow assignment
- bound.setup_names();
- CG_outputRepr *rhs = NULL;
- for (std::map<Variable_ID, int>::iterator j = overflow_table[0][i].begin(); j != overflow_table[0][i].end(); j++)
- if ((*j).first != NULL) {
- CG_outputRepr *t = ocg->CreateIdent((*j).first->name());
- if ((*j).second != 1)
- t = ocg->CreateTimes(ocg->CreateInt((*j).second), t);
- rhs = ocg->CreatePlus(rhs, t);
- }
- else
- if ((*j).second != 0)
- rhs = ocg->CreatePlus(rhs, ocg->CreateInt((*j).second));
-
- if (stride != 1)
- rhs = ocg->CreateIntegerCeil(rhs, ocg->CreateInt(stride));
- rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount));
-
- CG_outputRepr *lhs = ocg->CreateIdent(over_name);
- init_code = ocg->StmtListAppend(init_code, ocg->CreateAssignment(0, lhs, ocg->CreateInt(0)));
- lhs = ocg->CreateIdent(over_name);
- overflow_code = ocg->StmtListAppend(overflow_code, ocg->CreateAssignment(0, lhs, rhs));
- }
- }
-
- // upper splitting condition
- GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[0]);
- }
- else {
- std::string over_name = overflow_var_name_prefix + to_string(overflow_var_name_counter++);
- Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name);
- over_var_list.push_back(over_free_var);
-
- Tuple<CG_outputRepr *> lb_repr_list, ub_repr_list;
- for (int i = 0; i < lb_list.size(); i++) {
- //lb_repr_list.append(outputLBasRepr(ocg, lb_list[i], bound, bound.set_var(dim+1), stride, stride_eq, Relation::True(bound.n_set()), std::vector<CG_outputRepr *>(bound.n_set(), NULL)));
- lb_repr_list.append(outputLBasRepr(ocg, lb_list[i], bound, bound.set_var(dim+1), stride, stride_eq, Relation::True(bound.n_set()), std::vector<CG_outputRepr *>(bound.n_set())));
- GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]);
- }
- for (int i = 0; i < ub_list.size(); i++) {
- //ub_repr_list.append(outputUBasRepr(ocg, ub_list[i], bound, bound.set_var(dim+1), stride, stride_eq, std::vector<CG_outputRepr *>(bound.n_set(), NULL)));
- ub_repr_list.append(outputUBasRepr(ocg, ub_list[i], bound, bound.set_var(dim+1), stride, stride_eq, std::vector<CG_outputRepr *>(bound.n_set())));
- GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]);
- h.update_coef(cond_upper.get_local(over_free_var), -stride);
- }
-
- CG_outputRepr *lbRepr, *ubRepr;
- if (lb_repr_list.size() > 1)
- lbRepr = ocg->CreateInvoke("max", lb_repr_list);
- else if (lb_repr_list.size() == 1)
- lbRepr = lb_repr_list[1];
-
- if (ub_repr_list.size() > 1)
- ubRepr = ocg->CreateInvoke("min", ub_repr_list);
- else if (ub_repr_list.size() == 1)
- ubRepr = ub_repr_list[1];
-
- // create overflow assignment
- bound.setup_names();
- CG_outputRepr *rhs = ocg->CreatePlus(ocg->CreateMinus(ubRepr, lbRepr), ocg->CreateInt(1));
- if (stride != 1)
- rhs = ocg->CreateIntegerDivide(rhs, ocg->CreateInt(stride));
- rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount));
- CG_outputRepr *lhs = ocg->CreateIdent(over_name);
- init_code = ocg->StmtListAppend(init_code, ocg->CreateAssignment(0, lhs, ocg->CreateInt(0)));
- lhs = ocg->CreateIdent(over_name);
- overflow_code = ocg->CreateAssignment(0, lhs, rhs);
-
- // insert constraint 0 <= overflow < unroll_amount
- Variable_ID v = overflow_constraint.get_local(over_free_var);
- GEQ_Handle h1 = overflow_constraint_root->add_GEQ();
- h1.update_coef(v, 1);
- GEQ_Handle h2 = overflow_constraint_root->add_GEQ();
- h2.update_coef(v, -1);
- h2.update_const(unroll_amount-1);
- }
-
- // insert overflow statement
- int overflow_stmt_num = -1;
- if (overflow_code != NULL) {
- // build iteration space for overflow statement
- Relation mapping(level, level-1);
- F_And *f_root = mapping.add_and();
- for (int i = 1; i < level; i++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(i), 1);
- h.update_coef(mapping.input_var(i), -1);
- }
- Relation overflow_IS = Range(Restrict_Domain(mapping, copy(hull)));
- for (int i = 1; i < level; i++)
- overflow_IS.name_set_var(i, hull.set_var(i)->name());
- overflow_IS.setup_names();
-
- // build dumb transformation relation for overflow statement
- Relation overflow_xform(level-1, 2*(level-1)+1);
- f_root = overflow_xform.add_and();
- for (int i = 1; i <= level-1; i++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(overflow_xform.output_var(2*i), 1);
- h.update_coef(overflow_xform.input_var(i), -1);
-
- h = f_root->add_EQ();
- h.update_coef(overflow_xform.output_var(2*i-1), 1);
- h.update_const(-lex[2*i-2]);
- }
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(overflow_xform.output_var(2*(level-1)+1), 1);
- h.update_const(-lex[2*(level-1)]);
-
- shiftLexicalOrder(lex, dim-1, 1);
- Statement overflow_stmt;
- overflow_stmt.code = overflow_code;
- overflow_stmt.IS = overflow_IS;
- overflow_stmt.xform = overflow_xform;
- overflow_stmt.loop_level = std::vector<LoopLevel>(level-1);
- for (int i = 0; i < level-1; i++) {
- overflow_stmt.loop_level[i].type = stmt[stmt_num].loop_level[i].type;
- if (stmt[stmt_num].loop_level[i].type == LoopLevelTile &&
- stmt[stmt_num].loop_level[i].payload >= level)
- overflow_stmt.loop_level[i].payload = -1;
- else
- overflow_stmt.loop_level[i].payload = stmt[stmt_num].loop_level[i].payload;
- overflow_stmt.loop_level[i].parallel_level = stmt[stmt_num].loop_level[i].parallel_level;
- }
- stmt.push_back(overflow_stmt);
- dep.insert();
- overflow_stmt_num = stmt.size() - 1;
- overflow[overflow_stmt_num] = over_var_list;
-
- // update the global known information on overflow variable
- this->known = Intersection(this->known, Extend_Set(copy(overflow_constraint), this->known.n_set()-overflow_constraint.n_set()));
-
- // update dependence graph
- DependenceVector dv;
- dv.type = DEP_CONTROL;
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
- dep.connect(overflow_stmt_num, *i, dv);
- dv.type = DEP_W2W;
- {
- IR_ScalarSymbol *overflow_sym = NULL;
- std::vector<IR_ScalarRef *> scalars = ir->FindScalarRef(overflow_code);
- for (int i = scalars.size()-1; i >=0; i--)
- if (scalars[i]->is_write()) {
- overflow_sym = scalars[i]->symbol();
- break;
- }
- for (int i = scalars.size()-1; i >=0; i--)
- delete scalars[i];
- dv.sym = overflow_sym;
- }
- dv.lbounds = std::vector<coef_t>(num_dep_dim, 0);
- dv.ubounds = std::vector<coef_t>(num_dep_dim, 0);
- int dep_dim = get_last_dep_dim_before(stmt_num, level);
- for (int i = dep_dim + 1; i < num_dep_dim; i++) {
- dv.lbounds[i] = -posInfinity;
- dv.ubounds[i] = posInfinity;
- }
- for (int i = 0; i <= dep_dim; i++) {
- if (i != 0) {
- dv.lbounds[i-1] = 0;
- dv.ubounds[i-1] = 0;
- }
- dv.lbounds[i] = 1;
- dv.ubounds[i] = posInfinity;
- dep.connect(overflow_stmt_num, overflow_stmt_num, dv);
- }
- }
-
- // split the loop so it can be fully unrolled
- std::set<int> result = split(stmt_num, level, cond_upper);
- std::set<int> result2 = split(stmt_num, level, cond_lower);
- for (std::set<int>::iterator i = result2.begin(); i != result2.end(); i++)
- result.insert(*i);
-
- // check if unrolled statements can be trivially lumped together as one statement
- bool can_be_lumped = true;
- if (can_be_lumped) {
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
- if (*i != stmt_num) {
- if (stmt[*i].loop_level.size() != stmt[stmt_num].loop_level.size()) {
- can_be_lumped = false;
- break;
- }
- for (int j = 0; j < stmt[stmt_num].loop_level.size(); j++)
- if (!(stmt[*i].loop_level[j].type == stmt[stmt_num].loop_level[j].type &&
- stmt[*i].loop_level[j].payload == stmt[stmt_num].loop_level[j].payload)) {
- can_be_lumped = false;
- break;
- }
- if (!can_be_lumped)
- break;
- std::vector<int> lex2 = getLexicalOrder(*i);
- for (int j = 2*level; j < lex.size()-1; j+=2)
- if (lex[j] != lex2[j]) {
- can_be_lumped = false;
- break;
- }
- if (!can_be_lumped)
- break;
- }
- }
- if (can_be_lumped) {
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
- if (is_inner_loop_depend_on_level(stmt[*i].IS, level, known)) {
- can_be_lumped = false;
- break;
- }
- }
- if (can_be_lumped) {
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
- if (*i != stmt_num) {
- if (!(Must_Be_Subset(copy(stmt[*i].IS), copy(stmt[stmt_num].IS)) && Must_Be_Subset(copy(stmt[stmt_num].IS), copy(stmt[*i].IS)))) {
- can_be_lumped = false;
- break;
- }
- }
- }
- if (can_be_lumped) {
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
- for (DependenceGraph::EdgeList::iterator j = dep.vertex[*i].second.begin(); j != dep.vertex[*i].second.end(); j++)
- if (same_loop.find(j->first) != same_loop.end()) {
- for (int k = 0; k < j->second.size(); k++)
- if (j->second[k].type == DEP_CONTROL || j->second[k].type == DEP_UNKNOWN) {
- can_be_lumped = false;
- break;
- }
- if (!can_be_lumped)
- break;
- }
- if (!can_be_lumped)
- break;
- }
- }
-
-
- // add strides to original statements
- // for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
- // add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
-
-
- // std::vector<Free_Var_Decl *> depending_overflow_var;
- // for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
- // add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
- // if (overflow.find(*i) != overflow.end()) {
- // // TO DO: It should check whether overflow vaiable depends on
- // // this loop index and by how much. This step is important if
- // // you want to unroll loops in arbitrary order.
- // depending_overflow_var.insert(depending_overflow_var.end(), overflow[*i].begin(), overflow[*i].end());
-
- // continue;
- // }
- // }
-
-
-
-// std::map<int, std::vector<Statement> > pending;
-// for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-// add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
-
-// if (overflow.find(*i) != overflow.end()) {
-// // TO DO: It should check whether overflow vaiable depends on
-// // this loop index and by how much. This step is important if
-// // you want to unroll loops in arbitrary order.
-// depending_overflow_var.insert(depending_overflow_var.end(), overflow[*i].begin(), overflow[*i].end());
-
-// continue;
-// }
-
-// // create copy for each unroll amount
-// for (int j = 1; j < unroll_amount; j++) {
-// Tuple<CG_outputRepr *> funcList;
-// Tuple<std::string> loop_vars;
-// loop_vars.append(stmt[*i].IS.set_var((dim+1)/2)->name());
-// funcList.append(ocg->CreatePlus(ocg->CreateIdent(stmt[*i].IS.set_var(level)->name()), ocg->CreateInt(j*stride)));
-// CG_outputRepr *code = ocg->CreatePlaceHolder(0, stmt[*i].code->clone(), funcList, loop_vars);
-
-// // prepare the new statment to insert
-// Statement unrolled_stmt;
-// unrolled_stmt.IS = copy(stmt[*i].IS);
-// // adjust_loop_bound(unrolled_stmt.IS, (dim-1)/2, j);
-// unrolled_stmt.xform = copy(stmt[*i].xform);
-// unrolled_stmt.code = code;
-// unrolled_stmt.loop_level = stmt[*i].loop_level;
-// pending[*i].push_back(unrolled_stmt);
-// }
-// }
-
-// // adjust iteration space due to loop bounds depending on this loop
-// // index and affected overflow variables
-// for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-// for (int j = 0; j < pending[*i].size(); j++) {
-// adjust_loop_bound(pending[*i][j].IS, (dim-1)/2, j+1, depending_overflow_var);
-// //pending[*i][j].IS = Intersection(pending[*i][j].IS, Extend_Set(copy(this->known), pending[*i][j].IS.n_set() - this->known.n_set()));
-// }
-// }
-
- // insert unrolled statements
- int old_num_stmt = stmt.size();
- if (!can_be_lumped) {
- std::map<int, std::vector<int> > what_stmt_num;
-
- for (int j = 1; j < unroll_amount; j++) {
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
- Statement new_stmt;
-
- Tuple<CG_outputRepr *> funcList;
- Tuple<std::string> loop_vars;
- loop_vars.append(stmt[*i].IS.set_var(level)->name());
- funcList.append(ocg->CreatePlus(ocg->CreateIdent(stmt[*i].IS.set_var(level)->name()), ocg->CreateInt(j*stride)));
- new_stmt.code = ocg->CreatePlaceHolder(0, stmt[*i].code->clone(), funcList, loop_vars);
-
- new_stmt.IS = adjust_loop_bound(stmt[*i].IS, level, j * stride);
- add_loop_stride(new_stmt.IS, bound, level-1, unroll_amount * stride);
-
- new_stmt.xform = copy(stmt[*i].xform);
- new_stmt.loop_level = stmt[*i].loop_level;
- stmt.push_back(new_stmt);
- dep.insert();
- what_stmt_num[*i].push_back(stmt.size() - 1);
- }
- }
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
- add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
-
-
- // update dependence graph
- if (stmt[stmt_num].loop_level[level-1].type == LoopLevelOriginal) {
- int dep_dim = stmt[stmt_num].loop_level[level-1].payload;
- int new_stride = unroll_amount * stride;
- for (int i = 0; i < old_num_stmt; i++) {
- std::vector<std::pair<int, DependenceVector> > D;
-
- for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); ) {
- if (same_loop.find(i) != same_loop.end()) {
- if (same_loop.find(j->first) != same_loop.end()) {
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if (dv.type == DEP_CONTROL || dv.type == DEP_UNKNOWN) {
- D.push_back(std::make_pair(j->first, dv));
- for (int kk = 0; kk < unroll_amount - 1; kk++)
- if (what_stmt_num[i][kk] != -1 && what_stmt_num[j->first][kk] != -1)
- dep.connect(what_stmt_num[i][kk], what_stmt_num[j->first][kk], dv);
- }
- else {
- coef_t lb = dv.lbounds[dep_dim];
- coef_t ub = dv.ubounds[dep_dim];
- if (ub == lb && int_mod(lb, static_cast<coef_t>(new_stride)) == 0) {
- D.push_back(std::make_pair(j->first, dv));
- for (int kk = 0; kk < unroll_amount - 1; kk++)
- if (what_stmt_num[i][kk] != -1 && what_stmt_num[j->first][kk] != -1)
- dep.connect(what_stmt_num[i][kk], what_stmt_num[j->first][kk], dv);
- }
- else if (lb == -posInfinity && ub == posInfinity) {
- D.push_back(std::make_pair(j->first, dv));
- for (int kk = 0; kk < unroll_amount; kk++)
- if (kk == 0)
- D.push_back(std::make_pair(j->first, dv));
- else if (what_stmt_num[j->first][kk-1] != -1)
- D.push_back(std::make_pair(what_stmt_num[j->first][kk-1], dv));
- for (int t = 0; t < unroll_amount - 1; t++)
- if (what_stmt_num[i][t] != -1)
- for (int kk = 0; kk < unroll_amount; kk++)
- if (kk == 0)
- dep.connect(what_stmt_num[i][t], j->first, dv);
- else if (what_stmt_num[j->first][kk-1] != -1)
- dep.connect(what_stmt_num[i][t], what_stmt_num[j->first][kk-1], dv);
- }
- else {
- for (int kk = 0; kk < unroll_amount; kk++) {
- if (lb != -posInfinity) {
- if (kk * stride < int_mod(lb, static_cast<coef_t>(new_stride)))
- dv.lbounds[dep_dim] = floor(static_cast<double>(lb)/new_stride) * new_stride + new_stride;
- else
- dv.lbounds[dep_dim] = floor(static_cast<double>(lb)/new_stride) * new_stride;
- }
- if (ub != posInfinity) {
- if (kk * stride > int_mod(ub, static_cast<coef_t>(new_stride)))
- dv.ubounds[dep_dim] = floor(static_cast<double>(ub)/new_stride) * new_stride - new_stride;
- else
- dv.ubounds[dep_dim] = floor(static_cast<double>(ub)/new_stride) * new_stride;
- }
- if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim]) {
- if (kk == 0)
- D.push_back(std::make_pair(j->first, dv));
- else if (what_stmt_num[j->first][kk-1] != -1)
- D.push_back(std::make_pair(what_stmt_num[j->first][kk-1], dv));
- }
- }
- for (int t = 0; t < unroll_amount-1; t++)
- if (what_stmt_num[i][t] != -1)
- for (int kk = 0; kk < unroll_amount; kk++) {
- if (lb != -posInfinity) {
- if (kk * stride < int_mod(lb+t+1, static_cast<coef_t>(new_stride)))
- dv.lbounds[dep_dim] = floor(static_cast<double>(lb+(t+1)*stride)/new_stride) * new_stride + new_stride;
- else
- dv.lbounds[dep_dim] = floor(static_cast<double>(lb+(t+1)*stride)/new_stride) * new_stride;
- }
- if (ub != posInfinity) {
- if (kk * stride > int_mod(ub+t+1, static_cast<coef_t>(new_stride)))
- dv.ubounds[dep_dim] = floor(static_cast<double>(ub+(t+1)*stride)/new_stride) * new_stride - new_stride;
- else
- dv.ubounds[dep_dim] = floor(static_cast<double>(ub+(t+1)*stride)/new_stride) * new_stride;
- }
- if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim]) {
- if (kk == 0)
- dep.connect(what_stmt_num[i][t], j->first, dv);
- else if (what_stmt_num[j->first][kk-1] != -1)
- dep.connect(what_stmt_num[i][t], what_stmt_num[j->first][kk-1], dv);
- }
- }
- }
- }
- }
-
- dep.vertex[i].second.erase(j++);
- }
- else {
- for (int kk = 0; kk < unroll_amount - 1; kk++)
- if (what_stmt_num[i][kk] != -1)
- dep.connect(what_stmt_num[i][kk], j->first, j->second);
-
- j++;
- }
- }
- else {
- if (same_loop.find(j->first) != same_loop.end())
- for (int k = 0; k < j->second.size(); k++)
- for (int kk = 0; kk < unroll_amount - 1; kk++)
- if (what_stmt_num[j->first][kk] != -1)
- D.push_back(std::make_pair(what_stmt_num[j->first][kk], j->second[k]));
- j++;
- }
- }
-
- for (int j = 0; j < D.size(); j++)
- dep.connect(i, D[j].first, D[j].second);
- }
- }
-
- // reset lexical order for the unrolled loop body
- std::set<int> new_same_loop;
- for (std::map<int, std::vector<int> >::iterator i = what_stmt_num.begin(); i != what_stmt_num.end(); i++) {
- new_same_loop.insert(i->first);
- for (int j = 0; j < i->second.size(); j++)
- new_same_loop.insert(i->second[j]);
- }
- setLexicalOrder(dim+1, new_same_loop);
- }
- else {
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
- add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
-
- int max_level = stmt[stmt_num].loop_level.size();
- std::vector<std::pair<int, int> > stmt_order;
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
- stmt_order.push_back(std::make_pair(get_const(stmt[*i].xform, 2*max_level, Output_Var), *i));
- sort(stmt_order.begin(), stmt_order.end());
-
- Statement new_stmt;
- new_stmt.code = NULL;
- for (int j = 1; j < unroll_amount; j++)
- for (int i = 0; i < stmt_order.size(); i++) {
- Tuple<CG_outputRepr *> funcList;
- Tuple<std::string> loop_vars;
- loop_vars.append(stmt[stmt_order[i].second].IS.set_var(level)->name());
- funcList.append(ocg->CreatePlus(ocg->CreateIdent(stmt[stmt_order[i].second].IS.set_var(level)->name()), ocg->CreateInt(j*stride)));
- CG_outputRepr *code = ocg->CreatePlaceHolder(0, stmt[stmt_order[i].second].code->clone(), funcList, loop_vars);
- new_stmt.code = ocg->StmtListAppend(new_stmt.code, code);
- }
-
- new_stmt.IS = copy(stmt[stmt_num].IS);
- new_stmt.xform = copy(stmt[stmt_num].xform);
- assign_const(new_stmt.xform, 2*max_level, stmt_order[stmt_order.size()-1].first+1);
- new_stmt.loop_level = stmt[stmt_num].loop_level;
- stmt.push_back(new_stmt);
- dep.insert();
-
- // update dependence graph
- if (stmt[stmt_num].loop_level[level-1].type == LoopLevelOriginal) {
- int dep_dim = stmt[stmt_num].loop_level[level-1].payload;
- int new_stride = unroll_amount * stride;
- for (int i = 0; i < old_num_stmt; i++) {
- std::vector<std::pair<int, std::vector<DependenceVector> > > D;
-
- for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); ) {
- if (same_loop.find(i) != same_loop.end()) {
- if (same_loop.find(j->first) != same_loop.end()) {
- std::vector<DependenceVector> dvs11, dvs12, dvs22, dvs21;
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if (dv.type == DEP_CONTROL || dv.type == DEP_UNKNOWN) {
- if (i == j->first) {
- dvs11.push_back(dv);
- dvs22.push_back(dv);
- }
- else
- throw loop_error("unrolled statements lumped together illegally");
- }
- else {
- coef_t lb = dv.lbounds[dep_dim];
- coef_t ub = dv.ubounds[dep_dim];
- if (ub == lb && int_mod(lb, static_cast<coef_t>(new_stride)) == 0) {
- dvs11.push_back(dv);
- dvs22.push_back(dv);
- }
- else {
- if (lb != -posInfinity)
- dv.lbounds[dep_dim] = ceil(static_cast<double>(lb)/new_stride) * new_stride;
- if (ub != posInfinity)
- dv.ubounds[dep_dim] = floor(static_cast<double>(ub)/new_stride) * new_stride;
- if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim])
- dvs11.push_back(dv);
-
- if (lb != -posInfinity)
- dv.lbounds[dep_dim] = ceil(static_cast<double>(lb)/new_stride) * new_stride;
- if (ub != posInfinity)
- dv.ubounds[dep_dim] = ceil(static_cast<double>(ub)/new_stride) * new_stride;
- if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim])
- dvs21.push_back(dv);
-
- if (lb != -posInfinity)
- dv.lbounds[dep_dim] = floor(static_cast<double>(lb)/new_stride) * new_stride;
- if (ub != posInfinity)
- dv.ubounds[dep_dim] = floor(static_cast<double>(ub-stride)/new_stride) * new_stride;
- if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim])
- dvs12.push_back(dv);
-
- if (lb != -posInfinity)
- dv.lbounds[dep_dim] = floor(static_cast<double>(lb)/new_stride) * new_stride;
- if (ub != posInfinity)
- dv.ubounds[dep_dim] = ceil(static_cast<double>(ub-stride)/new_stride) * new_stride;
- if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim])
- dvs22.push_back(dv);
- }
- }
- }
- if (dvs11.size() > 0)
- D.push_back(std::make_pair(i, dvs11));
- if (dvs22.size() > 0)
- dep.connect(old_num_stmt, old_num_stmt, dvs22);
- if (dvs12.size() > 0)
- D.push_back(std::make_pair(old_num_stmt, dvs12));
- if (dvs21.size() > 0)
- dep.connect(old_num_stmt, i, dvs21);
-
- dep.vertex[i].second.erase(j++);
- }
- else {
- dep.connect(old_num_stmt, j->first, j->second);
- j++;
- }
- }
- else {
- if (same_loop.find(j->first) != same_loop.end())
- D.push_back(std::make_pair(old_num_stmt, j->second));
- j++;
- }
- }
-
- for (int j = 0; j < D.size(); j++)
- dep.connect(i, D[j].first, D[j].second);
- }
- }
- }
-
- return result;
-}
-
-
-std::vector<int> Loop::getLexicalOrder(int stmt_num) const {
- assert(stmt_num < stmt.size());
-
- const int n = stmt[stmt_num].xform.n_out();
- std::vector<int> lex(n,0);
-
- for (int i = 0; i < n; i += 2)
- lex[i] = get_const(stmt[stmt_num].xform, i, Output_Var);
-
- return lex;
-}
-
-std::set<int> Loop::getStatements(const std::vector<int> &lex, int dim) const {
- const int m = stmt.size();
-
- std::set<int> same_loops;
- for (int i = 0; i < m; i++) {
- if (dim < 0)
- same_loops.insert(i);
- else {
- std::vector<int> a_lex = getLexicalOrder(i);
- int j;
- for (j = 0; j <= dim; j+=2)
- if (lex[j] != a_lex[j])
- break;
- if (j > dim)
- same_loops.insert(i);
- }
- }
-
- return same_loops;
-}
-
-
-void Loop::shiftLexicalOrder(const std::vector<int> &lex, int dim, int amount) {
- const int m = stmt.size();
-
- if (amount == 0)
- return;
-
- for (int i = 0; i < m; i++) {
- std::vector<int> lex2 = getLexicalOrder(i);
-
- bool need_shift = true;
-
- for (int j = 0; j < dim; j++)
- if (lex2[j] != lex[j]) {
- need_shift = false;
- break;
- }
-
- if (!need_shift)
- continue;
-
- if (amount > 0) {
- if (lex2[dim] < lex[dim])
- continue;
- }
- else if (amount < 0) {
- if (lex2[dim] > lex[dim])
- continue;
- }
-
- assign_const(stmt[i].xform, dim, lex2[dim] + amount);
- }
-}
-
-
-void Loop::setLexicalOrder(int dim, const std::set<int> &active, int starting_order) {
- if (active.size() == 0)
- return;
-
- // check for sanity of parameters
- if (dim < 0 || dim % 2 != 0)
- throw std::invalid_argument("invalid constant loop level to set lexicographical order");
- std::vector<int> lex;
- int ref_stmt_num;
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
- if ((*i) < 0 || (*i) >= stmt.size())
- throw std::invalid_argument("invalid statement number " + to_string(*i));
- if (dim >= stmt[*i].xform.n_out())
- throw std::invalid_argument("invalid constant loop level to set lexicographical order");
- if (i == active.begin()) {
- lex = getLexicalOrder(*i);
- ref_stmt_num = *i;
- }
- else {
- std::vector<int> lex2 = getLexicalOrder(*i);
- for (int j = 0; j < dim; j+=2)
- if (lex[j] != lex2[j])
- throw std::invalid_argument("statements are not in the same sub loop nest");
- }
- }
-
- // sepearate statements by current loop level types
- int level = (dim+2)/2;
- std::map<std::pair<LoopLevelType, int>, std::set<int> > active_by_level_type;
- std::set<int> active_by_no_level;
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
- if (level > stmt[*i].loop_level.size())
- active_by_no_level.insert(*i);
- else
- active_by_level_type[std::make_pair(stmt[*i].loop_level[level-1].type, stmt[*i].loop_level[level-1].payload)].insert(*i);
- }
-
- // further separate statements due to control dependences
- std::vector<std::set<int> > active_by_level_type_splitted;
- for (std::map<std::pair<LoopLevelType, int>, std::set<int> >::iterator i = active_by_level_type.begin(); i != active_by_level_type.end(); i++)
- active_by_level_type_splitted.push_back(i->second);
- for (std::set<int>::iterator i = active_by_no_level.begin(); i != active_by_no_level.end(); i++)
- for (int j = active_by_level_type_splitted.size() - 1; j >= 0; j--) {
- std::set<int> controlled, not_controlled;
- for (std::set<int>::iterator k = active_by_level_type_splitted[j].begin(); k != active_by_level_type_splitted[j].end(); k++) {
- std::vector<DependenceVector> dvs = dep.getEdge(*i, *k);
- bool is_controlled = false;
- for (int kk = 0; kk < dvs.size(); kk++)
- if (dvs[kk].type = DEP_CONTROL) {
- is_controlled = true;
- break;
- }
- if (is_controlled)
- controlled.insert(*k);
- else
- not_controlled.insert(*k);
- }
- if (controlled.size() != 0 && not_controlled.size() != 0) {
- active_by_level_type_splitted.erase(active_by_level_type_splitted.begin() + j);
- active_by_level_type_splitted.push_back(controlled);
- active_by_level_type_splitted.push_back(not_controlled);
- }
- }
-
- // set lexical order separating loops with different loop types first
- if (active_by_level_type_splitted.size() + active_by_no_level.size() > 1) {
- int dep_dim = get_last_dep_dim_before(ref_stmt_num, level) + 1;
-
- Graph<std::set<int>, Empty> g;
- for (std::vector<std::set<int> >::iterator i = active_by_level_type_splitted.begin(); i != active_by_level_type_splitted.end(); i++)
- g.insert(*i);
- for (std::set<int>::iterator i = active_by_no_level.begin(); i != active_by_no_level.end(); i++) {
- std::set<int> t;
- t.insert(*i);
- g.insert(t);
- }
- for (int i = 0; i < g.vertex.size(); i++)
- for (int j = i+1; j < g.vertex.size(); j++) {
- bool connected = false;
- for (std::set<int>::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) {
- for (std::set<int>::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) {
- std::vector<DependenceVector> dvs = dep.getEdge(*ii, *jj);
- for (int k = 0; k < dvs.size(); k++)
- if (dvs[k].is_control_dependence() ||
- (dvs[k].is_data_dependence() && !dvs[k].has_been_carried_before(dep_dim))) {
- g.connect(i, j);
- connected = true;
- break;
- }
- if (connected)
- break;
- }
- if (connected)
- break;
- }
- connected = false;
- for (std::set<int>::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) {
- for (std::set<int>::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) {
- std::vector<DependenceVector> dvs = dep.getEdge(*jj, *ii);
- for (int k = 0; k < dvs.size(); k++)
- if (dvs[k].is_control_dependence() ||
- (dvs[k].is_data_dependence() && !dvs[k].has_been_carried_before(dep_dim))) {
- g.connect(j, i);
- connected = true;
- break;
- }
- if (connected)
- break;
- }
- if (connected)
- break;
- }
- }
-
- std::vector<std::set<int> > s = g.topoSort();
- if (s.size() != g.vertex.size())
- throw loop_error("cannot separate statements with different loop types at loop level " + to_string(level));
-
- // assign lexical order
- int order = starting_order;
- for (int i = 0; i < s.size(); i++) {
- std::set<int> &cur_scc = g.vertex[*(s[i].begin())].first;
- int sz = cur_scc.size();
- if (sz == 1) {
- int cur_stmt = *(cur_scc.begin());
- assign_const(stmt[cur_stmt].xform, dim, order);
- for (int j = dim+2; j < stmt[cur_stmt].xform.n_out(); j+=2)
- assign_const(stmt[cur_stmt].xform, j, 0);
- order++;
- }
- else {
- setLexicalOrder(dim, cur_scc, order);
- order += sz;
- }
- }
- }
- // set lexical order seperating single iteration statements and loops
- else {
- std::set<int> true_singles;
- std::set<int> nonsingles;
- std::map<coef_t, std::set<int> > fake_singles;
-
- // sort out statements that do not require loops
- for(std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
- Relation cur_IS = getNewIS(*i);
- if (is_single_iteration(cur_IS, dim+1)) {
- bool is_all_single = true;
- for (int j = dim+3; j < stmt[*i].xform.n_out(); j+=2)
- if (!is_single_iteration(cur_IS, j)) {
- is_all_single = false;
- break;
- }
- if (is_all_single)
- true_singles.insert(*i);
- else {
- try {
- fake_singles[get_const(cur_IS, dim+1, Set_Var)].insert(*i);
- }
- catch (const std::exception &e) {
- fake_singles[posInfinity].insert(*i);
- }
- }
- }
- else
- nonsingles.insert(*i);
- }
-
- // split nonsingles forcibly according to negative dependences present (loop unfusible)
- int dep_dim = get_dep_dim_of(ref_stmt_num, level);
- Graph<int, Empty> g2;
- for (std::set<int>::iterator i = nonsingles.begin(); i != nonsingles.end(); i++)
- g2.insert(*i);
- for (int i = 0; i < g2.vertex.size(); i++)
- for (int j = i+1; j < g2.vertex.size(); j++) {
- std::vector<DependenceVector> dvs = dep.getEdge(g2.vertex[i].first, g2.vertex[j].first);
- for (int k = 0; k < dvs.size(); k++)
- if (dvs[k].is_control_dependence() ||
- (dvs[k].is_data_dependence() && dvs[k].has_negative_been_carried_at(dep_dim))) {
- g2.connect(i, j);
- break;
- }
- dvs = dep.getEdge(g2.vertex[j].first, g2.vertex[i].first);
- for (int k = 0; k < dvs.size(); k++)
- if (dvs[k].is_control_dependence() ||
- (dvs[k].is_data_dependence() && dvs[k].has_negative_been_carried_at(dep_dim))) {
- g2.connect(j, i);
- break;
- }
- }
-
- std::vector<std::set<int> > s2 = g2.packed_topoSort();
-
- std::vector<std::set<int> > splitted_nonsingles;
- for (int i = 0; i < s2.size(); i++) {
- std::set<int> cur_scc;
- for (std::set<int>::iterator j = s2[i].begin(); j != s2[i].end(); j++)
- cur_scc.insert(g2.vertex[*j].first);
- splitted_nonsingles.push_back(cur_scc);
- }
-
- // convert to dependence graph for grouped statements
- dep_dim = get_last_dep_dim_before(ref_stmt_num, level) + 1;
- Graph<std::set<int>, Empty> g;
- for (std::set<int>::iterator i = true_singles.begin(); i != true_singles.end(); i++) {
- std::set<int> t;
- t.insert(*i);
- g.insert(t);
- }
- for (int i = 0; i < splitted_nonsingles.size(); i++) {
- g.insert(splitted_nonsingles[i]);
- }
- for (std::map<coef_t, std::set<int> >::iterator i = fake_singles.begin(); i != fake_singles.end(); i++)
- g.insert((*i).second);
-
- for (int i = 0; i < g.vertex.size(); i++)
- for (int j = i + 1; j < g.vertex.size(); j++) {
- bool connected = false;
- for (std::set<int>::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) {
- for (std::set<int>::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) {
- std::vector<DependenceVector> dvs = dep.getEdge(*ii, *jj);
- for (int k = 0; k < dvs.size(); k++)
- if (dvs[k].is_control_dependence() ||
- (dvs[k].is_data_dependence() && !dvs[k].has_been_carried_before(dep_dim))) {
- g.connect(i, j);
- connected = true;
- break;
- }
- if (connected)
- break;
- }
- if (connected)
- break;
- }
- connected = false;
- for (std::set<int>::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) {
- for (std::set<int>::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) {
- std::vector<DependenceVector> dvs = dep.getEdge(*jj, *ii);
- for (int k = 0; k < dvs.size(); k++)
- if (dvs[k].is_control_dependence() ||
- (dvs[k].is_data_dependence() && !dvs[k].has_been_carried_before(dep_dim))) {
- g.connect(j, i);
- connected = true;
- break;
- }
- if (connected)
- break;
- }
- if (connected)
- break;
- }
- }
-
- // topological sort according to chun's permute algorithm
- std::vector<std::set<int> > s = g.topoSort();
-
- // assign lexical order
- int order = starting_order;
- for (int i = 0; i < s.size(); i++) {
- // translate each SCC into original statements
- std::set<int> cur_scc;
- for (std::set<int>::iterator j = s[i].begin(); j != s[i].end(); j++)
- copy(g.vertex[*j].first.begin(), g.vertex[*j].first.end(), inserter(cur_scc, cur_scc.begin()));
-
- // now assign the constant
- for(std::set<int>::iterator j = cur_scc.begin(); j != cur_scc.end(); j++)
- assign_const(stmt[*j].xform, dim, order);
-
- if (cur_scc.size() > 1)
- setLexicalOrder(dim+2, cur_scc);
- else if (cur_scc.size() == 1) {
- int cur_stmt =*(cur_scc.begin());
- for (int j = dim+2; j < stmt[cur_stmt].xform.n_out(); j+=2)
- assign_const(stmt[cur_stmt].xform, j, 0);
- }
-
- if (cur_scc.size() > 0)
- order++;
- }
- }
-}
-
-
-void Loop::apply_xform() {
- std::set<int> active;
- for (int i = 0; i < stmt.size(); i++)
- active.insert(i);
- apply_xform(active);
-}
-
-
-void Loop::apply_xform(int stmt_num) {
- std::set<int> active;
- active.insert(stmt_num);
- apply_xform(active);
-}
-
-
-void Loop::apply_xform(std::set<int> &active) {
- int max_n = 0;
-
- CG_outputBuilder *ocg = ir->builder();
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
- int n = stmt[*i].loop_level.size();
- if (n > max_n)
- max_n = n;
-
- std::vector<int> lex = getLexicalOrder(*i);
-
- Relation mapping(2*n+1, n);
- F_And *f_root = mapping.add_and();
- for (int j = 1; j <= n; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(j), 1);
- h.update_coef(mapping.input_var(2*j), -1);
- }
- mapping = Composition(mapping, stmt[*i].xform);
- mapping.simplify();
-
- // match omega input/output variables to variable names in the code
- for (int j = 1; j <= stmt[*i].IS.n_set(); j++)
- mapping.name_input_var(j, stmt[*i].IS.set_var(j)->name());
- for (int j = 1; j <= n; j++)
- mapping.name_output_var(j, tmp_loop_var_name_prefix + to_string(tmp_loop_var_name_counter+j-1));
- mapping.setup_names();
-
- Relation known = Extend_Set(copy(this->known), mapping.n_out() - this->known.n_set());
- //stmt[*i].code = outputStatement(ocg, stmt[*i].code, 0, mapping, known, std::vector<CG_outputRepr *>(mapping.n_out(), NULL));
- stmt[*i].code = outputStatement(ocg, stmt[*i].code, 0, mapping, known, std::vector<CG_outputRepr *>(mapping.n_out()));
- stmt[*i].IS = Range(Restrict_Domain(mapping, stmt[*i].IS));
- stmt[*i].IS.simplify();
-
- // replace original transformation relation with straight 1-1 mapping
- mapping = Relation(n, 2*n+1);
- f_root = mapping.add_and();
- for (int j = 1; j <= n; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(2*j), 1);
- h.update_coef(mapping.input_var(j), -1);
- }
- for (int j = 1; j <= 2*n+1; j+=2) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(j), 1);
- h.update_const(-lex[j-1]);
- }
- stmt[*i].xform = mapping;
- }
-
- tmp_loop_var_name_counter += max_n;
-}
-
-
-void Loop::addKnown(const Relation &cond) {
- int n1 = this->known.n_set();
-
- Relation r = copy(cond);
- int n2 = r.n_set();
-
- if (n1 < n2)
- this->known = Extend_Set(this->known, n2-n1);
- else if (n1 > n2)
- r = Extend_Set(r, n1-n2);
-
- this->known = Intersection(this->known, r);
-}
-
-
-bool Loop::nonsingular(const std::vector<std::vector<int> > &T) {
- if (stmt.size() == 0)
- return true;
-
- // check for sanity of parameters
- for (int i = 0; i < stmt.size(); i++) {
- if (stmt[i].loop_level.size() != num_dep_dim)
- throw std::invalid_argument("nonsingular loop transformations must be applied to original perfect loop nest");
- for (int j = 0; j < stmt[i].loop_level.size(); j++)
- if (stmt[i].loop_level[j].type != LoopLevelOriginal)
- throw std::invalid_argument("nonsingular loop transformations must be applied to original perfect loop nest");
- }
- if (T.size() != num_dep_dim)
- throw std::invalid_argument("invalid transformation matrix");
- for (int i = 0; i < stmt.size(); i++)
- if (T[i].size() != num_dep_dim + 1 && T[i].size() != num_dep_dim)
- throw std::invalid_argument("invalid transformation matrix");
-
- // build relation from matrix
- Relation mapping(2*num_dep_dim+1, 2*num_dep_dim+1);
- F_And *f_root = mapping.add_and();
- for (int i = 0; i < num_dep_dim; i++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(2*(i+1)), -1);
- for (int j = 0; j < num_dep_dim; j++)
- if (T[i][j] != 0)
- h.update_coef(mapping.input_var(2*(j+1)), T[i][j]);
- if (T[i].size() == num_dep_dim+1)
- h.update_const(T[i][num_dep_dim]);
- }
- for (int i = 1; i <= 2*num_dep_dim+1; i+=2) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(i), -1);
- h.update_coef(mapping.input_var(i), 1);
- }
-
- // update transformation relations
- for (int i = 0; i < stmt.size(); i++)
- stmt[i].xform = Composition(copy(mapping), stmt[i].xform);
-
- // update dependence graph
- for (int i = 0; i < dep.vertex.size(); i++)
- for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++) {
- std::vector<DependenceVector> dvs = j->second;
- for (int k = 0; k < dvs.size(); k++) {
- DependenceVector &dv = dvs[k];
- switch (dv.type) {
- case DEP_W2R:
- case DEP_R2W:
- case DEP_W2W:
- case DEP_R2R: {
- std::vector<coef_t> lbounds(num_dep_dim), ubounds(num_dep_dim);
- for (int p = 0; p < num_dep_dim; p++) {
- coef_t lb = 0;
- coef_t ub = 0;
- for (int q = 0; q < num_dep_dim; q++) {
- if (T[p][q] > 0) {
- if (lb == -posInfinity || dv.lbounds[q] == -posInfinity)
- lb = -posInfinity;
- else
- lb += T[p][q] * dv.lbounds[q];
- if (ub == posInfinity || dv.ubounds[q] == posInfinity)
- ub = posInfinity;
- else
- ub += T[p][q] * dv.ubounds[q];
- }
- else if (T[p][q] < 0) {
- if (lb == -posInfinity || dv.ubounds[q] == posInfinity)
- lb = -posInfinity;
- else
- lb += T[p][q] * dv.ubounds[q];
- if (ub == posInfinity || dv.lbounds[q] == -posInfinity)
- ub = posInfinity;
- else
- ub += T[p][q] * dv.lbounds[q];
- }
- }
- if (T[p].size() == num_dep_dim+1) {
- if (lb != -posInfinity)
- lb += T[p][num_dep_dim];
- if (ub != posInfinity)
- ub += T[p][num_dep_dim];
- }
- lbounds[p] = lb;
- ubounds[p] = ub;
- }
- dv.lbounds = lbounds;
- dv.ubounds = ubounds;
-
- break;
- }
- default:
- ;
- }
- }
- j->second = dvs;
- }
-
- // set constant loop values
- std::set<int> active;
- for (int i = 0; i < stmt.size(); i++)
- active.insert(i);
- setLexicalOrder(0, active);
-
- return true;
-}
-
-
-void Loop::skew(const std::set<int> &stmt_nums, int level, const std::vector<int> &skew_amount) {
- if (stmt_nums.size() == 0)
- return;
-
- // check for sanity of parameters
- int ref_stmt_num = *(stmt_nums.begin());
- for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
- if (*i < 0 || *i >= stmt.size())
- throw std::invalid_argument("invalid statement number " + to_string(*i));
- if (level < 1 || level > stmt[*i].loop_level.size())
- throw std::invalid_argument("invalid loop level " + to_string(level));
- for (int j = stmt[*i].loop_level.size(); j < skew_amount.size(); j++)
- if (skew_amount[j] != 0)
- throw std::invalid_argument("invalid skewing formula");
- }
-
- // set trasformation relations
- for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
- int n = stmt[*i].xform.n_out();
- Relation r(n,n);
- F_And *f_root = r.add_and();
- for (int j = 1; j <= n; j++)
- if (j != 2*level) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.input_var(j), 1);
- h.update_coef(r.output_var(j), -1);
- }
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.output_var(2*level), -1);
- for (int j = 0; j < skew_amount.size(); j++)
- if (skew_amount[j] != 0)
- h.update_coef(r.input_var(2*(j+1)), skew_amount[j]);
-
- stmt[*i].xform = Composition(r, stmt[*i].xform);
- stmt[*i].xform.simplify();
- }
-
- // update dependence graph
- if (stmt[ref_stmt_num].loop_level[level-1].type == LoopLevelOriginal) {
- int dep_dim = stmt[ref_stmt_num].loop_level[level-1].payload;
- for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++)
- for (DependenceGraph::EdgeList::iterator j = dep.vertex[*i].second.begin(); j != dep.vertex[*i].second.end(); j++)
- if (stmt_nums.find(j->first) != stmt_nums.end()) {
- // dependence between skewed statements
- std::vector<DependenceVector> dvs = j->second;
- for (int k = 0; k < dvs.size(); k++) {
- DependenceVector &dv = dvs[k];
- if (dv.is_data_dependence()) {
- coef_t lb = 0;
- coef_t ub = 0;
- for (int kk = 0; kk < skew_amount.size(); kk++) {
- int cur_dep_dim = get_dep_dim_of(*i, kk+1);
- if (skew_amount[kk] > 0) {
- if (lb != -posInfinity &&
- stmt[*i].loop_level[kk].type == LoopLevelOriginal &&
- dv.lbounds[cur_dep_dim] != -posInfinity)
- lb += skew_amount[kk] * dv.lbounds[cur_dep_dim];
- else {
- if (cur_dep_dim != -1 && !(dv.lbounds[cur_dep_dim] == 0 && dv.ubounds[cur_dep_dim] == 0))
- lb = -posInfinity;
- }
- if (ub != posInfinity &&
- stmt[*i].loop_level[kk].type == LoopLevelOriginal &&
- dv.ubounds[cur_dep_dim] != posInfinity)
- ub += skew_amount[kk] * dv.ubounds[cur_dep_dim];
- else {
- if (cur_dep_dim != -1 && !(dv.lbounds[cur_dep_dim] == 0 && dv.ubounds[cur_dep_dim] == 0))
- ub = posInfinity;
- }
- }
- else if (skew_amount[kk] < 0) {
- if (lb != -posInfinity &&
- stmt[*i].loop_level[kk].type == LoopLevelOriginal &&
- dv.ubounds[cur_dep_dim] != posInfinity)
- lb += skew_amount[kk] * dv.ubounds[cur_dep_dim];
- else {
- if (cur_dep_dim != -1 && !(dv.lbounds[cur_dep_dim] == 0 && dv.ubounds[cur_dep_dim] == 0))
- lb = -posInfinity;
- }
- if (ub != posInfinity &&
- stmt[*i].loop_level[kk].type == LoopLevelOriginal &&
- dv.lbounds[cur_dep_dim] != -posInfinity)
- ub += skew_amount[kk] * dv.lbounds[cur_dep_dim];
- else {
- if (cur_dep_dim != -1 && !(dv.lbounds[cur_dep_dim] == 0 && dv.ubounds[cur_dep_dim] == 0))
- ub = posInfinity;
- }
- }
- }
- dv.lbounds[dep_dim] = lb;
- dv.ubounds[dep_dim] = ub;
- }
- }
- j->second = dvs;
- }
- else {
- // dependence from skewed statement to unskewed statement becomes jumbled,
- // put distance value at skewed dimension to unknown
- std::vector<DependenceVector> dvs = j->second;
- for (int k = 0; k < dvs.size(); k++) {
- DependenceVector &dv = dvs[k];
- if (dv.is_data_dependence()) {
- dv.lbounds[dep_dim] = -posInfinity;
- dv.ubounds[dep_dim] = posInfinity;
- }
- }
- j->second = dvs;
- }
- for (int i = 0; i < dep.vertex.size(); i++)
- if (stmt_nums.find(i) == stmt_nums.end())
- for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++)
- if (stmt_nums.find(j->first) != stmt_nums.end()) {
- // dependence from unskewed statement to skewed statement becomes jumbled,
- // put distance value at skewed dimension to unknown
- std::vector<DependenceVector> dvs = j->second;
- for (int k = 0; k < dvs.size(); k++) {
- DependenceVector &dv = dvs[k];
- if (dv.is_data_dependence()) {
- dv.lbounds[dep_dim] = -posInfinity;
- dv.ubounds[dep_dim] = posInfinity;
- }
- }
- j->second = dvs;
- }
- }
-}
-
-
-void Loop::shift(const std::set<int> &stmt_nums, int level, int shift_amount) {
- if (stmt_nums.size() == 0)
- return;
-
- // check for sanity of parameters
- int ref_stmt_num = *(stmt_nums.begin());
- for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
- if (*i < 0 || *i >= stmt.size())
- throw std::invalid_argument("invalid statement number " + to_string(*i));
- if (level < 1 || level > stmt[*i].loop_level.size())
- throw std::invalid_argument("invalid loop level " + to_string(level));
- }
-
- // do nothing
- if (shift_amount == 0)
- return;
-
- // set trasformation relations
- for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
- int n = stmt[*i].xform.n_out();
-
- Relation r(n, n);
- F_And *f_root = r.add_and();
- for (int j = 1; j <= n; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.input_var(j), 1);
- h.update_coef(r.output_var(j), -1);
- if (j == 2*level)
- h.update_const(shift_amount);
- }
-
- stmt[*i].xform = Composition(r, stmt[*i].xform);
- stmt[*i].xform.simplify();
- }
-
- // update dependence graph
- if (stmt[ref_stmt_num].loop_level[level-1].type == LoopLevelOriginal) {
- int dep_dim = stmt[ref_stmt_num].loop_level[level-1].payload;
- for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++)
- for (DependenceGraph::EdgeList::iterator j = dep.vertex[*i].second.begin(); j != dep.vertex[*i].second.end(); j++)
- if (stmt_nums.find(j->first) == stmt_nums.end()) {
- // dependence from shifted statement to unshifted statement
- std::vector<DependenceVector> dvs = j->second;
- for (int k = 0; k < dvs.size(); k++) {
- DependenceVector &dv = dvs[k];
- if (dv.is_data_dependence()) {
- if (dv.lbounds[dep_dim] != -posInfinity)
- dv.lbounds[dep_dim] -= shift_amount;
- if (dv.ubounds[dep_dim] != posInfinity)
- dv.ubounds[dep_dim] -= shift_amount;
- }
- }
- j->second = dvs;
- }
- for (int i = 0; i < dep.vertex.size(); i++)
- if (stmt_nums.find(i) == stmt_nums.end())
- for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++)
- if (stmt_nums.find(j->first) != stmt_nums.end()) {
- // dependence from unshifted statement to shifted statement
- std::vector<DependenceVector> dvs = j->second;
- for (int k = 0; k < dvs.size(); k++) {
- DependenceVector &dv = dvs[k];
- if (dv.is_data_dependence()) {
- if (dv.lbounds[dep_dim] != -posInfinity)
- dv.lbounds[dep_dim] += shift_amount;
- if (dv.ubounds[dep_dim] != posInfinity)
- dv.ubounds[dep_dim] += shift_amount;
- }
- }
- j->second = dvs;
- }
- }
-}
-
-
-
-// bool Loop::fuse(const std::set<int> &stmt_nums, int level) {
-// if (stmt_nums.size() == 0 || stmt_nums.size() == 1)
-// return true;
-// int dim = 2*level-1;
-
-// // check for sanity of parameters
-// std::vector<int> ref_lex;
-// for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-// if (*i < 0 || *i >= stmt.size())
-// throw std::invalid_argument("invalid statement number " + to_string(*i));
-// if (level < 1 || level > (stmt[*i].xform.n_out()-1)/2)
-// throw std::invalid_argument("invalid loop level " + to_string(level));
-// if (ref_lex.size() == 0)
-// ref_lex = getLexicalOrder(*i);
-// else {
-// std::vector<int> lex = getLexicalOrder(*i);
-// for (int j = 0; j < dim-1; j+=2)
-// if (lex[j] != ref_lex[j])
-// throw std::invalid_argument("statements for fusion must be in the same level-" + to_string(level-1) + " subloop");
-// }
-// }
-
-// // collect lexicographical order values from to-be-fused statements
-// std::set<int> lex_values;
-// for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-// std::vector<int> lex = getLexicalOrder(*i);
-// lex_values.insert(lex[dim-1]);
-// }
-// if (lex_values.size() == 1)
-// return true;
-
-// // negative dependence would prevent fusion
-// int dep_dim = xform_index[dim].first;
-// for (std::set<int>::iterator i = lex_values.begin(); i != lex_values.end(); i++) {
-// ref_lex[dim-1] = *i;
-// std::set<int> a = getStatements(ref_lex, dim-1);
-// std::set<int>::iterator j = i;
-// j++;
-// for (; j != lex_values.end(); j++) {
-// ref_lex[dim-1] = *j;
-// std::set<int> b = getStatements(ref_lex, dim-1);
-// for (std::set<int>::iterator ii = a.begin(); ii != a.end(); ii++)
-// for (std::set<int>::iterator jj = b.begin(); jj != b.end(); jj++) {
-// std::vector<DependenceVector> dvs;
-// dvs = dep.getEdge(*ii, *jj);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim) && dvs[k].hasNegative(dep_dim))
-// throw loop_error("loop error: statements " + to_string(*ii) + " and " + to_string(*jj) + " cannot be fused together due to negative dependence");
-// dvs = dep.getEdge(*jj, *ii);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim) && dvs[k].hasNegative(dep_dim))
-// throw loop_error("loop error: statements " + to_string(*jj) + " and " + to_string(*ii) + " cannot be fused together due to negative dependence");
-// }
-// }
-// }
-
-// // collect all other lexicographical order values from the subloop
-// // enclosing these to-be-fused loops
-// std::set<int> same_loop = getStatements(ref_lex, dim-3);
-// std::set<int> other_lex_values;
-// for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-// std::vector<int> lex = getLexicalOrder(*i);
-// if (lex_values.find(lex[dim-1]) == lex_values.end())
-// other_lex_values.insert(lex[dim-1]);
-// }
-
-// // update to-be-fused loops due to dependence cycle
-// Graph<std::set<int>, Empty> g;
-// {
-// std::set<int> t;
-// for (std::set<int>::iterator i = lex_values.begin(); i != lex_values.end(); i++) {
-// ref_lex[dim-1] = *i;
-// std::set<int> t2 = getStatements(ref_lex, dim-1);
-// std::set_union(t.begin(), t.end(), t2.begin(), t2.end(), inserter(t, t.begin()));
-// }
-// g.insert(t);
-// }
-// for (std::set<int>::iterator i = other_lex_values.begin(); i != other_lex_values.end(); i++) {
-// ref_lex[dim-1] = *i;
-// std::set<int> t = getStatements(ref_lex, dim-1);
-// g.insert(t);
-// }
-// for (int i = 0; i < g.vertex.size(); i++)
-// for (int j = i+1; j < g.vertex.size(); j++)
-// for (std::set<int>::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++)
-// for (std::set<int>::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) {
-// std::vector<DependenceVector> dvs;
-// dvs = dep.getEdge(*ii, *jj);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim)) {
-// g.connect(i, j);
-// break;
-// }
-// dvs = dep.getEdge(*jj, *ii);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim)) {
-// g.connect(j, i);
-// break;
-// }
-// }
-// std::vector<std::set<int> > s = g.topoSort();
-// int fused_lex_value = 0;
-// for (int i = 0; i < s.size(); i++)
-// if (s[i].find(0) != s[i].end()) {
-// // now add additional lexicographical order values
-// for (std::set<int>::iterator j = s[i].begin(); j != s[i].end(); j++)
-// if (*j != 0) {
-// int stmt = *(g.vertex[*j].first.begin());
-// std::vector<int> lex = getLexicalOrder(stmt);
-// lex_values.insert(lex[dim-1]);
-// }
-
-// if (s.size() > 1) {
-// if (i == 0) {
-// int min_lex_value;
-// for (std::set<int>::iterator j = s[i+1].begin(); j != s[i+1].end(); j++) {
-// int stmt = *(g.vertex[*j].first.begin());
-// std::vector<int> lex = getLexicalOrder(stmt);
-// if (j == s[i+1].begin())
-// min_lex_value = lex[dim-1];
-// else if (lex[dim-1] < min_lex_value)
-// min_lex_value = lex[dim-1];
-// }
-// fused_lex_value = min_lex_value - 1;
-// }
-// else {
-// int max_lex_value;
-// for (std::set<int>::iterator j = s[i-1].begin(); j != s[i-1].end(); j++) {
-// int stmt = *(g.vertex[*j].first.begin());
-// std::vector<int> lex = getLexicalOrder(stmt);
-// if (j == s[i-1].begin())
-// max_lex_value = lex[dim-1];
-// else if (lex[dim-1] > max_lex_value)
-// max_lex_value = lex[dim-1];
-// }
-// fused_lex_value = max_lex_value + 1;
-// }
-// }
-
-// break;
-// }
-
-// // sort the newly updated to-be-fused lexicographical order values
-// std::vector<int> ordered_lex_values;
-// for (std::set<int>::iterator i = lex_values.begin(); i != lex_values.end(); i++)
-// ordered_lex_values.push_back(*i);
-// std::sort(ordered_lex_values.begin(), ordered_lex_values.end());
-
-// // make sure internal loops inside to-be-fused loops have the same
-// // lexicographical order before and after fusion
-// std::vector<std::pair<int, int> > inside_lex_range(ordered_lex_values.size());
-// for (int i = 0; i < ordered_lex_values.size(); i++) {
-// ref_lex[dim-1] = ordered_lex_values[i];
-// std::set<int> the_stmts = getStatements(ref_lex, dim-1);
-// std::set<int>::iterator j = the_stmts.begin();
-// std::vector<int> lex = getLexicalOrder(*j);
-// int min_inside_lex_value = lex[dim+1];
-// int max_inside_lex_value = lex[dim+1];
-// j++;
-// for (; j != the_stmts.end(); j++) {
-// std::vector<int> lex = getLexicalOrder(*j);
-// if (lex[dim+1] < min_inside_lex_value)
-// min_inside_lex_value = lex[dim+1];
-// if (lex[dim+1] > max_inside_lex_value)
-// max_inside_lex_value = lex[dim+1];
-// }
-// inside_lex_range[i].first = min_inside_lex_value;
-// inside_lex_range[i].second = max_inside_lex_value;
-// }
-// for (int i = 1; i < ordered_lex_values.size(); i++)
-// if (inside_lex_range[i].first <= inside_lex_range[i-1].second) {
-// int shift_lex_value = inside_lex_range[i-1].second - inside_lex_range[i].first + 1;
-// ref_lex[dim-1] = ordered_lex_values[i];
-// ref_lex[dim+1] = inside_lex_range[i].first;
-// shiftLexicalOrder(ref_lex, dim+1, shift_lex_value);
-// inside_lex_range[i].first += shift_lex_value;
-// inside_lex_range[i].second += shift_lex_value;
-// }
-
-// // set lexicographical order for fused loops
-// for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-// std::vector<int> lex = getLexicalOrder(*i);
-// if (lex_values.find(lex[dim-1]) != lex_values.end())
-// assign_const(stmt[*i].xform, dim-1, fused_lex_value);
-// }
-
-// // no need to update dependence graph
-// ;
-
-// return true;
-// }
-
-
-// bool Loop::distribute(const std::set<int> &stmt_nums, int level) {
-// if (stmt_nums.size() == 0 || stmt_nums.size() == 1)
-// return true;
-// int dim = 2*level-1;
-
-// // check for sanity of parameters
-// std::vector<int> ref_lex;
-// for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-// if (*i < 0 || *i >= stmt.size())
-// throw std::invalid_argument("invalid statement number " + to_string(*i));
-// if (level < 1 || level > (stmt[*i].xform.n_out()-1)/2)
-// throw std::invalid_argument("invalid loop level " + to_string(level));
-// if (ref_lex.size() == 0)
-// ref_lex = getLexicalOrder(*i);
-// else {
-// std::vector<int> lex = getLexicalOrder(*i);
-// for (int j = 0; j <= dim-1; j+=2)
-// if (lex[j] != ref_lex[j])
-// throw std::invalid_argument("statements for distribution must be in the same level-" + to_string(level) + " subloop");
-// }
-// }
-
-// // find SCC in the to-be-distributed loop
-// int dep_dim = xform_index[dim].first;
-// std::set<int> same_loop = getStatements(ref_lex, dim-1);
-// Graph<int, Empty> g;
-// for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-// g.insert(*i);
-// for (int i = 0; i < g.vertex.size(); i++)
-// for (int j = i+1; j < g.vertex.size(); j++) {
-// std::vector<DependenceVector> dvs;
-// dvs = dep.getEdge(g.vertex[i].first, g.vertex[j].first);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim)) {
-// g.connect(i, j);
-// break;
-// }
-// dvs = dep.getEdge(g.vertex[j].first, g.vertex[i].first);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim)) {
-// g.connect(j, i);
-// break;
-// }
-// }
-// std::vector<std::set<int> > s = g.topoSort();
-
-// // find statements that cannot be distributed due to dependence cycle
-// Graph<std::set<int>, Empty> g2;
-// for (int i = 0; i < s.size(); i++) {
-// std::set<int> t;
-// for (std::set<int>::iterator j = s[i].begin(); j != s[i].end(); j++)
-// if (stmt_nums.find(g.vertex[*j].first) != stmt_nums.end())
-// t.insert(g.vertex[*j].first);
-// if (!t.empty())
-// g2.insert(t);
-// }
-// for (int i = 0; i < g2.vertex.size(); i++)
-// for (int j = i+1; j < g2.vertex.size(); j++)
-// for (std::set<int>::iterator ii = g2.vertex[i].first.begin(); ii != g2.vertex[i].first.end(); ii++)
-// for (std::set<int>::iterator jj = g2.vertex[j].first.begin(); jj != g2.vertex[j].first.end(); jj++) {
-// std::vector<DependenceVector> dvs;
-// dvs = dep.getEdge(*ii, *jj);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim)) {
-// g2.connect(i, j);
-// break;
-// }
-// dvs = dep.getEdge(*jj, *ii);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim)) {
-// g2.connect(j, i);
-// break;
-// }
-// }
-// std::vector<std::set<int> > s2 = g2.topoSort();
-
-// // nothing to distribute
-// if (s2.size() == 1)
-// throw loop_error("loop error: no statement can be distributed due to dependence cycle");
-
-// std::vector<std::set<int> > s3;
-// for (int i = 0; i < s2.size(); i++) {
-// std::set<int> t;
-// for (std::set<int>::iterator j = s2[i].begin(); j != s2[i].end(); j++)
-// std::set_union(t.begin(), t.end(), g2.vertex[*j].first.begin(), g2.vertex[*j].first.end(), inserter(t, t.begin()));
-// s3.push_back(t);
-// }
-
-// // associate other affected statements with the right distributed statements
-// for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-// if (stmt_nums.find(*i) == stmt_nums.end()) {
-// bool is_inserted = false;
-// int potential_insertion_point = 0;
-// for (int j = 0; j < s3.size(); j++) {
-// for (std::set<int>::iterator k = s3[j].begin(); k != s3[j].end(); k++) {
-// std::vector<DependenceVector> dvs;
-// dvs = dep.getEdge(*i, *k);
-// for (int kk = 0; kk < dvs.size(); kk++)
-// if (dvs[kk].isCarried(dep_dim)) {
-// s3[j].insert(*i);
-// is_inserted = true;
-// break;
-// }
-// dvs = dep.getEdge(*k, *i);
-// for (int kk = 0; kk < dvs.size(); kk++)
-// if (dvs[kk].isCarried(dep_dim))
-// potential_insertion_point = j;
-// }
-// if (is_inserted)
-// break;
-// }
-
-// if (!is_inserted)
-// s3[potential_insertion_point].insert(*i);
-// }
-
-// // set lexicographical order after distribution
-// int order = ref_lex[dim-1];
-// shiftLexicalOrder(ref_lex, dim-1, s3.size()-1);
-// for (std::vector<std::set<int> >::iterator i = s3.begin(); i != s3.end(); i++) {
-// for (std::set<int>::iterator j = (*i).begin(); j != (*i).end(); j++)
-// assign_const(stmt[*j].xform, dim-1, order);
-// order++;
-// }
-
-// // no need to update dependence graph
-// ;
-
-// return true;
-// }
-
-
-
-
-
-
-
-
diff --git a/loop_cuda.cc b/loop_cuda.cc
deleted file mode 100644
index a23990d..0000000
--- a/loop_cuda.cc
+++ /dev/null
@@ -1,2123 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2009 University of Utah
- All Rights Reserved.
-
- Purpose:
- Cudaize methods
-
- Notes:
-
- History:
- 1/7/10 Created by Gabe Rudy by migrating code from loop.cc
- 31/1/11 Modified by Protonu Basu
-*****************************************************************************/
-
-#include <code_gen/code_gen.h>
-#include <code_gen/CG_stringBuilder.h>
-#include <code_gen/output_repr.h>
-#include <code_gen/CG_outputRepr.h>
-#include "loop_cuda.hh"
-#include "loop.hh"
-#include <math.h>
-#include <useful.h>
-#include "omegatools.hh"
-#include "ir_cudasuif.hh"
-#include "ir_suif.hh"
-#include "ir_suif_utils.hh"
-#include "chill_error.hh"
-#include <vector>
-
-using namespace omega;
-char *k_cuda_texture_memory; //protonu--added to track texture memory type
-char *k_cuda_constant_memory; //protonu--added to track constant memory type
-//extern char *omega::k_cuda_texture_memory; //protonu--added to track texture memory type
-extern char *omega::k_ocg_comment;
-
-
-static int cudaDebug;
-class CudaStaticInit{ public: CudaStaticInit(){ cudaDebug=0; //Change this to 1 for debug
-}};
-static CudaStaticInit junkInitInstance__;
-
-
-
-std::string& upcase(std::string& s)
-{
- for(int i=0; i<s.size(); i++)
- s[i] = toupper(s[i]);
- return s;
-}
-
-void printVs(const std::vector<std::string>& curOrder){
- if(!cudaDebug) return;
- for(int i=0; i<curOrder.size(); i++){
- if(i>0)
- printf(",");
- printf("%s", curOrder[i].c_str());
- }
- printf("\n");
-}
-
-void printVS(const std::vector<std::string>& curOrder){
- //if(!cudaDebug) return;
- for(int i=0; i<curOrder.size(); i++){
- if(i>0)
- printf(",");
- printf("%s", curOrder[i].c_str());
- }
- printf("\n");
-}
-
-LoopCuda::~LoopCuda() {
- const int m = stmt.size();
- for (int i = 0; i < m; i++)
- stmt[i].code->clear();
-}
-
-bool LoopCuda::symbolExists(std::string s){
- if(symtab->lookup_sym(s.c_str(), SYM_VAR, false))
- return true;
- if(globals->lookup_sym(s.c_str(), SYM_VAR, false))
- return true;
- for(int i=0; i<idxNames.size(); i++)
- for(int j=0; j<idxNames[i].size(); j++)
- if(strcmp(idxNames[i][j].c_str(), s.c_str()) == 0)
- return true;
- return false;
-}
-
-void LoopCuda::addSync(int stmt_num, std::string idxName)
-{
- //we store these and code-gen inserts sync to omega comments where stmt
- //in loop that has idxName being generated
- syncs.push_back(make_pair(stmt_num,idxName));
-}
-
-void LoopCuda::renameIndex(int stmt_num, std::string idx, std::string newName)
-{
- int level = findCurLevel(stmt_num, idx);
- if(idxNames.size() <= stmt_num || idxNames[stmt_num].size() < level)
- throw std::runtime_error("Invalid statment number of index");
- idxNames[stmt_num][level-1] = newName.c_str();
-}
-
-
-
-enum Type{ Int };
-
-struct VarDefs{
- std::string name;
- std::string secondName;
- operand size_expr; //array size as an expression (can be a product of other variables etc)
- type_node * type;
- var_sym* in_data; //Variable of array to copy data in from (before kernel call)
- var_sym* out_data; //Variable of array to copy data out to (after kernel call)
- int size_2d; //-1 if linearized, the constant size N, of a NxN 2D array otherwise
- bool tex_mapped; //protonu-- true if this variable will be texture mapped, so no need to pass it as a argument
- bool cons_mapped; //protonu-- true if this variable will be constant mem mapped, so no need to pass it as a argument
- std::string original_name; //this is such a hack, to store the original name, to store a table to textures used
- int var_ref_size ;
-};
-
-tree_node_list* wrapInIfFromMinBound(tree_node_list* then_part, tree_for* loop, base_symtab* symtab, var_sym* bound_sym)
-{
- tree_node_list* ub = loop->ub_list();
- tree_node_list_iter upli(ub);
- while(!upli.is_empty()){
- tree_node *node = upli.step();
- if(node->kind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr)
- {
- in_rrr* ins = (in_rrr*)((tree_instr*)node)->instr();
- //expect the structure: cpy( _ = min(grab_me, _))
- if(ins->opcode() == io_cpy && ins->src1_op().is_instr()){
- ins = (in_rrr*)ins->src1_op().instr();
- if(ins->opcode() == io_min){
- tree_node_list* tnl = new tree_node_list;
- tnl->append(if_node(symtab, fold_sle(operand(bound_sym), ins->src1_op().instr()->clone()), then_part));
- return tnl;
- }
- }
- }
- }
- return then_part; //Failed to go to proper loop level
-}
-
-/**
- * This would be better if it was done by a CHiLL xformation instead of at codegen
- *
- * state:
- * for(...)
- * for(...)
- * cur_body
- * stmt1
- *
- * stm1 is in-between two loops that are going to be reduced. The
- * solution is to put stmt1 at the end of cur_body but conditionally run
- * in on the last step of the for loop.
- *
- * A CHiLL command that would work better:
- *
- * for(...)
- * stmt0
- * for(for i=0; i<n; i++)
- * cur_body
- * stmt1
- * =>
- * for(...)
- * for(for i=0; i<n; i++)
- * if(i==0) stmt0
- * cur_body
- * if(i==n-1) stmt1
- */
-
-std::vector<tree_for*> findCommentedFors(const char* index, tree_node_list* tnl){
- std::vector<tree_for *> result;
-
- tree_node_list_iter iter(tnl);
- bool next_loop_ok = false;
- while (!iter.is_empty()) {
- tree_node *tn = iter.step();
- if (tn->kind() == TREE_INSTR && ((tree_instr*)tn)->instr()->opcode() == io_mrk)
- {
- instruction* inst = ((tree_instr*)tn)->instr();
- std::string comment;
- if ((inst->peek_annote(k_ocg_comment) != NULL))
- {
- immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment));
- immed_list_iter data_iter(data);
- if(!data_iter.is_empty()){
- immed first_immed = data_iter.step();
- if(first_immed.kind() == im_string)
- comment = first_immed.string();
- }
- }
- if(comment.find("~cuda~") != std::string::npos
- && comment.find("preferredIdx: ") != std::string::npos){
- std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos);
- if(idx.find(" ") != std::string::npos)
- idx = idx.substr(0,idx.find(" "));
- if(strcmp(idx.c_str(),index) == 0)
- next_loop_ok = true;
- }
- }
- if (tn->kind() == TREE_FOR){
- if(next_loop_ok){
- //printf("found loop %s\n", static_cast<tree_for *>(tn)->index()->name());
- result.push_back(static_cast<tree_for *>(tn));
- }
- else{
- //printf("looking down for loop %s\n", static_cast<tree_for *>(tn)->index()->name());
- std::vector<tree_for*> t = findCommentedFors(index, static_cast<tree_for *>(tn)->body());
- std::copy(t.begin(), t.end(), back_inserter(result));
- }
- next_loop_ok = false;
- }
- if (tn->kind() == TREE_IF) {
- //printf("looking down if\n");
- tree_if *tni = static_cast<tree_if *>(tn);
- std::vector<tree_for*> t = findCommentedFors(index, tni->then_part());
- std::copy(t.begin(), t.end(), back_inserter(result));
- }
- }
-
- return result;
-}
-
-tree_node_list* forReduce(tree_for* loop, var_sym* reduceIndex, proc_symtab* proc_syms)
-{
- //We did the replacements all at once with recursiveFindPreferedIdxs
- //replacements r;
- //r.oldsyms.append(loop->index());
- //r.newsyms.append(reduceIndex);
- //tree_for* new_loop = (tree_for*)loop->clone_helper(&r, true);
- tree_for* new_loop = loop;
-
- //return body one loops in
- tree_node_list* tnl = loop_body_at_level(new_loop, 1);
- //wrap in conditional if necessary
- tnl = wrapInIfFromMinBound(tnl, new_loop, proc_syms, reduceIndex);
- return tnl;
-}
-
-void recursiveFindRefs(tree_node_list* code, proc_symtab* proc_syms, replacements* r)
-{
- if(code->parent() && code->scope()->is_block())
- ((block_symtab*)code->scope())->find_exposed_refs(proc_syms, r);
- tree_node_list_iter tnli(code);
- while (!tnli.is_empty()) {
- tree_node *node = tnli.step();
- //printf("node kind: %d\n", node->kind());
- if(node->is_instr())
- {
- tree_instr* t_instr = (tree_instr*)node;
- t_instr->find_exposed_refs(proc_syms, r);
- }
- if(node->is_block()){
- recursiveFindRefs(static_cast<tree_block *>(node)->body(), proc_syms, r);
- }
- else if(node->is_for()){
- tree_for* tn_for = static_cast<tree_for *>(node);
- //Find refs in statemetns and body
- tn_for->find_exposed_refs(proc_syms, r);
- //recursiveFindRefs(tn_for->body(), proc_syms, r);
- }
- }
-}
-
-tree_node_list* recursiveFindReplacePreferedIdxs(tree_node_list* code, proc_symtab* proc_syms,
- proc_sym* cudaSync, func_type* unkown_func,
- std::map<std::string, var_sym*>& loop_idxs)
-{
- tree_node_list* tnl = new tree_node_list;
- tree_node_list_iter tnli(code);
- var_sym* idxSym=0;
- bool sync = false;
- std::vector<tree_node*> r1;
- std::vector<tree_node_list*> r2;
- while (!tnli.is_empty()) {
- tree_node *node = tnli.step();
- //printf("node kind: %d\n", node->kind());
- if(node->is_instr())
- {
- if(((tree_instr*)node)->instr()->format() == inf_rrr){
- in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr();
- if(inst->opcode() == io_mrk){
- std::string comment;
- if ((inst->peek_annote(k_ocg_comment) != NULL))
- {
- immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment));
- immed_list_iter data_iter(data);
- if(!data_iter.is_empty()){
- immed first_immed = data_iter.step();
- if(first_immed.kind() == im_string)
- comment = first_immed.string();
- }
- }
- if(comment.find("~cuda~") != std::string::npos
- && comment.find("preferredIdx: ") != std::string::npos){
- std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos);
- if(idx.find(" ") != std::string::npos)
- idx = idx.substr(0,idx.find(" "));
- //printf("sym_tab preferred index: %s\n", idx.c_str());
- if(loop_idxs.find(idx) != loop_idxs.end())
- idxSym = loop_idxs.find(idx)->second;
- //Get the proc variable sybol for this preferred index
- if(idxSym == 0){
- idxSym = (var_sym*)proc_syms->lookup_sym(idx.c_str(), SYM_VAR, false);
- //printf("idx not found: lookup %p\n", idxSym);
- if(!idxSym){
- idxSym = new var_sym(type_s32, (char*)idx.c_str());
- proc_syms->add_sym(idxSym);
- //printf("idx created and inserted\n");
- }
- //Now insert into our map for future
- loop_idxs.insert(make_pair(idx, idxSym));
- }
- //See if we have a sync as well
- if(comment.find("sync") != std::string::npos){
- //printf("Inserting sync after current block\n");
- sync = true;
- }
- }
- }
- }
- tnl->append(node);
- }
- else if(node->is_block()){
- tree_block* b = static_cast<tree_block *>(node);
- b->set_body(recursiveFindReplacePreferedIdxs(b->body(), proc_syms, cudaSync, unkown_func, loop_idxs));
- tnl->append(b);
- }
- else if(node->is_for()){
- tree_for* tn_for = static_cast<tree_for *>(node);
- if(idxSym){
- //Replace the current tn_for's index variable with idxSym
- //printf("replacing sym %s -> %s\n", tn_for->index()->name(), idxSym->name());
- replacements r;
- r.oldsyms.append(tn_for->index());
- r.newsyms.append(idxSym);
- tree_for* new_loop = (tree_for*)tn_for->clone_helper(&r, true);
- idxSym = 0; //Reset for more loops in this tnl
- new_loop->set_body(recursiveFindReplacePreferedIdxs(new_loop->body(), proc_syms, cudaSync, unkown_func, loop_idxs));
- tnl->append(new_loop);
-
- if(sync){
- in_cal *the_call =
- new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaSync))), 0);
- tnl->append(new tree_instr(the_call));
- //tnl->print();
- sync = true;
- }
- }else{
- tn_for->set_body(recursiveFindReplacePreferedIdxs(tn_for->body(), proc_syms, cudaSync, unkown_func, loop_idxs));
- tnl->append(tn_for);
- }
- }else if (node->kind() == TREE_IF) {
- tree_if *tni = static_cast<tree_if *>(node);
- tni->set_then_part(recursiveFindReplacePreferedIdxs(tni->then_part(), proc_syms, cudaSync, unkown_func, loop_idxs));
- tnl->append(tni);
- }
- }
- //Do this after the loop to not screw up the pointer interator
- /*
- for(int i=0; i<r1.size(); i++){
- swap_node_for_node_list(r1[i],r2[i]);
- }*/
- return tnl;
-}
-
-// loop_vars -> array references
-// loop_idxs -> <idx_name,idx_sym> map for when we encounter a loop with a different preferredIndex
-// dim_vars -> out param, fills with <old,new> var_sym pair for 2D array dimentions (messy stuff)
-tree_node_list* swapVarReferences(tree_node_list* code, replacements* r, CG_suifBuilder *ocg,
- std::map<std::string, var_sym*>& loop_vars,
- proc_symtab *proc_syms,
- std::vector< std::pair<var_sym*,var_sym*> >& dim_vars)
-{
- //Iterate over every expression, looking up each variable and type
- //reference used and possibly replacing it or adding it to our symbol
- //table
- //
- //We use the built-in cloning helper methods to seriously help us with this!
-
- //Need to do a recursive mark
- recursiveFindRefs(code, proc_syms, r);
-
-
- //We can't rely on type_node->clone() to do the heavy lifting when the
- //old type is a two dimentional array with variable upper bounds as
- //that requires creating and saveing variable references to the upper
- //bounds. So we do one pass over the oldtypes doing this type of
- //conversion, putting results in the fixed_types map for a second pass
- //to pick up.
- std::map<type_node*,type_node*> fixed_types; //array_types needing their upper bound installed
- type_node_list_iter tlip(&r->oldtypes);
- while(!tlip.is_empty())
- {
- type_node* old_tn = tlip.step();
- type_node* new_tn = 0;
- type_node* base_type = old_tn;
- std::vector< std::pair<var_sym*, type_node*> > variable_upper_bouneds;
- if(old_tn->is_ptr()){
- while (base_type->is_array() || base_type->is_ptr()) {
- if (base_type->is_array()){
- array_bound ub = ((array_type*)base_type)->upper_bound();
- if(ub.is_variable()){
- var_sym* old_ub = (var_sym*)ub.variable();
- var_sym *new_ub = proc_syms->new_unique_var(type_s32);
- dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub));
- variable_upper_bouneds.push_back( std::pair<var_sym*, type_node*>(new_ub, base_type) );
- }
- base_type = static_cast<array_type *>(base_type)->elem_type();
- }
- else if (base_type->is_ptr())
- base_type = static_cast<ptr_type *>(base_type)->ref_type();
- }
- }
- for (int i = variable_upper_bouneds.size()-1; i >= 0; i--) {
- var_sym *var_ub = variable_upper_bouneds[i].first;
- type_node* old_tn = variable_upper_bouneds[i].second;
- if(new_tn == 0)
- new_tn = new array_type(base_type, array_bound(1), array_bound(var_ub));
- else
- new_tn = new array_type(new_tn, array_bound(1), array_bound(var_ub));
- proc_syms->add_type(new_tn);
- fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn));
- }
- if(new_tn){
- if(old_tn->is_ptr()){
- new_tn = new ptr_type(new_tn);
- proc_syms->add_type(new_tn);
- }
- fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn));
- }
- }
-
- //Quickly look for modifiers on our our array types (__shared__ float [][])
- type_node_list_iter tliq(&r->oldtypes);
- while(!tliq.is_empty())
- {
- type_node* old_tn = tliq.step();
- if(old_tn->is_modifier()){
- type_node* base_type = static_cast<modifier_type *>(old_tn)->base();
- if(fixed_types.find(base_type) != fixed_types.end()){
- type_node* fixed_base = (*fixed_types.find(base_type)).second;
- //printf("Fix modifier with fixed base\n");
- //This should work to copy over the annotations, but apparently doesn't work so well
- type_node* new_tn = new modifier_type(static_cast<modifier_type*>(old_tn)->op(), fixed_base);
- old_tn->copy_annotes(new_tn);
- fixed_types.insert(std::pair<type_node*,type_node*>(old_tn, new_tn));
- }
- }
- }
-
- //Run through the types and create entries in r->newtypes but don't install
- type_node_list_iter tli(&r->oldtypes);
- while(!tli.is_empty())
- {
- type_node* old_tn = tli.step();
- type_node* new_tn = 0;
-
- //If we recorded this as fixed by our special case, use that type
- //instead of cloning.
- if(fixed_types.find(old_tn) != fixed_types.end()){
- new_tn = (*fixed_types.find(old_tn)).second;
- //printf("Reusing fixed typ %u: ", new_tn->type_id());
- }else{
- new_tn = old_tn->clone();
- //printf("Cloning type %u: ", old_tn->type_id());
- }
- new_tn = proc_syms->install_type(new_tn);
-
- //Ok, there is a weird case where an array type that has var_sym as
- //their upper bounds can't be covered fully in this loop or the
- //var_sym loop, so we need special code.
- /*
- if(old_tn->op() == TYPE_PTR && ((ptr_type*)old_tn)->ref_type()->op() == TYPE_ARRAY){
- array_type* outer_array = (array_type*)((ptr_type*)old_tn)->ref_type();
- array_bound ub = outer_array->upper_bound();
- if(ub.is_variable()){
- var_sym* old_ub = (var_sym*)ub.variable();
- var_sym* new_ub = (var_sym*)((array_type*)((ptr_type*)new_tn)->ref_type())->upper_bound().variable();
- //r->oldsyms.append(old_ub);
- fix_ub.insert(std::pair<var_sym*,array_type*>(old_ub, (array_type*)((ptr_type*)new_tn)->ref_type()));
- dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub));
- printf("array var_sym: %p\n", new_ub);
- }
- if(outer_array->elem_type()->op() == TYPE_ARRAY)
- {
- array_type* inner_array = (array_type*)outer_array->elem_type();
- array_bound ub = inner_array->upper_bound();
- if(ub.is_variable()){
- var_sym* old_ub = (var_sym*)ub.variable();
- var_sym* new_ub = (var_sym*)((array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type())->upper_bound().variable();
- dim_vars.push_back(std::pair<var_sym* , var_sym*>(old_ub, new_ub));
- printf("array var_sym: %p\n", new_ub);
- //r->oldsyms.append(old_ub);
- fix_ub.insert(std::pair<var_sym*,array_type*>(old_ub, (array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type()));
- }
- }
- }
- */
- r->newtypes.append(new_tn);
- }
-
- //printf("proc_syms symbol run through\n");
- //proc_syms->print();
-
- //Run through the syms creating new copies
- sym_node_list_iter snli(&r->oldsyms);
- while(!snli.is_empty())
- {
- sym_node *old_sn = snli.step();
-
- if(loop_vars.count(std::string(old_sn->name())) > 0)
- {
- r->newsyms.append(loop_vars[std::string(old_sn->name())]);
- //printf("def exists: %s\n", old_sn->name());
- }else{
- sym_node *new_sn = old_sn->copy();
- if(new_sn->is_var()){
- var_sym* var = (var_sym*)new_sn;
- type_node* new_type = var->type()->clone_helper(r);
-
- //TODO: Have a tagged list of variables to make shared
- //Make local 2D arrays __shared__
- if(new_type->op() == TYPE_ARRAY && ((array_type*)new_type)->elem_type()->op() == TYPE_ARRAY){
- //protonu--changes suggested by Malik
- //printf("Adding __shared__ annotation to : %s\n", new_sn->name());
- //new_type = ocg->ModifyType(new_type, "__shared__");
- //proc_syms->add_type(new_type);
- }
- var->set_type(new_type);
- }
- proc_syms->add_sym(new_sn);
- r->newsyms.append(new_sn);
- //printf("def new: %s\n", new_sn->name());
- }
- }
-
- //printf("proc_syms var runthrough\n");
- //proc_syms->print();
- return code->clone_helper(r);
-}
-
-bool LoopCuda::validIndexes(int stmt, const std::vector<std::string>& idxs){
- for(int i=0; i<idxs.size(); i++){
- bool found = false;
- for(int j=0; j<idxNames[stmt].size(); j++){
- if(strcmp(idxNames[stmt][j].c_str(), idxs[i].c_str()) == 0){
- found=true;
- }
- }
- if(!found){
- return false;
- }
- }
- return true;
-}
-
-
-bool LoopCuda::cudaize_v2(std::string kernel_name, std::map<std::string, int> array_dims,
- std::vector<std::string> blockIdxs, std::vector<std::string> threadIdxs)
-{
- int stmt_num = 0;
- if(cudaDebug){
- printf("cudaize_v2(%s, {", kernel_name.c_str());
- //for(
- printf("}, blocks={"); printVs(blockIdxs); printf("}, thread={"); printVs(threadIdxs); printf("})\n");
- }
-
- this->array_dims = array_dims;
- if(!validIndexes(stmt_num, blockIdxs)){
- throw std::runtime_error("One of the indexes in the block list was not "
- "found in the current set of indexes.");
- }
- if(!validIndexes(stmt_num, threadIdxs)){
- throw std::runtime_error("One of the indexes in the thread list was not "
- "found in the current set of indexes.");
- }
- if(blockIdxs.size() ==0)
- throw std::runtime_error("Cudaize: Need at least one block dimention");
- int block_level=0;
- //Now, we will determine the actual size (if possible, otherwise
- //complain) for the block dimentions and thread dimentions based on our
- //indexes and the relations for our stmt;
- for(int i=0; i<blockIdxs.size(); i++){
- int level = findCurLevel(stmt_num, blockIdxs[i]);
- int ub,lb;
- extractCudaUB(stmt_num,level,ub,lb);
- if(lb!= 0){
- //attempt to "normalize" the loop with an in-place tile and then re-check our bounds
- if(cudaDebug) printf("Cudaize: doing tile at level %d to try and normalize lower bounds\n", level);
- tile(stmt_num,level,1,level,CountedTile);
- idxNames[stmt_num].insert(idxNames[stmt_num].begin()+(level),"");//TODO: possibly handle this for all sibling stmts
- extractCudaUB(stmt_num,level,ub,lb);
- }
- if(lb != 0){
- char buf[1024];
- sprintf(buf, "Cudaize: Loop at level %d does not have 0 as it's lower bound", level);
- throw std::runtime_error(buf);
- }
- if(ub < 0){
- char buf[1024];
- sprintf(buf, "Cudaize: Loop at level %d does not have a hard upper bound", level);
- throw std::runtime_error(buf);
- }
- if(cudaDebug) printf("block idx %s level %d lb: %d ub %d\n", blockIdxs[i].c_str(), level, lb, ub);
- if(i == 0){
- block_level = level;
- cu_bx = ub+1;
- idxNames[stmt_num][level-1] = "bx";
- }
- else if(i == 1){
- cu_by = ub+1;
- idxNames[stmt_num][level-1] = "by";
- }
- }
- if(!cu_by)
- block_level=0;
- int thread_level1 = 0;
- int thread_level2 = 0;
- for(int i=0; i<threadIdxs.size(); i++){
- int level = findCurLevel(stmt_num, threadIdxs[i]);
- int ub,lb;
- extractCudaUB(stmt_num,level,ub,lb);
- if(lb!= 0){
- //attempt to "normalize" the loop with an in-place tile and then re-check our bounds
- if(cudaDebug) printf("Cudaize: doing tile at level %d to try and normalize lower bounds\n", level);
- tile(stmt_num,level,1,level,CountedTile);
- idxNames[stmt_num].insert(idxNames[stmt_num].begin()+(level),"");
- extractCudaUB(stmt_num,level,ub,lb);
- }
- if(lb != 0){
- char buf[1024];
- sprintf(buf, "Cudaize: Loop at level %d does not have 0 as it's lower bound", level);
- throw std::runtime_error(buf);
- }
- if(ub < 0){
- char buf[1024];
- sprintf(buf, "Cudaize: Loop at level %d does not have a hard upper bound", level);
- throw std::runtime_error(buf);
- }
-
- if(cudaDebug) printf("thread idx %s level %d lb: %d ub %d\n", threadIdxs[i].c_str(), level, lb, ub);
- if(i == 0){
- thread_level1 = level;
- cu_tx = ub+1;
- idxNames[stmt_num][level-1] = "tx";
- }
- else if(i == 1){
- thread_level2 = level;
- cu_ty = ub+1;
- idxNames[stmt_num][level-1] = "ty";
- }
- else if(i == 2){
- cu_tz = ub+1;
- idxNames[stmt_num][level-1] = "tz";
- }
- }
- if(!cu_ty)
- thread_level1 = 0;
- if(!cu_tz)
- thread_level2 = 0;
-
- //Make changes to nonsplitlevels
- const int m = stmt.size();
- for (int i = 0; i < m; i++) {
- if(block_level){
- //stmt[i].nonSplitLevels.append((block_level)*2);
- stmt_nonSplitLevels[i].append((block_level)*2);
- }
- if(thread_level1){
- //stmt[i].nonSplitLevels.append((thread_level1)*2);
- stmt_nonSplitLevels[i].append((thread_level1)*2);
- }
- if(thread_level2){
- //stmt[i].nonSplitLevels.append((thread_level1)*2);
- stmt_nonSplitLevels[i].append((thread_level1)*2);
- }
- }
-
- if(cudaDebug) {
- printf("Codegen: current names: ");
- printVS(idxNames[stmt_num]);
- }
- //Set codegen flag
- code_gen_flags |= GenCudaizeV2;
-
- //Save array dimention sizes
- this->array_dims = array_dims;
- cu_kernel_name = kernel_name.c_str();
-
-}
-
-tree_node_list* LoopCuda::cudaize_codegen_v2()
-{
- //printf("cudaize codegen V2\n");
- CG_suifBuilder *ocg = dynamic_cast<CG_suifBuilder*>(ir->builder());
- if(!ocg) return false;
-
- //protonu--adding an annote to track texture memory type
- ANNOTE(k_cuda_texture_memory, "cuda texture memory", TRUE);
- ANNOTE(k_cuda_constant_memory, "cuda constant memory", TRUE);
- int tex_mem_on = 0;
- int cons_mem_on = 0;
-
-
-
- CG_outputRepr* repr;
- std::vector<VarDefs> arrayVars;
- std::vector<VarDefs> localScopedVars;
-
- std::vector<IR_ArrayRef *> ro_refs;
- std::vector<IR_ArrayRef *> wo_refs;
- std::set<std::string> uniqueRefs;
- std::set<std::string> uniqueWoRefs;
- //protonu--let's try a much simpler approach of a map instead
- //we also keep a map for constant memories
- std::map<std::string , var_sym *>tex_ref_map;
- std::map<std::string , var_sym *>cons_ref_map;
-
- for(int j=0; j<stmt.size(); j++)
- {
- std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[j].code);
- for (int i = 0; i < refs.size(); i++)
- {
- //printf("ref %s wo %d\n", static_cast<const char*>(refs[i]->name()), refs[i]->is_write());
- var_sym* var = symtab->lookup_var((char*)refs[i]->name().c_str(),false);
- //If the array is not a parameter, then it's a local array and we
- //want to recreate it as a stack variable in the kernel as opposed to
- //passing it in.
- if(!var->is_param())
- continue;
- if (uniqueRefs.find(refs[i]->name()) == uniqueRefs.end())
- {
- uniqueRefs.insert(refs[i]->name());
- if(refs[i]->is_write()){
- uniqueWoRefs.insert(refs[i]->name());
- wo_refs.push_back(refs[i]);
- }
- else
- ro_refs.push_back(refs[i]);
- }
- if (refs[i]->is_write() && uniqueWoRefs.find(refs[i]->name()) == uniqueWoRefs.end()){
- uniqueWoRefs.insert(refs[i]->name());
- wo_refs.push_back(refs[i]);
- //printf("adding %s to wo\n", static_cast<const char*>(refs[i]->name()));
- }
- }
- }
-
- // printf("reading from array ");
- // for(int i=0; i<ro_refs.size(); i++)
- // printf("'%s' ", ro_refs[i]->name().c_str());
- // printf("and writting to array ");
- // for(int i=0; i<wo_refs.size(); i++)
- // printf("'%s' ", wo_refs[i]->name().c_str());
- // printf("\n");
-
- const char* gridName = "dimGrid";
- const char* blockName = "dimBlock";
-
- //TODO: Could allow for array_dims_vars to be a mapping from array
- //references to to variable names that define their length.
- var_sym* dim1 = 0;
- var_sym* dim2 = 0;
-
- for(int i=0; i<wo_refs.size(); i++)
- {
- //TODO: Currently assume all arrays are floats of one or two dimentions
- var_sym* outArray = 0;
- std::string name = wo_refs[i]->name();
- outArray = symtab->lookup_var((char*)name.c_str(),false);
-
- VarDefs v;
- v.size_2d = -1;
- char buf[32];
- snprintf(buf, 32, "devO%dPtr", i+1);
- v.name = buf;
- if(outArray->type()->is_ptr())
- if(((ptr_type *)(outArray->type()))->ref_type()->is_array())
- v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type();
- else
- v.type = ((ptr_type *)(outArray->type()))->ref_type();
- else
- v.type = type_f32;
- v.tex_mapped = false;
- v.cons_mapped = false;
- v.original_name = wo_refs[i]->name();
- //Size of the array = dim1 * dim2 * num bytes of our array type
-
- //If our input array is 2D (non-linearized), we want the actual
- //dimentions of the array
- CG_outputRepr* size;
- //Lookup in array_dims
- std::map<std::string, int>::iterator it = array_dims.find(name.c_str());
- if(outArray->type()->is_ptr() && outArray->type()->ref_type(0)->is_array())
- {
- array_type* t = (array_type*)outArray->type()->ref_type(0);
- v.size_2d = t->upper_bound().constant()+1;
- printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)wo_refs[i]->name().c_str());
- size = ocg->CreateInt(v.size_2d * v.size_2d);
- }else if(it != array_dims.end()){
- int ref_size = it->second;
- v.var_ref_size = ref_size;
- size = ocg->CreateInt(ref_size);
- }
- else{
- if(dim1){
- size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)),
- new CG_suifRepr(operand(dim2)));
- }else{
- char buf[1024];
- sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a "
- "detectable size or specififed dimentions", name.c_str());
- throw std::runtime_error(buf);
- }
- }
- v.size_expr = operand(static_cast<CG_suifRepr*>(ocg->CreateTimes(
- size,
- ocg->CreateInt(v.type->size()/8)))->GetExpression());
- v.in_data = 0;
- v.out_data = outArray;
- //Check for in ro_refs and remove it at this point
- std::vector<IR_ArrayRef *>::iterator it_;
- for(it_ = ro_refs.begin(); it_ != ro_refs.end(); it_++)
- {
- if((*it_)->name() == wo_refs[i]->name()){
- break;
- }
- }
- if(it_ != ro_refs.end())
- {
- v.in_data = outArray;
- ro_refs.erase(it_);
- }
-
- arrayVars.push_back(v);
-
- }
-
- //protonu-- assuming that all texture mapped memories were originally read only mems
- //there should be safety checks for that, will implement those later
-
- int cs_ref_size = 0;
-
- for(int i=0; i<ro_refs.size(); i++)
- {
- var_sym* inArray = 0;
- std::string name = ro_refs[i]->name();
- inArray = symtab->lookup_var((char*)name.c_str(),false);
- VarDefs v;
- v.size_2d = -1;
- char buf[32];
- snprintf(buf, 32, "devI%dPtr", i+1);
- v.name = buf;
- if(inArray->type()->is_ptr())
- if(((ptr_type *)(inArray->type()))->ref_type()->is_array())
- v.type = ((array_type *)(((ptr_type *)(inArray->type()))->ref_type()))->elem_type();
- else
- v.type = ((ptr_type *)(inArray->type()))->ref_type();
- else
- v.type = type_f32;
- v.tex_mapped = false;
- v.cons_mapped = false;
- v.original_name = ro_refs[i]->name();
- if ( texture != NULL)
- v.tex_mapped = (texture->is_array_tex_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars
- if (v.tex_mapped){
- printf("this variable %s is mapped to texture memory", name.c_str());
- }
- if ( constant_mem != NULL)
- v.cons_mapped = (constant_mem->is_array_cons_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars
- if (v.cons_mapped){
- printf("this variable %s is mapped to constant memory", name.c_str());
- }
-
- //Size of the array = dim1 * dim2 * num bytes of our array type
-
- //If our input array is 2D (non-linearized), we want the actual
- //dimentions of the array (as it might be less than cu_n
- CG_outputRepr* size;
- //Lookup in array_dims
- std::map<std::string, int>::iterator it = array_dims.find(name.c_str());
- int ref_size = 0;
- if(inArray->type()->is_ptr() && inArray->type()->ref_type(0)->is_array())
- {
- array_type* t = (array_type*)inArray->type()->ref_type(0);
- v.size_2d = t->upper_bound().constant()+1;
- printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)ro_refs[i]->name().c_str());
- size = ocg->CreateInt(v.size_2d * v.size_2d);
- }else if(it != array_dims.end()){
- ref_size = it->second;
- v.var_ref_size = ref_size;
- size = ocg->CreateInt(ref_size);
- }else{
- if(dim1){
- size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)),
- new CG_suifRepr(operand(dim2)));
- }else{
- char buf[1024];
- sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a "
- "detectable size or specififed dimentions", name.c_str());
- throw std::runtime_error(buf);
- }
- }
-
-
-
- v.size_expr = operand(static_cast<CG_suifRepr*>(ocg->CreateTimes(
- size,
- ocg->CreateInt(v.type->size()/8)))->GetExpression());
-
- v.in_data = inArray;
- v.out_data = 0;
- arrayVars.push_back(v);
- }
-
-
- if(arrayVars.size() < 2)
- {
- fprintf(stderr, "cudaize error: Did not find two arrays being accessed\n");
- return false;
- }
-
- //protonu--debugging tool--the printf statement
- //tex_mem_on signals use of tex mem
- for(int i=0; i<arrayVars.size(); i++)
- {
- //printf("var name %s, tex_mem used %s\n", arrayVars[i].name.c_str(), (arrayVars[i].tex_mapped)?"true":"false");
- if (arrayVars[i].tex_mapped ) tex_mem_on ++;
- if (arrayVars[i].cons_mapped ) cons_mem_on ++;
- }
-
- //Add CUDA function extern prototypes and function types
- func_type* unkown_func = new func_type(type_s32); //function on unkown args that returns a i32
- unkown_func = (func_type*)symtab->install_type(unkown_func);
- func_type* void_func = new func_type(type_void); //function on unkown args that returns a void
- void_func = (func_type*)globals->install_type(void_func);
- func_type* float_func = new func_type(type_f32); //function on unkown args that returns a float
- float_func = (func_type*)globals->install_type(float_func);
-
- type_node* result = ocg->ModifyType(type_void, "__global__");
- result = globals->install_type(result);
- func_type* kernel_type = new func_type(result); //function returns a '__global__ void'
-
- int numArgs = arrayVars.size() + (dim1 ? 2 : 0) + localScopedVars.size();
- //protonu--need to account for texture memory here, reduce the #args
- if( tex_mem_on ) numArgs -= tex_mem_on;
- if( cons_mem_on ) numArgs -= cons_mem_on;
- kernel_type->set_num_args(numArgs);
- int argCount = 0;
- for(int i=0; i<arrayVars.size(); i++)
- {
- type_node* fptr;
- if(arrayVars[i].in_data)
- fptr = arrayVars[i].in_data->type()->clone();
- else
- fptr = arrayVars[i].out_data->type()->clone();
- //protonu--skip this for texture mems
- if( arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true )
- kernel_type->set_arg_type(argCount++, fptr);
- }
- if(dim1){
- kernel_type->set_arg_type(argCount++, type_s32); //width x height dimentions
- kernel_type->set_arg_type(argCount++, type_s32);
- }
- kernel_type = (func_type*)globals->install_type(kernel_type);
-
- proc_sym* cudaMalloc = globals->new_proc(unkown_func, src_c, "cudaMalloc");
- proc_sym* cudaMemcpy = globals->new_proc(unkown_func, src_c, "cudaMemcpy");
- proc_sym* cudaFree = globals->new_proc(unkown_func, src_c, "cudaFree");
- proc_sym* cudaSync = globals->new_proc(void_func, src_c, "__syncthreads");
- proc_sym* cudaBind = globals->new_proc(unkown_func, src_c, "cudaBindTexture");
- proc_sym* cudaMemcpySym = globals->new_proc(unkown_func, src_c, "cudaMemcpyToSymbol");
-
-
- //protonu-removing Gabe's function, introducing mine, this is pretty cosmetic
- //proc_sym* cudaFetch = globals->new_proc(float_func, src_c, "tex1Dfetch");
- proc_sym* tex1D = globals->new_proc(float_func, src_c, "tex1Dfetch");
-
- var_sym *cudaMemcpyHostToDevice = new var_sym(type_s32, "cudaMemcpyHostToDevice");
- var_sym *cudaMemcpyDeviceToHost = new var_sym(type_s32, "cudaMemcpyDeviceToHost");
- cudaMemcpyDeviceToHost->set_param();
- cudaMemcpyHostToDevice->set_param();
- globals->add_sym(cudaMemcpyHostToDevice);
- globals->add_sym(cudaMemcpyDeviceToHost);
-
- //protonu--adding the bool tex_mem to the structure struct_type
- //to bypass the re-naming of struct texture, this is a hack fix
- struct_type* texType = new struct_type(TYPE_GROUP, 0, "texture<float, 1, cudaReadModeElementType>", 0, true);
- immed_list *iml_tex = new immed_list;
- iml_tex->append(immed("texture memory"));
- texType->append_annote(k_cuda_texture_memory, iml_tex);
- //protonu--end my changes
- texType = (struct_type*)globals->install_type(texType);
- //protonu--should register the locals later on
- //when we do the bind operation
- //var_sym* texRef = new var_sym(texType, "texRef");
- //globals->add_sym(texRef);
-
- //Add our mallocs (and input array memcpys)
- for(int i=0; i<arrayVars.size(); i++)
- {
- //protonu--check if the variable is not a tex-mapped variable. If it is tex mapped
- // allow a malloc and memcpy operation, and a bind, but only if it is tex mapped, but dont call
- // the kernel with it as an argument.
-
- //Make a pointer of type a[i].type
- //type_node* fptr = new ptr_type(arrayVars[i].type->clone());
- //protonu--temporary change
- type_node* fptr = new ptr_type(arrayVars[i].type);
- fptr = symtab->install_type(fptr);
- var_sym *dvs = new var_sym(fptr, const_cast<char*>(
- arrayVars[i].name.c_str()));
- dvs->set_addr_taken();
- symtab->add_sym(dvs);
-
- //cudaMalloc args
- //protonu--no cudaMalloc required for constant memory
- tree_node_list* tnl = new tree_node_list;
- if(arrayVars[i].cons_mapped != true )
- {
- in_cal *the_call =
- new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMalloc))), 2);
- the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to()->ptr_to(), operand(), immed(dvs))));
- the_call->set_argument(1, arrayVars[i].size_expr);
-
- tnl->append(new tree_instr(the_call));
- setup_code = ocg->StmtListAppend(setup_code,
- new CG_suifRepr(tnl));
- }
- if(arrayVars[i].in_data)
- {
- //cudaMemcpy args
- //protonu-- no cudaMemcpy required for constant memory
- if ( arrayVars[i].cons_mapped != true )
- {
- in_cal *the_call =
- new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpy))), 4);
- the_call->set_argument(0, operand(dvs));
- the_call->set_argument(1, operand(arrayVars[i].in_data));
- the_call->set_argument(2, arrayVars[i].size_expr.clone());
- the_call->set_argument(3, operand(cudaMemcpyHostToDevice));
-
- tnl = new tree_node_list;
- tnl->append(new tree_instr(the_call));
- setup_code = ocg->StmtListAppend(setup_code,
- new CG_suifRepr(tnl));
- }
-
- //protonu--check if the arrayvar is tex mapped
- if(arrayVars[i].tex_mapped == true)
- {
- //Need a texture reference variable
- char buf[32];
- snprintf(buf, 32, "tex%dRef", i+1);
- arrayVars[i].secondName = buf;
-
- var_sym* texRef = new var_sym(texType, buf);
- //printf("\n putting in %s\n", arrayVars[i].original_name.c_str());
- tex_ref_map[arrayVars[i].original_name] = texRef;
- globals->add_sym(texRef);
- //protonu--added the above two lines
-
- in_cal *the_call =
- new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaBind))), 4);
- in_ldc *ins = new in_ldc(type_s32, operand(), immed(0));
- the_call->set_argument(0, operand(ins));
- the_call->set_argument(1, operand(texRef));//protonu--change to add the new sym
- the_call->set_argument(2, operand(dvs));
- the_call->set_argument(3, arrayVars[i].size_expr.clone());
-
- tnl = new tree_node_list;
- tnl->append(new tree_instr(the_call));
- setup_code = ocg->StmtListAppend(setup_code,
- new CG_suifRepr(tnl));
- }
-
- //protonu--if arrayvar is mapped to constant memory
- if(arrayVars[i].cons_mapped == true)
- {
- char buf[32];
- snprintf(buf, 32, "cs%dRef", i+1);
- //arrayVars[i].secondName = buf;
- array_bound low (0);
- array_bound high (arrayVars[i].var_ref_size -1);
- array_type *arr = new array_type(arrayVars[i].type,low, high);
- type_node* cons_arr = ocg->ModifyType(arr, "__device__ __constant__");
- cons_arr = globals->install_type(cons_arr);
- var_sym* consRef = new var_sym(cons_arr, buf);
- cons_ref_map[arrayVars[i].original_name] = consRef;
- globals->add_sym(consRef);
-
-
-
- in_cal *the_call =
- new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpySym))), 3);
- the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to(), operand(), immed(consRef))));
- the_call->set_argument(1, operand(arrayVars[i].in_data));
- the_call->set_argument(2, arrayVars[i].size_expr.clone());
-
- tnl = new tree_node_list;
- tnl->append(new tree_instr(the_call));
- setup_code = ocg->StmtListAppend(setup_code,
- new CG_suifRepr(tnl));
-
- }
- }
- }
-
- //Build dimGrid dim3 variables based on loop dimentions and ti/tj
- char blockD1[120];
- char blockD2[120];
- if(dim1){
- snprintf(blockD1, 120, "%s/%d", dim1->name(), cu_tx);
- snprintf(blockD2, 120, "%s/%d", dim2->name(), cu_ty);
- }else{
- snprintf(blockD1, 120, "%d", cu_bx);
- snprintf(blockD2, 120, "%d", cu_by);
- //snprintf(blockD1, 120, "%d/%d", cu_nx, cu_tx);
- //snprintf(blockD2, 120, "%d/%d", cu_ny, cu_ty);
- }
- repr = ocg->CreateDim3(immed((char*)gridName),
- immed(blockD1),
- immed(blockD2));
- setup_code = ocg->StmtListAppend(setup_code, repr);
-
- repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx),immed(cu_ty));
-
- if(cu_tz > 1)
- repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty), immed(cu_tz));
- else
- repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty));
- setup_code = ocg->StmtListAppend(setup_code, repr);
-
- //call kernel function with name loop_name
- //like: transpose_k<<<dimGrid,dimBlock>>>(devOPtr, devIPtr , width, height);
- char dims[120];
- snprintf(dims,120,"<<<%s,%s>>>",gridName, blockName);
- immed_list *iml = new immed_list;
- iml->append(immed((char*)cu_kernel_name.c_str()));
- iml->append(immed(dims));
- //printf("%s %s\n", static_cast<const char*>(cu_kernel_name), dims);
- for(int i=0; i<arrayVars.size(); i++)
- //Throw in a type cast if our kernel takes 2D array notation
- //like (float(*) [1024])
- {
- //protonu--throwing in another hack to stop the caller from passing tex mapped
- //vars to the kernel.
- if(arrayVars[i].tex_mapped == true || arrayVars[i].cons_mapped == true )
- continue;
- if(arrayVars[i].size_2d >= 0)
- {
- snprintf(dims,120,"(float(*) [%d])%s", arrayVars[i].size_2d,
- const_cast<char*>(arrayVars[i].name.c_str()));
- //printf("%d %s\n", i, dims);
- iml->append(immed(dims));
- }else{
- //printf("%d %s\n", i, static_cast<const char*>(arrayVars[i].name));
- iml->append(immed(const_cast<char*>(
- arrayVars[i].name.c_str())));
- }
- }
- if(dim1){
- iml->append(immed(dim1));
- iml->append(immed(dim2));
- }
- repr = ocg->CreateKernel(iml);//kernel call
- setup_code = ocg->StmtListAppend(setup_code, repr);
-
- //cuda free variables
- for(int i=0; i<arrayVars.size(); i++)
- {
- if(arrayVars[i].out_data)
- {
- //cudaMemcpy args
- in_cal *the_call =
- new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpy))), 4);
- the_call->set_argument(0, operand(arrayVars[i].out_data));
- the_call->set_argument(1, operand(symtab->lookup_var(const_cast<char*>(
- arrayVars[i].name.c_str()))));
- the_call->set_argument(2, arrayVars[i].size_expr.clone());
- the_call->set_argument(3, operand(cudaMemcpyDeviceToHost));
-
- tree_node_list* tnl = new tree_node_list;
- tnl->append(new tree_instr(the_call));
- teardown_code = ocg->StmtListAppend(teardown_code,
- new CG_suifRepr(tnl));
- }
-
- in_cal *the_call =
- new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaFree))), 1);
- the_call->set_argument(0, operand(symtab->lookup_var(const_cast<char*>(
- arrayVars[i].name.c_str()))));
-
- tree_node_list* tnl = new tree_node_list;
- tnl->append(new tree_instr(the_call));
- teardown_code = ocg->StmtListAppend(teardown_code,
- new CG_suifRepr(tnl));
- }
-
- // ---------------
- // BUILD THE KERNEL
- // ---------------
-
- //Extract out kernel body
- tree_node_list* code = getCode();
- //Get rid of wrapper if that original() added
- if(code->head()->contents->kind() == TREE_IF)
- {
- tree_if* ifn = (tree_if*)code->head()->contents;
- code = ifn->then_part();
- }
-
- //Create kernel function body
- proc_sym *new_psym = globals->new_proc(kernel_type, src_c, (char*)cu_kernel_name.c_str());
- proc_symtab *new_proc_syms = new proc_symtab(new_psym->name());
- globals->add_child(new_proc_syms);
-
- //Add Params
- std::map<std::string, var_sym*> loop_vars;
- //In-Out arrays
- type_node* fptr;
- for(int i=0; i<arrayVars.size(); i++)
- {
- if(arrayVars[i].in_data)
- //fptr = arrayVars[i].in_data->type()->clone();
- fptr = arrayVars[i].in_data->type();
- else
- //fptr = arrayVars[i].out_data->type()->clone();
- fptr = arrayVars[i].out_data->type();
- fptr = new_proc_syms->install_type(fptr);
- std::string name = arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name();
- var_sym* sym = new var_sym(fptr, arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name());
- //protonu--adding a check to ensure that texture memories are not passed in as arguments
- if(arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true )
- {
- sym->set_param();
- new_proc_syms->params()->append(sym);
- new_proc_syms->add_sym(sym);//protonu--added to suppress the addition of the redundant var in the kernel
- }
- if (arrayVars[i].cons_mapped == true)
- {
- sym->set_param();
- new_proc_syms->add_sym(sym);
- }
- //printf("inserting name: %s\n", static_cast<const char*>(name));
- loop_vars.insert(std::pair<std::string, var_sym*>(std::string(name), sym));
- }
-
- if(dim1)
- {
- //Array dimentions
- var_sym* kdim1 = new var_sym(dim1->type(), dim1->name());
- kdim1->set_param();
- new_proc_syms->add_sym(kdim1);
- loop_vars.insert(std::pair<std::string, var_sym*>(std::string(dim1->name()), kdim1));
- var_sym* kdim2 = new var_sym(dim2->type(), dim2->name());
- kdim2->set_param();
- new_proc_syms->add_sym(kdim2);
- loop_vars.insert(std::pair<std::string, var_sym*>(std::string(dim2->name()), kdim2));
- new_proc_syms->params()->append(kdim1);
- new_proc_syms->params()->append(kdim2);
- }
- //Put block and thread implicit variables into scope
- std::vector<var_sym *> index_syms;
- /* Currently we don't use the block dimentions
- var_sym* blockDim_x = new var_sym(type_s32, "blockDim.x");
- blockDim_x->set_param();
- new_proc_syms->add_sym(blockDim_x);
- var_sym* blockDim_y = new var_sym(type_s32, "blockDim.y");
- blockDim_y->set_param();
- new_proc_syms->add_sym(blockDim_y);
- */
- if(cu_bx > 1){
- var_sym* blockIdx_x = new var_sym(type_s32, "blockIdx.x");
- blockIdx_x->set_param();
- new_proc_syms->add_sym(blockIdx_x);
- index_syms.push_back(blockIdx_x);
- }
- if(cu_by > 1){
- var_sym* blockIdx_y = new var_sym(type_s32, "blockIdx.y");
- blockIdx_y->set_param();
- new_proc_syms->add_sym(blockIdx_y);
- index_syms.push_back(blockIdx_y);
- }
- if(cu_tx > 1){
- var_sym* threadIdx_x = new var_sym(type_s32, "threadIdx.x");
- threadIdx_x->set_param();
- new_proc_syms->add_sym(threadIdx_x);
- index_syms.push_back(threadIdx_x);
- }
- if(cu_ty > 1){
- var_sym* threadIdx_y = new var_sym(type_s32, "threadIdx.y");
- threadIdx_y->set_param();
- new_proc_syms->add_sym(threadIdx_y);
- index_syms.push_back(threadIdx_y);
- }
-
- if(cu_tz > 1){
- var_sym* threadIdx_z = new var_sym(type_s32, "threadIdx.z");
- threadIdx_z->set_param();
- new_proc_syms->add_sym(threadIdx_z);
- index_syms.push_back(threadIdx_z);
- }
-
- //Figure out which loop variables will be our thread and block dimention variables
- std::vector<var_sym *> loop_syms;
- //Get our indexes
- std::vector<const char*> indexes;// = get_loop_indexes(code,cu_num_reduce);
- int threadsPos=0;
- if(cu_bx > 1)
- indexes.push_back("bx");
- if(cu_by > 1)
- indexes.push_back("by");
- if(cu_tx > 1){
- threadsPos = indexes.size();
- indexes.push_back("tx");
- }
- if(cu_ty > 1)
- indexes.push_back("ty");
- if(cu_tz > 1)
- indexes.push_back("tz");
- for(int i=0; i<indexes.size(); i++)
- {
- //printf("indexes[%d] = %s\n", i, (char*)indexes[i]);
- loop_syms.push_back(new var_sym(type_s32, (char*)indexes[i]));
- new_proc_syms->add_sym(loop_syms[i]);
- //loop_vars.insert(std::pair<std::string, var_sym*>(std::string(indexes[i]), loop_syms[i]));
- }
-
- //Generate this code
- //int bx = blockIdx.x
- //int by = blockIdx.y
- //int tx = threadIdx.x
- //int ty = threadIdx.y
- CG_outputRepr *body=NULL;
- for(int i=0; i<indexes.size(); i++){
- CG_outputRepr *lhs = new CG_suifRepr(operand(loop_syms[i]));
- //body = ocg->StmtListAppend(body, ocg->CreateStmtList(
- // ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i])))));
- body = ocg->StmtListAppend(body, ocg->StmtListAppend(
- ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i]))), NULL));
- }
-
- //Get our inital code prepped for loop reduction. First we need to swap
- //out internal SUIF variable references to point to the new local
- //function symbol table.
- std::map<std::string, var_sym*> loop_idxs; //map from idx names to their new syms
- std::vector< std::pair<var_sym*, var_sym*> > dim_vars; //pair is of <old,new> var_sym (for 2D array size initializations)
- replacements r;
- tree_node_list* swapped = swapVarReferences(code, &r, ocg, loop_vars, new_proc_syms, dim_vars);
- //printf("\n code before recursiveFindReplacePreferedIdxs :\n");
- //swapped->print();
- swapped = recursiveFindReplacePreferedIdxs(swapped, new_proc_syms, cudaSync, void_func, loop_idxs);//in-place swapping
- //printf("\n code after recursiveFindReplacePreferedIdxs :\n");
- //swapped->print();
-
- for(int i=0; i<indexes.size(); i++){
- std::vector<tree_for*> tfs = findCommentedFors(indexes[i], swapped);
- for(int k=0; k<tfs.size(); k++){
- //printf("replacing %p tfs for index %s\n", tfs[k], indexes[i]);
- tree_node_list* newBlock = forReduce(tfs[k], loop_idxs[indexes[i]], new_proc_syms);
- //newBlock->print();
- swap_node_for_node_list(tfs[k], newBlock);
- //printf("AFTER SWAP\n"); newBlock->print();
- }
- }
- //printf("AFTER REDUCE\n"); swapped->print();
-
- if(static_cast<const IR_cudasuifCode *>(ir)->init_code()){
- tree_node_list* orig_init_code = static_cast<CG_suifRepr *>(static_cast<const IR_cudasuifCode *>(ir)->init_code())->GetCode();
- for(int i=0; i<dim_vars.size(); i++){
- //We have a map of var_sym from the original function body and we know
- //that these var_syms have initialization statements which define the
- //array size. We need to mimic these initialization statements.
-
- //First find the assignment and pull out the constant initialization
- //value
- int value = -1;
- tree_node_list_iter tnli(orig_init_code);
- while (!tnli.is_empty()) {
- tree_node *node = tnli.step();
- if(node->kind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr)
- {
- in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr();
- //expect the structure: cpy( _ = min(grab_me, _))
- if(inst->opcode() == io_cpy && inst->dst_op().is_symbol()){
- //printf("looking at instruction: ");
- //inst->print();
- var_sym* dest = inst->dst_op().symbol();
- if(dest == dim_vars[i].first)
- {
- if(inst->src1_op().is_instr() && inst->src1_op().instr()->format() == inf_ldc){
- value = ((in_ldc*)inst->src1_op().instr())->value().integer();
- }
- }
- }
- }
- }
- if(value < 0){
- fprintf(stderr, "ERROR: Could not find initializing statement for variable used in upper_bound of array type");
- }
- CG_outputRepr *lhs = new CG_suifRepr(operand(dim_vars[i].second));
- //body = ocg->StmtListAppend(body, ocg->CreateStmtList(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value))));
- body = ocg->StmtListAppend(body, ocg->StmtListAppend(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value)), NULL));
- }
- }
-
-
- body = ocg->StmtListAppend(body, new CG_suifRepr(swapped));
-
- //protonu--lets try creating our function definiton here
- var_sym *tsym = NULL;
-
-
- std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(body);
- for(int i=0; i<refs.size(); i++)
- {
- //check if the array is tex mapped
- if(texture != NULL && texture->is_array_tex_mapped(refs[i]->name().c_str()))
- {
- //protonu--our new tex lookup function
- in_cal *tex_lookup =
- new in_cal(type_f32, operand(), operand(new in_ldc(float_func->ptr_to(), operand(), immed(tex1D))), 2);
-
- //printf("name of the array to be mapped is %s\n", refs[i]->name().c_str());
- tsym = tex_ref_map[refs[i]->name()];
- tex_lookup->set_argument(0, operand(tsym));
-
-
- int array_dims = ((IR_suifArrayRef *)refs[i])->ia_->dims();
-
- if (array_dims == 1){
- tex_lookup->set_argument(1, ((IR_suifArrayRef *)refs[i])->ia_->index(0).clone());
- }else if (array_dims > 2) {
- printf(" \n we don't handle more than 2D arrays mapped to textures yet\n");
- }else if (array_dims == 2) {
-
- IR_ArraySymbol *sym = refs[i]->symbol();
- CG_outputRepr *sz = sym->size(1);
- delete sym; // free the wrapper object only
- // find the builder ocg
- CG_outputRepr *expr = ocg->CreateTimes(sz->clone(),refs[i]->index(0));
- delete sz; // free the wrapper object only
- expr = ocg->CreatePlus(expr, refs[i]->index(1));
- // expr holds the 1D access expression and take it out
- tex_lookup->set_argument(1, ((CG_suifRepr *)expr)->GetExpression());
- }
-
- //using chun's function to replace the array look up with the function call
- ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(tex_lookup)));
- }
-
- }
-
-
- tsym = NULL;
- //protonu--now let's try what we did above for constant memory
- for(int i=0; i<refs.size(); i++)
- {
- //check if the array is tex mapped
- if(constant_mem != NULL && constant_mem->is_array_cons_mapped(refs[i]->name().c_str()))
- {
-
- //printf("name of the array to be cons mapped is %s\n", refs[i]->name().c_str());
- tsym = cons_ref_map[refs[i]->name()];
- //we should create a IR_SuifArray here
- IR_ArraySymbol *ar_sym = new IR_suifArraySymbol(ir,tsym);
- std::vector<CG_outputRepr *> ar_index;
- ar_index.push_back(((IR_suifArrayRef *)refs[i])->index(0));
- IR_ArrayRef *ar_ref = ((IR_suifCode *)ir)->CreateArrayRef(ar_sym, ar_index);
- //using chun's function to replace the array look up with the function call
- ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(((IR_suifArrayRef *)ar_ref)->ia_)));
-
- }
- }
-
-
- tree_proc *new_body = new tree_proc(static_cast<CG_suifRepr*>(body)->GetCode(), new_proc_syms);
- //globals->add_child(new_proc_syms);
- new_psym->set_block(new_body);
- new_procs.push_back(new_psym);
-
- return swapped;
-}
-
-//Order taking out dummy variables
-std::vector<std::string> cleanOrder(std::vector<std::string> idxNames){
- std::vector<std::string> results;
- for(int j=0; j<idxNames.size(); j++){
- if(idxNames[j].length() != 0)
- results.push_back(idxNames[j]);
- }
- return results;
-}
-
-//First non-dummy level in ascending order
-int LoopCuda::nonDummyLevel(int stmt, int level){
- //level comes in 1-basd and should leave 1-based
- for(int j=level-1; j<idxNames[stmt].size(); j++){
- if(idxNames[stmt][j].length() != 0){
- //printf("found non dummy level of %d with idx: %s when searching for %d\n", j+1, (const char*) idxNames[stmt][j], level);
- return j+1;
- }
- }
- char buf[128]; sprintf(buf, "%d", level);
- throw std::runtime_error(std::string("Unable to find a non-dummy level starting from ") + std::string(buf));
-}
-
-int LoopCuda::findCurLevel(int stmt, std::string idx){
- for(int j=0; j<idxNames[stmt].size(); j++){
- if(strcmp(idxNames[stmt][j].c_str(),idx.c_str()) == 0)
- return j+1;
- }
- throw std::runtime_error(std::string("Unable to find index ") + idx + std::string(" in current list of indexes"));
-}
-
-void LoopCuda::permute_cuda(int stmt, const std::vector<std::string>& curOrder)
-{
- //printf("curOrder: ");
- //printVs(curOrder);
- //printf("idxNames: ");
- //printVS(idxNames[stmt]);
- std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt]);
- bool same=true;
- std::vector<int> pi;
- for(int i=0; i<curOrder.size(); i++){
- bool found = false;
- for(int j=0; j<cIdxNames.size(); j++){
- if(strcmp(cIdxNames[j].c_str(), curOrder[i].c_str()) == 0){
- pi.push_back(j+1);
- found=true;
- if(j!=i)
- same=false;
- }
- }
- if(!found){
- throw std::runtime_error("One of the indexes in the permute order where not "
- "found in the current set of indexes.");
- }
- }
- for(int i=curOrder.size(); i<cIdxNames.size(); i++){
- pi.push_back(i);
- }
- if(same)
- return;
- permute(stmt, pi);
- //Set old indexe names as new
- for(int i=0; i<curOrder.size(); i++){
- idxNames[stmt][i] = curOrder[i].c_str(); //what about sibling stmts?
- }
-}
-
-
-bool LoopCuda::permute(int stmt_num, const std::vector<int> &pi)
-{
-// check for sanity of parameters
- if (stmt_num >= stmt.size() || stmt_num < 0)
- throw std::invalid_argument("invalid statement " + to_string(stmt_num));
- const int n = stmt[stmt_num].xform.n_out();
- if (pi.size() > (n-1)/2)
- throw std::invalid_argument("iteration space dimensionality does not match permute dimensionality");
- int first_level = 0;
- int last_level = 0;
- for (int i = 0; i < pi.size(); i++) {
- if (pi[i] > (n-1)/2 || pi[i] <= 0)
- throw std::invalid_argument("invalid loop level " + to_string(pi[i]) + " in permuation");
-
- if (pi[i] != i+1) {
- if (first_level == 0)
- first_level = i+1;
- last_level = i+1;
- }
- }
- if (first_level == 0)
- return true;
-
- std::vector<int> lex = getLexicalOrder(stmt_num);
- std::set<int> active = getStatements(lex, 2*first_level-2);
- Loop::permute(active, pi);
-}
-
-
-void LoopCuda::tile_cuda(int stmt, int level, int outer_level)
-{
- tile_cuda(stmt,level,1,outer_level,"","",CountedTile);
-}
-void LoopCuda::tile_cuda(int level, int tile_size, int outer_level, std::string idxName,
- std::string ctrlName, TilingMethodType method){
- tile_cuda(0, level, tile_size, outer_level, idxName, ctrlName, method);
-}
-
-void LoopCuda::tile_cuda(int stmt, int level, int tile_size, int outer_level, std::string idxName,
- std::string ctrlName, TilingMethodType method){
- //Do regular tile but then update the index and control loop variable
- //names as well as the idxName to reflect the current state of things.
- //printf("tile(%d,%d,%d,%d)\n", stmt, level, tile_size, outer_level);
- //printf("idxNames before: ");
- //printVS(idxNames[stmt]);
-
- tile(stmt, level, tile_size, outer_level, method);
-
- if(idxName.size())
- idxNames[stmt][level-1] = idxName.c_str();
- if(tile_size == 1){
- //potentially rearrange loops
- if(outer_level < level){
- std::string tmp = idxNames[stmt][level-1];
- for(int i=level-1; i>outer_level-1; i--){
- if(i-1 >= 0)
- idxNames[stmt][i] = idxNames[stmt][i-1];
- }
- idxNames[stmt][outer_level-1] = tmp;
- }
- //TODO: even with a tile size of one, you need a insert (of a dummy loop)
- idxNames[stmt].insert(idxNames[stmt].begin()+(level),"");
- }else{
- if(!ctrlName.size())
- throw std::runtime_error("No ctrl loop name for tile");
- //insert
- idxNames[stmt].insert(idxNames[stmt].begin()+(outer_level-1),ctrlName.c_str());
- }
-
- //printf("idxNames after: ");
- //printVS(idxNames[stmt]);
-}
-
-
-bool LoopCuda::datacopy_privatized_cuda(int stmt_num, int level, const std::string &array_name, const std::vector<int> &privatized_levels, bool allow_extra_read , int fastest_changing_dimension , int padding_stride , int padding_alignment , bool cuda_shared)
-{
- int old_stmts =stmt.size();
- //datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, cuda_shared);
- if(cuda_shared)
- datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 1);
- else
- datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 0);
-
-
- //Adjust idxNames to reflect updated state
- std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
- int new_stmts = stmt.size();
- for(int i=old_stmts; i<new_stmts; i++){
- //printf("fixing up statement %d\n", i);
- std::vector<std::string> idxs;
-
-
- //protonu-making sure the vector of nonSplitLevels grows along with
- //the statement structure
- stmt_nonSplitLevels.push_back(omega::Tuple<int>());
-
- //Indexes up to level will be the same
- for(int j=0; j<level-1; j++)
- idxs.push_back(cIdxNames[j]);
-
- //Expect privatized_levels to match
- for(int j=0; j<privatized_levels.size(); j++)
- idxs.push_back(cIdxNames[privatized_levels[j]-1]);//level is one-based
-
- //all further levels should match order they are in originally
- if(privatized_levels.size()){
- int last_privatized = privatized_levels.back();
- int top_level = last_privatized + (stmt[i].IS.n_set()-idxs.size());
- //printf("last privatized_levels: %d top_level: %d\n", last_privatized, top_level);
- for(int j=last_privatized; j<top_level; j++){
- idxs.push_back(cIdxNames[j]);
- //printf("pushing back: %s\n", (const char*)cIdxNames[j]);
- }
- }
- idxNames.push_back(idxs);
- }
-}
-
-bool LoopCuda::datacopy_cuda(int stmt_num, int level, const std::string &array_name, std::vector<std::string> new_idxs, bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, bool cuda_shared)
-{
-
- int old_stmts =stmt.size();
- //datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment,cuda_shared);
- if(cuda_shared)
- datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 1);
- else
- datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 0);
- //Adjust idxNames to reflect updated state
- std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
- int new_stmts = stmt.size();
- for(int i=old_stmts; i<new_stmts; i++){
- //printf("fixing up statement %d\n", i);
- std::vector<std::string> idxs;
-
- //protonu-making sure the vector of nonSplitLevels grows along with
- //the statement structure
- stmt_nonSplitLevels.push_back(omega::Tuple<int>());
-
- //protonu--lets dump out the code from each statement here
- //printf("\n dumping statement :%d", i);
- //stmt[i].code->Dump();
-
- //Indexes up to level will be the same
- for(int j=0; j<level-1; j++)
- idxs.push_back(cIdxNames[j]);
-
- //all further levels should get names from new_idxs
- int top_level = stmt[i].IS.n_set();
- //printf("top_level: %d level: %d\n", top_level, level);
- if(new_idxs.size() < top_level-level+1)
- throw std::runtime_error("Need more new index names for new datacopy loop levels");
-
- for(int j=level-1; j<top_level; j++){
- idxs.push_back(new_idxs[j-level+1].c_str());
- //printf("pushing back: %s\n", new_idxs[j-level+1].c_str());
- }
- idxNames.push_back(idxs);
- }
-}
-
-bool LoopCuda::unroll_cuda(int stmt_num, int level, int unroll_amount)
-{
- int old_stmts =stmt.size();
- //bool b= unroll(stmt_num, , unroll_amount);
-
-
- int dim = 2*level-1;
- std::vector<int> lex = getLexicalOrder(stmt_num);
- std::set<int> same_loop = getStatements(lex, dim-1);
-
- level = nonDummyLevel(stmt_num,level);
- //printf("unrolling %d at level %d\n", stmt_num,level);
-
- //protonu--using the new version of unroll, which returns
- //a set of ints instead of a bool. To keep Gabe's logic
- //I'll check the size of the set, if it's 0 return true
- //bool b= unroll(stmt_num, level, unroll_amount);
- std::set<int> b_set= unroll(stmt_num, level, unroll_amount);
- bool b = false;
- if (b_set.size() == 0) b = true;
- //end--protonu
-
- //Adjust idxNames to reflect updated state
- std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
- std::vector<std::string> origSource = idxNames[stmt_num];;
- //Drop index names at level
- if(unroll_amount == 0){
- //For all statements that were in this unroll together, drop index name for unrolled level
- idxNames[stmt_num][level-1] = "";
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
- //printf("in same loop as %d is %d\n", stmt_num, (*i));
- //idxNames[(*i)][level-1] = "";
- idxNames[(*i)] = idxNames[stmt_num];
- }
- }
-
- lex = getLexicalOrder(stmt_num);
- same_loop = getStatements(lex, dim-1);
-
- bool same_as_source = false;
- int new_stmts = stmt.size();
- for(int i=old_stmts; i<new_stmts; i++){
- //Check whether we had a sync for the statement we are unrolling, if
- //so, propogate that to newly created statements so that if they are
- //in a different loop structure, they will also get a syncthreads
- int size = syncs.size();
- for(int j=0; j<size; j++){
- if(syncs[j].first == stmt_num)
- syncs.push_back(make_pair(i,syncs[j].second));
- }
-
- //protonu-making sure the vector of nonSplitLevels grows along with
- //the statement structure
- stmt_nonSplitLevels.push_back(omega::Tuple<int>());
-
-
- //We expect that new statements have a constant for the variable in
- //stmt[i].IS at level (as seen with print_with_subs), otherwise there
- //will be a for loop at level and idxNames should match stmt's
- //idxNames pre-unrolled
- Relation IS = stmt[i].IS;
- //Ok, if you know how the hell to get anything out of a Relation, you
- //should probably be able to do this more elegantly. But for now, I'm
- //hacking it.
- std::string s = IS.print_with_subs_to_string();
- //s looks looks like
- //{[_t49,8,_t51,_t52,128]: 0 <= _t52 <= 3 && 0 <= _t51 <= 15 && 0 <= _t49 && 64_t49+16_t52+_t51 <= 128}
- //where level == 5, you see a integer in the input set
-
- //If that's not an integer and this is the first new statement, then
- //we think codegen will have a loop at that level. It's not perfect,
- //not sure if it can be determined without round-tripping to codegen.
- int sIdx = 0;
- int eIdx = 0;
- for(int j=0; j<level-1; j++){
- sIdx = s.find(",",sIdx+1);
- if(sIdx < 0) break;
- }
- if(sIdx > 0){
- eIdx = s.find("]");
- int tmp = s.find(",",sIdx+1);
- if(tmp > 0 && tmp < eIdx)
- eIdx = tmp; //", before ]"
- if(eIdx > 0){
- sIdx++;
- std::string var = s.substr(sIdx,eIdx-sIdx);
- //printf("%s\n", s.c_str());
- //printf("set var for stmt %d at level %d is %s\n", i, level, var.c_str());
- if(atoi(var.c_str()) == 0 && i ==old_stmts){
- //TODO:Maybe do see if this new statement would be in the same
- //group as the original and if it would, don't say
- //same_as_source
- if(same_loop.find(i) == same_loop.end()){
- printf("stmt %d level %d, newly created unroll statement should have same level indexes as source\n", i, level);
- same_as_source = true;
- }
- }
- }
- }
-
-
- //printf("fixing up statement %d n_set %d with %d levels\n", i, stmt[i].IS.n_set(), level-1);
- if(same_as_source)
- idxNames.push_back(origSource);
- else
- idxNames.push_back(idxNames[stmt_num]);
- }
-
- return b;
-}
-
-void LoopCuda::copy_to_texture(const char *array_name)
-{
- //protonu--placeholder for now
- //set the bool for using cuda memory as true
- //in a vector of strings, put the names of arrays to tex mapped
- if ( !texture )
- texture = new texture_memory_mapping(true, array_name);
- else
- texture->add(array_name);
-
-
-}
-
-
-void LoopCuda::copy_to_constant(const char *array_name)
-{
- //protonu--placeholder for now
- //set the bool for using cuda memory as true
- //in a vector of strings, put the names of arrays to tex mapped
- if ( !constant_mem )
- constant_mem = new constant_memory_mapping(true, array_name);
- else
- constant_mem->add(array_name);
-}
-
-//protonu--moving this from Loop
-tree_node_list* LoopCuda::codegen()
-{
- if(code_gen_flags & GenCudaizeV2)
- return cudaize_codegen_v2();
- //Do other flagged codegen methods, return plain vanilla generated code
- return getCode();
-}
-
-//These three are in Omega code_gen.cc and are used as a massive hack to
-//get out some info from MMGenerateCode. Yea for nasty side-effects.
-namespace omega{
- extern int checkLoopLevel;
- extern int stmtForLoopCheck;
- extern int upperBoundForLevel;
- extern int lowerBoundForLevel;
-}
-
-
-void LoopCuda::extractCudaUB(int stmt_num, int level, int &outUpperBound, int &outLowerBound){
- // check for sanity of parameters
- const int m = stmt.size();
- if (stmt_num >= m || stmt_num < 0)
- throw std::invalid_argument("invalid statement " + to_string(stmt_num));
- const int n = stmt[stmt_num].xform.n_out();
- if (level > (n-1)/2 || level <= 0)
- throw std::invalid_argument("invalid loop level " + to_string(level));
-
- int dim = 2*level-1;
-
- std::vector<int> lex = getLexicalOrder(stmt_num);
- std::set<int> same_loop = getStatements(lex, dim-1);
-
- // extract the intersection of the iteration space to be considered
- Relation hull;
- {
- hull = Relation::True(n);
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
- hull = Intersection(hull, project_onto_levels(getNewIS(*i), dim+1, true));
- hull.simplify(2, 4);
- }
-
- for (int i = 2; i <= dim+1; i+=2) {
- //std::string name = std::string("_t") + to_string(t_counter++);
- std::string name = std::string("_t") + to_string(tmp_loop_var_name_counter++);
- hull.name_set_var(i, name);
- }
- hull.setup_names();
- }
-
- // extract the exact loop bound of the dimension to be unrolled
- if (is_single_iteration(hull, dim)){
- throw std::runtime_error("No loop availabe at level to extract upper bound.");
- }
- Relation bound = get_loop_bound(hull, dim);
- if (!bound.has_single_conjunct() || !bound.is_satisfiable() || bound.is_tautology())
- throw loop_error("loop error: unable to extract loop bound for cudaize");
-
- // extract the loop stride
- EQ_Handle stride_eq;
- int stride = 1;
- {
- bool simple_stride = true;
- int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(dim+1), stride_eq, simple_stride);
- if (strides > 1)
- throw loop_error("loop error: too many strides");
- else if (strides == 1) {
- int sign = stride_eq.get_coef(bound.set_var(dim+1));
-// assert(sign == 1 || sign == -1);
- Constr_Vars_Iter it(stride_eq, true);
- stride = abs((*it).coef/sign);
- }
- }
- if(stride != 1){
- char buf[1024];
- sprintf(buf, "Cudaize: Loop at level %d has non-one stride of %d", level, stride);
- throw std::runtime_error(buf);
- }
-
- //Use code generation system to build tell us our bound information. We
- //need a hard upper bound a 0 lower bound.
-
- checkLoopLevel = level*2;
- stmtForLoopCheck = stmt_num;
- upperBoundForLevel = -1;
- lowerBoundForLevel = -1;
- printCode(1,false);
- checkLoopLevel = 0;
-
- outUpperBound = upperBoundForLevel;
- outLowerBound = lowerBoundForLevel;
- return;
-}
-
-
-void LoopCuda::printCode(int effort, bool actuallyPrint) const {
- const int m = stmt.size();
- if (m == 0)
- return;
- const int n = stmt[0].xform.n_out();
-
-
-
- Tuple<Relation> IS(m);
- Tuple<Relation> xform(m);
- Tuple<IntTuple > nonSplitLevels(m);
- for (int i = 0; i < m; i++) {
- IS[i+1] = stmt[i].IS;
- xform[i+1] = stmt[i].xform;
- nonSplitLevels[i+1] = stmt_nonSplitLevels[i];
- //nonSplitLevels[i+1] = stmt[i].nonSplitLevels;
- }
-
- Tuple< Tuple<std::string> > idxTupleNames;
- if(useIdxNames){
- for(int i=0; i<idxNames.size(); i++){
- Tuple<std::string> idxs;
- for(int j=0; j<idxNames[i].size(); j++)
- idxs.append(idxNames[i][j]);
- idxTupleNames.append( idxs );
- }
- }
-
- Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
- CG_stringBuilder *ocg = new CG_stringBuilder();
- Tuple<CG_outputRepr *> nameInfo;
- for (int i = 1; i <= m; i++)
- nameInfo.append(new CG_stringRepr("s" + to_string(i)));
- CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort);
- if(actuallyPrint)
- std::cout << GetString(repr);
-/*
- for (int i = 1; i <= m; i++)
- delete nameInfo[i];
-*/
-
- delete ocg;
-}
-
-
-
-void LoopCuda::printRuntimeInfo() const {
- for(int i=0; i<stmt.size(); i++){
- Relation IS = stmt[i].IS;
- Relation xform = stmt[i].xform;
- printf("stmt[%d]\n", i);
- printf("IS\n");
- IS.print_with_subs();
-
- printf("xform[%d]\n", i);
- xform.print_with_subs();
-
- //printf("code\n");
- //static_cast<CG_suifRepr *>(stmt[i].code)->GetCode()->print_expr();
- }
-}
-
-void LoopCuda::printIndexes() const {
- for(int i=0; i<stmt.size(); i++){
- printf("stmt %d nset %d ", i, stmt[i].IS.n_set());
-
- for(int j=0; j<idxNames[i].size(); j++){
- if(j>0)
- printf(",");
- printf("%s", idxNames[i][j].c_str());
- }
- printf("\n");
- }
-}
-
-tree_node_list* LoopCuda::getCode(int effort) const {
- const int m = stmt.size();
- if (m == 0)
- return new tree_node_list;
- const int n = stmt[0].xform.n_out();
-
-
-
- Tuple<CG_outputRepr *> ni(m);
- Tuple<Relation> IS(m);
- Tuple<Relation> xform(m);
- Tuple< IntTuple > nonSplitLevels(m);
- for (int i = 0; i < m; i++) {
- ni[i+1] = stmt[i].code;
- IS[i+1] = stmt[i].IS;
- xform[i+1] = stmt[i].xform;
- nonSplitLevels[i+1] = stmt_nonSplitLevels[i];
- //nonSplitLevels[i+1] = stmt[i].nonSplitLevels;
- }
-
-
- Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
-#ifdef DEBUG
-// std::cout << GetString(MMGenerateCode(new CG_stringBuilder(), xform, IS, known, effort));
-#endif
- Tuple< Tuple<std::string> > idxTupleNames;
- if(useIdxNames){
- for(int i=0; i<idxNames.size(); i++){
- Tuple<std::string> idxs;
- for(int j=0; j<idxNames[i].size(); j++)
- idxs.append(idxNames[i][j]);
- idxTupleNames.append( idxs );
- }
- }
-
- CG_outputBuilder *ocg = ir->builder();
- CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, nonSplitLevels, syncs, idxTupleNames, effort);
-
- //CG_outputRepr *overflow_initialization = ocg->CreateStmtList();
- //protonu--using the new function CG_suifBuilder::StmtListAppend
- CG_outputRepr *overflow_initialization = ocg->StmtListAppend(NULL, NULL);
- for (std::map<int, std::vector<Free_Var_Decl *> >::const_iterator i = overflow.begin(); i != overflow.end(); i++)
- for (std::vector<Free_Var_Decl *>::const_iterator j = i->second.begin(); j != i->second.end(); j++)
- //overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->CreateStmtList(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0))));
- overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->StmtListAppend(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)), NULL));
-
- repr = ocg->StmtListAppend(overflow_initialization, repr);
- tree_node_list *tnl = static_cast<CG_suifRepr *>(repr)->GetCode();
-
- delete repr;
- /*
- for (int i = 1; i <= m; i++)
- delete ni[i];
- */
-
- return tnl;
-}
-
-
-//protonu--adding constructors for the new derived class
-LoopCuda::LoopCuda():Loop(), code_gen_flags(GenInit){}
-
-LoopCuda::LoopCuda(IR_Control *irc, int loop_num)
- :Loop(irc)
-{
- setup_code = NULL;
- teardown_code = NULL;
- code_gen_flags = 0;
- cu_bx = cu_by = cu_tx = cu_ty = cu_tz = 1;
- cu_num_reduce = 0;
- cu_mode = GlobalMem;
- texture = NULL;
- constant_mem = NULL;
-
- int m=stmt.size();
- //printf("\n the size of stmt(initially) is: %d\n", stmt.size());
- for(int i=0; i<m; i++)
- stmt_nonSplitLevels.push_back(omega::Tuple<int>());
-
-
- //protonu--setting up
- //proc_symtab *symtab
- //global_symtab *globals
-
- globals = ((IR_cudasuifCode *)ir)->gsym_ ;
- std::vector<tree_for *> tf = ((IR_cudasuifCode *)ir)->get_loops();
-
- symtab = tf[loop_num]->proc()->block()->proc_syms();
-
- std::vector<tree_for *> deepest = find_deepest_loops(tf[loop_num]);
-
- for (int i = 0; i < deepest.size(); i++){
- index.push_back(deepest[i]->index()->name()); //reflects original code index names
- }
-
- for(int i=0; i< stmt.size(); i++)
- idxNames.push_back(index); //refects prefered index names (used as handles in cudaize v2)
- useIdxNames=false;
-
-}
-
diff --git a/loop_cuda_rose.cc b/loop_cuda_rose.cc
deleted file mode 100644
index c5633ee..0000000
--- a/loop_cuda_rose.cc
+++ /dev/null
@@ -1,3734 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2009 University of Utah
- All Rights Reserved.
-
- Purpose:
- Cudaize methods
-
- Notes:
-
- History:
- 1/7/10 Created by Gabe Rudy by migrating code from loop.cc
- 31/1/11 Modified by Protonu Basu
-*****************************************************************************/
-#define TRANSFORMATION_FILE_INFO Sg_File_Info::generateDefaultFileInfoForTransformationNode()
-#include <code_gen/CG_stringBuilder.h>
-#include <codegen.h>
-#include <code_gen/CG_utils.h>
-#include <code_gen/CG_outputRepr.h>
-#include "loop_cuda_rose.hh"
-#include "loop.hh"
-#include <math.h>
-//#include <useful.h>
-#include "omegatools.hh"
-#include "ir_cudarose.hh"
-#include "ir_rose.hh"
-#include "ir_rose_utils.hh"
-#include "chill_error.hh"
-#include <vector>
-#include "Outliner.hh"
-//#define DEBUG
-using namespace omega;
-using namespace SageBuilder;
-using namespace SageInterface;
-//using namespace Outliner;
-//using namespace ASTtools;
-char *k_cuda_texture_memory; //protonu--added to track texture memory type
-//extern char *omega::k_cuda_texture_memory; //protonu--added to track texture memory type
-extern char *omega::k_ocg_comment;
-
-static int cudaDebug;
-class CudaStaticInit {
-public:
- CudaStaticInit() {
- cudaDebug = 0; //Change this to 1 for debug
- }
-};
-static CudaStaticInit junkInitInstance__;
-
-std::string& upcase(std::string& s) {
- for (int i = 0; i < s.size(); i++)
- s[i] = toupper(s[i]);
- return s;
-}
-
-void printVs(const std::vector<std::string>& curOrder) {
- if (!cudaDebug) return;
- for (int i = 0; i < curOrder.size(); i++) {
- if (i > 0)
- printf(",");
- printf("%s", curOrder[i].c_str());
- }
- printf("\n");
-}
-
-void printVS(const std::vector<std::string>& curOrder) {
- if(!cudaDebug) return;
- for (int i = 0; i < curOrder.size(); i++) {
- if (i > 0)
- printf(",");
- printf("%s", curOrder[i].c_str());
- }
- printf("\n");
-}
-
-LoopCuda::~LoopCuda() {
- const int m = stmt.size();
- for (int i = 0; i < m; i++)
- stmt[i].code->clear();
-}
-
-bool LoopCuda::symbolExists(std::string s) {
-
- if (body_symtab->find_variable(SgName(s.c_str()))
- || parameter_symtab->find_variable(SgName(s.c_str())))
- return true;
- if (globals->lookup_variable_symbol(SgName(s.c_str())))
- return true;
- for (int i = 0; i < idxNames.size(); i++)
- for (int j = 0; j < idxNames[i].size(); j++)
- if (strcmp(idxNames[i][j].c_str(), s.c_str()) == 0)
- return true;
- return false;
-}
-
-void LoopCuda::addSync(int stmt_num, std::string idxName) {
- //we store these and code-gen inserts sync to omega comments where stmt
- //in loop that has idxName being generated
- syncs.push_back(make_pair(stmt_num, idxName));
-}
-
-void LoopCuda::renameIndex(int stmt_num, std::string idx, std::string newName) {
- int level = findCurLevel(stmt_num, idx);
- if (idxNames.size() <= stmt_num || idxNames[stmt_num].size() < level)
- throw std::runtime_error("Invalid statment number of index");
- idxNames[stmt_num][level - 1] = newName.c_str();
-}
-
-enum Type {
- Int
-};
-
-SgNode* wrapInIfFromMinBound(SgNode* then_part, SgForStatement* loop,
- SgScopeStatement* symtab, SgVariableSymbol* bound_sym) {
- // CG_roseBuilder *ocg = new CG_roseBuilder(
-
- SgBinaryOp* test_expr = isSgBinaryOp(loop->get_test_expr());
- SgExpression* upperBound;
- SgExpression* conditional;
- upperBound = test_expr->get_rhs_operand();
- CG_outputRepr *ifstmt;
-
- SgCallExpression *call;
- if (call = isSgCallExpression(upperBound))
- if (isSgVarRefExp(call->get_function())->get_symbol()->get_name().getString()
- == "__rose_lt") {
- SgExprListExp* arg_list = call->get_args();
- SgExpression *if_bound = *(arg_list->get_expressions().begin());
- /*This relies on the minimum expression being the rhs operand of
- * the min instruction.
- */
- SgIfStmt *ifstmt = buildIfStmt(
- buildLessOrEqualOp(buildVarRefExp(bound_sym), if_bound),
- isSgStatement(then_part), NULL);
- return isSgNode(ifstmt);
-
- }
-
-/* if (isSgConditionalExp(upperBound)) {
- conditional = isSgConditionalExp(upperBound)->get_conditional_exp();
-
- if (isSgBinaryOp(conditional)) {
- SgBinaryOp* binop = isSgBinaryOp(conditional);
-
- if (isSgLessThanOp(binop) || isSgLessOrEqualOp(binop)) {
- SgIfStmt *ifstmt = buildIfStmt(
- buildLessOrEqualOp(buildVarRefExp(bound_sym),
- test_expr), isSgStatement(then_part), NULL);
- return isSgNode(ifstmt);
- }
-
- }
-
- }
-*/
- return then_part;
-}
-
-/**
- * This would be better if it was done by a CHiLL xformation instead of at codegen
- *
- * state:
- * for(...)
- * for(...)
- * cur_body
- * stmt1
- *
- * stm1 is in-between two loops that are going to be reduced. The
- * solution is to put stmt1 at the end of cur_body but conditionally run
- * in on the last step of the for loop.
- *
- * A CHiLL command that would work better:
- *
- * for(...)
- * stmt0
- * for(for i=0; i<n; i++)
- * cur_body
- * stmt1
- * =>
- * for(...)
- * for(for i=0; i<n; i++)
- * if(i==0) stmt0
- * cur_body
- * if(i==n-1) stmt1
- */
-
-std::vector<SgForStatement*> findCommentedFors(const char* index, SgNode* tnl) {
- std::vector<SgForStatement *> result;
- bool next_loop_ok = false;
-
- if (isSgBasicBlock(tnl)) {
-
- SgStatementPtrList& list = isSgBasicBlock(tnl)->get_statements();
-
- for (SgStatementPtrList::iterator it = list.begin(); it != list.end();
- it++) {
- std::vector<SgForStatement*> t = findCommentedFors(index,
- isSgNode(*it));
- std::copy(t.begin(), t.end(), back_inserter(result));
- }
- } else if (isSgForStatement(tnl)) {
-
- AstTextAttribute* att =
- (AstTextAttribute*) (isSgNode(tnl)->getAttribute(
- "omega_comment"));
- std::string comment = att->toString();
-
- if (comment.find("~cuda~") != std::string::npos
- && comment.find("preferredIdx: ") != std::string::npos) {
- std::string idx = comment.substr(
- comment.find("preferredIdx: ") + 14, std::string::npos);
- if (idx.find(" ") != std::string::npos)
- idx = idx.substr(0, idx.find(" "));
- if (strcmp(idx.c_str(), index) == 0)
- next_loop_ok = true;
- }
-
- if (next_loop_ok) {
- //printf("found loop %s\n", static_cast<tree_for *>(tn)->index()->name());
- result.push_back(isSgForStatement(tnl));
- } else {
- //printf("looking down for loop %s\n", static_cast<tree_for *>(tn)->index()->name());
- std::vector<SgForStatement*> t = findCommentedFors(index,
- isSgForStatement(tnl)->get_loop_body());
- std::copy(t.begin(), t.end(), back_inserter(result));
- }
- next_loop_ok = false;
- } else if (isSgIfStmt(tnl)) {
- //printf("looking down if\n");
- SgIfStmt *tni = isSgIfStmt(tnl);
- std::vector<SgForStatement*> t = findCommentedFors(index,
- tni->get_true_body());
- std::copy(t.begin(), t.end(), back_inserter(result));
- }
-
- return result;
-}
-
-SgNode* forReduce(SgForStatement* loop, SgVariableSymbol* reduceIndex,
- SgScopeStatement* body_syms) {
- //We did the replacements all at once with recursiveFindPreferedIdxs
- //replacements r;
- //r.oldsyms.append(loop->index());
- //r.newsyms.append(reduceIndex);
- //tree_for* new_loop = (tree_for*)loop->clone_helper(&r, true);
- SgForStatement* new_loop = loop;
-
- //return body one loops in
- SgNode* tnl = loop_body_at_level(new_loop, 1);
- //wrap in conditional if necessary
- tnl = wrapInIfFromMinBound(tnl, new_loop, body_syms, reduceIndex);
- return tnl;
-}
-
-void recursiveFindRefs(SgNode* code, std::set<const SgVariableSymbol *>& syms,
- SgFunctionDefinition* def) {
-
- SgStatement* s = isSgStatement(code);
- // L = {symbols defined within 's'}, local variables declared within 's'
- ASTtools::VarSymSet_t L;
- ASTtools::collectDefdVarSyms(s, L);
- //dump (L, "L = ");
-
- // U = {symbols used within 's'}
- ASTtools::VarSymSet_t U;
- ASTtools::collectRefdVarSyms(s, U);
- //dump (U, "U = ");
-
- // U - L = {symbols used within 's' but not defined in 's'}
- // variable references to non-local-declared variables
- ASTtools::VarSymSet_t diff_U_L;
- set_difference(U.begin(), U.end(), L.begin(), L.end(),
- inserter(diff_U_L, diff_U_L.begin()));
- //dump (diff_U_L, "U - L = ");
-
- // Q = {symbols defined within the function surrounding 's' that are
- // visible at 's'}, including function parameters
- ASTtools::VarSymSet_t Q;
- ASTtools::collectLocalVisibleVarSyms(def->get_declaration(), s, Q);
-// dump (Q, "Q = ");
-
- // (U - L) \cap Q = {variables that need to be passed as parameters
- // to the outlined function}
- // a sub set of variables that are not globally visible (no need to pass at all)
- // It excludes the variables with a scope between global and the enclosing function
- set_intersection(diff_U_L.begin(), diff_U_L.end(), Q.begin(), Q.end(),
- inserter(syms, syms.begin()));
-
- /* std::vector<SgVariableSymbol *> scalars;
- //SgNode *tnl = static_cast<const omega::CG_roseRepr *>(repr)->GetCode();
- SgStatement* stmt;
- SgExpression* exp;
- if (tnl != NULL) {
- if(stmt = isSgStatement(tnl)){
- if(isSgBasicBlock(stmt)){
- SgStatementPtrList& stmts = isSgBasicBlock(stmt)->get_statements();
- for(int i =0; i < stmts.size(); i++){
- //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgNode(stmts[i]));
- std::vector<SgVariableSymbol *> a = recursiveFindRefs(isSgNode(stmts[i]));
- //delete r;
- std::copy(a.begin(), a.end(), back_inserter(scalars));
- }
-
- }
- else if(isSgForStatement(stmt)){
-
- SgForStatement *tnf = isSgForStatement(stmt);
- //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgStatement(tnf->get_loop_body()));
- std::vector<SgVariableSymbol *> a = recursiveFindRefs(isSgNode(tnf->get_loop_body()));
- //delete r;
- std::copy(a.begin(), a.end(), back_inserter(scalars));
- }
- else if(isSgFortranDo(stmt)){
- SgFortranDo *tfortran = isSgFortranDo(stmt);
- omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgStatement(tfortran->get_body()));
- std::vector<SgVariableSymbol *> a = recursiveFindRefs(r);
- delete r;
- std::copy(a.begin(), a.end(), back_inserter(scalars));
- }
-
- else if(isSgIfStmt(stmt) ){
- SgIfStmt* tni = isSgIfStmt(stmt);
- //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgNode(tni->get_conditional()));
- std::vector<SgVariableSymbol *> a = recursiveFindRefs(isSgNode(tni->get_conditional()));
- //delete r;
- std::copy(a.begin(), a.end(), back_inserter(scalars));
- //r = new omega::CG_roseRepr(isSgNode(tni->get_true_body()));
- a = recursiveFindRefs(isSgNode(tni->get_true_body()));
- //delete r;
- std::copy(a.begin(), a.end(), back_inserter(scalars));
- //r = new omega::CG_roseRepr(isSgNode(tni->get_false_body()));
- a = recursiveFindRefs(isSgNode(tni->get_false_body()));
- //delete r;
- std::copy(a.begin(), a.end(), back_inserter(scalars));
- }
- else if(isSgExprStatement(stmt)) {
- //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgExpression(isSgExprStatement(stmt)->get_expression()));
- std::vector<SgVariableSymbol *> a = recursiveFindRefs(isSgNode(isSgExprStatement(stmt)->get_expression()));
- //delete r;
- std::copy(a.begin(), a.end(), back_inserter(scalars));
-
- }
- }
- }
- else{
- SgExpression* op = isSgExpression(tnl);
- if(isSgVarRefExp(op)){
-
- scalars.push_back(isSgVarRefExp(op)->get_symbol());
-
- }
- else if( isSgAssignOp(op)){
- //omega::CG_roseRepr *r1 = new omega::CG_roseRepr(isSgAssignOp(op)->get_lhs_operand());
- std::vector<SgVariableSymbol *> a1 = recursiveFindRefs(isSgNode(isSgAssignOp(op)->get_lhs_operand()));
- //delete r1;
- std::copy(a1.begin(), a1.end(), back_inserter(scalars));
- //omega::CG_roseRepr *r2 = new omega::CG_roseRepr(isSgAssignOp(op)->get_rhs_operand());
- std::vector<SgVariableSymbol *> a2 = recursiveFindRefs(isSgNode(isSgAssignOp(op)->get_rhs_operand()));
- //delete r2;
- std::copy(a2.begin(), a2.end(), back_inserter(scalars));
-
- }
- else if(isSgBinaryOp(op)){
- // omega::CG_roseRepr *r1 = new omega::CG_roseRepr(isSgBinaryOp(op)->get_lhs_operand());
- std::vector<SgVariableSymbol *> a1 = recursiveFindRefs(isSgNode(isSgBinaryOp(op)->get_lhs_operand()));
- //delete r1;
- std::copy(a1.begin(), a1.end(), back_inserter(scalars));
- //omega::CG_roseRepr *r2 = new omega::CG_roseRepr(isSgBinaryOp(op)->get_rhs_operand());
- std::vector<SgVariableSymbol *> a2 = recursiveFindRefs((isSgBinaryOp(op)->get_rhs_operand()));
- //delete r2;
- std::copy(a2.begin(), a2.end(), back_inserter(scalars));
- }
- else if(isSgUnaryOp(op)){
- //omega::CG_roseRepr *r1 = new omega::CG_roseRepr(isSgUnaryOp(op)->get_operand());
- std::vector<SgVariableSymbol *> a1 = recursiveFindRefs(isSgNode(isSgUnaryOp(op)->get_operand()));
- //delete r1;
- std::copy(a1.begin(), a1.end(), back_inserter(scalars));
- }
-
- }
- return scalars;
-
-
- */
-
-}
-
-SgNode* recursiveFindReplacePreferedIdxs(SgNode* code, SgSymbolTable* body_syms,
- SgSymbolTable* param_syms, SgScopeStatement* body,
- std::map<std::string, SgVariableSymbol*>& loop_idxs,
- SgGlobal* globalscope, bool sync = false) {
- //tree_node_list* tnl = new tree_node_list;
- //tree_node_list_iter tnli(code);
- SgVariableSymbol* idxSym = 0;
- std::vector<SgStatement*> r1;
- std::vector<SgNode*> r2;
- SgNode* tnli;
- SgNode* tnli1;
- SgNode* tnli2;
- SgBasicBlock * clone;
-
- if (isSgForStatement(code)) {
- AstTextAttribute* att =
- (AstTextAttribute*) (isSgNode(code)->getAttribute(
- "omega_comment"));
-
- std::string comment;
- if (att != NULL)
- comment = att->toString();
-
- if (comment.find("~cuda~") != std::string::npos
- && comment.find("preferredIdx: ") != std::string::npos) {
- std::string idx = comment.substr(
- comment.find("preferredIdx: ") + 14, std::string::npos);
- if (idx.find(" ") != std::string::npos)
- idx = idx.substr(0, idx.find(" "));
- if (loop_idxs.find(idx) != loop_idxs.end())
- idxSym = loop_idxs.find(idx)->second;
- //Get the proc variable sybol for this preferred index
- if (idxSym == 0) {
- idxSym = body_syms->find_variable(idx.c_str());
- if (!idxSym)
- idxSym = param_syms->find_variable(idx.c_str());
- //printf("idx not found: lookup %p\n", idxSym);
- if (!idxSym) {
- SgVariableDeclaration* defn = buildVariableDeclaration(
- SgName((char*) idx.c_str()), buildIntType());
- //idxSym = new var_sym(type_s32, (char*)idx.c_str());
- SgInitializedNamePtrList& variables = defn->get_variables();
- SgInitializedNamePtrList::const_iterator i =
- variables.begin();
- SgInitializedName* initializedName = *i;
- SgVariableSymbol* vs = new SgVariableSymbol(
- initializedName);
- prependStatement(defn, body);
- vs->set_parent(body_syms);
- body_syms->insert(SgName((char*) idx.c_str()), vs);
- idxSym = vs;
- //printf("idx created and inserted\n");
- }
- //Now insert into our map for future
- if (cudaDebug)
- std::cout << idx << "\n\n";
- loop_idxs.insert(make_pair(idx, idxSym));
- }
- //See if we have a sync as well
- if (comment.find("sync") != std::string::npos) {
- //printf("Inserting sync after current block\n");
- sync = true;
- }
-
- }
- if (idxSym) {
- SgForInitStatement* list =
- isSgForStatement(code)->get_for_init_stmt();
- SgStatementPtrList& initStatements = list->get_init_stmt();
- SgStatementPtrList::const_iterator j = initStatements.begin();
- const SgVariableSymbol* index;
-
- if (SgExprStatement *expr = isSgExprStatement(*j))
- if (SgAssignOp* op = isSgAssignOp(expr->get_expression()))
- if (SgVarRefExp* var_ref = isSgVarRefExp(
- op->get_lhs_operand()))
- index = var_ref->get_symbol();
-
- std::vector<SgVarRefExp *> array = substitute(code, index, NULL,
- isSgNode(body_syms));
-
- for (int j = 0; j < array.size(); j++)
- array[j]->set_symbol(idxSym);
- }
-
- SgStatement* body_ = isSgStatement(
- recursiveFindReplacePreferedIdxs(
- isSgNode((isSgForStatement(code)->get_loop_body())),
- body_syms, param_syms, body, loop_idxs, globalscope));
-
- omega::CG_roseRepr * tnl = new omega::CG_roseRepr(code);
- omega::CG_outputRepr* block = tnl->clone();
- tnli = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-
- isSgForStatement(tnli)->set_loop_body(body_);
- body_->set_parent(tnli);
-
- if (idxSym) {
- SgForInitStatement* list =
- isSgForStatement(tnli)->get_for_init_stmt();
- SgStatementPtrList& initStatements = list->get_init_stmt();
- SgStatementPtrList::const_iterator j = initStatements.begin();
- const SgVariableSymbol* index;
-
- if (SgExprStatement *expr = isSgExprStatement(*j))
- if (SgAssignOp* op = isSgAssignOp(expr->get_expression()))
- if (SgVarRefExp* var_ref = isSgVarRefExp(
- op->get_lhs_operand()))
- index = var_ref->get_symbol();
-
- std::vector<SgVarRefExp *> array = substitute(tnli, index, NULL,
- isSgNode(body_syms));
-
- for (int j = 0; j < array.size(); j++)
- array[j]->set_symbol(idxSym);
- }
- // std::cout << isSgNode(body_)->unparseToString() << "\n\n";
- if (att != NULL)
- tnli->setAttribute("omega_comment", att);
-
- if (sync) {
- SgName name_syncthreads("__syncthreads");
- SgFunctionSymbol * syncthreads_symbol =
- globalscope->lookup_function_symbol(name_syncthreads);
-
- // Create a call to __syncthreads():
- SgFunctionCallExp * syncthreads_call = buildFunctionCallExp(
- syncthreads_symbol, buildExprListExp());
-
- SgExprStatement* stmt = buildExprStatement(syncthreads_call);
-
- /* if (SgBasicBlock* bb = isSgBasicBlock(
- isSgForStatement(code)->get_loop_body()))
- appendStatement(isSgStatement(stmt), bb);
-
- else if (SgStatement* ss = isSgStatement(
- isSgForStatement(code)->get_loop_body())) {
- SgBasicBlock* bb2 = buildBasicBlock();
-
- isSgNode(ss)->set_parent(bb2);
- appendStatement(ss, bb2);
-
- appendStatement(isSgStatement(stmt), bb2);
- isSgNode(stmt)->set_parent(bb2);
- isSgForStatement(code)->set_loop_body(bb2);
- isSgNode(bb2)->set_parent(code);
- }
- */
-
- SgBasicBlock* bb2 = buildBasicBlock();
-
- bb2->append_statement(isSgStatement(tnli));
- bb2->append_statement(stmt);
- /* SgNode* parent = code->get_parent();
- if(!isSgStatement(parent))
- throw loop_error("Parent not a statement");
-
- if(isSgForStatement(parent)){
- if(SgStatement *ss = isSgForStatement(isSgForStatement(parent)->get_loop_body())){
- omega::CG_roseRepr * tnl = new omega::CG_roseRepr(ss);
- omega::CG_outputRepr* block= tnl->clone();
-
- SgNode *new_ss = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
- SgBasicBlock* bb2 = buildBasicBlock();
-
- isSgNode(new_ss)->set_parent(bb2);
- appendStatement(isSgStatement(new_ss), bb2);
- appendStatement(isSgStatement(stmt), bb2);
- isSgNode(stmt)->set_parent(bb2);
-
- isSgStatement(parent)->replace_statement_from_basicBlock(ss, isSgStatement(bb2));
-
- }else if(isSgBasicBlock(isSgForStatement(parent)->get_loop_body()))
- isSgStatement(isSgForStatement(parent)->get_loop_body())->insert_statement(isSgStatement(code), stmt, false);
- else
- throw loop_error("parent statement type undefined!!");
-
- }
- else if(isSgBasicBlock(parent))
- isSgStatement(parent)->insert_statement(isSgStatement(code), stmt, false);
- else
- throw loop_error("parent statement type undefined!!");
-
- //tnl->print();
- *
- *
- */
- sync = true;
- return isSgNode(bb2);
-
- } else
- return tnli;
- } else if (isSgIfStmt(code)) {
- SgStatement* body_ = isSgStatement(
- recursiveFindReplacePreferedIdxs(
- isSgNode((isSgIfStmt(code)->get_true_body())),
- body_syms, param_syms, body, loop_idxs, globalscope));
-
- omega::CG_roseRepr * tnl = new omega::CG_roseRepr(code);
- omega::CG_outputRepr* block = tnl->clone();
- tnli = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-
- isSgIfStmt(tnli)->set_true_body(body_);
-
- if ((isSgIfStmt(code)->get_false_body()))
- isSgIfStmt(tnli)->set_false_body(
- isSgStatement(
- recursiveFindReplacePreferedIdxs(
- isSgNode(
- (isSgIfStmt(code)->get_false_body())),
- body_syms, param_syms, body, loop_idxs,
- globalscope)));
-
- return tnli;
- } else if (isSgStatement(code) && !isSgBasicBlock(code)) {
- omega::CG_roseRepr * tnl = new omega::CG_roseRepr(code);
- omega::CG_outputRepr* block = tnl->clone();
- tnli = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-
- return tnli;
-
- } else if (isSgBasicBlock(code)) {
- SgStatementPtrList& tnl = isSgBasicBlock(code)->get_statements();
-
- SgStatementPtrList::iterator temp;
- clone = buildBasicBlock();
- bool sync_found = false;
- for (SgStatementPtrList::const_iterator it = tnl.begin();
- it != tnl.end(); it++) {
-
- if (isSgForStatement(*it)) {
- AstTextAttribute* att =
- (AstTextAttribute*) (isSgNode(*it)->getAttribute(
- "omega_comment"));
-
- std::string comment;
- if (att != NULL)
- comment = att->toString();
-
- if (comment.find("~cuda~") != std::string::npos
- && comment.find("preferredIdx: ")
- != std::string::npos) {
- std::string idx = comment.substr(
- comment.find("preferredIdx: ") + 14,
- std::string::npos);
- if (idx.find(" ") != std::string::npos)
- idx = idx.substr(0, idx.find(" "));
- //printf("sym_tab preferred index: %s\n", idx.c_str());
- if (loop_idxs.find(idx) != loop_idxs.end())
- idxSym = loop_idxs.find(idx)->second;
- //Get the proc variable sybol for this preferred index
- if (idxSym == 0) {
- idxSym = body_syms->find_variable(idx.c_str());
- if (!idxSym)
- idxSym = param_syms->find_variable(idx.c_str());
- //printf("idx not found: lookup %p\n", idxSym);
- if (!idxSym) {
- SgVariableDeclaration* defn =
- buildVariableDeclaration(
- SgName((char*) idx.c_str()),
- buildIntType());
- //idxSym = new var_sym(type_s32, (char*)idx.c_str());
- SgInitializedNamePtrList& variables =
- defn->get_variables();
- SgInitializedNamePtrList::const_iterator i =
- variables.begin();
- SgInitializedName* initializedName = *i;
- SgVariableSymbol* vs = new SgVariableSymbol(
- initializedName);
- prependStatement(defn, body);
- vs->set_parent(body_syms);
- body_syms->insert(SgName((char*) idx.c_str()), vs);
- //printf("idx created and inserted\n");
- idxSym = vs;
- }
- //Now insert into our map for future
- if (cudaDebug)
- std::cout << idx << "\n\n";
- loop_idxs.insert(make_pair(idx, idxSym));
-
- }
- //See if we have a sync as well
- if (comment.find("sync") != std::string::npos) {
- //printf("Inserting sync after current block\n");
- sync = true;
- }
-
- }
- if (idxSym) {
- SgForInitStatement* list =
- isSgForStatement(*it)->get_for_init_stmt();
- SgStatementPtrList& initStatements = list->get_init_stmt();
- SgStatementPtrList::const_iterator j =
- initStatements.begin();
- const SgVariableSymbol* index;
-
- if (SgExprStatement *expr = isSgExprStatement(*j))
- if (SgAssignOp* op = isSgAssignOp(
- expr->get_expression()))
- if (SgVarRefExp* var_ref = isSgVarRefExp(
- op->get_lhs_operand()))
- index = var_ref->get_symbol();
-
- std::vector<SgVarRefExp *> array = substitute(*it, index,
- NULL, isSgNode(body_syms));
-
- for (int j = 0; j < array.size(); j++)
- array[j]->set_symbol(idxSym);
-
- }
-
- SgStatement* body_ =
- isSgStatement(
- recursiveFindReplacePreferedIdxs(
- isSgNode(
- (isSgForStatement(*it)->get_loop_body())),
- body_syms, param_syms, body, loop_idxs,
- globalscope));
-
- omega::CG_roseRepr * tnl = new omega::CG_roseRepr(*it);
- omega::CG_outputRepr* block = tnl->clone();
- tnli =
- static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-
- isSgForStatement(tnli)->set_loop_body(body_);
- body_->set_parent(tnli);
- if (idxSym) {
- SgForInitStatement* list =
- isSgForStatement(tnli)->get_for_init_stmt();
- SgStatementPtrList& initStatements = list->get_init_stmt();
- SgStatementPtrList::const_iterator j =
- initStatements.begin();
- const SgVariableSymbol* index;
-
- if (SgExprStatement *expr = isSgExprStatement(*j))
- if (SgAssignOp* op = isSgAssignOp(
- expr->get_expression()))
- if (SgVarRefExp* var_ref = isSgVarRefExp(
- op->get_lhs_operand()))
- index = var_ref->get_symbol();
-
- std::vector<SgVarRefExp *> array = substitute(tnli, index,
- NULL, isSgNode(body_syms));
-
- for (int j = 0; j < array.size(); j++)
- array[j]->set_symbol(idxSym);
- }
- idxSym = 0;
- // std::cout << isSgNode(body_)->unparseToString() << "\n\n";
- if (att != NULL)
- tnli->setAttribute("omega_comment", att);
- clone->append_statement(isSgStatement(tnli));
- if (sync) {
- SgName name_syncthreads("__syncthreads");
- SgFunctionSymbol * syncthreads_symbol =
- globalscope->lookup_function_symbol(
- name_syncthreads);
-
- // Create a call to __syncthreads():
- SgFunctionCallExp * syncthreads_call = buildFunctionCallExp(
- syncthreads_symbol, buildExprListExp());
-
- SgExprStatement* stmt = buildExprStatement(
- syncthreads_call);
-
- /* if (SgBasicBlock* bb = isSgBasicBlock(
- isSgForStatement(code)->get_loop_body()))
- appendStatement(isSgStatement(stmt), bb);
-
- else if (SgStatement* ss = isSgStatement(
- isSgForStatement(code)->get_loop_body())) {
- SgBasicBlock* bb2 = buildBasicBlock();
-
- isSgNode(ss)->set_parent(bb2);
- appendStatement(ss, bb2);
-
- appendStatement(isSgStatement(stmt), bb2);
- isSgNode(stmt)->set_parent(bb2);
- isSgForStatement(code)->set_loop_body(bb2);
- isSgNode(bb2)->set_parent(code);
- }
- */
-
- //SgBasicBlock* bb2 = buildBasicBlock();
- clone->append_statement(stmt);
- /* SgNode* parent = code->get_parent();
- if(!isSgStatement(parent))
- throw loop_error("Parent not a statement");
-
- if(isSgForStatement(parent)){
- if(SgStatement *ss = isSgForStatement(isSgForStatement(parent)->get_loop_body())){
- omega::CG_roseRepr * tnl = new omega::CG_roseRepr(ss);
- omega::CG_outputRepr* block= tnl->clone();
-
- SgNode *new_ss = static_cast<const omega::CG_roseRepr *>(block)->GetCode();
- SgBasicBlock* bb2 = buildBasicBlock();
-
- isSgNode(new_ss)->set_parent(bb2);
- appendStatement(isSgStatement(new_ss), bb2);
- appendStatement(isSgStatement(stmt), bb2);
- isSgNode(stmt)->set_parent(bb2);
-
- isSgStatement(parent)->replace_statement_from_basicBlock(ss, isSgStatement(bb2));
-
- }else if(isSgBasicBlock(isSgForStatement(parent)->get_loop_body()))
- isSgStatement(isSgForStatement(parent)->get_loop_body())->insert_statement(isSgStatement(code), stmt, false);
- else
- throw loop_error("parent statement type undefined!!");
-
- }
- else if(isSgBasicBlock(parent))
- isSgStatement(parent)->insert_statement(isSgStatement(code), stmt, false);
- else
- throw loop_error("parent statement type undefined!!");
-
- //tnl->print();
- *
- *
- */
- sync = true;
- // return isSgNode(bb2);
-
- }
-
- // return tnli;
- } else if (isSgIfStmt(*it)) {
- SgStatement* body_ = isSgStatement(
- recursiveFindReplacePreferedIdxs(
- isSgNode((isSgIfStmt(*it)->get_true_body())),
- body_syms, param_syms, body, loop_idxs,
- globalscope));
-
- omega::CG_roseRepr * tnl = new omega::CG_roseRepr(*it);
- omega::CG_outputRepr* block = tnl->clone();
- tnli1 =
- static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-
- isSgIfStmt(tnli1)->set_true_body(body_);
-
- if ((isSgIfStmt(*it)->get_false_body()))
- isSgIfStmt(tnli1)->set_false_body(
- isSgStatement(
- recursiveFindReplacePreferedIdxs(
- isSgNode(
- (isSgIfStmt(*it)->get_false_body())),
- body_syms, param_syms, body,
- loop_idxs, globalscope)));
-
- clone->append_statement(isSgStatement(tnli1));
- //return tnli;
- } else if (isSgStatement(*it)) {
- omega::CG_roseRepr * tnl = new omega::CG_roseRepr(*it);
- omega::CG_outputRepr* block = tnl->clone();
- tnli2 =
- static_cast<const omega::CG_roseRepr *>(block)->GetCode();
-
- clone->append_statement(isSgStatement(tnli2));
- //return tnli;
-
- }
- }
-
- return isSgNode(clone);
-
- }
-
- /* if (!isSgBasicBlock(
- recursiveFindReplacePreferedIdxs(isSgNode(*it), body_syms,
- param_syms, body, loop_idxs, globalscope))) {
- SgStatement *to_push = isSgStatement(
- recursiveFindReplacePreferedIdxs(isSgNode(*it),
- body_syms, param_syms, body, loop_idxs,
- globalscope, sync));
- clone->append_statement(to_push);
-
- if ((sync_found) && isSgForStatement(to_push)) {
- SgName name_syncthreads("__syncthreads");
- SgFunctionSymbol * syncthreads_symbol =
- globalscope->lookup_function_symbol(
- name_syncthreads);
-
- // Create a call to __syncthreads():
- SgFunctionCallExp * syncthreads_call = buildFunctionCallExp(
- syncthreads_symbol, buildExprListExp());
-
- SgExprStatement* stmt = buildExprStatement(
- syncthreads_call);
-
- clone->append_statement(isSgStatement(stmt));
- }
- // std::cout<<isSgNode(*it)->unparseToString()<<"\n\n";
- } else {
-
- SgStatementPtrList& tnl2 = isSgBasicBlock(
- recursiveFindReplacePreferedIdxs(isSgNode(*it),
- body_syms, param_syms, body, loop_idxs,
- globalscope))->get_statements();
- for (SgStatementPtrList::const_iterator it2 = tnl2.begin();
- it2 != tnl2.end(); it2++) {
- clone->append_statement(*it2);
-
- sync_found = true;
- // std::cout<<isSgNode(*it2)->unparseToString()<<"\n\n";
- }
- }
-
- }
- return isSgNode(clone);
- }
- */
-// return tnl;
-}
-
-// loop_vars -> array references
-// loop_idxs -> <idx_name,idx_sym> map for when we encounter a loop with a different preferredIndex
-// dim_vars -> out param, fills with <old,new> var_sym pair for 2D array dimentions (messy stuff)
-SgNode* swapVarReferences(SgNode* code,
- std::set<const SgVariableSymbol *>& syms, SgSymbolTable* param,
- SgSymbolTable* body, SgScopeStatement* body_stmt) {
- //Iterate over every expression, looking up each variable and type
- //reference used and possibly replacing it or adding it to our symbol
- //table
- //
- //We use the built-in cloning helper methods to seriously help us with this!
-
- //Need to do a recursive mark
-
- std::set<const SgVariableSymbol *>::iterator myIterator;
- for (myIterator = syms.begin(); myIterator != syms.end(); myIterator++) {
- SgName var_name = (*myIterator)->get_name();
- std::string x = var_name.getString();
-
- if ((param->find_variable(var_name) == NULL)
- && (body->find_variable(var_name) == NULL)) {
- SgInitializedName* decl = (*myIterator)->get_declaration();
-
- SgVariableSymbol* dvs = new SgVariableSymbol(decl);
- SgVariableDeclaration* var_decl = buildVariableDeclaration(
- dvs->get_name(), dvs->get_type());
-
- AstTextAttribute* att = (AstTextAttribute*) (isSgNode(
- decl->get_declaration())->getAttribute("__shared__"));
- if (isSgNode(decl->get_declaration())->attributeExists(
- "__shared__"))
- var_decl->get_declarationModifier().get_storageModifier().setCudaShared();
-
- appendStatement(var_decl, body_stmt);
-
- dvs->set_parent(body);
- body->insert(var_name, dvs);
- }
-
- std::vector<SgVarRefExp *> array = substitute(code, *myIterator, NULL,
- isSgNode(body));
-
- SgVariableSymbol* var = (SgVariableSymbol*) (*myIterator);
- for (int j = 0; j < array.size(); j++)
- array[j]->set_symbol(var);
- }
-
- return code;
-}
-
-bool LoopCuda::validIndexes(int stmt, const std::vector<std::string>& idxs) {
- for (int i = 0; i < idxs.size(); i++) {
- bool found = false;
- for (int j = 0; j < idxNames[stmt].size(); j++) {
- if (strcmp(idxNames[stmt][j].c_str(), idxs[i].c_str()) == 0) {
- found = true;
- }
- }
- if (!found) {
- return false;
- }
- }
- return true;
-}
-
-bool LoopCuda::cudaize_v2(std::string kernel_name,
- std::map<std::string, int> array_dims,
- std::vector<std::string> blockIdxs,
- std::vector<std::string> threadIdxs) {
- CG_outputBuilder *ocg = ir->builder();
- int stmt_num = 0;
- if (cudaDebug) {
- printf("cudaize_v2(%s, {", kernel_name.c_str());
- //for(
- printf("}, blocks={");
- printVs(blockIdxs);
- printf("}, thread={");
- printVs(threadIdxs);
- printf("})\n");
- }
-
- this->array_dims = array_dims;
- if (!validIndexes(stmt_num, blockIdxs)) {
- throw std::runtime_error("One of the indexes in the block list was not "
- "found in the current set of indexes.");
- }
- if (!validIndexes(stmt_num, threadIdxs)) {
- throw std::runtime_error(
- "One of the indexes in the thread list was not "
- "found in the current set of indexes.");
- }
- if (blockIdxs.size() == 0)
- throw std::runtime_error("Cudaize: Need at least one block dimention");
- int block_level = 0;
- //Now, we will determine the actual size (if possible, otherwise
- //complain) for the block dimentions and thread dimentions based on our
- //indexes and the relations for our stmt;
- for (int i = 0; i < blockIdxs.size(); i++) {
- int level = findCurLevel(stmt_num, blockIdxs[i]);
- int ub, lb;
- CG_outputRepr* ubrepr = extractCudaUB(stmt_num, level, ub, lb);
- if (lb != 0) {
- //attempt to "normalize" the loop with an in-place tile and then re-check our bounds
- if (cudaDebug)
- printf(
- "Cudaize: doing tile at level %d to try and normalize lower bounds\n",
- level);
- tile(stmt_num, level, 1, level, CountedTile);
- idxNames[stmt_num].insert(idxNames[stmt_num].begin() + (level), ""); //TODO: possibly handle this for all sibling stmts
- ubrepr = extractCudaUB(stmt_num, level, ub, lb);
- }
- if (lb != 0) {
- char buf[1024];
- sprintf(buf,
- "Cudaize: Loop at level %d does not have 0 as it's lower bound",
- level);
- throw std::runtime_error(buf);
- }
- if (ub < 0) {
- char buf[1024];
- sprintf(buf,
- "Cudaize: Loop at level %d does not have a hard upper bound",
- level);
- //Anand: Commenting out error indication for lack of constant upper bound
- //throw std::runtime_error(buf);
- }
- if (cudaDebug)
- printf("block idx %s level %d lb: %d ub %d\n", blockIdxs[i].c_str(),
- level, lb, ub);
- if (i == 0) {
- block_level = level;
- if (ubrepr == NULL) {
- cu_bx = ub + 1;
- cu_bx_repr = NULL;
- } else {
- cu_bx = 0;
- cu_bx_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
- }
- idxNames[stmt_num][level - 1] = "bx";
- } else if (i == 1) {
- if (ubrepr == NULL) {
- cu_by = ub + 1;
- cu_by_repr = NULL;
- } else {
- cu_by = 0;
- cu_by_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
- }
- idxNames[stmt_num][level - 1] = "by";
- }
- }
- if (!cu_by && !cu_by_repr)
- block_level = 0;
- int thread_level1 = 0;
- int thread_level2 = 0;
- for (int i = 0; i < threadIdxs.size(); i++) {
- int level = findCurLevel(stmt_num, threadIdxs[i]);
- int ub, lb;
- CG_outputRepr* ubrepr = extractCudaUB(stmt_num, level, ub, lb);
- if (lb != 0) {
- //attempt to "normalize" the loop with an in-place tile and then re-check our bounds
- if (cudaDebug)
- printf(
- "Cudaize: doing tile at level %d to try and normalize lower bounds\n",
- level);
- tile(stmt_num, level, 1, level, CountedTile);
- idxNames[stmt_num].insert(idxNames[stmt_num].begin() + (level), "");
- ubrepr = extractCudaUB(stmt_num, level, ub, lb);
- }
- if (lb != 0) {
- char buf[1024];
- sprintf(buf,
- "Cudaize: Loop at level %d does not have 0 as it's lower bound",
- level);
- throw std::runtime_error(buf);
- }
- if (ub < 0) {
- char buf[1024];
- sprintf(buf,
- "Cudaize: Loop at level %d does not have a hard upper bound",
- level);
- //Anand: Commenting out error indication for lack of constant upper bound
- //throw std::runtime_error(buf);
- }
-
- if (cudaDebug)
- printf("thread idx %s level %d lb: %d ub %d\n",
- threadIdxs[i].c_str(), level, lb, ub);
- if (i == 0) {
- thread_level1 = level;
- if (ubrepr == NULL) {
- cu_tx = ub + 1;
- cu_tx_repr = NULL;
- } else {
- cu_tx = 0;
- cu_tx_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
- }
- idxNames[stmt_num][level - 1] = "tx";
- } else if (i == 1) {
- thread_level2 = level;
- if (ubrepr == NULL) {
- cu_ty = ub + 1;
- cu_ty_repr = NULL;
- } else {
- cu_ty = 0;
- cu_ty_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
- }
- idxNames[stmt_num][level - 1] = "ty";
- } else if (i == 2) {
- if (ubrepr == NULL) {
- cu_tz = ub + 1;
- cu_tz_repr = NULL;
- } else {
- cu_tz = 0;
- cu_tz_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1));
- }
- idxNames[stmt_num][level - 1] = "tz";
- }
- }
- if (!cu_ty && !cu_ty_repr)
- thread_level1 = 0;
- if (!cu_tz && !cu_tz_repr)
- thread_level2 = 0;
-
- //Make changes to nonsplitlevels
- const int m = stmt.size();
- for (int i = 0; i < m; i++) {
- if (block_level) {
- //stmt[i].nonSplitLevels.append((block_level)*2);
- stmt_nonSplitLevels[i].push_back((block_level) * 2);
- }
- if (thread_level1) {
- //stmt[i].nonSplitLevels.append((thread_level1)*2);
- stmt_nonSplitLevels[i].push_back((thread_level1) * 2);
- }
- if (thread_level2) {
- //stmt[i].nonSplitLevels.append((thread_level1)*2);
- stmt_nonSplitLevels[i].push_back((thread_level1) * 2);
- }
- }
-
- if (cudaDebug) {
- printf("Codegen: current names: ");
- printVS(idxNames[stmt_num]);
- }
- //Set codegen flag
- code_gen_flags |= GenCudaizeV2;
-
- //Save array dimention sizes
- this->array_dims = array_dims;
- cu_kernel_name = kernel_name.c_str();
-
-}
-
-/*
- * setupConstantVar
- * handles constant variable declaration
- * and adds a global constant variable
- * parameters:
- * constant - the constant_memory_mapping object for this loop
- * arr_def - the VarDefs object for the mapped variable
- * globals - Rose Global variables
- * i - an index to keep new variable names unique
- * symtab - global symbol table
- */
-static void setupConstantVar(constant_memory_mapping* constant, VarDefs* arr_def, SgGlobal* globals, int i, SgSymbolTable* symtab) {
- char* buf1 = new char[32];
- snprintf(buf1, 32, "cs%dRef", i+1);
- arr_def->secondName = buf1;
-
- char buf2[64];
- snprintf(buf2, 64, "__device__ __constant__ float");
-
- SgVariableDeclaration* consvar_decl = buildVariableDeclaration(
- SgName(std::string(buf1)), buildArrayType(
- buildOpaqueType(SgName(buf2),globals),
- arr_def->size_expr));
- SgInitializedNamePtrList& variables = consvar_decl->get_variables();
- SgInitializedNamePtrList::const_iterator j = variables.begin();
- SgInitializedName* initializedName = *j;
- SgVariableSymbol* consvar_sym = new SgVariableSymbol(initializedName);
- prependStatement(consvar_decl, globals);
-
- consvar_sym->set_parent(symtab);
- symtab->insert(SgName(std::string(buf1)), consvar_sym);
-
- constant->set_mapped_symbol(arr_def->original_name.c_str(), consvar_sym);
- constant->set_vardef(arr_def->original_name.c_str(), arr_def);
-}
-
-/*
- * cudaBindConstantVar
- * allocs a variable to constant memory
- * constant - the constant mapping object
- * arr_def - the VarDefs abject
- * globals - global symbol table
- * stmt_list - the GPU functions' statement list
- */
-static void cudaBindConstantVar(constant_memory_mapping* constant, VarDefs* arr_def, SgGlobal* globals, SgStatementPtrList* stmt_list) {
- SgName cudaMemcpyToSymbol_name("cudaMemcpyToSymbol");
- SgFunctionDeclaration* cudaMemcpyToSymbol_decl = buildNondefiningFunctionDeclaration(
- cudaMemcpyToSymbol_name, buildVoidType(), buildFunctionParameterList(), globals);
- SgExprListExp* args = buildExprListExp();
- args->append_expression(buildCastExp(constant->get_mapped_symbol_exp(arr_def->original_name.c_str()),
- buildPointerType(buildVoidType())));
- args->append_expression(buildVarRefExp(arr_def->in_data));
- args->append_expression(arr_def->size_expr);
- stmt_list->push_back(buildExprStatement(
- buildFunctionCallExp(buildFunctionRefExp(cudaMemcpyToSymbol_decl), args)));
-}
-
-static void consmapArrayRefs(constant_memory_mapping* constant, std::vector<IR_ArrayRef*>* refs, SgGlobal* globals, IR_Code* ir, CG_roseBuilder* ocg) {
- // if constant mapping is not being used, ignore this function
- if(constant == NULL) return;
- for(int i = 0; i < refs->size(); i++) {
- IR_ArrayRef* aref = (*refs)[i];
- if(constant->is_array_mapped(aref->name().c_str())) {
- // get array reference dimensions
- int dims = aref->symbol()->n_dim();
- if(dims > 2) {
- printf(" \n CHiLL does not handle constant memory mapping for more than 2D arrays.\n");
- return;
- }
-
- SgExpression* varexp = constant->get_mapped_symbol_exp(aref->name().c_str());
- SgExpression* index_exp;
- // build index expression
- if(dims == 1) {
- index_exp = static_cast<omega::CG_roseRepr*>(aref->index(0)->clone())->GetExpression();
- }
- if(dims == 2) {
- VarDefs* arr_def = constant->get_vardef(aref->name().c_str());
- CG_outputRepr* i0 = aref->index(0)->clone();
- CG_outputRepr* i1 = aref->index(1)->clone();
- CG_outputRepr* sz = new CG_roseRepr(buildIntVal(arr_def->size_multi_dim[0]));
- CG_outputRepr* exp = ocg->CreatePlus(ocg->CreateTimes(sz->clone(), i0), i1);
- index_exp = static_cast<omega::CG_roseRepr*>(exp->clone())->GetExpression();
- }
- ir->ReplaceExpression(aref, new CG_roseRepr(buildPntrArrRefExp(varexp, index_exp)));
- }
- }
-}
-
-/*
- * setupTexmappingVar
- * handles texture variable declaration
- * and adds a global texture object
- * parameters:
- * texture - the texture_memory_mapping object
- * arr_def - the VarDefs object for the mapped variable
- * globals - Rose Global variables
- * i - an index to keep the new variable names unique
- * devptr_sym - the devptr that the original variable is associated with
- * symtab - GPU function symbol table
- */
-static void setupTexmappingVar(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals, int i, SgVariableSymbol* devptr_sym, SgSymbolTable* symtab) {
- char* buf1 = new char[32];
- snprintf(buf1, 32, "tex%dRef", i+1);
- arr_def->secondName = buf1;
-
- char buf2[64];
- // single-dimensional
- snprintf(buf2, 64, "texture<float, %d, cudaReadModeElementType>", 1);
- // multi-dimensional
- // snprintf(buf2, 64, "texture<float, %d, cudaReadModeElemetType>", (int)(arr_def->size_multi_dim.size())); //*/
-
- SgVariableDeclaration* texvar_decl = buildVariableDeclaration(SgName(std::string(buf1)), buildOpaqueType(buf2, globals));
-
- SgInitializedNamePtrList& variables = texvar_decl->get_variables();
- SgInitializedNamePtrList::const_iterator j = variables.begin();
- SgInitializedName* initializedName = *j;
- SgVariableSymbol* texvar_sym = new SgVariableSymbol(initializedName);
- prependStatement(texvar_decl, globals);
-
- texvar_sym->set_parent(symtab);
- symtab->insert(SgName(buf1), texvar_sym);
-
- texture->set_mapped_symbol(arr_def->original_name.c_str(), texvar_sym);
- texture->set_devptr_symbol(arr_def->original_name.c_str(), devptr_sym);
- texture->set_vardef(arr_def->original_name.c_str(), arr_def);
-}
-
-
-/*
- * One dimensional version of cudaBindTexture
- * see cudaBindTexture for details
- */
-static SgFunctionCallExp* cudaBindTexture1D(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals) {
- SgName cudaBindTexture_name("cudaBindTexture");
- SgFunctionDeclaration* cudaBindTexture_decl = buildNondefiningFunctionDeclaration(
- cudaBindTexture_name, buildVoidType(), buildFunctionParameterList(), globals);
-
- SgExprListExp* args = buildExprListExp();
- args->append_expression(buildIntVal(0));
- args->append_expression(texture->get_mapped_symbol_exp(arr_def->original_name.c_str()));
- args->append_expression(texture->get_devptr_symbol_exp(arr_def->original_name.c_str()));
- args->append_expression(arr_def->size_expr);
- return buildFunctionCallExp(buildFunctionRefExp(cudaBindTexture_decl), args);
-}
-
-/*
- * Two dimensional version of cudaBindTexture
- * see cudaBindTexture for details
- */
-//static SgFunctionCallExp* cudaBindTexture2D(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals) {
-// SgName cudaBindTexture_name("cudaBindTexture2D");
-// SgFunctionDeclaration* cudaBindTexture_decl = buildNondefiningFunctionDeclaration(
-// cudaBindTexture_name, buildVoidType(), buildFunctionParameterList(), globals);
-//
-// SgExprListExp* args = buildExprListExp();
-// args->append_expression(buildIntVal(0));
-// args->append_expression(texture->get_tex_mapped_symbol_exp(arr_def->original_name.c_str()));
-// args->append_expression(texture->get_devptr_symbol_exp(arr_def->original_name.c_str()));
-// args->append_expression(buildIntVal(texture->get_dim_length(arr_def->original_name.c_str(), 0)));
-// args->append_expression(buildIntVal(texture->get_dim_length(arr_def->original_name.c_str(), 1)));
-// args->append_expression(arr_def->size_expr);
-// return buildFunctionCallExp(buildFunctionRefExp(cudaBindTexture_decl), args);
-//}
-
-/*
- * cudaBindTexture
- * binds a variable to a texture
- * parameters:
- * texture - the texture mapping object
- * arr_def - the VarDefs object
- * globals - global symbol table
- * stmt_list - the GPU functions' statement list
- * notes:
- * only supports binding 1D textures, may need to consider cudaBindTexture2D for 2D textures
- */
-static void cudaBindTexture(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals, SgStatementPtrList* stmt_list) {
- //int dims = (int)(arr_def->size_multi_dim.size());
- //int dims = texture->get_dims(arr_def->original_name.c_str());
- //if(dims == 1)
- stmt_list->push_back(
- buildExprStatement(cudaBindTexture1D(texture, arr_def, globals)));
- //if(dims == 2)
- // stmt_list->push_back(
- // buildExprStatement(cudaBindTexture2D(texture, arr_def, globals)));
-}
-
-/*
- * texmapArrayRefs
- * maps array reference expresions of texture mapped variables to the tex1D function
- * parameters:
- * texture - the texture mapping object
- * refs - a list of all array read operations
- * globals - global symbol table
- * ir - handles IR_Code operations
- * ocg - handles CG_roseBuilder operations
-**/
-static void texmapArrayRefs(texture_memory_mapping* texture, std::vector<IR_ArrayRef*>* refs, SgGlobal* globals, IR_Code* ir, CG_roseBuilder *ocg) {
- // if texture mapping is not being used, ignore this function
- if(texture == NULL) return;
- for(int i = 0; i < refs->size(); i++) {
- IR_ArrayRef* aref = (*refs)[i];
- if(texture->is_array_mapped(aref->name().c_str())) {
-
- // get array dimensions
- VarDefs* arr_def = texture->get_vardef(aref->name().c_str());
- int dims = aref->symbol()->n_dim();
- if(dims > 2) {
- printf(" \n CHiLL does not handle texture mapping for more than 2D arrays.\n");
- // TODO throw some sort of error. or handle in texture_copy function
- return;
- }
-
- // build texture lookup function declaration
- char texNDfetch_strName[16];
- sprintf(texNDfetch_strName, "tex%dDfetch", 1); // for now, only support tex1Dfetch
- //sprintf(texNDfetch_strName, "tex%dDfetch", dims);
- SgFunctionDeclaration* fetch_decl = buildNondefiningFunctionDeclaration(
- SgName(texNDfetch_strName), buildFloatType(), buildFunctionParameterList(), globals);
-
- // build args
- SgExprListExp* args = buildExprListExp();
- args->append_expression(texture->get_mapped_symbol_exp(aref->name().c_str()));
-
- // set indexing args
- //for(int i = 0; i < dims; i++) {
- // args->append_expression((static_cast<omega::CG_roseRepr*>(aref->index(i)->clone()))->GetExpression());
- //}
- if(dims == 1) {
- args->append_expression(static_cast<omega::CG_roseRepr*>(aref->index(0)->clone())->GetExpression());
- }
- else if(dims == 2) {
- CG_outputRepr* i0 = aref->index(0)->clone();
- CG_outputRepr* i1 = aref->index(1)->clone();
- CG_outputRepr* sz = new CG_roseRepr(buildIntVal(arr_def->size_multi_dim[0]));
- CG_outputRepr* expr = ocg->CreatePlus(ocg->CreateTimes(sz->clone(), i0), i1);
- args->append_expression(static_cast<omega::CG_roseRepr*>(expr->clone())->GetExpression());
- }
-
- // build function call and replace original array ref
- SgFunctionCallExp* fetch_call = buildFunctionCallExp(buildFunctionRefExp(fetch_decl), args);
- ir->ReplaceExpression(aref, new CG_roseRepr(fetch_call));
- }
- }
-}
-
-SgNode* LoopCuda::cudaize_codegen_v2() {
- if(cudaDebug)
- printf("cudaize codegen V2\n");
- CG_roseBuilder *ocg = dynamic_cast<CG_roseBuilder*>(ir->builder());
- if (!ocg)
- return false;
-
- //protonu--adding an annote to track texture memory type
- //ANNOTE(k_cuda_texture_memory, "cuda texture memory", TRUE);
- //ANNOTE(k_cuda_constant_memory, "cuda constant memory", TRUE);
- int tex_mem_on = 0;
- int cons_mem_on = 0;
-
-
-
- CG_outputRepr* repr;
- std::vector<VarDefs> arrayVars;
- std::vector<VarDefs> localScopedVars;
-
- std::vector<IR_ArrayRef *> ro_refs;
- std::vector<IR_ArrayRef *> wo_refs;
- std::set<std::string> uniqueRefs;
- std::set<std::string> uniqueWoRefs;
- std::set<const SgVariableSymbol *> syms;
- std::set<const SgVariableSymbol *> psyms;
- std::set<const SgVariableSymbol *> pdSyms;
- SgStatementPtrList* replacement_list = new SgStatementPtrList;
-
- for (int j = 0; j < stmt.size(); j++) {
- std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[j].code);
- for (int i = 0; i < refs.size(); i++) {
- //printf("ref %s wo %d\n", static_cast<const char*>(refs[i]->name()), refs[i]->is_write());
- SgVariableSymbol* var = body_symtab->find_variable(
- SgName((char*) refs[i]->name().c_str()));
- SgVariableSymbol* var2 = parameter_symtab->find_variable(
- SgName((char*) refs[i]->name().c_str()));
-
- //If the array is not a parameter, then it's a local array and we
- //want to recreate it as a stack variable in the kernel as opposed to
- //passing it in.
- if (var != NULL) {
- //anand-- needs modification, if variable is parameter it wont be part of the
- // block's symbol table but the functiond definition's symbol table
-
- continue;
- }
- if (uniqueRefs.find(refs[i]->name()) == uniqueRefs.end()) {
-
- uniqueRefs.insert(refs[i]->name());
- if (refs[i]->is_write()) {
- uniqueWoRefs.insert(refs[i]->name());
- wo_refs.push_back(refs[i]);
- } else
- ro_refs.push_back(refs[i]);
- }
- if (refs[i]->is_write()
- && uniqueWoRefs.find(refs[i]->name())
- == uniqueWoRefs.end()) {
- uniqueWoRefs.insert(refs[i]->name());
- wo_refs.push_back(refs[i]);
- //printf("adding %s to wo\n", static_cast<const char*>(refs[i]->name()));
- }
- pdSyms.insert((const SgVariableSymbol*) var2);
- }
- }
-
- if (cudaDebug) {
- printf("reading from array ");
- for (int i = 0; i < ro_refs.size(); i++)
- printf("'%s' ", ro_refs[i]->name().c_str());
- printf("and writing to array ");
- for (int i = 0; i < wo_refs.size(); i++)
- printf("'%s' ", wo_refs[i]->name().c_str());
- printf("\n");
- }
- const char* gridName = "dimGrid";
- const char* blockName = "dimBlock";
-
- //TODO: Could allow for array_dims_vars to be a mapping from array
- //references to to variable names that define their length.
- SgVariableSymbol* dim1 = 0;
- SgVariableSymbol* dim2 = 0;
-
- for (int i = 0; i < wo_refs.size(); i++) {
- //TODO: Currently assume all arrays are floats of one or two dimentions
- SgVariableSymbol* outArray = 0;
- std::string name = wo_refs[i]->name();
- outArray = body_symtab->find_variable(SgName((char*) name.c_str()));
- int size_n_d;
- if (outArray == NULL)
- outArray = parameter_symtab->find_variable(
- SgName((char*) name.c_str()));
-
- VarDefs v;
- v.size_multi_dim = std::vector<int>();
- char buf[32];
- snprintf(buf, 32, "devO%dPtr", i + 1);
- v.name = buf;
- if (isSgPointerType(outArray->get_type())) {
- if (isSgArrayType(
- isSgNode(
- isSgPointerType(outArray->get_type())->get_base_type()))) {
- // v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type();
- SgType* t =
- isSgPointerType(outArray->get_type())->get_base_type();
- /* SgExprListExp* dimList = t->get_dim_info();
- SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
- SgExpression* expr=NULL;
- for (; j != dimList->get_expressions().end(); j++)
- expr = *j;
- */
- while (isSgArrayType(t))
- t = isSgArrayType(t)->get_base_type();
-
- if (!isSgType(t)) {
- char buf[1024];
- sprintf(buf, "CudaizeCodeGen: Array type undetected!");
- throw std::runtime_error(buf);
-
- }
-
- v.type = t;
- } else
- v.type = isSgPointerType(outArray->get_type())->get_base_type();
- } else if (isSgArrayType(outArray->get_type())) {
- if (isSgArrayType(
- isSgNode(
- isSgArrayType(outArray->get_type())->get_base_type()))) {
- // v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type();
- SgType* t =
- isSgArrayType(outArray->get_type())->get_base_type();
- /* SgExprListExp* dimList = t->get_dim_info();
- SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
- SgExpression* expr=NULL;
- for (; j != dimList->get_expressions().end(); j++)
- expr = *j;
- */
- while (isSgArrayType(t))
- t = isSgArrayType(t)->get_base_type();
-
- if (!isSgType(t)) {
- char buf[1024];
- sprintf(buf, "CudaizeCodeGen: Array type undetected!");
- throw std::runtime_error(buf);
-
- }
-
- v.type = t;
- } else
- v.type = isSgArrayType(outArray->get_type())->get_base_type();
- } else
- v.type = buildFloatType();
- v.tex_mapped = false;
- v.cons_mapped = false;
- v.original_name = wo_refs[i]->name();
- //Size of the array = dim1 * dim2 * num bytes of our array type
-
- //If our input array is 2D (non-linearized), we want the actual
- //dimentions of the array
- CG_outputRepr* size;
- //Lookup in array_dims
- std::map<std::string, int>::iterator it = array_dims.find(name.c_str());
- if (isSgPointerType(outArray->get_type())
- && isSgArrayType(
- isSgNode(
- isSgPointerType(outArray->get_type())->get_base_type()))) {
- SgType* t = isSgPointerType(outArray->get_type())->get_base_type();
- /* SgExprListExp* dimList = t->get_dim_info();
- SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
- SgExpression* expr=NULL;
- for (; j != dimList->get_expressions().end(); j++)
- expr = *j;
- */
- if (isSgIntVal(isSgArrayType(t)->get_index()))
- size_n_d =
- (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgUnsignedIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgUnsignedLongVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d =
- (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d =
- (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgUnsignedLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgAddOp(isSgArrayType(t)->get_index())) {
- SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-
- SgExpression *lhs = op_add->get_lhs_operand();
- SgExpression *rhs = op_add->get_rhs_operand();
-
- if (isSgIntVal(lhs))
- size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value());
- else if (isSgUnsignedIntVal(lhs))
- size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value()
- + (int) isSgUnsignedIntVal(rhs)->get_value();
- else if (isSgUnsignedLongVal(lhs))
- size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongLongIntVal(lhs))
- size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- size_n_d = (int) (isSgLongIntVal(lhs)->get_value()
- + isSgLongIntVal(rhs)->get_value());
- else if (isSgUnsignedLongLongIntVal(lhs))
- size_n_d =
- (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongLongIntVal(rhs)->get_value());
-
- }
- t = isSgArrayType(t)->get_base_type();
- while (isSgArrayType(t)) {
- int dim;
- if (isSgIntVal(isSgArrayType(t)->get_index()))
- dim =
- (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgUnsignedIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgUnsignedLongVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongLongIntVal(
- isSgArrayType(t)->get_index()))
- dim = (int) (isSgUnsignedLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgAddOp(isSgArrayType(t)->get_index())) {
- SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-
- SgExpression *lhs = op_add->get_lhs_operand();
- SgExpression *rhs = op_add->get_rhs_operand();
-
- if (isSgIntVal(lhs))
- dim = (int) isSgIntVal(lhs)->get_value()
- + (int) (isSgIntVal(rhs)->get_value());
- else if (isSgUnsignedIntVal(lhs))
- dim = (int) isSgUnsignedIntVal(lhs)->get_value()
- + (int) isSgUnsignedIntVal(rhs)->get_value();
- else if (isSgUnsignedLongVal(lhs))
- dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongLongIntVal(lhs))
- dim = (int) (isSgLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- dim = (int) (isSgLongIntVal(lhs)->get_value()
- + isSgLongIntVal(rhs)->get_value());
- else if (isSgUnsignedLongLongIntVal(lhs))
- dim =
- (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongLongIntVal(rhs)->get_value());
-
- }
- size_n_d *= dim;
- v.size_multi_dim.push_back(dim);
- t = isSgArrayType(t)->get_base_type();
- }
- //v.size_2d = (int) (isSgIntVal(t->get_index())->get_value());
-
- if (cudaDebug)
- printf("Detected Multi-dimensional array sized of %d for %s\n",
- size_n_d, (char*) wo_refs[i]->name().c_str());
- size = ocg->CreateInt(size_n_d);
- } else if (isSgArrayType(outArray->get_type())
- && isSgArrayType(
- isSgNode(
- isSgArrayType(outArray->get_type())->get_base_type()))) {
- SgType* t = outArray->get_type();
- /* SgExprListExp* dimList = t->get_dim_info();
- SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
- SgExpression* expr=NULL;
- for (; j != dimList->get_expressions().end(); j++)
- expr = *j;
- */
-
- if (isSgIntVal(isSgArrayType(t)->get_index()))
- size_n_d =
- (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgUnsignedIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgUnsignedLongVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d =
- (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d =
- (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgUnsignedLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgAddOp(isSgArrayType(t)->get_index())) {
- SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-
- SgExpression *lhs = op_add->get_lhs_operand();
- SgExpression *rhs = op_add->get_rhs_operand();
-
- if (isSgIntVal(lhs))
- size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value());
- else if (isSgUnsignedIntVal(lhs))
- size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value()
- + (int) isSgUnsignedIntVal(rhs)->get_value();
- else if (isSgUnsignedLongVal(lhs))
- size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongLongIntVal(lhs))
- size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- size_n_d = (int) (isSgLongIntVal(lhs)->get_value()
- + isSgLongIntVal(rhs)->get_value());
- else if (isSgUnsignedLongLongIntVal(lhs))
- size_n_d =
- (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongLongIntVal(rhs)->get_value());
-
- }
- t = isSgArrayType(t)->get_base_type();
- while (isSgArrayType(t)) {
- int dim;
- if (isSgIntVal(isSgArrayType(t)->get_index()))
- dim =
- (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgUnsignedIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgUnsignedLongVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongLongIntVal(
- isSgArrayType(t)->get_index()))
- dim = (int) (isSgUnsignedLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgAddOp(isSgArrayType(t)->get_index())) {
- SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-
- SgExpression *lhs = op_add->get_lhs_operand();
- SgExpression *rhs = op_add->get_rhs_operand();
-
- if (isSgIntVal(lhs))
- dim = (int) isSgIntVal(lhs)->get_value()
- + (int) (isSgIntVal(rhs)->get_value());
- else if (isSgUnsignedIntVal(lhs))
- dim = (int) isSgUnsignedIntVal(lhs)->get_value()
- + (int) isSgUnsignedIntVal(rhs)->get_value();
- else if (isSgUnsignedLongVal(lhs))
- dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongLongIntVal(lhs))
- dim = (int) (isSgLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- dim = (int) (isSgLongIntVal(lhs)->get_value()
- + isSgLongIntVal(rhs)->get_value());
- else if (isSgUnsignedLongLongIntVal(lhs))
- dim =
- (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongLongIntVal(rhs)->get_value());
-
- }
- size_n_d *= dim;
- v.size_multi_dim.push_back(dim);
- t = isSgArrayType(t)->get_base_type();
- }
-
- //v.size_2d = (int) (isSgIntVal(t->get_index())->get_value());
-
- if (cudaDebug)
- printf("Detected Multi-Dimensional array sized of %d for %s\n",
- size_n_d, (char*) wo_refs[i]->name().c_str());
- size = ocg->CreateInt(size_n_d);
- } else if (it != array_dims.end()) {
- int ref_size = it->second;
- //size =
- // ocg->CreateInt(
- // isSgIntVal(
- // isSgArrayType(outArray->get_type())->get_index())->get_value());
- //v.size_2d = isSgArrayType(outArray->get_type())->get_rank();
- //v.var_ref_size = ref_size;
- size = ocg->CreateInt(ref_size);
-
- } else {
- if (dim1) {
- size = ocg->CreateTimes(
- new CG_roseRepr(isSgExpression(buildVarRefExp(dim1))),
- new CG_roseRepr(isSgExpression(buildVarRefExp(dim2))));
- } else {
- char buf[1024];
- sprintf(buf,
- "CudaizeCodeGen: Array reference %s does not have a "
- "detectable size or specififed dimentions",
- name.c_str());
- throw std::runtime_error(buf);
- }
- }
-
- v.size_expr =
- static_cast<CG_roseRepr*>(ocg->CreateTimes(size,
- new omega::CG_roseRepr(
- isSgExpression(buildSizeOfOp(v.type)))))->GetExpression();
-
- v.in_data = 0;
- v.out_data = outArray;
- //Check for in ro_refs and remove it at this point
- std::vector<IR_ArrayRef *>::iterator it_;
- for (it_ = ro_refs.begin(); it_ != ro_refs.end(); it_++) {
- if ((*it_)->name() == wo_refs[i]->name()) {
- break;
- }
- }
- if (it_ != ro_refs.end()) {
- v.in_data = outArray;
- ro_refs.erase(it_);
- }
-
- arrayVars.push_back(v);
-
- }
-
- //protonu-- assuming that all texture mapped memories were originally read only mems
- //there should be safety checks for that, will implement those later
-
- for (int i = 0; i < ro_refs.size(); i++) {
- SgVariableSymbol* inArray = 0;
- std::string name = ro_refs[i]->name();
- inArray = body_symtab->find_variable(SgName((char*) name.c_str()));
- if (inArray == NULL)
- inArray = parameter_symtab->find_variable(
- SgName((char*) name.c_str()));
-
- VarDefs v;
- v.size_multi_dim = std::vector<int>();
- char buf[32];
- snprintf(buf, 32, "devI%dPtr", i + 1);
- v.name = buf;
- int size_n_d;
- if (isSgPointerType(inArray->get_type())) {
- if (isSgArrayType(
- isSgNode(
- isSgPointerType(inArray->get_type())->get_base_type()))) {
-
- SgType* t =
- isSgPointerType(inArray->get_type())->get_base_type();
-
- while (isSgArrayType(t))
- t = isSgArrayType(t)->get_base_type();
-
- if (!isSgType(t)) {
- char buf[1024];
- sprintf(buf, "CudaizeCodeGen: Array type undetected!");
- throw std::runtime_error(buf);
-
- }
- v.type = t;
- } else
- v.type = isSgPointerType(inArray->get_type())->get_base_type();
- } else if (isSgArrayType(inArray->get_type())) {
- if (isSgArrayType(
- isSgNode(
- isSgArrayType(inArray->get_type())->get_base_type()))) {
-
- SgType* t = inArray->get_type();
- while (isSgArrayType(t))
- t = isSgArrayType(t)->get_base_type();
-
- if (!isSgType(t)) {
- char buf[1024];
- sprintf(buf, "CudaizeCodeGen: Array type undetected!");
- throw std::runtime_error(buf);
-
- }
- v.type = t;
- } else
- v.type = isSgArrayType(inArray->get_type())->get_base_type();
- }
-
- else
- v.type = buildFloatType();
-
- v.tex_mapped = false;
- v.cons_mapped = false;
- v.original_name = ro_refs[i]->name();
-
- //derick -- adding texture and constant mapping
- if ( texture != NULL)
- v.tex_mapped = (texture->is_array_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars
- if (v.tex_mapped){
- printf("this variable %s is mapped to texture memory", name.c_str());
- }
- //derick -- this is commented out until constant memory is implemeted
- if ( constant_mem != NULL)
- v.cons_mapped = (constant_mem->is_array_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars
- if (v.cons_mapped){
- printf("this variable %s is mapped to constant memory", name.c_str());
- }
-
- //Size of the array = dim1 * dim2 * num bytes of our array type
- //If our input array is 2D (non-linearized), we want the actual
- //dimentions of the array (as it might be less than cu_n
- CG_outputRepr* size;
- //Lookup in array_dims
- std::map<std::string, int>::iterator it = array_dims.find(name.c_str());
- if (isSgPointerType(inArray->get_type())
- && isSgArrayType(
- isSgPointerType(inArray->get_type())->get_base_type())) {
- //SgArrayType* t = isSgArrayType(isSgArrayType(inArray->get_type())->get_base_type());
- //v.size_2d = t->get_rank();
- SgType* t = isSgPointerType(inArray->get_type())->get_base_type();
- /* SgExprListExp* dimList = t->get_dim_info();
- SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
- SgExpression* expr=NULL;
- for (; j != dimList->get_expressions().end(); j++)
- expr = *j;
- */
- //v.size_2d = 1;
- if (isSgIntVal(isSgArrayType(t)->get_index()))
- size_n_d =
- (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgUnsignedIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgUnsignedLongVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d =
- (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d =
- (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgUnsignedLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgAddOp(isSgArrayType(t)->get_index())) {
- SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-
- SgExpression *lhs = op_add->get_lhs_operand();
- SgExpression *rhs = op_add->get_rhs_operand();
-
- if (isSgIntVal(lhs))
- size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value());
- else if (isSgUnsignedIntVal(lhs))
- size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value()
- + (int) isSgUnsignedIntVal(rhs)->get_value();
- else if (isSgUnsignedLongVal(lhs))
- size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongLongIntVal(lhs))
- size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- size_n_d = (int) (isSgLongIntVal(lhs)->get_value()
- + isSgLongIntVal(rhs)->get_value());
- else if (isSgUnsignedLongLongIntVal(lhs))
- size_n_d =
- (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongLongIntVal(rhs)->get_value());
-
- }
- t = isSgArrayType(t)->get_base_type();
- while (isSgArrayType(t)) {
- int dim;
- if (isSgIntVal(isSgArrayType(t)->get_index()))
- dim =
- (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgUnsignedIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgUnsignedLongVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongLongIntVal(
- isSgArrayType(t)->get_index()))
- dim = (int) (isSgUnsignedLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgAddOp(isSgArrayType(t)->get_index())) {
- SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-
- SgExpression *lhs = op_add->get_lhs_operand();
- SgExpression *rhs = op_add->get_rhs_operand();
-
- if (isSgIntVal(lhs))
- dim = (int) isSgIntVal(lhs)->get_value()
- + (int) (isSgIntVal(rhs)->get_value());
- else if (isSgUnsignedIntVal(lhs))
- dim = (int) isSgUnsignedIntVal(lhs)->get_value()
- + (int) isSgUnsignedIntVal(rhs)->get_value();
- else if (isSgUnsignedLongVal(lhs))
- dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongLongIntVal(lhs))
- dim = (int) (isSgLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- dim = (int) (isSgLongIntVal(lhs)->get_value()
- + isSgLongIntVal(rhs)->get_value());
- else if (isSgUnsignedLongLongIntVal(lhs))
- dim =
- (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongLongIntVal(rhs)->get_value());
-
- }
- size_n_d *= dim;
- v.size_multi_dim.push_back(dim);
- t = isSgArrayType(t)->get_base_type();
- }
- if (cudaDebug)
- printf("Detected Multi-dimensional array sized of %d for %s\n",
- size_n_d, (char*) ro_refs[i]->name().c_str());
- size = ocg->CreateInt(size_n_d);
- } else if (isSgArrayType(inArray->get_type())
- && isSgArrayType(
- isSgArrayType(inArray->get_type())->get_base_type())) {
- //SgArrayType* t = isSgArrayType(isSgArrayType(inArray->get_type())->get_base_type());
- //v.size_2d = t->get_rank();
- SgType* t = inArray->get_type();
- /* SgExprListExp* dimList = t->get_dim_info();
- SgExpressionPtrList::iterator j= dimList->get_expressions().begin();
- SgExpression* expr=NULL;
- for (; j != dimList->get_expressions().end(); j++)
- expr = *j;
- */
-
- if (isSgIntVal(isSgArrayType(t)->get_index()))
- size_n_d =
- (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgUnsignedIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgUnsignedLongVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d =
- (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d =
- (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index()))
- size_n_d = (int) (isSgUnsignedLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgAddOp(isSgArrayType(t)->get_index())) {
- SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-
- SgExpression *lhs = op_add->get_lhs_operand();
- SgExpression *rhs = op_add->get_rhs_operand();
-
- if (isSgIntVal(lhs))
- size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value());
- else if (isSgUnsignedIntVal(lhs))
- size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value()
- + (int) isSgUnsignedIntVal(rhs)->get_value();
- else if (isSgUnsignedLongVal(lhs))
- size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongLongIntVal(lhs))
- size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- size_n_d = (int) (isSgLongIntVal(lhs)->get_value()
- + isSgLongIntVal(rhs)->get_value());
- else if (isSgUnsignedLongLongIntVal(lhs))
- size_n_d =
- (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongLongIntVal(rhs)->get_value());
-
- }
- t = isSgArrayType(t)->get_base_type();
- while (isSgArrayType(t)) {
- int dim;
- if (isSgIntVal(isSgArrayType(t)->get_index()))
- dim =
- (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgUnsignedIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgUnsignedLongVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongLongIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgLongIntVal(isSgArrayType(t)->get_index()))
- dim = (int) (isSgLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgUnsignedLongLongIntVal(
- isSgArrayType(t)->get_index()))
- dim = (int) (isSgUnsignedLongLongIntVal(
- isSgArrayType(t)->get_index())->get_value());
- else if (isSgAddOp(isSgArrayType(t)->get_index())) {
- SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index());
-
- SgExpression *lhs = op_add->get_lhs_operand();
- SgExpression *rhs = op_add->get_rhs_operand();
-
- if (isSgIntVal(lhs))
- dim = (int) isSgIntVal(lhs)->get_value()
- + (int) (isSgIntVal(rhs)->get_value());
- else if (isSgUnsignedIntVal(lhs))
- dim = (int) isSgUnsignedIntVal(lhs)->get_value()
- + (int) isSgUnsignedIntVal(rhs)->get_value();
- else if (isSgUnsignedLongVal(lhs))
- dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- dim = (int) (isSgUnsignedLongVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongLongIntVal(lhs))
- dim = (int) (isSgLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongVal(rhs)->get_value());
- else if (isSgLongIntVal(lhs))
- dim = (int) (isSgLongIntVal(lhs)->get_value()
- + isSgLongIntVal(rhs)->get_value());
- else if (isSgUnsignedLongLongIntVal(lhs))
- dim =
- (int) (isSgUnsignedLongLongIntVal(lhs)->get_value()
- + isSgUnsignedLongLongIntVal(rhs)->get_value());
-
- }
- size_n_d *= dim;
- v.size_multi_dim.push_back(dim);
- t = isSgArrayType(t)->get_base_type();
- }
- if (cudaDebug)
- printf("Detected Multi-Dimensional array sized of %d for %s\n",
- size_n_d, (char*) ro_refs[i]->name().c_str());
- size = ocg->CreateInt(size_n_d);
- }
-
- else if (it != array_dims.end()) {
- int ref_size = it->second;
- // v.var_ref_size = ref_size;
- size = ocg->CreateInt(ref_size);
- } else {
- if (dim1) {
- size = ocg->CreateTimes(
- new CG_roseRepr(isSgExpression(buildVarRefExp(dim1))),
- new CG_roseRepr(isSgExpression(buildVarRefExp(dim2))));
- } else {
- char buf[1024];
- sprintf(buf,
- "CudaizeCodeGen: Array reference %s does not have a "
- "detectable size or specififed dimentions",
- name.c_str());
- throw std::runtime_error(buf);
- }
- }
- v.size_expr =
- static_cast<CG_roseRepr*>(ocg->CreateTimes(size,
- new omega::CG_roseRepr(
- isSgExpression(buildSizeOfOp(v.type)))))->GetExpression();
-
- v.in_data = inArray;
- v.out_data = 0;
- arrayVars.push_back(v);
- }
-
- if (arrayVars.size() < 2) {
- fprintf(stderr,
- "cudaize error: Did not find two arrays being accessed\n");
- return false;
- }
-
- //protonu--debugging tool--the printf statement
- //tex_mem_on signals use of tex mem
- /* derick -- texmapping near malloc mcopy
- for(int i=0; i<arrayVars.size(); i++)
- {
- //printf("var name %s, tex_mem used %s\n", arrayVars[i].name.c_str(), (arrayVars[i].tex_mapped)?"true":"false");
- if (arrayVars[i].tex_mapped ) tex_mem_on ++;
- //if (arrayVars[i].cons_mapped ) cons_mem_on ++;
- }
- */
-
- //Add our mallocs (and input array memcpys)
- for (int i = 0; i < arrayVars.size(); i++) {
- if(arrayVars[i].cons_mapped) {
- setupConstantVar(constant_mem, &arrayVars[i], globals, i, symtab);
- SgStatementPtrList *tnl = new SgStatementPtrList;
- cudaBindConstantVar(constant_mem, &arrayVars[i], globals, tnl);
- setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl));
- }
- else {
- SgVariableDeclaration* defn = buildVariableDeclaration(
- SgName(arrayVars[i].name.c_str()),
- buildPointerType(arrayVars[i].type));
- SgInitializedNamePtrList& variables = defn->get_variables();
- SgInitializedNamePtrList::const_iterator j = variables.begin();
- SgInitializedName* initializedName = *j;
- SgVariableSymbol* dvs = new SgVariableSymbol(initializedName);
- prependStatement(defn, func_body);
-
- dvs->set_parent(body_symtab);
- body_symtab->insert(SgName(arrayVars[i].name.c_str()), dvs);
-
-// SgVariableSymbol* dvs = body_symtab->find_variable(SgName(arrayVars[i].name.c_str()));
-
- // if(dvs == NULL)
- // dvs = parameter_symtab->find_variable(SgName(arrayVars[i].name.c_str()));
-
- //cudaMalloc args
- // SgBasicBlock* block = buildBasicBlock();
- SgName name_cuda_malloc("cudaMalloc");
- SgFunctionDeclaration * decl_cuda_malloc =
- buildNondefiningFunctionDeclaration(name_cuda_malloc,
- buildVoidType(), buildFunctionParameterList(), globals);
-
- SgName name_cuda_copy("cudaMemcpy");
- SgFunctionDeclaration * decl_cuda_copy =
- buildNondefiningFunctionDeclaration(name_cuda_copy,
- buildVoidType(), buildFunctionParameterList(), globals);
-
- SgExprListExp* args = buildExprListExp();
- args->append_expression(
- buildCastExp(buildAddressOfOp(buildVarRefExp(dvs)),
- buildPointerType(buildPointerType(buildVoidType()))));
- args->append_expression(arrayVars[i].size_expr);
-
-// decl_cuda_malloc->get_parameterList()->append_arg
- SgFunctionCallExp *the_call = buildFunctionCallExp(
- buildFunctionRefExp(decl_cuda_malloc), args);
-
- SgExprStatement* stmt = buildExprStatement(the_call);
-
- // (*replacement_list).push_back (stmt);
-
- SgStatementPtrList* tnl = new SgStatementPtrList;
- (*tnl).push_back(stmt);
- setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl));
- if (arrayVars[i].in_data) {
-
- SgExprListExp * cuda_copy_in_args = buildExprListExp();
- cuda_copy_in_args->append_expression(
- isSgExpression(buildVarRefExp(dvs)));
- cuda_copy_in_args->append_expression(
- isSgExpression(buildVarRefExp(arrayVars[i].in_data)));
- CG_roseRepr* size_exp = new CG_roseRepr(arrayVars[i].size_expr);
- cuda_copy_in_args->append_expression(
- static_cast<CG_roseRepr*>(size_exp->clone())->GetExpression());
- cuda_copy_in_args->append_expression(
- buildOpaqueVarRefExp("cudaMemcpyHostToDevice", globals));
-
-// cuda_copy_in_args->append_expression(
-// new SgVarRefExp(sourceLocation, )
-// );
- SgFunctionCallExp * cuda_copy_in_func_call = buildFunctionCallExp(
- buildFunctionRefExp(decl_cuda_copy), cuda_copy_in_args);
-
- SgExprStatement* stmt = buildExprStatement(cuda_copy_in_func_call);
-
- SgStatementPtrList *tnl = new SgStatementPtrList;
- (*tnl).push_back(stmt);
- setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl));
-
- if(arrayVars[i].tex_mapped) {
- setupTexmappingVar(texture, &arrayVars[i], globals, i, dvs, symtab);
- SgStatementPtrList *tnl = new SgStatementPtrList;
- cudaBindTexture(texture, &arrayVars[i], globals, tnl);
- setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl));
- }
- }
- }
- }
-
- //Build dimGrid dim3 variables based on loop dimentions and ti/tj
- char blockD1[120];
- char blockD2[120];
- if (dim1) {
- snprintf(blockD1, 120, "%s/%d",
- dim1->get_declaration()->get_name().getString().c_str(), cu_tx);
- snprintf(blockD2, 120, "%s/%d",
- dim2->get_declaration()->get_name().getString().c_str(), cu_ty);
- } else {
- snprintf(blockD1, 120, "%d", cu_bx);
- snprintf(blockD2, 120, "%d", cu_by);
- //snprintf(blockD1, 120, "%d/%d", cu_nx, cu_tx);
- //snprintf(blockD2, 120, "%d/%d", cu_ny, cu_ty);
- }
-
- SgInitializedName* arg1 = buildInitializedName("i", buildIntType());
- SgInitializedName* arg2 = buildInitializedName("j", buildIntType());
- SgInitializedName* arg3 = buildInitializedName("k", buildIntType());
- SgName type_name("dim3");
- //SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(type_name);
-
- //ROSE_ASSERT(type_symbol != NULL);
-
- //SgClassDeclaration * dim3classdecl = isSgClassDeclaration(
- // type_symbol->get_declaration());
-
- SgFunctionDeclaration * funcdecl = buildNondefiningFunctionDeclaration(
- SgName("dim3"), buildOpaqueType("dim3", globalScope),
- //isSgType(dim3classdecl->get_type()),
- buildFunctionParameterList(arg1, arg2, arg3), globalScope);
-
- if (cu_bx && cu_by)
- repr = ocg->CreateDim3((const char*) gridName, ocg->CreateInt(cu_bx),
- ocg->CreateInt(cu_by));
- else if (cu_bx_repr && cu_by_repr)
- repr = ocg->CreateDim3((const char*) gridName, cu_bx_repr, cu_by_repr);
- else if (cu_bx_repr)
- repr = ocg->CreateDim3((const char*) gridName, cu_bx_repr,
- ocg->CreateInt(1));
- setup_code = ocg->StmtListAppend(setup_code, repr);
- //SgStatementPtrList* dimList = static_cast<CG_roseRepr *>(repr)->GetList();
-
- //for(SgStatementPtrList::iterator it = (*dimList).begin(); it != (*dimList).end(); it++)
- // (*replacement_list).push_back (*it);
-
- // repr = ocg->CreateDim3((const char*)blockName, cu_tx,cu_ty);
-
- if (cu_tz > 1 || cu_tz_repr) {
-
- if (cu_tx && cu_ty && cu_tz)
- repr = ocg->CreateDim3((char*) blockName, ocg->CreateInt(cu_tx),
- ocg->CreateInt(cu_ty), ocg->CreateInt(cu_tz));
- else if (cu_tx_repr && cu_ty_repr && cu_tz_repr)
- repr = ocg->CreateDim3((char*) blockName, cu_tx_repr, cu_ty_repr,
- cu_tz_repr);
- // SgStatementPtrList* dimList = static_cast<CG_roseRepr *>(repr)->GetList();
-
- // for(SgStatementPtrList::iterator it = (*dimList).begin(); it != (*dimList).end(); it++)
- // (*replacement_list).push_back (*it);
-
- } else {
- if (cu_tx && cu_ty)
- repr = ocg->CreateDim3((char*) blockName, ocg->CreateInt(cu_tx),
- ocg->CreateInt(cu_ty));
- else if (cu_tx_repr && cu_ty_repr)
- repr = ocg->CreateDim3((char*) blockName, cu_tx_repr, cu_ty_repr);
- //SgStatementPtrList* dimList = static_cast<CG_roseRepr *>(repr)->GetList();
-
- //for(SgStatementPtrList::iterator it = (*dimList).begin(); it != (*dimList).end(); it++)
- // (*replacement_list).push_back (*it);
-
- }
-
- setup_code = ocg->StmtListAppend(setup_code, repr);
-
- SgCudaKernelExecConfig* config = new SgCudaKernelExecConfig(
- buildVarRefExp(gridName), buildVarRefExp(blockName), NULL, NULL);
- //SgCudaKernelExecConfig* config = new SgCudaKernelExecConfig(buildIntVal(cu_bx), , NULL, NULL);
- SgExprListExp* iml = new SgExprListExp();
- SgCastExp* dim_s;
-
- //Creating Kernel function
- SgBasicBlock* bb = new SgBasicBlock(TRANSFORMATION_FILE_INFO);
- SgFunctionDefinition* kernel_defn = new SgFunctionDefinition(
- TRANSFORMATION_FILE_INFO, bb);
- SgFunctionDeclaration* kernel_decl_ = new SgFunctionDeclaration(
- TRANSFORMATION_FILE_INFO, SgName((char*)cu_kernel_name.c_str()),buildFunctionType(buildVoidType(), buildFunctionParameterList()), kernel_defn);
- SgFunctionDeclaration* kernel_decl = new SgFunctionDeclaration(
- TRANSFORMATION_FILE_INFO, SgName((char*)cu_kernel_name.c_str()),buildFunctionType(buildVoidType(), buildFunctionParameterList()), kernel_defn);
-
- //((kernel_decl->get_declarationModifier()).get_storageModifier()).setStatic();
-
- kernel_decl->set_definingDeclaration(kernel_decl);
- kernel_defn->set_parent(kernel_decl);
- bb->set_parent(kernel_defn);
- bb->set_endOfConstruct(TRANSFORMATION_FILE_INFO);
- bb->get_endOfConstruct()->set_parent(bb);
-
- //SgFunctionSymbol* functionSymbol = new SgFunctionSymbol(kernel_decl_);
- //globals->insert_symbol(SgName((char*) cu_kernel_name.c_str()),
- // functionSymbol);
- SgFunctionSymbol* functionSymbol2 = new SgFunctionSymbol(kernel_decl);
-
- globals->insert_symbol(SgName((char*) cu_kernel_name.c_str()),
- functionSymbol2);
-
- kernel_decl_->set_parent(globals);
-
- kernel_decl_->set_scope(globals);
-
- kernel_decl_->setForward();
-
- globals->prepend_declaration(kernel_decl_);
-
- kernel_decl->set_endOfConstruct(TRANSFORMATION_FILE_INFO);
- kernel_decl->get_endOfConstruct()->set_parent(kernel_decl);
-
- kernel_decl->set_parent(globals);
- kernel_decl->set_scope(globals);
-
- kernel_decl->get_definition()->set_endOfConstruct(TRANSFORMATION_FILE_INFO);
- kernel_decl->get_definition()->get_endOfConstruct()->set_parent(
- kernel_decl->get_definition());
-
- globals->append_statement(kernel_decl);
-
- //printf("%s %s\n", static_cast<const char*>(cu_kernel_name), dims);
- //--derick - kernel function parameters
- for (int i = 0; i < arrayVars.size(); i++)
- //Throw in a type cast if our kernel takes 2D array notation
- //like (float(*) [1024])
- {
- //protonu--throwing in another hack to stop the caller from passing tex mapped
- //vars to the kernel.
- if (arrayVars[i].tex_mapped == true || arrayVars[i].cons_mapped)
- continue;
- if (!(arrayVars[i].size_multi_dim.empty())) {
- //snprintf(dims,120,"(float(*) [%d])%s", arrayVars[i].size_2d,
- // const_cast<char*>(arrayVars[i].name.c_str()));
-
- SgType* t = arrayVars[i].type;
- for (int k = arrayVars[i].size_multi_dim.size() - 1; k >= 0; k--) {
- t = buildArrayType(t,
- buildIntVal(arrayVars[i].size_multi_dim[k]));
- }
- SgVariableSymbol* temp = body_symtab->find_variable(
- SgName((char*) arrayVars[i].name.c_str()));
- if (temp == NULL)
- temp = parameter_symtab->find_variable(
- SgName((char*) arrayVars[i].name.c_str()));
-
- dim_s = buildCastExp(buildVarRefExp(temp), buildPointerType(t),
- SgCastExp::e_C_style_cast);
-
- //printf("%d %s\n", i, dims);
- iml->append_expression(dim_s);
-
- SgInitializedName* id = buildInitializedName(
- (char*) arrayVars[i].original_name.c_str(),
- buildPointerType(t));
- kernel_decl->get_parameterList()->append_arg(id);
- kernel_decl_->get_parameterList()->append_arg(id);
- id->set_file_info(TRANSFORMATION_FILE_INFO);
-
- // DQ (9/8/2007): We now test this, so it has to be set explicitly.
- id->set_scope(kernel_decl->get_definition());
-
- // DQ (9/8/2007): Need to add variable symbol to global scope!
- //printf ("Fixing up the symbol table in scope = %p = %s for SgInitializedName = %p = %s \n",globalScope,globalScope->class_name().c_str(),var1_init_name,var1_init_name->get_name().str());
- SgVariableSymbol *var_symbol = new SgVariableSymbol(id);
- kernel_decl->get_definition()->insert_symbol(id->get_name(),
- var_symbol);
-
- // if(kernel_decl->get_definition()->get_symbol_table()->find((const) id) == NULL)
-
- } else {
- //printf("%d %s\n", i, static_cast<const char*>(arrayVars[i].name));
- SgVariableSymbol* temp = body_symtab->find_variable(
- SgName((char*) arrayVars[i].name.c_str()));
- if (temp == NULL)
- temp = parameter_symtab->find_variable(
- SgName((char*) arrayVars[i].name.c_str()));
- iml->append_expression(buildVarRefExp(temp));
- SgInitializedName* id = buildInitializedName(
- (char*) arrayVars[i].original_name.c_str(),
- buildPointerType(arrayVars[i].type));
- kernel_decl->get_parameterList()->append_arg(id);
- kernel_decl_->get_parameterList()->append_arg(id);
- id->set_file_info(TRANSFORMATION_FILE_INFO);
-
- // DQ (9/8/2007): We now test this, so it has to be set explicitly.
- id->set_scope(kernel_decl->get_definition());
-
- // DQ (9/8/2007): Need to add variable symbol to global scope!
- //printf ("Fixing up the symbol table in scope = %p = %s for SgInitializedName = %p = %s \n"$
- SgVariableSymbol *var_symbol = new SgVariableSymbol(id);
- kernel_decl->get_definition()->insert_symbol(id->get_name(),
- var_symbol);
-
- }
-
- }
- if (dim1) {
- iml->append_expression(buildVarRefExp(dim1));
- SgInitializedName* id = buildInitializedName(
- dim1->get_name().getString().c_str(), dim1->get_type());
- kernel_decl->get_parameterList()->append_arg(id);
-
- iml->append_expression(buildVarRefExp(dim2));
- SgInitializedName* id2 = buildInitializedName(
- dim2->get_name().getString().c_str(), dim2->get_type());
-
- kernel_decl->get_parameterList()->append_arg(id);
- kernel_decl_->get_parameterList()->append_arg(id);
- }
-
- kernel_decl->get_functionModifier().setCudaKernel();
- kernel_decl_->get_functionModifier().setCudaKernel();
- SgCudaKernelCallExp * cuda_call_site = new SgCudaKernelCallExp(
- TRANSFORMATION_FILE_INFO, buildFunctionRefExp(kernel_decl), iml,config);
-
- // SgStatementPtrList *tnl2 = new SgStatementPtrList;
-
- (*replacement_list).push_back(buildExprStatement(cuda_call_site));
-
- setup_code = ocg->StmtListAppend(setup_code,
- new CG_roseRepr(replacement_list));
-
- //cuda free variables
- for (int i = 0; i < arrayVars.size(); i++) {
- if (arrayVars[i].out_data) {
-
- SgName name_cuda_copy("cudaMemcpy");
- SgFunctionDeclaration * decl_cuda_copyout =
- buildNondefiningFunctionDeclaration(name_cuda_copy,
- buildVoidType(), buildFunctionParameterList(),
- globals);
-
- SgExprListExp* args = buildExprListExp();
- SgExprListExp * cuda_copy_out_args = buildExprListExp();
- cuda_copy_out_args->append_expression(
- isSgExpression(buildVarRefExp(arrayVars[i].out_data)));
- cuda_copy_out_args->append_expression(
- isSgExpression(buildVarRefExp(arrayVars[i].name)));
- CG_roseRepr* size_exp = new CG_roseRepr(arrayVars[i].size_expr);
- cuda_copy_out_args->append_expression(
- static_cast<CG_roseRepr*>(size_exp->clone())->GetExpression());
- cuda_copy_out_args->append_expression(
- buildOpaqueVarRefExp("cudaMemcpyDeviceToHost", globals));
-
-// cuda_copy_in_args->append_expression(
-// new SgVarRefExp(sourceLocation, )
-// );
- SgFunctionCallExp * cuda_copy_out_func_call = buildFunctionCallExp(
- buildFunctionRefExp(decl_cuda_copyout), cuda_copy_out_args);
-
- SgFunctionCallExp *the_call = buildFunctionCallExp(
- buildFunctionRefExp(decl_cuda_copyout), cuda_copy_out_args);
-
- SgExprStatement* stmt = buildExprStatement(the_call);
-
- SgStatementPtrList* tnl3 = new SgStatementPtrList;
-
- (*tnl3).push_back(stmt);
-
- // tree_node_list* tnl = new tree_node_list;
- // tnl->append(new tree_instr(the_call));
- setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl3));
-
- }
- if(!arrayVars[i].cons_mapped) {
- SgName name_cuda_free("cudaFree");
- SgFunctionDeclaration * decl_cuda_free =
- buildNondefiningFunctionDeclaration(name_cuda_free,
- buildVoidType(), buildFunctionParameterList(), globals);
-
- SgExprListExp* args3 = buildExprListExp();
-
- SgVariableSymbol* tmp = body_symtab->find_variable(
- SgName(arrayVars[i].name.c_str()));
- if (tmp == NULL)
- tmp = parameter_symtab->find_variable(
- SgName(arrayVars[i].name.c_str()));
-
- args3->append_expression(buildVarRefExp(tmp));
-
- SgFunctionCallExp *the_call2 = buildFunctionCallExp(
- buildFunctionRefExp(decl_cuda_free), args3);
-
- SgExprStatement* stmt2 = buildExprStatement(the_call2);
-
- SgStatementPtrList* tnl4 = new SgStatementPtrList;
-
- (*tnl4).push_back(stmt2);
- //(*replacement_list).push_back (stmt2);
-
- setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl4));
- }
- }
-
- // ---------------
- // BUILD THE KERNEL
- // ---------------
-
- //Extract out kernel body
- SgNode* code = getCode();
- //Create kernel function body
- //Add Params
- std::map<std::string, SgVariableSymbol*> loop_vars;
- //In-Out arrays
- for (int i = 0; i < arrayVars.size(); i++) {
- /* if(arrayVars[i].in_data)
- fptr = arrayVars[i].in_data->type()->clone();
- else
- fptr = arrayVars[i].out_data->type()->clone();
- */
-
- // fptr = new_proc_syms->install_type(fptr);
- std::string name =
- arrayVars[i].in_data ?
- arrayVars[i].in_data->get_declaration()->get_name().getString() :
- arrayVars[i].out_data->get_declaration()->get_name().getString();
- //SgVariableSymbol* sym = new var_sym(fptr, arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name());
-
- SgVariableSymbol* sym =
- kernel_decl->get_definition()->get_symbol_table()->find_variable(
- (const char*) name.c_str());
- /* SgVariableDeclaration* defn = buildVariableDeclaration(SgName(name.c_str()), buildFloatType());
- SgInitializedNamePtrList& variables = defn->get_variables();
- SgInitializedNamePtrList::const_iterator i = variables.begin();
- SgInitializedName* initializedName = *i;
- SgVariableSymbol* sym = new SgVariableSymbol(initializedName);
- prependStatement(defn, isSgScopeStatement(root_));
-
- vs->set_parent(symtab2_);
- symtab2_->insert(SgName(_s.c_str()), vs);
- */
-
- if (sym != NULL)
- loop_vars.insert(
- std::pair<std::string, SgVariableSymbol*>(std::string(name),
- sym));
- }
-
- //Figure out which loop variables will be our thread and block dimention variables
- std::vector<SgVariableSymbol *> loop_syms;
- //Get our indexes
- std::vector<const char*> indexes; // = get_loop_indexes(code,cu_num_reduce);
- int threadsPos = 0;
-
- CG_outputRepr *body = NULL;
- SgFunctionDefinition* func_d = func_definition;
- //std::vector<SgVariableSymbol *> symbols = recursiveFindRefs(code);
-
- SgName name_sync("__syncthreads");
- SgFunctionDeclaration * decl_sync = buildNondefiningFunctionDeclaration(
- name_sync, buildVoidType(), buildFunctionParameterList(),
- globalScope);
-
- recursiveFindRefs(code, syms, func_d);
-
- //SgFunctionDeclaration* func = Outliner::generateFunction (code, (char*)cu_kernel_name.c_str(), syms, pdSyms, psyms, NULL, globalScope);
-
- if (cu_bx > 1 || cu_bx_repr) {
- indexes.push_back("bx");
- SgName type_name("blockIdx.x");
- SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
- type_name);
- SgVariableDeclaration * var_decl = buildVariableDeclaration("bx",
- buildIntType(), NULL,
- isSgScopeStatement(kernel_decl->get_definition()->get_body()));
- SgStatementPtrList *tnl = new SgStatementPtrList;
- // (*tnl).push_back(isSgStatement(var_decl));
- appendStatement(var_decl, kernel_decl->get_definition()->get_body());
-
- SgVariableSymbol* bx =
- kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
- SgName("bx"));
- SgStatement* assign = isSgStatement(
- buildAssignStatement(buildVarRefExp(bx),
- buildOpaqueVarRefExp("blockIdx.x",
- kernel_decl->get_definition()->get_body())));
- (*tnl).push_back(assign);
- // body = ocg->StmtListAppend(body,
- // new CG_roseRepr(tnl));
- appendStatement(assign, kernel_decl->get_definition()->get_body());
-
- }
- if (cu_by > 1 || cu_by_repr) {
- indexes.push_back("by");
- SgName type_name("blockIdx.y");
- SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
- type_name);
- SgVariableDeclaration * var_decl = buildVariableDeclaration("by",
- buildIntType(), NULL,
- isSgScopeStatement(kernel_decl->get_definition()->get_body()));
- // SgStatementPtrList *tnl = new SgStatementPtrList;
- // (*tnl).push_back(isSgStatement(var_decl));
- appendStatement(var_decl, kernel_decl->get_definition()->get_body());
-
- SgVariableSymbol* by =
- kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
- SgName("by"));
- SgStatement* assign = isSgStatement(
- buildAssignStatement(buildVarRefExp(by),
- buildOpaqueVarRefExp("blockIdx.y",
- kernel_decl->get_definition()->get_body())));
- //(*tnl).push_back(assign);
- // body = ocg->StmtListAppend(body,
- // new CG_roseRepr(tnl));
- appendStatement(assign, kernel_decl->get_definition()->get_body());
-
- }
- if (cu_tx_repr || cu_tx > 1) {
- threadsPos = indexes.size();
- indexes.push_back("tx");
- SgName type_name("threadIdx.x");
- SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
- type_name);
- SgVariableDeclaration * var_decl = buildVariableDeclaration("tx",
- buildIntType(), NULL,
- isSgScopeStatement(kernel_decl->get_definition()->get_body()));
- // SgStatementPtrList *tnl = new SgStatementPtrList;
- // (*tnl).push_back(isSgStatement(var_decl));
- appendStatement(var_decl, kernel_decl->get_definition()->get_body());
-
- SgVariableSymbol* tx =
- kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
- SgName("tx"));
- SgStatement* assign = isSgStatement(
- buildAssignStatement(buildVarRefExp(tx),
- buildOpaqueVarRefExp("threadIdx.x",
- kernel_decl->get_definition()->get_body())));
- //(*tnl).push_back(assign);
- // body = ocg->StmtListAppend(body,
- // new CG_roseRepr(tnl));
- appendStatement(assign, kernel_decl->get_definition()->get_body());
-
- }
- if (cu_ty_repr || cu_ty > 1) {
- indexes.push_back("ty");
- SgName type_name("threadIdx.y");
- SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
- type_name);
- SgVariableDeclaration * var_decl = buildVariableDeclaration("ty",
- buildIntType(), NULL,
- isSgScopeStatement(kernel_decl->get_definition()->get_body()));
- appendStatement(var_decl, kernel_decl->get_definition()->get_body());
-
- // SgStatementPtrList *tnl = new SgStatementPtrList;
- // (*tnl).push_back(isSgStatement(var_decl));
- SgVariableSymbol* ty =
- kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
- SgName("ty"));
- SgStatement* assign = isSgStatement(
- buildAssignStatement(buildVarRefExp(ty),
- buildOpaqueVarRefExp("threadIdx.y",
- kernel_decl->get_definition()->get_body())));
- // (*tnl).push_back(assign);
- // body = ocg->StmtListAppend(body,
- // new CG_roseRepr(tnl));
- appendStatement(assign, kernel_decl->get_definition()->get_body());
-
- }
- if (cu_tz_repr || cu_tz > 1) {
- indexes.push_back("tz");
- SgName type_name("threadIdx.z");
- SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(
- type_name);
- SgVariableDeclaration * var_decl = buildVariableDeclaration("tz",
- buildIntType(), NULL,
- isSgScopeStatement(kernel_decl->get_definition()->get_body()));
- // SgStatementPtrList *tnl = new SgStatementPtrList;
- // (*tnl).push_back(isSgStatement(var_decl));
- appendStatement(var_decl, kernel_decl->get_definition()->get_body());
-
- SgVariableSymbol* tz =
- kernel_decl->get_definition()->get_body()->lookup_variable_symbol(
- "tz");
- SgStatement* assign = isSgStatement(
- buildAssignStatement(buildVarRefExp(tz),
- buildOpaqueVarRefExp("threadIdx.z",
- kernel_decl->get_definition()->get_body())));
- // (*tnl).push_back(assign);
- // body = ocg->StmtListAppend(body,
- // new CG_roseRepr(tnl));
- appendStatement(assign, kernel_decl->get_definition()->get_body());
-
- }
-
- std::map<std::string, SgVariableSymbol*> loop_idxs; //map from idx names to their new syms
-
- SgNode* swapped_ = swapVarReferences(code, syms,
- kernel_decl->get_definition()->get_symbol_table(),
- kernel_decl->get_definition()->get_body()->get_symbol_table(),
- kernel_decl->get_definition()->get_body());
-
- //std::cout << swapped_->unparseToString() << std::endl << std::endl;
-
- SgNode *swapped = recursiveFindReplacePreferedIdxs(swapped_,
- kernel_decl->get_definition()->get_body()->get_symbol_table(),
- kernel_decl->get_definition()->get_symbol_table(),
- kernel_decl->get_definition()->get_body(), loop_idxs, globalScope); //in-place swapping
- //swapped->print();
-
- if (!isSgBasicBlock(swapped)) {
- appendStatement(isSgStatement(swapped),
- kernel_decl->get_definition()->get_body());
- swapped->set_parent(
- isSgNode(kernel_decl->get_definition()->get_body()));
- } else {
-
- for (SgStatementPtrList::iterator it =
- isSgBasicBlock(swapped)->get_statements().begin();
- it != isSgBasicBlock(swapped)->get_statements().end(); it++) {
- appendStatement(*it, kernel_decl->get_definition()->get_body());
- (*it)->set_parent(
- isSgNode(kernel_decl->get_definition()->get_body()));
-
- }
-
- }
-
- for (int i = 0; i < indexes.size(); i++) {
- std::vector<SgForStatement*> tfs = findCommentedFors(indexes[i],
- swapped);
- for (int k = 0; k < tfs.size(); k++) {
- //printf("replacing %p tfs for index %s\n", tfs[k], indexes[i]);
- SgNode* newBlock = forReduce(tfs[k], loop_idxs[indexes[i]],
- kernel_decl->get_definition());
- //newBlock->print();
- swap_node_for_node_list(tfs[k], newBlock);
- //printf("AFTER SWAP\n"); newBlock->print();
- }
- }
-
- //--derick replace array refs of texture mapped vars here
- body = new CG_roseRepr(kernel_decl->get_definition()->get_body());
- std::vector<IR_ArrayRef*> refs = ir->FindArrayRef(body);
- texmapArrayRefs(texture, &refs, globals, ir, ocg);
- // do the same for constant mapped vars
- consmapArrayRefs(constant_mem, &refs, globals, ir, ocg);
-
- return swapped;
-}
-
-//Order taking out dummy variables
-std::vector<std::string> cleanOrder(std::vector<std::string> idxNames) {
- std::vector<std::string> results;
- for (int j = 0; j < idxNames.size(); j++) {
- if (idxNames[j].length() != 0)
- results.push_back(idxNames[j]);
- }
- return results;
-}
-
-//First non-dummy level in ascending order
-int LoopCuda::nonDummyLevel(int stmt, int level) {
- //level comes in 1-basd and should leave 1-based
- for (int j = level - 1; j < idxNames[stmt].size(); j++) {
- if (idxNames[stmt][j].length() != 0) {
- //printf("found non dummy level of %d with idx: %s when searching for %d\n", j+1, (const char*) idxNames[stmt][j], level);
- return j + 1;
- }
- }
- char buf[128];
- sprintf(buf, "%d", level);
- throw std::runtime_error(
- std::string("Unable to find a non-dummy level starting from ")
- + std::string(buf));
-}
-
-int LoopCuda::findCurLevel(int stmt, std::string idx) {
- for (int j = 0; j < idxNames[stmt].size(); j++) {
- if (strcmp(idxNames[stmt][j].c_str(), idx.c_str()) == 0)
- return j + 1;
- }
- throw std::runtime_error(
- std::string("Unable to find index ") + idx
- + std::string(" in current list of indexes"));
-}
-
-void LoopCuda::permute_cuda(int stmt,
- const std::vector<std::string>& curOrder) {
- //printf("curOrder: ");
- //printVs(curOrder);
- //printf("idxNames: ");
- //printVS(idxNames[stmt]);
- std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt]);
- bool same = true;
- std::vector<int> pi;
- for (int i = 0; i < curOrder.size(); i++) {
- bool found = false;
- for (int j = 0; j < cIdxNames.size(); j++) {
- if (strcmp(cIdxNames[j].c_str(), curOrder[i].c_str()) == 0) {
- pi.push_back(j + 1);
- found = true;
- if (j != i)
- same = false;
- }
- }
- if (!found) {
- throw std::runtime_error(
- "One of the indexes in the permute order were not "
- "found in the current set of indexes.");
- }
- }
- for (int i = curOrder.size(); i < cIdxNames.size(); i++) {
- pi.push_back(i);
- }
- if (same)
- return;
- permute(stmt, pi);
- //Set old indexe names as new
- for (int i = 0; i < curOrder.size(); i++) {
- idxNames[stmt][i] = curOrder[i].c_str(); //what about sibling stmts?
- }
-}
-
-bool LoopCuda::permute(int stmt_num, const std::vector<int> &pi) {
-// check for sanity of parameters
- if (stmt_num >= stmt.size() || stmt_num < 0)
- throw std::invalid_argument("invalid statement " + to_string(stmt_num));
- const int n = stmt[stmt_num].xform.n_out();
- if (pi.size() > (n - 1) / 2)
- throw std::invalid_argument(
- "iteration space dimensionality does not match permute dimensionality");
- int first_level = 0;
- int last_level = 0;
- for (int i = 0; i < pi.size(); i++) {
- if (pi[i] > (n - 1) / 2 || pi[i] <= 0)
- throw std::invalid_argument(
- "invalid loop level " + to_string(pi[i])
- + " in permuation");
-
- if (pi[i] != i + 1) {
- if (first_level == 0)
- first_level = i + 1;
- last_level = i + 1;
- }
- }
- if (first_level == 0)
- return true;
-
- std::vector<int> lex = getLexicalOrder(stmt_num);
- std::set<int> active = getStatements(lex, 2 * first_level - 2);
- Loop::permute(active, pi);
-}
-
-void LoopCuda::tile_cuda(int stmt, int level, int outer_level) {
- tile_cuda(stmt, level, 1, outer_level, "", "", CountedTile);
-}
-void LoopCuda::tile_cuda(int level, int tile_size, int outer_level,
- std::string idxName, std::string ctrlName, TilingMethodType method) {
- tile_cuda(0, level, tile_size, outer_level, idxName, ctrlName, method);
-}
-
-void LoopCuda::tile_cuda(int stmt, int level, int tile_size, int outer_level,
- std::string idxName, std::string ctrlName, TilingMethodType method) {
- //Do regular tile but then update the index and control loop variable
- //names as well as the idxName to reflect the current state of things.
- //printf("tile(%d,%d,%d,%d)\n", stmt, level, tile_size, outer_level);
- //printf("idxNames before: ");
- //printVS(idxNames[stmt]);
-
- tile(stmt, level, tile_size, outer_level, method);
-
- if (idxName.size())
- idxNames[stmt][level - 1] = idxName.c_str();
- if (tile_size == 1) {
- //potentially rearrange loops
- if (outer_level < level) {
- std::string tmp = idxNames[stmt][level - 1];
- for (int i = level - 1; i > outer_level - 1; i--) {
- if (i - 1 >= 0)
- idxNames[stmt][i] = idxNames[stmt][i - 1];
- }
- idxNames[stmt][outer_level - 1] = tmp;
- }
- //TODO: even with a tile size of one, you need a insert (of a dummy loop)
- idxNames[stmt].insert(idxNames[stmt].begin() + (level), "");
- } else {
- if (!ctrlName.size())
- throw std::runtime_error("No ctrl loop name for tile");
- //insert
- idxNames[stmt].insert(idxNames[stmt].begin() + (outer_level - 1),
- ctrlName.c_str());
- }
-
- //printf("idxNames after: ");
- //printVS(idxNames[stmt]);
-}
-
-bool LoopCuda::datacopy_privatized_cuda(int stmt_num, int level,
- const std::string &array_name,
- const std::vector<int> &privatized_levels, bool allow_extra_read,
- int fastest_changing_dimension, int padding_stride,
- int padding_alignment, bool cuda_shared) {
- int old_stmts = stmt.size();
- // printf("before datacopy_privatized:\n");
- printIS();
- //datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, cuda_shared);
- if (cuda_shared)
- datacopy_privatized(stmt_num, level, array_name, privatized_levels,
- allow_extra_read, fastest_changing_dimension, padding_stride,
- padding_alignment, 1);
- else
- datacopy_privatized(stmt_num, level, array_name, privatized_levels,
- allow_extra_read, fastest_changing_dimension, padding_stride,
- padding_alignment, 0);
- // printf("after datacopy_privatized:\n");
- printIS();
-
- //Adjust idxNames to reflect updated state
- std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
- int new_stmts = stmt.size();
- for (int i = old_stmts; i < new_stmts; i++) {
- //printf("fixing up statement %d\n", i);
- std::vector<std::string> idxs;
-
- //protonu-making sure the vector of nonSplitLevels grows along with
- //the statement structure
- stmt_nonSplitLevels.push_back(std::vector<int>());
-
- //Indexes up to level will be the same
- for (int j = 0; j < level - 1; j++)
- idxs.push_back(cIdxNames[j]);
-
- //Expect privatized_levels to match
- for (int j = 0; j < privatized_levels.size(); j++)
- idxs.push_back(cIdxNames[privatized_levels[j] - 1]);//level is one-based
-
- //all further levels should match order they are in originally
- if (privatized_levels.size()) {
- int last_privatized = privatized_levels.back();
- int top_level = last_privatized
- + (stmt[i].IS.n_set() - idxs.size());
- //printf("last privatized_levels: %d top_level: %d\n", last_privatized, top_level);
- for (int j = last_privatized; j < top_level; j++) {
- idxs.push_back(cIdxNames[j]);
- //printf("pushing back: %s\n", (const char*)cIdxNames[j]);
- }
- }
- idxNames.push_back(idxs);
- }
-}
-
-bool LoopCuda::datacopy_cuda(int stmt_num, int level,
- const std::string &array_name,
- const std::vector<std::string> new_idxs,
- bool allow_extra_read, int fastest_changing_dimension,
- int padding_stride, int padding_alignment, bool cuda_shared) {
-
- int old_stmts = stmt.size();
- //datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment,cuda_shared);
- // printf("before datacopy:\n");
- // printIS();
- if (cuda_shared)
- datacopy(stmt_num, level, array_name, allow_extra_read,
- fastest_changing_dimension, padding_stride, padding_alignment,
- 1);
- else
- datacopy(stmt_num, level, array_name, allow_extra_read,
- fastest_changing_dimension, padding_stride, padding_alignment,
- 0);
- // printf("after datacopy:\n");
- printIS();
-
- //Adjust idxNames to reflect updated state
- std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
- int new_stmts = stmt.size();
- for (int i = old_stmts; i < new_stmts; i++) {
- //printf("fixing up statement %d\n", i);
- std::vector<std::string> idxs;
-
- //protonu-making sure the vector of nonSplitLevels grows along with
- //the statement structure
- stmt_nonSplitLevels.push_back(std::vector<int>());
-
- //Indexes up to level will be the same
- for (int j = 0; j < level - 1; j++)
- idxs.push_back(cIdxNames[j]);
-
- //all further levels should get names from new_idxs
- int top_level = stmt[i].IS.n_set();
- //printf("top_level: %d level: %d\n", top_level, level);
- if (new_idxs.size() < top_level - level + 1)
- throw std::runtime_error(
- "Need more new index names for new datacopy loop levels");
-
- for (int j = level - 1; j < top_level; j++) {
- idxs.push_back(new_idxs[j - level + 1].c_str());
- //printf("pushing back: %s\n", new_idxs[j-level+1].c_str());
- }
- idxNames.push_back(idxs);
- }
-}
-
-bool LoopCuda::unroll_cuda(int stmt_num, int level, int unroll_amount) {
- int old_stmts = stmt.size();
- //bool b= unroll(stmt_num, , unroll_amount);
-
- int dim = 2 * level - 1;
- std::vector<int> lex = getLexicalOrder(stmt_num);
- std::set<int> same_loop = getStatements(lex, dim - 1);
-
- level = nonDummyLevel(stmt_num, level);
- //printf("unrolling %d at level %d\n", stmt_num,level);
-
- //protonu--using the new version of unroll, which returns
- //a set of ints instead of a bool. To keep Gabe's logic
- //I'll check the size of the set, if it's 0 return true
- //bool b= unroll(stmt_num, level, unroll_amount);
- std::set<int> b_set = unroll(stmt_num, level, unroll_amount, idxNames);
- bool b = false;
- if (b_set.size() == 0)
- b = true;
- //end--protonu
-
- //Adjust idxNames to reflect updated state
- std::vector<std::string> cIdxNames = cleanOrder(idxNames[stmt_num]);
- std::vector<std::string> origSource = idxNames[stmt_num];
- ;
- //Drop index names at level
- if (unroll_amount == 0) {
- //For all statements that were in this unroll together, drop index name for unrolled level
- idxNames[stmt_num][level - 1] = "";
- for (std::set<int>::iterator i = same_loop.begin();
- i != same_loop.end(); i++) {
- //printf("in same loop as %d is %d\n", stmt_num, (*i));
- //idxNames[(*i)][level-1] = "";
- idxNames[(*i)] = idxNames[stmt_num];
- }
- }
-
- lex = getLexicalOrder(stmt_num);
- same_loop = getStatements(lex, dim - 1);
-
- bool same_as_source = false;
- int new_stmts = stmt.size();
- for (int i = old_stmts; i < new_stmts; i++) {
- //Check whether we had a sync for the statement we are unrolling, if
- //so, propogate that to newly created statements so that if they are
- //in a different loop structure, they will also get a syncthreads
- int size = syncs.size();
- for (int j = 0; j < size; j++) {
- if (syncs[j].first == stmt_num)
- syncs.push_back(make_pair(i, syncs[j].second));
- }
-
- //protonu-making sure the vector of nonSplitLevels grows along with
- //the statement structure
- stmt_nonSplitLevels.push_back(std::vector<int>());
-
- //We expect that new statements have a constant for the variable in
- //stmt[i].IS at level (as seen with print_with_subs), otherwise there
- //will be a for loop at level and idxNames should match stmt's
- //idxNames pre-unrolled
- Relation IS = stmt[i].IS;
- //Ok, if you know how the hell to get anything out of a Relation, you
- //should probably be able to do this more elegantly. But for now, I'm
- //hacking it.
- std::string s = IS.print_with_subs_to_string();
- //s looks looks like
- //{[_t49,8,_t51,_t52,128]: 0 <= _t52 <= 3 && 0 <= _t51 <= 15 && 0 <= _t49 && 64_t49+16_t52+_t51 <= 128}
- //where level == 5, you see a integer in the input set
-
- //If that's not an integer and this is the first new statement, then
- //we think codegen will have a loop at that level. It's not perfect,
- //not sure if it can be determined without round-tripping to codegen.
- int sIdx = 0;
- int eIdx = 0;
- for (int j = 0; j < level - 1; j++) {
- sIdx = s.find(",", sIdx + 1);
- if (sIdx < 0)
- break;
- }
- if (sIdx > 0) {
- eIdx = s.find("]");
- int tmp = s.find(",", sIdx + 1);
- if (tmp > 0 && tmp < eIdx)
- eIdx = tmp; //", before ]"
- if (eIdx > 0) {
- sIdx++;
- std::string var = s.substr(sIdx, eIdx - sIdx);
- //printf("%s\n", s.c_str());
- //printf("set var for stmt %d at level %d is %s\n", i, level, var.c_str());
- if (atoi(var.c_str()) == 0 && i == old_stmts) {
- //TODO:Maybe do see if this new statement would be in the same
- //group as the original and if it would, don't say
- //same_as_source
- if (same_loop.find(i) == same_loop.end()) {
- printf(
- "stmt %d level %d, newly created unroll statement should have same level indexes as source\n",
- i, level);
- same_as_source = true;
- }
- }
- }
- }
-
- //printf("fixing up statement %d n_set %d with %d levels\n", i, stmt[i].IS.n_set(), level-1);
- if (same_as_source)
- idxNames.push_back(origSource);
- else
- idxNames.push_back(idxNames[stmt_num]);
- }
-
- return b;
-}
-
-void LoopCuda::copy_to_texture(const char *array_name) {
- //protonu--placeholder for now
- //set the bool for using cuda memory as true
- //in a vector of strings, put the names of arrays to tex mapped
- if (!texture)
- texture = new texture_memory_mapping(true, array_name);
- else
- texture->add(array_name);
-
-}
-
-//void LoopCuda::copy_to_texture_2d(const char *array_name, int width, int height) {
-// if (!texture)
-// texture = new texture_memory_mapping(true, array_name, width, height);
-// else
-// texture->add(array_name, width, height);
-//}
-
-void LoopCuda::copy_to_constant(const char *array_name) {
- if(!constant_mem)
- constant_mem = new constant_memory_mapping(true, array_name);
- else
- constant_mem->add(array_name);
-}
-
-//protonu--moving this from Loop
-SgNode* LoopCuda::codegen() {
- if (code_gen_flags & GenCudaizeV2)
- return cudaize_codegen_v2();
- //Do other flagged codegen methods, return plain vanilla generated code
- return getCode();
-}
-
-//These three are in Omega code_gen.cc and are used as a massive hack to
-//get out some info from MMGenerateCode. Yea for nasty side-effects.
-namespace omega {
- extern int checkLoopLevel;
- extern int stmtForLoopCheck;
- extern int upperBoundForLevel;
- extern int lowerBoundForLevel;
-}
-
-CG_outputRepr* LoopCuda::extractCudaUB(int stmt_num, int level,
- int &outUpperBound, int &outLowerBound) {
- // check for sanity of parameters
- const int m = stmt.size();
- if (stmt_num >= m || stmt_num < 0)
- throw std::invalid_argument("invalid statement " + to_string(stmt_num));
- const int n = stmt[stmt_num].xform.n_out();
- if (level > (n - 1) / 2 || level <= 0)
- throw std::invalid_argument("invalid loop level " + to_string(level));
-
- int dim = 2 * level - 1;
-
- std::vector<int> lex = getLexicalOrder(stmt_num);
- std::set<int> same_loop = getStatements(lex, dim - 1);
-
- // extract the intersection of the iteration space to be considered
- Relation hull;
- {
- hull = Relation::True(n);
- for (std::set<int>::iterator i = same_loop.begin();
- i != same_loop.end(); i++) {
- Relation r = getNewIS(*i);
- for (int j = dim + 2; j <= r.n_set(); j++)
- r = Project(r, r.set_var(j));
- hull = Intersection(hull, r);
- hull.simplify(2, 4);
- }
-
- for (int i = 2; i <= dim + 1; i += 2) {
- //std::string name = std::string("_t") + to_string(t_counter++);
- std::string name = std::string("_t")
- + to_string(tmp_loop_var_name_counter++);
- hull.name_set_var(i, name);
- }
- hull.setup_names();
- }
-
- // extract the exact loop bound of the dimension to be unrolled
- if (is_single_iteration(hull, dim)) {
- throw std::runtime_error(
- "No loop availabe at level to extract upper bound.");
- }
- Relation bound = get_loop_bound(hull, dim);
- if (!bound.has_single_conjunct() || !bound.is_satisfiable()
- || bound.is_tautology())
- throw loop_error(
- "loop error: unable to extract loop bound for cudaize");
-
- // extract the loop stride
- EQ_Handle stride_eq;
- /*int stride = 1;
- {
- bool simple_stride = true;
- int strides = countStrides(bound.query_DNF()->single_conjunct(),
- bound.set_var(dim + 1), stride_eq, simple_stride);
- if (strides > 1)
- throw loop_error("loop error: too many strides");
- else if (strides == 1) {
- int sign = stride_eq.get_coef(bound.set_var(dim + 1));
- // assert(sign == 1 || sign == -1);
- Constr_Vars_Iter it(stride_eq, true);
- stride = abs((*it).coef / sign);
- }
- }
- */
- int stride = 1;
- {
-
- coef_t stride;
- std::pair<EQ_Handle, Variable_ID> result = find_simplest_stride(bound,
- bound.set_var(dim + 1));
- if (result.second == NULL)
- stride = 1;
- else
- stride = abs(result.first.get_coef(result.second))
- / gcd(abs(result.first.get_coef(result.second)),
- abs(result.first.get_coef(bound.set_var(dim + 1))));
-
- if (stride > 1)
- throw loop_error("loop error: too many strides");
- /*else if (stride == 1) {
- int sign = result.first.get_coef(bound.set_var(dim+1));
- assert(sign == 1 || sign == -1);
- } */
- }
-
- if (stride != 1) {
- char buf[1024];
- sprintf(buf, "Cudaize: Loop at level %d has non-one stride of %d",
- level, stride);
- throw std::runtime_error(buf);
- }
-
- //Use code generation system to build tell us our bound information. We
- //need a hard upper bound a 0 lower bound.
-
- checkLoopLevel = level * 2;
- stmtForLoopCheck = stmt_num;
- upperBoundForLevel = -1;
- lowerBoundForLevel = -1;
- printCode(1, false);
- checkLoopLevel = 0;
-
- outUpperBound = upperBoundForLevel;
- outLowerBound = lowerBoundForLevel;
-
- if (outUpperBound == -1) {
-
- CG_result* temp = last_compute_cgr_;
-
- while (temp) {
- CG_loop * loop;
- if (loop = dynamic_cast<CG_loop*>(temp)) {
- if (loop->level_ == 2 * level) {
- Relation bound = copy(loop->bounds_);
- Variable_ID v = bound.set_var(2 * level);
- for (GEQ_Iterator e(
- const_cast<Relation &>(bound).single_conjunct()->GEQs());
- e; e++) {
- if ((*e).get_coef(v) < 0
- && (*e).is_const_except_for_global(v))
- return output_upper_bound_repr(ir->builder(), *e, v,
- bound,
- std::vector<std::pair<CG_outputRepr *, int> >(
- bound.n_set(),
- std::make_pair(
- static_cast<CG_outputRepr *>(NULL),
- 0)));
- }
- }
- if (loop->level_ > 2 * level)
- break;
- else
- temp = loop->body_;
- } else
- break;
- }
- }
-
- return NULL;
-}
-
-void LoopCuda::printCode(int effort, bool actuallyPrint) const {
- const int m = stmt.size();
- if (m == 0)
- return;
- const int n = stmt[0].xform.n_out();
-
- /*or (int i = 0; i < m; i++) {
- IS[i + 1] = stmt[i].IS;
- xform[i + 1] = stmt[i].xform;
-
- //nonSplitLevels[i+1] = stmt[i].nonSplitLevels;
- }
- */
-
- // invalidate saved codegen computation
- if (last_compute_cgr_ != NULL) {
- delete last_compute_cgr_;
- last_compute_cgr_ = NULL;
- }
-
- if (last_compute_cg_ != NULL) {
- delete last_compute_cg_;
- last_compute_cg_ = NULL;
- }
-
- //Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
- /*CG_stringBuilder *ocg = new CG_stringBuilder();
- Tuple<CG_outputRepr *> nameInfo;
- for (int i = 1; i <= m; i++)
- nameInfo.append(new CG_stringRepr("s" + to_string(i)));
- */
-
- // -- replacing MMGenerateCode
- // -- formally CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort);
- // -- in the future, these if statements need to be cleaned up.
- // -- something like check_lastComputeCG might be a decent protected member function
- // -- and/or something that returns a std::vector<CG_outputRepr*> that also checks last_compute_cg_
- //if (last_compute_cg_ == NULL) {
- std::vector<Relation> IS(m);
- std::vector<Relation> xforms(m);
- std::vector<std::vector<int> > nonSplitLevels(m);
-
- /* std::vector < std::vector <std::string> > idxTupleNames;
- if (useIdxNames) {
- for (int i = 0; i < idxNames.size(); i++) {
- Tuple<std::string> idxs;
- for (int j = 0; j < idxNames[i].size(); j++)
- idxs.append(idxNames[i][j]);
- idxTupleNames.append(idxs);
- }
- }
- */
- for (int i = 0; i < m; i++) {
- IS[i] = stmt[i].IS;
- xforms[i] = stmt[i].xform;
- nonSplitLevels[i] = stmt_nonSplitLevels[i];
- }
- Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
-
- last_compute_cg_ = new CodeGen(xforms, IS, known, nonSplitLevels, idxNames,
- syncs);
-
- delete last_compute_cgr_; // this was just done above?
- last_compute_cgr_ = NULL;
- //}
-
- if (last_compute_cgr_ == NULL || last_compute_effort_ != effort) {
- delete last_compute_cgr_;
- last_compute_cgr_ = last_compute_cg_->buildAST(effort);
- last_compute_effort_ = effort;
- }
-
- //std::vector<CG_outputRepr *> stmts(m);
- //for (int i = 0; i < m; i++)
- // stmts[i] = stmt[i].code;
- //CG_outputRepr* repr = last_compute_cgr_->printRepr(ocg, stmts);
- // -- end replacing MMGenerateCode
- std::string repr = last_compute_cgr_->printString();
-
- if (actuallyPrint)
- std::cout << repr << std::endl;
- //std::cout << static_cast<CG_stringRepr*>(repr)->GetString();
- /*
- for (int i = 1; i <= m; i++)
- delete nameInfo[i];
- */
-
- //delete ocg;
-}
-
-void LoopCuda::printRuntimeInfo() const {
- for (int i = 0; i < stmt.size(); i++) {
- Relation IS = stmt[i].IS;
- Relation xform = stmt[i].xform;
- printf("stmt[%d]\n", i);
- printf("IS\n");
- IS.print_with_subs();
-
- printf("xform[%d]\n", i);
- xform.print_with_subs();
-
- }
-}
-
-void LoopCuda::printIndexes() const {
- for (int i = 0; i < stmt.size(); i++) {
- printf("stmt %d nset %d ", i, stmt[i].IS.n_set());
-
- for (int j = 0; j < idxNames[i].size(); j++) {
- if (j > 0)
- printf(",");
- printf("%s", idxNames[i][j].c_str());
- }
- printf("\n");
- }
-}
-
-SgNode* LoopCuda::getCode(int effort) const {
- const int m = stmt.size();
- if (m == 0)
- return new SgNode;
- const int n = stmt[0].xform.n_out();
- /*
- Tuple<CG_outputRepr *> ni(m);
- Tuple < Relation > IS(m);
- Tuple < Relation > xform(m);
- vector < vector <int> > nonSplitLevels(m);
- for (int i = 0; i < m; i++) {
- ni[i + 1] = stmt[i].code;
- IS[i + 1] = stmt[i].IS;
- xform[i + 1] = stmt[i].xform;
- nonSplitLevels[i + 1] = stmt_nonSplitLevels[i];
-
- //nonSplitLevels[i+1] = stmt[i].nonSplitLevels;
- }
- */
- //Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
-//#ifdef DEBUG
-//#endif
- //std::cout << GetString(MMGenerateCode(new CG_stringBuilder(), xform, IS, ni, known,
- // nonSplitLevels, syncs, idxTupleNames, effort));
- if (last_compute_cgr_ != NULL) {
- delete last_compute_cgr_;
- last_compute_cgr_ = NULL;
- }
-
- if (last_compute_cg_ != NULL) {
- delete last_compute_cg_;
- last_compute_cg_ = NULL;
- }
-
- CG_outputBuilder *ocg = ir->builder();
- // -- replacing MMGenerateCode
- // -- formally CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort);
- // -- in the future, these if statements need to be cleaned up.
- // -- something like check_lastComputeCG might be a decent protected member function
- // -- and/or something that returns a std::vector<CG_outputRepr*> that also checks last_compute_cg_
- //if (last_compute_cg_ == NULL) {
- std::vector<Relation> IS(m);
- std::vector<Relation> xforms(m);
- std::vector<std::vector<int> > nonSplitLevels(m);
- for (int i = 0; i < m; i++) {
- IS[i] = stmt[i].IS;
- xforms[i] = stmt[i].xform;
- nonSplitLevels[i] = stmt_nonSplitLevels[i];
- }
-
- /*std::vector < std::vector<std::string> > idxTupleNames;
- if (useIdxNames) {
- for (int i = 0; i < idxNames.size(); i++) {
- std::vector<std::string> idxs;
- for (int j = 0; j < idxNames[i].size(); j++)
- idxs.push_back(idxNames[i][j]);
- idxTupleNames.push_back(idxs);
- }
- }
- */
- Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
-
- last_compute_cg_ = new CodeGen(xforms, IS, known, nonSplitLevels, idxNames,
- syncs);
- delete last_compute_cgr_;
- last_compute_cgr_ = NULL;
- //}
-
- if (last_compute_cgr_ == NULL || last_compute_effort_ != effort) {
- delete last_compute_cgr_;
- last_compute_cgr_ = last_compute_cg_->buildAST(effort);
- last_compute_effort_ = effort;
- }
-
- std::vector<CG_outputRepr *> stmts(m);
- for (int i = 0; i < m; i++)
- stmts[i] = stmt[i].code;
- CG_outputRepr* repr = last_compute_cgr_->printRepr(ocg, stmts);
- // -- end replacing MMGenerateCode
-
- //CG_outputRepr *overflow_initialization = ocg->CreateStmtList();
- CG_outputRepr *overflow_initialization = ocg->StmtListAppend(NULL, NULL);
- for (std::map<int, std::vector<Free_Var_Decl *> >::const_iterator i =
- overflow.begin(); i != overflow.end(); i++)
- for (std::vector<Free_Var_Decl *>::const_iterator j = i->second.begin();
- j != i->second.end(); j++)
- //overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->CreateStmtList(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0))));
- overflow_initialization = ocg->StmtListAppend(
- overflow_initialization,
- ocg->StmtListAppend(
- ocg->CreateAssignment(0,
- ocg->CreateIdent((*j)->base_name()),
- ocg->CreateInt(0)), NULL));
-
- repr = ocg->StmtListAppend(overflow_initialization, repr);
- SgNode *tnl = static_cast<CG_roseRepr *>(repr)->GetCode();
- SgStatementPtrList *list = static_cast<CG_roseRepr *>(repr)->GetList();
-
- if (tnl != NULL)
- return tnl;
- else if (tnl == NULL && list != NULL) {
- SgBasicBlock* bb2 = buildBasicBlock();
-
- for (SgStatementPtrList::iterator it = (*list).begin();
- it != (*list).end(); it++)
- bb2->append_statement(*it);
-
- tnl = isSgNode(bb2);
- } else
- throw loop_error("codegen failed");
-
- delete repr;
- /*
- for (int i = 1; i <= m; i++)
- delete ni[i];
- */
- return tnl;
-
-}
-
-//protonu--adding constructors for the new derived class
-LoopCuda::LoopCuda() :
- Loop(), code_gen_flags(GenInit) {
-}
-
-LoopCuda::LoopCuda(IR_Control *irc, int loop_num) :
- Loop(irc) {
- setup_code = NULL;
- teardown_code = NULL;
- code_gen_flags = 0;
- cu_bx = cu_by = cu_tx = cu_ty = cu_tz = 1;
- cu_bx_repr = NULL;
- cu_tx_repr = NULL;
- cu_by_repr = NULL;
- cu_ty_repr = NULL;
- cu_tz_repr = NULL;
-
- cu_num_reduce = 0;
- cu_mode = GlobalMem;
- texture = NULL;
- constant_mem = NULL;
-
- int m = stmt.size();
- //printf("\n the size of stmt(initially) is: %d\n", stmt.size());
- for (int i = 0; i < m; i++)
- stmt_nonSplitLevels.push_back(std::vector<int>());
-
- globals = ((IR_cudaroseCode *) ir)->gsym_;
- globalScope = ((IR_cudaroseCode *) ir)->first_scope;
- parameter_symtab = ((IR_cudaroseCode *) ir)->parameter;
- body_symtab = ((IR_cudaroseCode *) ir)->body;
- func_body = ((IR_cudaroseCode *) ir)->defn;
- func_definition = ((IR_cudaroseCode *) ir)->func_defn;
- std::vector<SgForStatement *> tf = ((IR_cudaroseCode *) ir)->get_loops();
-
- symtab = tf[loop_num]->get_symbol_table();
-
- std::vector<SgForStatement *> deepest = find_deepest_loops(
- isSgNode(tf[loop_num]));
-
- for (int i = 0; i < deepest.size(); i++) {
- SgVariableSymbol* vs;
- SgForInitStatement* list = deepest[i]->get_for_init_stmt();
- SgStatementPtrList& initStatements = list->get_init_stmt();
- SgStatementPtrList::const_iterator j = initStatements.begin();
- if (SgExprStatement *expr = isSgExprStatement(*j))
- if (SgAssignOp* op = isSgAssignOp(expr->get_expression()))
- if (SgVarRefExp* var_ref = isSgVarRefExp(op->get_lhs_operand()))
- vs = var_ref->get_symbol();
-
- index.push_back(vs->get_name().getString().c_str()); //reflects original code index names
- }
-
- for (int i = 0; i < stmt.size(); i++)
- idxNames.push_back(index); //refects prefered index names (used as handles in cudaize v2)
- useIdxNames = false;
-
-}
-
-void LoopCuda::printIS() {
- if (!cudaDebug) return;
- int k = stmt.size();
- for (int i = 0; i < k; i++) {
- printf(" printing statement:%d\n", i);
- stmt[i].IS.print();
- }
-}
-
diff --git a/loop_modified.cc b/loop_modified.cc
deleted file mode 100644
index 9686f6d..0000000
--- a/loop_modified.cc
+++ /dev/null
@@ -1,4234 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2008 University of Southern California
- Copyright (C) 2009-2010 University of Utah
- All Rights Reserved.
-
- Purpose:
- Core loop transformation functionality.
-
- Notes:
- "level" (starting from 1) means loop level and it corresponds to "dim"
- (starting from 0) in transformed iteration space [c_1,l_1,c_2,l_2,....,
- c_n,l_n,c_(n+1)], e.g., l_2 is loop level 2 in generated code, dim 3
- in transformed iteration space, and variable 4 in Omega relation.
- All c's are constant numbers only and they will not show up as actual loops.
- Formula:
- dim = 2*level - 1
- var = dim + 1
-
- History:
- 10/2005 Created by Chun Chen.
- 09/2009 Expand tile functionality, -chun
- 10/2009 Initialize unfusible loop nest without bailing out, -chun
-*****************************************************************************/
-
-#include <limits.h>
-#include <math.h>
-#include <code_gen/code_gen.h>
-#include <code_gen/CG_outputBuilder.h>
-#include <code_gen/output_repr.h>
-#include <iostream>
-#include <map>
-#include "loop.hh"
-#include "omegatools.hh"
-#include "irtools.hh"
-#include "chill_error.hh"
-#include <string.h>
-using namespace omega;
-
-const std::string Loop::tmp_loop_var_name_prefix = std::string("_t");
-const std::string Loop::overflow_var_name_prefix = std::string("over");
-
-//-----------------------------------------------------------------------------
-// Class Loop
-//-----------------------------------------------------------------------------
-
-bool Loop::init_loop(std::vector<ir_tree_node *> &ir_tree,
- std::vector<ir_tree_node *> &ir_stmt) {
- ir_stmt = extract_ir_stmts(ir_tree);
- stmt_nesting_level_.resize(ir_stmt.size());
- std::vector<int> stmt_nesting_level(ir_stmt.size());
- for (int i = 0; i < ir_stmt.size(); i++) {
- ir_stmt[i]->payload = i;
- int t = 0;
- ir_tree_node *itn = ir_stmt[i];
- while (itn->parent != NULL) {
- itn = itn->parent;
- if (itn->content->type() == IR_CONTROL_LOOP)
- t++;
- }
- stmt_nesting_level_[i] = t;
- stmt_nesting_level[i] = t;
- }
-
- stmt = std::vector<Statement>(ir_stmt.size());
- int n_dim = -1;
- int max_loc;
- //std::vector<std::string> index;
- for (int i = 0; i < ir_stmt.size(); i++) {
- int max_nesting_level = -1;
- int loc;
- for (int j = 0; j < ir_stmt.size(); j++)
- if (stmt_nesting_level[j] > max_nesting_level) {
- max_nesting_level = stmt_nesting_level[j];
- loc = j;
- }
-
- // most deeply nested statement acting as a reference point
- if (n_dim == -1) {
- n_dim = max_nesting_level;
- max_loc = loc;
-
- index = std::vector<std::string>(n_dim);
-
- ir_tree_node *itn = ir_stmt[loc];
- int cur_dim = n_dim - 1;
- while (itn->parent != NULL) {
- itn = itn->parent;
- if (itn->content->type() == IR_CONTROL_LOOP) {
- index[cur_dim] =
- static_cast<IR_Loop *>(itn->content)->index()->name();
- itn->payload = cur_dim--;
- }
- }
- }
-
- // align loops by names, temporary solution
- ir_tree_node *itn = ir_stmt[loc];
- int depth = stmt_nesting_level_[loc] - 1;
- /* while (itn->parent != NULL) {
- itn = itn->parent;
- if (itn->content->type() == IR_CONTROL_LOOP && itn->payload == -1) {
- std::string name = static_cast<IR_Loop *>(itn->content)->index()->name();
- for (int j = 0; j < n_dim; j++)
- if (index[j] == name) {
- itn->payload = j;
- break;
- }
- if (itn->payload == -1)
- throw loop_error("no complex alignment yet");
- }
- }
- */
- for (int t = depth; t >= 0; t--) {
- int y = t;
- ir_tree_node *itn = ir_stmt[loc];
-
- while ((itn->parent != NULL) && (y >= 0)) {
- itn = itn->parent;
- if (itn->content->type() == IR_CONTROL_LOOP)
- y--;
- }
-
- if (itn->content->type() == IR_CONTROL_LOOP && itn->payload == -1) {
- CG_outputBuilder *ocg = ir_->builder();
-
- itn->payload = depth - t;
-
- CG_outputRepr *code =
- static_cast<IR_Block *>(ir_stmt[loc]->content)->extract();
-
- Tuple<CG_outputRepr *> index_expr;
- Tuple<std::string> old_index;
- CG_outputRepr *repl = ocg->CreateIdent(index[itn->payload]);
- index_expr.append(repl);
- old_index.append(
- static_cast<IR_Loop *>(itn->content)->index()->name());
-
- code = ocg->CreatePlaceHolder(0, code, index_expr, old_index);
- replace.insert(std::pair<int, CG_outputRepr*>(loc, code));
- //stmt[loc].code = code;
-
- }
- }
-
- // set relation variable names
- Relation r(n_dim);
- F_And *f_root = r.add_and();
- itn = ir_stmt[loc];
- int temp_depth = depth;
- while (itn->parent != NULL) {
-
- itn = itn->parent;
- if (itn->content->type() == IR_CONTROL_LOOP) {
- r.name_set_var(itn->payload + 1, index[temp_depth]);
-
- temp_depth--;
- }
- //static_cast<IR_Loop *>(itn->content)->index()->name());
- }
-
- /*while (itn->parent != NULL) {
- itn = itn->parent;
- if (itn->content->type() == IR_CONTROL_LOOP)
- r.name_set_var(itn->payload+1, static_cast<IR_Loop *>(itn->content)->index()->name());
- }*/
-
- // extract information from loop/if structures
- std::vector<bool> processed(n_dim, false);
- Tuple<std::string> vars_to_be_reversed;
- itn = ir_stmt[loc];
- while (itn->parent != NULL) {
- itn = itn->parent;
-
- switch (itn->content->type()) {
- case IR_CONTROL_LOOP: {
- IR_Loop *lp = static_cast<IR_Loop *>(itn->content);
- Variable_ID v = r.set_var(itn->payload + 1);
- int c;
-
- try {
- c = lp->step_size();
- if (c > 0) {
- CG_outputRepr *lb = lp->lower_bound();
- exp2formula(ir, r, f_root, freevar, lb, v, 's',
- IR_COND_GE, true);
- CG_outputRepr *ub = lp->upper_bound();
- IR_CONDITION_TYPE cond = lp->stop_cond();
- if (cond == IR_COND_LT || cond == IR_COND_LE)
- exp2formula(ir, r, f_root, freevar, ub, v, 's',
- cond, true);
- else
- throw ir_error("loop condition not supported");
-
- } else if (c < 0) {
- CG_outputBuilder *ocg = ir->builder();
- CG_outputRepr *lb = lp->lower_bound();
- lb = ocg->CreateMinus(NULL, lb);
- exp2formula(ir, r, f_root, freevar, lb, v, 's',
- IR_COND_GE, true);
- CG_outputRepr *ub = lp->upper_bound();
- ub = ocg->CreateMinus(NULL, ub);
- IR_CONDITION_TYPE cond = lp->stop_cond();
- if (cond == IR_COND_GE)
- exp2formula(ir, r, f_root, freevar, ub, v, 's',
- IR_COND_LE, true);
- else if (cond == IR_COND_GT)
- exp2formula(ir, r, f_root, freevar, ub, v, 's',
- IR_COND_LT, true);
- else
- throw ir_error("loop condition not supported");
-
- vars_to_be_reversed.append(lp->index()->name());
- } else
- throw ir_error("loop step size zero");
- } catch (const ir_error &e) {
- for (int i = 0; i < itn->children.size(); i++)
- delete itn->children[i];
- itn->children = std::vector<ir_tree_node *>();
- itn->content = itn->content->convert();
- return false;
- }
-
- if (abs(c) != 1) {
- F_Exists *f_exists = f_root->add_exists();
- Variable_ID e = f_exists->declare();
- F_And *f_and = f_exists->add_and();
- Stride_Handle h = f_and->add_stride(abs(c));
- if (c > 0)
- h.update_coef(e, 1);
- else
- h.update_coef(e, -1);
- h.update_coef(v, -1);
- CG_outputRepr *lb = lp->lower_bound();
- exp2formula(ir, r, f_and, freevar, lb, e, 's', IR_COND_EQ,
- true);
- }
-
- processed[itn->payload] = true;
- break;
- }
- case IR_CONTROL_IF: {
- CG_outputRepr *cond =
- static_cast<IR_If *>(itn->content)->condition();
- try {
- if (itn->payload % 2 == 1)
- exp2constraint(ir, r, f_root, freevar, cond, true);
- else {
- F_Not *f_not = f_root->add_not();
- F_And *f_and = f_not->add_and();
- exp2constraint(ir, r, f_and, freevar, cond, true);
- }
- } catch (const ir_error &e) {
- std::vector<ir_tree_node *> *t;
- if (itn->parent == NULL)
- t = &ir_tree;
- else
- t = &(itn->parent->children);
- int id = itn->payload;
- int i = t->size() - 1;
- while (i >= 0) {
- if ((*t)[i] == itn) {
- for (int j = 0; j < itn->children.size(); j++)
- delete itn->children[j];
- itn->children = std::vector<ir_tree_node *>();
- itn->content = itn->content->convert();
- } else if ((*t)[i]->payload >> 1 == id >> 1) {
- delete (*t)[i];
- t->erase(t->begin() + i);
- }
- i--;
- }
- return false;
- }
-
- break;
- }
- default:
- for (int i = 0; i < itn->children.size(); i++)
- delete itn->children[i];
- itn->children = std::vector<ir_tree_node *>();
- itn->content = itn->content->convert();
- return false;
- }
- }
-
- // add information for missing loops
- for (int j = 0; j < n_dim; j++)
- if (!processed[j]) {
- ir_tree_node *itn = ir_stmt[max_loc];
- while (itn->parent != NULL) {
- itn = itn->parent;
- if (itn->content->type() == IR_CONTROL_LOOP
- && itn->payload == j)
- break;
- }
-
- Variable_ID v = r.set_var(j + 1);
- if (loc < max_loc) {
- CG_outputRepr *lb =
- static_cast<IR_Loop *>(itn->content)->lower_bound();
- exp2formula(ir, r, f_root, freevar, lb, v, 's', IR_COND_EQ,
- true);
- } else { // loc > max_loc
- CG_outputRepr *ub =
- static_cast<IR_Loop *>(itn->content)->upper_bound();
- exp2formula(ir, r, f_root, freevar, ub, v, 's', IR_COND_EQ,
- true);
- }
- }
-
- r.setup_names();
- r.simplify();
-
- // insert the statement
- CG_outputBuilder *ocg = ir->builder();
- Tuple<CG_outputRepr *> reverse_expr;
- for (int j = 1; j <= vars_to_be_reversed.size(); j++) {
- CG_outputRepr *repl = ocg->CreateIdent(vars_to_be_reversed[j]);
- repl = ocg->CreateMinus(NULL, repl);
- reverse_expr.append(repl);
- }
- CG_outputRepr *code =
- static_cast<IR_Block *>(ir_stmt[loc]->content)->original();
- code = ocg->CreatePlaceHolder(0, code, reverse_expr,
- vars_to_be_reversed);
- stmt[loc].code = code;
- stmt[loc].IS = r;
- stmt[loc].loop_level = std::vector<LoopLevel>(n_dim);
- for (int i = 0; i < n_dim; i++) {
- stmt[loc].loop_level[i].type = LoopLevelOriginal;
- stmt[loc].loop_level[i].payload = i;
- stmt[loc].loop_level[i].parallel_level = 0;
- }
-
- stmt_nesting_level[loc] = -1;
- }
-
- return true;
-}
-
-Loop::Loop(const IR_Control *control) {
- ir = const_cast<IR_Code *>(control->ir_);
- init_code = NULL;
- cleanup_code = NULL;
- tmp_loop_var_name_counter = 1;
- overflow_var_name_counter = 1;
- known = Relation::True(0);
-
- std::vector<ir_tree_node *> ir_tree = build_ir_tree(control->clone(), NULL);
- std::vector<ir_tree_node *> ir_stmt;
-
- while (!init_loop(ir_tree, ir_stmt)) {
- }
-
- // init the dependence graph
- for (int i = 0; i < stmt.size(); i++)
- dep.insert();
-
- for (int i = 0; i < stmt.size(); i++)
- for (int j = i; j < stmt.size(); j++) {
- std::pair<std::vector<DependenceVector>,
- std::vector<DependenceVector> > dv = test_data_dependences(
- ir_, stmt[i].code, stmt[i].IS, stmt[j].code, stmt[j].IS,
- freevar, index, stmt_nesting_level_[i],
- stmt_nesting_level[j]);
-
- for (int k = 0; k < dv.first.size(); k++) {
- if (is_dependence_valid(ir_stmt[i], ir_stmt[j], dv.first[k],
- true))
- dep.connect(i, j, dv.first[k]);
- else {
- dep.connect(j, i, dv.first[k].reverse());
- }
-
- }
- for (int k = 0; k < dv.second.size(); k++)
- if (is_dependence_valid(ir_stmt[j], ir_stmt[i], dv.second[k],
- false))
- dep.connect(j, i, dv.second[k]);
- else {
- dep.connect(i, j, dv.second[k].reverse());
- }
- // std::pair<std::vector<DependenceVector>,
- // std::vector<DependenceVector> > dv_ = test_data_dependences(
-
- }
-
- for (int i = 0; i < stmt.size(); i++) {
- std::map<int, CG_outputRepr*>::iterator it = replace.find(i);
-
- if (it != replace.end())
- stmt[i].code = (it->second)->clone();
- else
- stmt[i].code = stmt[i].code->clone();
- }
-
- // cleanup the IR tree
- for (int i = 0; i < ir_tree.size(); i++)
- delete ir_tree[i];
-
- // init dumb transformation relations e.g. [i, j] -> [ 0, i, 0, j, 0]
- for (int i = 0; i < stmt.size(); i++) {
- int n = stmt[i].IS.n_set();
- stmt[i].xform = Relation(n, 2 * n + 1);
- F_And *f_root = stmt[i].xform.add_and();
-
- for (int j = 1; j <= n; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(stmt[i].xform.output_var(2 * j), 1);
- h.update_coef(stmt[i].xform.input_var(j), -1);
- }
-
- for (int j = 1; j <= 2 * n + 1; j += 2) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(stmt[i].xform.output_var(j), 1);
- }
- stmt[i].xform.simplify();
- }
-
- if (stmt.size() != 0)
- num_dep_dim = stmt[0].IS.n_set();
- else
- num_dep_dim = 0;
-}
-
-Loop::~Loop() {
- for (int i = 0; i < stmt.size(); i++)
- if (stmt[i].code != NULL) {
- stmt[i].code->clear();
- delete stmt[i].code;
- }
- if (init_code != NULL) {
- init_code->clear();
- delete init_code;
- }
- if (cleanup_code != NULL) {
- cleanup_code->clear();
- delete cleanup_code;
- }
-}
-
-int Loop::get_dep_dim_of(int stmt_num, int level) const {
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invaid statement " + to_string(stmt_num));
-
- if (level < 1 || level > stmt[stmt_num].loop_level.size())
- return -1;
-
- int trip_count = 0;
- while (true) {
- switch (stmt[stmt_num].loop_level[level - 1].type) {
- case LoopLevelOriginal:
- return stmt[stmt_num].loop_level[level - 1].payload;
- case LoopLevelTile:
- level = stmt[stmt_num].loop_level[level - 1].payload;
- if (level < 1)
- return -1;
- if (level > stmt[stmt_num].loop_level.size())
- throw loop_error(
- "incorrect loop level information for statement "
- + to_string(stmt_num));
- break;
- default:
- throw loop_error(
- "unknown loop level information for statement "
- + to_string(stmt_num));
- }
- trip_count++;
- if (trip_count >= stmt[stmt_num].loop_level.size())
- throw loop_error(
- "incorrect loop level information for statement "
- + to_string(stmt_num));
- }
-}
-
-int Loop::get_last_dep_dim_before(int stmt_num, int level) const {
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invaid statement " + to_string(stmt_num));
-
- if (level < 1)
- return -1;
- if (level > stmt[stmt_num].loop_level.size())
- level = stmt[stmt_num].loop_level.size() + 1;
-
- for (int i = level - 1; i >= 1; i--)
- if (stmt[stmt_num].loop_level[i - 1].type == LoopLevelOriginal)
- return stmt[stmt_num].loop_level[i - 1].payload;
-
- return -1;
-}
-
-void Loop::print_internal_loop_structure() const {
- for (int i = 0; i < stmt.size(); i++) {
- std::vector<int> lex = getLexicalOrder(i);
- std::cout << "s" << i + 1 << ": ";
- for (int j = 0; j < stmt[i].loop_level.size(); j++) {
- if (2 * j < lex.size())
- std::cout << lex[2 * j];
- switch (stmt[i].loop_level[j].type) {
- case LoopLevelOriginal:
- std::cout << "(dim:" << stmt[i].loop_level[j].payload << ")";
- break;
- case LoopLevelTile:
- std::cout << "(tile:" << stmt[i].loop_level[j].payload << ")";
- break;
- default:
- std::cout << "(unknown)";
- }
- std::cout << ' ';
- }
- for (int j = 2 * stmt[i].loop_level.size(); j < lex.size(); j += 2) {
- std::cout << lex[j];
- if (j != lex.size() - 1)
- std::cout << ' ';
- }
- std::cout << std::endl;
- }
-}
-
-CG_outputRepr *Loop::getCode(int effort) const {
- const int m = stmt.size();
- if (m == 0)
- return NULL;
- const int n = stmt[0].xform.n_out();
-
- Tuple<CG_outputRepr *> ni(m);
- Tuple < Relation > IS(m);
- Tuple < Relation > xform(m);
- for (int i = 0; i < m; i++) {
- ni[i + 1] = stmt[i].code;
- IS[i + 1] = stmt[i].IS;
- xform[i + 1] = stmt[i].xform;
- }
-
- Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
- CG_outputBuilder *ocg = ir->builder();
- CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, effort);
-
- if (init_code != NULL)
- repr = ocg->StmtListAppend(init_code->clone(), repr);
- if (cleanup_code != NULL)
- repr = ocg->StmtListAppend(repr, cleanup_code->clone());
-
- return repr;
-}
-
-void Loop::printCode(int effort) const {
- const int m = stmt.size();
- if (m == 0)
- return;
- const int n = stmt[0].xform.n_out();
-
- Tuple < Relation > IS(m);
- Tuple < Relation > xform(m);
- for (int i = 0; i < m; i++) {
- IS[i + 1] = stmt[i].IS;
- xform[i + 1] = stmt[i].xform;
- }
-
- Relation known = Extend_Set(copy(this->known), n - this->known.n_set());
- std::cout << MMGenerateCode(xform, IS, known, effort);
-}
-
-Relation Loop::getNewIS(int stmt_num) const {
- Relation result;
-
- if (stmt[stmt_num].xform.is_null()) {
- Relation known = Extend_Set(copy(this->known),
- stmt[stmt_num].IS.n_set() - this->known.n_set());
- result = Intersection(copy(stmt[stmt_num].IS), known);
- } else {
- Relation known = Extend_Set(copy(this->known),
- stmt[stmt_num].xform.n_out() - this->known.n_set());
- result = Intersection(
- Range(
- Restrict_Domain(copy(stmt[stmt_num].xform),
- copy(stmt[stmt_num].IS))), known);
- }
-
- result.simplify(2, 4);
-
- return result;
-}
-
-std::vector<Relation> Loop::getNewIS() const {
- const int m = stmt.size();
-
- std::vector<Relation> new_IS(m);
- for (int i = 0; i < m; i++)
- new_IS[i] = getNewIS(i);
-
- return new_IS;
-}
-
-void Loop::permute(const std::vector<int> &pi) {
- std::set<int> active;
- for (int i = 0; i < stmt.size(); i++)
- active.insert(i);
-
- permute(active, pi);
-}
-
-void Loop::original() {
- std::set<int> active;
- for (int i = 0; i < stmt.size(); i++)
- active.insert(i);
- setLexicalOrder(0, active);
-}
-
-void Loop::permute(const std::set<int> &active, const std::vector<int> &pi) {
- if (active.size() == 0 || pi.size() == 0)
- return;
-
- // check for sanity of parameters
- int level = pi[0];
- for (int i = 1; i < pi.size(); i++)
- if (pi[i] < level)
- level = pi[i];
- if (level < 1)
- throw std::invalid_argument("invalid permuation");
- std::vector<int> reverse_pi(pi.size(), 0);
- for (int i = 0; i < pi.size(); i++)
- if (pi[i] >= level + pi.size())
- throw std::invalid_argument("invalid permutation");
- else
- reverse_pi[pi[i] - level] = i + level;
- for (int i = 0; i < reverse_pi.size(); i++)
- if (reverse_pi[i] == 0)
- throw std::invalid_argument("invalid permuation");
- int ref_stmt_num;
- std::vector<int> lex;
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
- if (*i < 0 || *i >= stmt.size())
- throw std::invalid_argument("invalid statement " + to_string(*i));
- if (i == active.begin()) {
- ref_stmt_num = *i;
- lex = getLexicalOrder(*i);
- } else {
- if (level + pi.size() - 1 > stmt[*i].loop_level.size())
- throw std::invalid_argument("invalid permuation");
- std::vector<int> lex2 = getLexicalOrder(*i);
- for (int j = 0; j < 2 * level - 3; j += 2)
- if (lex[j] != lex2[j])
- throw std::invalid_argument(
- "statements to permute must be in the same subloop");
- for (int j = 0; j < pi.size(); j++)
- if (!(stmt[*i].loop_level[level + j - 1].type
- == stmt[ref_stmt_num].loop_level[level + j - 1].type
- && stmt[*i].loop_level[level + j - 1].payload
- == stmt[ref_stmt_num].loop_level[level + j - 1].payload))
- throw std::invalid_argument(
- "permuted loops must have the same loop level types");
- }
- }
-
- // Update transformation relations
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
- int n = stmt[*i].xform.n_out();
- Relation mapping(n, n);
- F_And *f_root = mapping.add_and();
- for (int j = 1; j <= n; j += 2) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(j), 1);
- h.update_coef(mapping.input_var(j), -1);
- }
- for (int j = 0; j < pi.size(); j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(2 * (level + j)), 1);
- h.update_coef(mapping.input_var(2 * pi[j]), -1);
- }
- for (int j = 1; j < level; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(2 * j), 1);
- h.update_coef(mapping.input_var(2 * j), -1);
- }
- for (int j = level + pi.size(); j <= stmt[*i].loop_level.size(); j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(2 * j), 1);
- h.update_coef(mapping.input_var(2 * j), -1);
- }
-
- stmt[*i].xform = Composition(mapping, stmt[*i].xform);
- stmt[*i].xform.simplify();
- }
-
- // get the permuation for dependence vectors
- std::vector<int> t;
- for (int i = 0; i < pi.size(); i++)
- if (stmt[ref_stmt_num].loop_level[pi[i] - 1].type == LoopLevelOriginal)
- t.push_back(stmt[ref_stmt_num].loop_level[pi[i] - 1].payload);
- int max_dep_dim = -1;
- int min_dep_dim = num_dep_dim;
- for (int i = 0; i < t.size(); i++) {
- if (t[i] > max_dep_dim)
- max_dep_dim = t[i];
- if (t[i] < min_dep_dim)
- min_dep_dim = t[i];
- }
- if (min_dep_dim > max_dep_dim)
- return;
- if (max_dep_dim - min_dep_dim + 1 != t.size())
- throw loop_error("cannot update the dependence graph after permuation");
- std::vector<int> dep_pi(num_dep_dim);
- for (int i = 0; i < min_dep_dim; i++)
- dep_pi[i] = i;
- for (int i = min_dep_dim; i <= max_dep_dim; i++)
- dep_pi[i] = t[i - min_dep_dim];
- for (int i = max_dep_dim + 1; i < num_dep_dim; i++)
- dep_pi[i] = i;
-
- // update the dependence graph
- DependenceGraph g;
- for (int i = 0; i < dep.vertex.size(); i++)
- g.insert();
- for (int i = 0; i < dep.vertex.size(); i++)
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();
- j++) {
- if ((active.find(i) != active.end()
- && active.find(j->first) != active.end())) {
- std::vector<DependenceVector> dv = j->second;
- for (int k = 0; k < dv.size(); k++) {
- switch (dv[k].type) {
- case DEP_W2R:
- case DEP_R2W:
- case DEP_W2W:
- case DEP_R2R: {
- std::vector<coef_t> lbounds(num_dep_dim);
- std::vector<coef_t> ubounds(num_dep_dim);
- for (int d = 0; d < num_dep_dim; d++) {
- lbounds[d] = dv[k].lbounds[dep_pi[d]];
- ubounds[d] = dv[k].ubounds[dep_pi[d]];
- }
- dv[k].lbounds = lbounds;
- dv[k].ubounds = ubounds;
- break;
- }
- case DEP_CONTROL: {
- break;
- }
- default:
- throw loop_error("unknown dependence type");
- }
- }
- g.connect(i, j->first, dv);
- } else if (active.find(i) == active.end()
- && active.find(j->first) == active.end()) {
- std::vector<DependenceVector> dv = j->second;
- g.connect(i, j->first, dv);
- } else {
- std::vector<DependenceVector> dv = j->second;
- for (int k = 0; k < dv.size(); k++)
- switch (dv[k].type) {
- case DEP_W2R:
- case DEP_R2W:
- case DEP_W2W:
- case DEP_R2R: {
- for (int d = 0; d < num_dep_dim; d++)
- if (dep_pi[d] != d) {
- dv[k].lbounds[d] = -posInfinity;
- dv[k].ubounds[d] = posInfinity;
- }
- break;
- }
- case DEP_CONTROL:
- break;
- default:
- throw loop_error("unknown dependence type");
- }
- g.connect(i, j->first, dv);
- }
- }
- dep = g;
-
- // update loop level information
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
- int cur_dep_dim = min_dep_dim;
- std::vector<LoopLevel> new_loop_level(stmt[*i].loop_level.size());
- for (int j = 1; j <= stmt[*i].loop_level.size(); j++)
- if (j >= level && j < level + pi.size()) {
- switch (stmt[*i].loop_level[reverse_pi[j - level] - 1].type) {
- case LoopLevelOriginal:
- new_loop_level[j - 1].type = LoopLevelOriginal;
- new_loop_level[j - 1].payload = cur_dep_dim++;
- new_loop_level[j - 1].parallel_level =
- stmt[*i].loop_level[reverse_pi[j - level] - 1].parallel_level;
- break;
- case LoopLevelTile: {
- new_loop_level[j - 1].type = LoopLevelTile;
- int ref_level = stmt[*i].loop_level[reverse_pi[j - level]
- - 1].payload;
- if (ref_level >= level && ref_level < level + pi.size())
- new_loop_level[j - 1].payload = reverse_pi[ref_level
- - level];
- else
- new_loop_level[j - 1].payload = ref_level;
- new_loop_level[j - 1].parallel_level =
- stmt[*i].loop_level[reverse_pi[j - level] - 1].parallel_level;
- break;
- }
- default:
- throw loop_error(
- "unknown loop level information for statement "
- + to_string(*i));
- }
- } else {
- switch (stmt[*i].loop_level[j - 1].type) {
- case LoopLevelOriginal:
- new_loop_level[j - 1].type = LoopLevelOriginal;
- new_loop_level[j - 1].payload =
- stmt[*i].loop_level[j - 1].payload;
- new_loop_level[j - 1].parallel_level = stmt[*i].loop_level[j
- - 1].parallel_level;
- break;
- case LoopLevelTile: {
- new_loop_level[j - 1].type = LoopLevelTile;
- int ref_level = stmt[*i].loop_level[j - 1].payload;
- if (ref_level >= level && ref_level < level + pi.size())
- new_loop_level[j - 1].payload = reverse_pi[ref_level
- - level];
- else
- new_loop_level[j - 1].payload = ref_level;
- new_loop_level[j - 1].parallel_level = stmt[*i].loop_level[j
- - 1].parallel_level;
- break;
- }
- default:
- throw loop_error(
- "unknown loop level information for statement "
- + to_string(*i));
- }
- }
- stmt[*i].loop_level = new_loop_level;
- }
-
- setLexicalOrder(2 * level - 2, active);
-}
-
-std::set<int> Loop::split(int stmt_num, int level, const Relation &cond) {
- // check for sanity of parameters
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invalid statement " + to_string(stmt_num));
- if (level <= 0 || level > stmt[stmt_num].loop_level.size())
- throw std::invalid_argument("invalid loop level " + to_string(level));
-
- std::set<int> result;
- int dim = 2 * level - 1;
- std::vector<int> lex = getLexicalOrder(stmt_num);
- std::set<int> same_loop = getStatements(lex, dim - 1);
-
- Relation cond2 = copy(cond);
- cond2.simplify();
- cond2 = EQs_to_GEQs(cond2);
- Conjunct *c = cond2.single_conjunct();
- int cur_lex = lex[dim - 1];
- for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
- int max_level = (*gi).max_tuple_pos();
- Relation single_cond(max_level);
- single_cond.and_with_GEQ(*gi);
-
- // TODO: should decide where to place newly created statements with
- // complementary split condition from dependence graph.
- bool place_after;
- if (max_level == 0)
- place_after = true;
- else if ((*gi).get_coef(cond2.set_var(max_level)) < 0)
- place_after = true;
- else
- place_after = false;
-
- // original statements with split condition,
- // new statements with complement of split condition
- int old_num_stmt = stmt.size();
- std::map<int, int> what_stmt_num;
- apply_xform(same_loop);
- for (std::set<int>::iterator i = same_loop.begin();
- i != same_loop.end(); i++) {
- int n = stmt[*i].IS.n_set();
- Relation part1, part2;
- if (max_level > n) {
- part1 = copy(stmt[*i].IS);
- part2 = Relation::False(0);
- } else {
- part1 = Intersection(copy(stmt[*i].IS),
- Extend_Set(copy(single_cond), n - max_level));
- part2 = Intersection(copy(stmt[*i].IS),
- Extend_Set(Complement(copy(single_cond)),
- n - max_level));
- }
-
- //split dependence check
-
- if (max_level > level) {
-
- DNF_Iterator di1(stmt[*i].IS.query_DNF());
- DNF_Iterator di2(part1.query_DNF());
- for (; di1 && di2; di1++, di2++) {
- //printf("In next conjunct,\n");
- EQ_Iterator ei1 = (*di1)->EQs();
- EQ_Iterator ei2 = (*di2)->EQs();
- for (; ei1 && ei2; ei1++, ei2++) {
- //printf(" In next equality constraint,\n");
- Constr_Vars_Iter cvi1(*ei1);
- Constr_Vars_Iter cvi2(*ei2);
- int dimension = (*cvi1).var->get_position();
- int same = 0;
- bool identical = false;
- if (identical = !strcmp((*cvi1).var->char_name(),
- (*cvi2).var->char_name())) {
-
- for (; cvi1 && cvi2; cvi1++, cvi2++) {
-
- if (((*cvi1).coef != (*cvi2).coef
- || (*ei1).get_const()
- != (*ei2).get_const())
- || (strcmp((*cvi1).var->char_name(),
- (*cvi2).var->char_name()))) {
-
- same++;
- }
- }
- }
- if ((same != 0) || !identical) {
-
- dimension = dimension - 1;
-
- while (stmt[*i].loop_level[dimension].type
- == LoopLevelTile)
- dimension = xform_index[dimension].first;
-
- dimension = stmt[*i].loop_level[dimension].payload;
-
- for (int i = 0; i < stmt.size(); i++) {
- std::vector<std::pair<int, DependenceVector> > D;
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[i].second.begin();
- j != dep.vertex[i].second.end(); j++) {
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if ((dv.hasNegative(dimension)
- && !dv.quasi)
- || (dv.hasPositive(dimension)
- && dv.quasi))
-
- throw loop_error(
- "loop error: Split is illegal, dependence violation!");
-
- }
- }
- }
-
- }
-
- GEQ_Iterator gi1 = (*di1)->GEQs();
- GEQ_Iterator gi2 = (*di2)->GEQs();
-
- for (; gi1 && gi2; gi++, gi2++) {
-
- Constr_Vars_Iter cvi1(*gi1);
- Constr_Vars_Iter cvi2(*gi2);
- int dimension = (*cvi1).var->get_position();
- int same = 0;
- bool identical = false;
- if (identical = !strcmp((*cvi1).var->char_name(),
- (*cvi2).var->char_name())) {
-
- for (; cvi1 && cvi2; cvi1++, cvi2++) {
-
- if (((*cvi1).coef != (*cvi2).coef
- || (*gi1).get_const()
- != (*gi2).get_const())
- || (strcmp((*cvi1).var->char_name(),
- (*cvi2).var->char_name()))) {
-
- same++;
- }
- }
- }
- if ((same != 0) || !identical) {
- dimension = dimension - 1;
-
- while (stmt[*i].loop_level[dimension].type
- == LoopLevelTile)
- dimension = xform_index[dimension].first;
-
- dimension =
- stmt[*i].loop_level[dimension].payload;
-
- for (int i = 0; i < stmt.size(); i++) {
- std::vector<std::pair<int, DependenceVector> > D;
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[i].second.begin();
- j != dep.vertex[i].second.end();
- j++) {
- for (int k = 0; k < j->second.size();
- k++) {
- DependenceVector dv = j->second[k];
- if ((dv.hasNegative(dimension)
- && !dv.quasi)
- || (dv.hasPositive(
- dimension)
- && dv.quasi))
-
- throw loop_error(
- "loop error: Split is illegal, dependence violation!");
-
- }
- }
- }
-
- }
-
- }
-
- }
-
- }
-
- DNF_Iterator di3(stmt[*i].IS.query_DNF());
- DNF_Iterator di4(part2.query_DNF());
- for (; di3 && di4; di3++, di4++) {
- EQ_Iterator ei1 = (*di3)->EQs();
- EQ_Iterator ei2 = (*di4)->EQs();
- for (; ei1 && ei2; ei1++, ei2++) {
- Constr_Vars_Iter cvi1(*ei1);
- Constr_Vars_Iter cvi2(*ei2);
- int dimension = (*cvi1).var->get_position();
- int same = 0;
- bool identical = false;
- if (identical = !strcmp((*cvi1).var->char_name(),
- (*cvi2).var->char_name())) {
-
- for (; cvi1 && cvi2; cvi1++, cvi2++) {
-
- if (((*cvi1).coef != (*cvi2).coef
- || (*ei1).get_const()
- != (*ei2).get_const())
- || (strcmp((*cvi1).var->char_name(),
- (*cvi2).var->char_name()))) {
-
- same++;
- }
- }
- }
- if ((same != 0) || !identical) {
- dimension = dimension - 1;
-
- while (stmt[*i].loop_level[dimension].type
- == LoopLevelTile)
- dimension = xform_index[dimension].first;
-
- dimension = stmt[*i].loop_level[dimension].payload;
-
- for (int i = 0; i < stmt.size(); i++) {
- std::vector<std::pair<int, DependenceVector> > D;
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[i].second.begin();
- j != dep.vertex[i].second.end(); j++) {
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if ((dv.hasNegative(dimension)
- && !dv.quasi)
- || (dv.hasPositive(dimension)
- && dv.quasi))
-
- throw loop_error(
- "loop error: Split is illegal, dependence violation!");
-
- }
- }
- }
-
- }
-
- }
- GEQ_Iterator gi1 = (*di3)->GEQs();
- GEQ_Iterator gi2 = (*di4)->GEQs();
-
- for (; gi1 && gi2; gi++, gi2++) {
- Constr_Vars_Iter cvi1(*gi1);
- Constr_Vars_Iter cvi2(*gi2);
- int dimension = (*cvi1).var->get_position();
- int same = 0;
- bool identical = false;
- if (identical = !strcmp((*cvi1).var->char_name(),
- (*cvi2).var->char_name())) {
-
- for (; cvi1 && cvi2; cvi1++, cvi2++) {
-
- if (((*cvi1).coef != (*cvi2).coef
- || (*gi1).get_const()
- != (*gi2).get_const())
- || (strcmp((*cvi1).var->char_name(),
- (*cvi2).var->char_name()))) {
-
- same++;
- }
- }
- }
- if ((same != 0) || !identical) {
- dimension = dimension - 1;
-
- while (stmt[*i].loop_level[dimension].type
- == LoopLevelTile)
- dimension = xform_index[dimension].first;
-
- dimension = stmt[*i].loop_level[dimension].payload;
-
- for (int i = 0; i < stmt.size(); i++) {
- std::vector<std::pair<int, DependenceVector> > D;
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[i].second.begin();
- j != dep.vertex[i].second.end(); j++) {
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if ((dv.hasNegative(dimension)
- && !dv.quasi)
- || (dv.hasPositive(dimension)
- && dv.quasi))
-
- throw loop_error(
- "loop error: Split is illegal, dependence violation!");
-
- }
- }
- }
-
- }
-
- }
-
- }
-
- }
-
- stmt[*i].IS = part1;
-
- if (Intersection(copy(part2),
- Extend_Set(copy(this->known), n - this->known.n_set())).is_upper_bound_satisfiable()) {
- Statement new_stmt;
- new_stmt.code = stmt[*i].code->clone();
- new_stmt.IS = part2;
- new_stmt.xform = copy(stmt[*i].xform);
-
- new_stmt.loop_level = stmt[*i].loop_level;
- stmt.push_back(new_stmt);
- dep.insert();
- what_stmt_num[*i] = stmt.size() - 1;
- if (*i == stmt_num)
- result.insert(stmt.size() - 1);
-
- stmt_nesting_level_.push_back(stmt_nesting_level[*i]);
- std::pair<std::vector<DependenceVector>,
- std::vector<DependenceVector> > dv =
- test_data_dependences(ir_, stmt[*i].code, part1,
- stmt[*i].code, part2, freevar, index,
- stmt_nesting_level[*i],
- stmt_nesting_level[stmt.size() - 1]);
-
- int part1_to_part2 = 0;
- int part2_to_part1 = 0;
-
- for (int k = 0; k < dv.first.size(); k++)
- if (is_dependence_valid_based_on_lex_order(*i,
- what_stmt_num[*i], dv.first[k], true))
- part1_to_part2++;
- else
- part2_to_part1++;
-
- if (part1_to_part2 > 0 && part2_to_part1 > 0)
- throw loop_error(
- "loop error: Aborting, split resulted in impossible dependence cycle!");
-
- for (int k = 0; k < dv.second.size(); k++)
- if (is_dependence_valid_based_on_lex_order(
- what_stmt_num[*i], *i, dv.second[k], false))
- part2_to_part1++;
-
- else
- part1_to_part2++;
-
- if (part1_to_part2 > 0 && part2_to_part1 > 0)
- throw loop_error(
- "loop error: Aborting, split resulted in impossible dependence cycle!");
- bool temp_place_after;
- if (part2_to_part1 > 0)
- temp_place_after = false;
- else
- temp_place_after = true;
-
- if (i == same_loop.begin())
- place_after = temp_place_after;
- else {
- if (temp_place_after != place_after)
- throw loop_error(
- "loop error: Aborting, split resulted in impossible dependence cycle!");
-
- }
-
- if (place_after)
- assign_const(new_stmt.xform, dim - 1, cur_lex + 1);
- else
- assign_const(new_stmt.xform, dim - 1, cur_lex - 1);
-
- }
-
- }
- // make adjacent lexical number available for new statements
- if (place_after) {
- lex[dim - 1] = cur_lex + 1;
- shiftLexicalOrder(lex, dim - 1, 1);
- } else {
- lex[dim - 1] = cur_lex - 1;
- shiftLexicalOrder(lex, dim - 1, -1);
- }
- // update dependence graph
- int dep_dim = get_dep_dim_of(stmt_num, level);
- for (int i = 0; i < old_num_stmt; i++) {
- std::vector<std::pair<int, std::vector<DependenceVector> > > D;
-
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[i].second.begin();
- j != dep.vertex[i].second.end(); j++) {
- if (same_loop.find(i) != same_loop.end()) {
- if (same_loop.find(j->first) != same_loop.end()) {
- if (what_stmt_num.find(i) != what_stmt_num.end()
- && what_stmt_num.find(j->first)
- != what_stmt_num.end())
- dep.connect(what_stmt_num[i],
- what_stmt_num[j->first], j->second);
- if (place_after
- && what_stmt_num.find(j->first)
- != what_stmt_num.end()) {
- std::vector<DependenceVector> dvs;
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if (dv.is_data_dependence() && dep_dim != -1) {
- dv.lbounds[dep_dim] = -posInfinity;
- dv.ubounds[dep_dim] = posInfinity;
- }
- dvs.push_back(dv);
- }
- if (dvs.size() > 0)
- D.push_back(
- std::make_pair(what_stmt_num[j->first],
- dvs));
- } else if (!place_after
- && what_stmt_num.find(i)
- != what_stmt_num.end()) {
- std::vector<DependenceVector> dvs;
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if (dv.is_data_dependence() && dep_dim != -1) {
- dv.lbounds[dep_dim] = -posInfinity;
- dv.ubounds[dep_dim] = posInfinity;
- }
- dvs.push_back(dv);
- }
- if (dvs.size() > 0)
- dep.connect(what_stmt_num[i], j->first, dvs);
-
- }
- } else {
- if (what_stmt_num.find(i) != what_stmt_num.end())
- dep.connect(what_stmt_num[i], j->first, j->second);
- }
- } else if (same_loop.find(j->first) != same_loop.end()) {
- if (what_stmt_num.find(j->first) != what_stmt_num.end())
- D.push_back(
- std::make_pair(what_stmt_num[j->first],
- j->second));
- }
- }
-
- for (int j = 0; j < D.size(); j++)
- dep.connect(i, D[j].first, D[j].second);
- }
-
- }
-
- return result;
-}
-
-void Loop::tile(int stmt_num, int level, int tile_size, int outer_level,
- TilingMethodType method, int alignment_offset, int alignment_multiple) {
- // check for sanity of parameters
- if (tile_size < 0)
- throw std::invalid_argument("invalid tile size");
- if (alignment_multiple < 1 || alignment_offset < 0)
- throw std::invalid_argument("invalid alignment for tile");
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invalid statement " + to_string(stmt_num));
- if (level <= 0)
- throw std::invalid_argument("invalid loop level " + to_string(level));
- if (level > stmt[stmt_num].loop_level.size())
- throw std::invalid_argument(
- "there is no loop level " + to_string(level) + " for statement "
- + to_string(stmt_num));
- if (outer_level <= 0 || outer_level > level)
- throw std::invalid_argument(
- "invalid tile controlling loop level "
- + to_string(outer_level));
-
- int dim = 2 * level - 1;
- int outer_dim = 2 * outer_level - 1;
- std::vector<int> lex = getLexicalOrder(stmt_num);
- std::set<int> same_tiled_loop = getStatements(lex, dim - 1);
- std::set<int> same_tile_controlling_loop = getStatements(lex,
- outer_dim - 1);
-
- for (int i = 0; i < stmt.size(); i++) {
- std::vector<std::pair<int, DependenceVector> > D;
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();
- j++) {
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- int dim2 = level - 1;
- if ((dv.type != DEP_CONTROL) && (dv.type != DEP_UNKNOWN)) {
- while (stmt[i].loop_level[dim2].type == LoopLevelTile) {
- dim2 = stmt[i].loop_level[dim2].payload;
- }
- dim2 = stmt[i].loop_level[dim2].payload;
-
- if ((dv.hasNegative(dim2) && (!dv.quasi))
- || (dv.quasi && dv.hasPositive(dim2))) {
- for (int l = outer_level; l < level; l++)
- if (stmt[i].loop_level[l - 1].type
- != LoopLevelTile) {
- if (dv.isCarried(
- stmt[i].loop_level[l - 1].payload))
- throw loop_error(
- "loop error: Tiling is illegal, dependence violation!");
- } else {
-
- int dim3 = l - 1;
- while (stmt[i].loop_level[l - 1].type
- != LoopLevelTile) {
- dim3 = stmt[i].loop_level[l - 1].payload;
-
- }
-
- dim3 = stmt[i].loop_level[l - 1].payload;
- if (dim3 < level - 1)
- if (dv.isCarried(dim3))
- throw loop_error(
- "loop error: Tiling is illegal, dependence violation!");
- }
- }
- }
- }
- }
- }
- // special case for no tiling
- if (tile_size == 0) {
- for (std::set<int>::iterator i = same_tile_controlling_loop.begin();
- i != same_tile_controlling_loop.end(); i++) {
- Relation r(stmt[*i].xform.n_out(), stmt[*i].xform.n_out() + 2);
- F_And *f_root = r.add_and();
- for (int j = 1; j <= 2 * outer_level - 1; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.input_var(j), 1);
- h.update_coef(r.output_var(j), -1);
- }
- EQ_Handle h1 = f_root->add_EQ();
- h1.update_coef(r.output_var(2 * outer_level), 1);
- EQ_Handle h2 = f_root->add_EQ();
- h2.update_coef(r.output_var(2 * outer_level + 1), 1);
- for (int j = 2 * outer_level; j <= stmt[*i].xform.n_out(); j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.input_var(j), 1);
- h.update_coef(r.output_var(j + 2), -1);
- }
-
- stmt[*i].xform = Composition(copy(r), stmt[*i].xform);
- }
- }
- // normal tiling
- else {
- std::set<int> private_stmt;
- for (std::set<int>::iterator i = same_tile_controlling_loop.begin();
- i != same_tile_controlling_loop.end(); i++) {
-// if (same_tiled_loop.find(*i) == same_tiled_loop.end() && !is_single_iteration(getNewIS(*i), dim))
-// same_tiled_loop.insert(*i);
-
- // should test dim's value directly but it is ok for now
-// if (same_tiled_loop.find(*i) == same_tiled_loop.end() && get_const(stmt[*i].xform, dim+1, Output_Var) == posInfinity)
- if (same_tiled_loop.find(*i) == same_tiled_loop.end()
- && overflow.find(*i) != overflow.end())
- private_stmt.insert(*i);
- }
-
- // extract the union of the iteration space to be considered
- Relation hull;
- {
- Tuple < Relation > r_list;
- Tuple<int> r_mask;
-
- for (std::set<int>::iterator i = same_tile_controlling_loop.begin();
- i != same_tile_controlling_loop.end(); i++)
- if (private_stmt.find(*i) == private_stmt.end()) {
- Relation r = project_onto_levels(getNewIS(*i), dim + 1,
- true);
- for (int j = outer_dim; j < dim; j++)
- r = Project(r, j + 1, Set_Var);
- for (int j = 0; j < outer_dim; j += 2)
- r = Project(r, j + 1, Set_Var);
- r_list.append(r);
- r_mask.append(1);
- }
-
- hull = Hull(r_list, r_mask, 1, true);
- }
-
- // extract the bound of the dimension to be tiled
- Relation bound = get_loop_bound(hull, dim);
- if (!bound.has_single_conjunct()) {
- // further simplify the bound
- hull = Approximate(hull);
- bound = get_loop_bound(hull, dim);
-
- int i = outer_dim - 2;
- while (!bound.has_single_conjunct() && i >= 0) {
- hull = Project(hull, i + 1, Set_Var);
- bound = get_loop_bound(hull, dim);
- i -= 2;
- }
-
- if (!bound.has_single_conjunct())
- throw loop_error("cannot handle tile bounds");
- }
-
- // separate lower and upper bounds
- std::vector<GEQ_Handle> lb_list, ub_list;
- {
- Conjunct *c = bound.query_DNF()->single_conjunct();
- for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
- int coef = (*gi).get_coef(bound.set_var(dim + 1));
- if (coef < 0)
- ub_list.push_back(*gi);
- else if (coef > 0)
- lb_list.push_back(*gi);
- }
- }
- if (lb_list.size() == 0)
- throw loop_error(
- "unable to calculate tile controlling loop lower bound");
- if (ub_list.size() == 0)
- throw loop_error(
- "unable to calculate tile controlling loop upper bound");
-
- // find the simplest lower bound for StridedTile or simplest iteration count for CountedTile
- int simplest_lb = 0, simplest_ub = 0;
- if (method == StridedTile) {
- int best_cost = INT_MAX;
- for (int i = 0; i < lb_list.size(); i++) {
- int cost = 0;
- for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- cost += 5;
- break;
- }
- case Global_Var: {
- cost += 2;
- break;
- }
- default:
- cost += 15;
- break;
- }
- }
-
- if (cost < best_cost) {
- best_cost = cost;
- simplest_lb = i;
- }
- }
- } else if (method == CountedTile) {
- std::map<Variable_ID, coef_t> s1, s2, s3;
- int best_cost = INT_MAX;
- for (int i = 0; i < lb_list.size(); i++)
- for (int j = 0; j < ub_list.size(); j++) {
- int cost = 0;
-
- for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- s1[(*ci).var] += (*ci).coef;
- break;
- }
- case Global_Var: {
- s2[(*ci).var] += (*ci).coef;
- break;
- }
- case Exists_Var:
- case Wildcard_Var: {
- s3[(*ci).var] += (*ci).coef;
- break;
- }
- default:
- cost = INT_MAX - 2;
- break;
- }
- }
-
- for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- s1[(*ci).var] += (*ci).coef;
- break;
- }
- case Global_Var: {
- s2[(*ci).var] += (*ci).coef;
- break;
- }
- case Exists_Var:
- case Wildcard_Var: {
- s3[(*ci).var] += (*ci).coef;
- break;
- }
- default:
- if (cost == INT_MAX - 2)
- cost = INT_MAX - 1;
- else
- cost = INT_MAX - 3;
- break;
- }
- }
-
- if (cost == 0) {
- for (std::map<Variable_ID, coef_t>::iterator k =
- s1.begin(); k != s1.end(); k++)
- if ((*k).second != 0)
- cost += 5;
- for (std::map<Variable_ID, coef_t>::iterator k =
- s2.begin(); k != s2.end(); k++)
- if ((*k).second != 0)
- cost += 2;
- for (std::map<Variable_ID, coef_t>::iterator k =
- s3.begin(); k != s3.end(); k++)
- if ((*k).second != 0)
- cost += 15;
- }
-
- if (cost < best_cost) {
- best_cost = cost;
- simplest_lb = i;
- simplest_ub = j;
- }
- }
- }
-
- // prepare the new transformation relations
- for (std::set<int>::iterator i = same_tile_controlling_loop.begin();
- i != same_tile_controlling_loop.end(); i++) {
- Relation r(stmt[*i].xform.n_out(), stmt[*i].xform.n_out() + 2);
- F_And *f_root = r.add_and();
- for (int j = 0; j < outer_dim - 1; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.output_var(j + 1), 1);
- h.update_coef(r.input_var(j + 1), -1);
- }
-
- for (int j = outer_dim - 1; j < stmt[*i].xform.n_out(); j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.output_var(j + 3), 1);
- h.update_coef(r.input_var(j + 1), -1);
- }
-
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.output_var(outer_dim), 1);
- h.update_const(-lex[outer_dim - 1]);
-
- stmt[*i].xform = Composition(r, stmt[*i].xform);
- }
-
- // add tiling constraints.
- for (std::set<int>::iterator i = same_tile_controlling_loop.begin();
- i != same_tile_controlling_loop.end(); i++) {
- F_And *f_super_root = stmt[*i].xform.and_with_and();
- F_Exists *f_exists = f_super_root->add_exists();
- F_And *f_root = f_exists->add_and();
-
- // create a lower bound variable for easy formula creation later
- Variable_ID aligned_lb;
- {
- Variable_ID lb = f_exists->declare();
- coef_t coef = lb_list[simplest_lb].get_coef(
- bound.set_var(dim + 1));
- if (coef == 1) { // e.g. if i >= m+5, then LB = m+5
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(lb, 1);
- for (Constr_Vars_Iter ci(lb_list[simplest_lb]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- int pos = (*ci).var->get_position();
- if (pos != dim + 1)
- h.update_coef(stmt[*i].xform.output_var(pos),
- (*ci).coef);
- break;
- }
- case Global_Var: {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = stmt[*i].xform.get_local(g);
- else
- v = stmt[*i].xform.get_local(g,
- (*ci).var->function_of());
- h.update_coef(v, (*ci).coef);
- break;
- }
- default:
- throw loop_error("cannot handle tile bounds");
- }
- }
- h.update_const(lb_list[simplest_lb].get_const());
- } else { // e.g. if 2i >= m+5, then m+5 <= 2*LB < m+5+2
- GEQ_Handle h1 = f_root->add_GEQ();
- GEQ_Handle h2 = f_root->add_GEQ();
- for (Constr_Vars_Iter ci(lb_list[simplest_lb]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- int pos = (*ci).var->get_position();
- if (pos == dim + 1) {
- h1.update_coef(lb, (*ci).coef);
- h2.update_coef(lb, -(*ci).coef);
- } else {
- h1.update_coef(stmt[*i].xform.output_var(pos),
- (*ci).coef);
- h2.update_coef(stmt[*i].xform.output_var(pos),
- -(*ci).coef);
- }
- break;
- }
- case Global_Var: {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = stmt[*i].xform.get_local(g);
- else
- v = stmt[*i].xform.get_local(g,
- (*ci).var->function_of());
- h1.update_coef(v, (*ci).coef);
- h2.update_coef(v, -(*ci).coef);
- break;
- }
- default:
- throw loop_error("cannot handle tile bounds");
- }
- }
- h1.update_const(lb_list[simplest_lb].get_const());
- h2.update_const(-lb_list[simplest_lb].get_const());
- h2.update_const(coef - 1);
- }
-
- Variable_ID offset_lb;
- if (alignment_offset == 0)
- offset_lb = lb;
- else {
- EQ_Handle h = f_root->add_EQ();
- offset_lb = f_exists->declare();
- h.update_coef(offset_lb, 1);
- h.update_coef(lb, -1);
- h.update_const(alignment_offset);
- }
-
- if (alignment_multiple == 1) { // trivial
- aligned_lb = offset_lb;
- } else { // e.g. to align at 4, aligned_lb = 4*alpha && LB-4 < 4*alpha <= LB
- aligned_lb = f_exists->declare();
- Variable_ID e = f_exists->declare();
-
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(aligned_lb, 1);
- h.update_coef(e, -alignment_multiple);
-
- GEQ_Handle h1 = f_root->add_GEQ();
- GEQ_Handle h2 = f_root->add_GEQ();
- h1.update_coef(e, alignment_multiple);
- h2.update_coef(e, -alignment_multiple);
- h1.update_coef(offset_lb, -1);
- h2.update_coef(offset_lb, 1);
- h1.update_const(alignment_multiple - 1);
- }
- }
-
- // create an upper bound variable for easy formula creation later
- Variable_ID ub = f_exists->declare();
- {
- coef_t coef = -ub_list[simplest_ub].get_coef(
- bound.set_var(dim + 1));
- if (coef == 1) { // e.g. if i <= m+5, then UB = m+5
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(ub, -1);
- for (Constr_Vars_Iter ci(ub_list[simplest_ub]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- int pos = (*ci).var->get_position();
- if (pos != dim + 1)
- h.update_coef(stmt[*i].xform.output_var(pos),
- (*ci).coef);
- break;
- }
- case Global_Var: {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = stmt[*i].xform.get_local(g);
- else
- v = stmt[*i].xform.get_local(g,
- (*ci).var->function_of());
- h.update_coef(v, (*ci).coef);
- break;
- }
- default:
- throw loop_error("cannot handle tile bounds");
- }
- }
- h.update_const(ub_list[simplest_ub].get_const());
- } else { // e.g. if 2i <= m+5, then m+5-2 < 2*UB <= m+5
- GEQ_Handle h1 = f_root->add_GEQ();
- GEQ_Handle h2 = f_root->add_GEQ();
- for (Constr_Vars_Iter ci(ub_list[simplest_ub]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- int pos = (*ci).var->get_position();
- if (pos == dim + 1) {
- h1.update_coef(ub, -(*ci).coef);
- h2.update_coef(ub, (*ci).coef);
- } else {
- h1.update_coef(stmt[*i].xform.output_var(pos),
- -(*ci).coef);
- h2.update_coef(stmt[*i].xform.output_var(pos),
- (*ci).coef);
- }
- break;
- }
- case Global_Var: {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = stmt[*i].xform.get_local(g);
- else
- v = stmt[*i].xform.get_local(g,
- (*ci).var->function_of());
- h1.update_coef(v, -(*ci).coef);
- h2.update_coef(v, (*ci).coef);
- break;
- }
- default:
- throw loop_error("cannot handle tile bounds");
- }
- }
- h1.update_const(-ub_list[simplest_ub].get_const());
- h2.update_const(ub_list[simplest_ub].get_const());
- h1.update_const(coef - 1);
- }
- }
-
- // insert tile controlling loop constraints
- if (method == StridedTile) { // e.g. ii = LB + 32 * alpha && alpha >= 0
- Variable_ID e = f_exists->declare();
- GEQ_Handle h1 = f_root->add_GEQ();
- h1.update_coef(e, 1);
-
- EQ_Handle h2 = f_root->add_EQ();
- h2.update_coef(stmt[*i].xform.output_var(outer_dim + 1), 1);
- h2.update_coef(e, -tile_size);
- h2.update_coef(aligned_lb, -1);
- } else if (method == CountedTile) { // e.g. 0 <= ii < ceiling((UB-LB+1)/32)
- GEQ_Handle h1 = f_root->add_GEQ();
- h1.update_coef(stmt[*i].xform.output_var(outer_dim + 1), 1);
-
- GEQ_Handle h2 = f_root->add_GEQ();
- h2.update_coef(stmt[*i].xform.output_var(outer_dim + 1),
- -tile_size);
- h2.update_coef(aligned_lb, -1);
- h2.update_coef(ub, 1);
- }
-
- // special care for private statements like overflow assignment
- if (private_stmt.find(*i) != private_stmt.end()) { // e.g. ii <= UB
- GEQ_Handle h = f_root->add_GEQ();
- h.update_coef(stmt[*i].xform.output_var(outer_dim + 1), -1);
- h.update_coef(ub, 1);
- }
- // if (private_stmt.find(*i) != private_stmt.end()) {
- // if (stmt[*i].xform.n_out() > dim+3) { // e.g. ii <= UB && i = ii
- // GEQ_Handle h = f_root->add_GEQ();
- // h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1);
- // h.update_coef(ub, 1);
-
- // stmt[*i].xform = Project(stmt[*i].xform, dim+3, Output_Var);
- // f_root = stmt[*i].xform.and_with_and();
- // EQ_Handle h1 = f_root->add_EQ();
- // h1.update_coef(stmt[*i].xform.output_var(dim+3), 1);
- // h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1);
- // }
- // else if (method == StridedTile) { // e.g. ii <= UB since i does not exist
- // GEQ_Handle h = f_root->add_GEQ();
- // h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1);
- // h.update_coef(ub, 1);
- // }
- // }
-
- // restrict original loop index inside the tile
- else {
- if (method == StridedTile) { // e.g. ii <= i < ii + tile_size
- GEQ_Handle h1 = f_root->add_GEQ();
- h1.update_coef(stmt[*i].xform.output_var(dim + 3), 1);
- h1.update_coef(stmt[*i].xform.output_var(outer_dim + 1),
- -1);
-
- GEQ_Handle h2 = f_root->add_GEQ();
- h2.update_coef(stmt[*i].xform.output_var(dim + 3), -1);
- h2.update_coef(stmt[*i].xform.output_var(outer_dim + 1), 1);
- h2.update_const(tile_size - 1);
- } else if (method == CountedTile) { // e.g. LB+32*ii <= i < LB+32*ii+tile_size
- GEQ_Handle h1 = f_root->add_GEQ();
- h1.update_coef(stmt[*i].xform.output_var(outer_dim + 1),
- -tile_size);
- h1.update_coef(stmt[*i].xform.output_var(dim + 3), 1);
- h1.update_coef(aligned_lb, -1);
-
- GEQ_Handle h2 = f_root->add_GEQ();
- h2.update_coef(stmt[*i].xform.output_var(outer_dim + 1),
- tile_size);
- h2.update_coef(stmt[*i].xform.output_var(dim + 3), -1);
- h2.update_const(tile_size - 1);
- h2.update_coef(aligned_lb, 1);
- }
- }
- }
- }
-
- // update loop level information
- for (std::set<int>::iterator i = same_tile_controlling_loop.begin();
- i != same_tile_controlling_loop.end(); i++) {
- for (int j = 1; j <= stmt[*i].loop_level.size(); j++)
- switch (stmt[*i].loop_level[j - 1].type) {
- case LoopLevelOriginal:
- break;
- case LoopLevelTile:
- if (stmt[*i].loop_level[j - 1].payload >= outer_level)
- stmt[*i].loop_level[j - 1].payload++;
- break;
- default:
- throw loop_error(
- "unknown loop level type for statement "
- + to_string(*i));
- }
-
- LoopLevel ll;
- ll.type = LoopLevelTile;
- ll.payload = level + 1;
- ll.parallel_level = 0;
- stmt[*i].loop_level.insert(
- stmt[*i].loop_level.begin() + (outer_level - 1), ll);
- }
-}
-
-std::set<int> Loop::unroll(int stmt_num, int level, int unroll_amount) {
- // check for sanity of parameters
- if (unroll_amount < 0)
- throw std::invalid_argument(
- "invalid unroll amount " + to_string(unroll_amount));
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invalid statement " + to_string(stmt_num));
- if (level <= 0 || level > stmt[stmt_num].loop_level.size())
- throw std::invalid_argument("invalid loop level " + to_string(level));
-
- int dim = 2 * level - 1;
- std::vector<int> lex = getLexicalOrder(stmt_num);
- std::set<int> same_loop = getStatements(lex, dim - 1);
-
- // nothing to do
- if (unroll_amount == 1)
- return std::set<int>();
-
- for (int i = 0; i < stmt.size(); i++) {
- std::vector<std::pair<int, DependenceVector> > D;
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();
- j++) {
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- int dim2 = level - 1;
- if ((dv.type != DEP_CONTROL) && (dv.type != DEP_UNKNOWN)) {
-
- while (stmt[i].loop_level[dim2].type == LoopLevelTile) {
- dim2 = xform_index[dim2].first;
- }
- dim2 = stmt[i].loop_level[dim2].payload;
-
- if (dv.isCarried(dim2)
- && (dv.hasNegative(dim2) && !dv.quasi))
- throw loop_error(
- "loop error: Unrolling is illegal, dependence violation!");
-
- if (dv.isCarried(dim2)
- && (dv.hasPositive(dim2) && dv.quasi))
- throw loop_error(
- "loop error: Unrolling is illegal, dependence violation!");
- bool safe = false;
-
- if (dv.isCarried(dim2)) {
-
- if (!dv.quasi) {
- if (dv.lbounds[dim2] != posInfinity) {
- if (dv.lbounds[dim2] != negInfinity)
- if (dv.lbounds[dim2] > unroll_amount)
- safe = true;
- } else
- safe = true;
- } else {
- if (dv.ubounds[dim2] != negInfinity) {
- if (dv.ubounds[dim2] != posInfinity)
- if ((-(dv.ubounds[dim2])) > unroll_amount)
- safe = true;
- } else
- safe = true;
- }
-
- if (!safe) {
- for (int l = level; l <= (n - 1) / 2; l++) {
- int dim3 = l - 1;
-
- if (stmt[i].loop_level[dim3].type
- != LoopLevelTile)
- dim3 = stmt[i].loop_level[dim3].payload;
- else {
- while (stmt[i].loop_level[dim2].type
- == LoopLevelTile) {
- dim3 = stmt[i].loop_level[dim3].payload;
- }
- dim3 = stmt[i].loop_level[dim3].payload;
- }
-
- if (dim3 > dim2) {
- if ((dv.hasPositive(dim3) && !dv.quasi)
- || (dv.hasNegative(dim3) && dv.quasi))
- break;
- else if ((dv.hasNegative(dim3) && !dv.quasi)
- || (dv.hasPositive(dim3) && dv.quasi))
- throw loop_error(
- "loop error: Unrolling is illegal, dependence violation!");
- }
- }
- }
- }
- }
- }
- }
- }
-
- // extract the intersection of the iteration space to be considered
- Relation hull = Relation::True(level);
- apply_xform(same_loop);
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end();
- i++) {
- if (stmt[*i].IS.is_upper_bound_satisfiable()) {
- Relation mapping(stmt[*i].IS.n_set(), level);
- F_And *f_root = mapping.add_and();
- for (int j = 1; j <= level; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.input_var(j), 1);
- h.update_coef(mapping.output_var(j), -1);
- }
- hull = Intersection(hull,
- Range(Restrict_Domain(mapping, copy(stmt[*i].IS))));
- hull.simplify(2, 4);
- }
- }
- for (int i = 1; i <= level; i++) {
- std::string name = tmp_loop_var_name_prefix + to_string(i);
- hull.name_set_var(i, name);
- }
- hull.setup_names();
-
- // extract the exact loop bound of the dimension to be unrolled
- if (is_single_loop_iteration(hull, level, this->known))
- return std::set<int>();
- Relation bound = get_loop_bound(hull, level, this->known);
- if (!bound.has_single_conjunct() || !bound.is_satisfiable()
- || bound.is_tautology())
- throw loop_error("unable to extract loop bound for unrolling");
-
- // extract the loop stride
- EQ_Handle stride_eq;
- int stride = 1;
- {
- bool simple_stride = true;
- int strides = countStrides(bound.query_DNF()->single_conjunct(),
- bound.set_var(level), stride_eq, simple_stride);
- if (strides > 1)
- throw loop_error("too many strides");
- else if (strides == 1) {
- int sign = stride_eq.get_coef(bound.set_var(level));
- Constr_Vars_Iter it(stride_eq, true);
- stride = abs((*it).coef / sign);
- }
- }
-
- // separate lower and upper bounds
- std::vector<GEQ_Handle> lb_list, ub_list;
- {
- Conjunct *c = bound.query_DNF()->single_conjunct();
- for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
- int coef = (*gi).get_coef(bound.set_var(level));
- if (coef < 0)
- ub_list.push_back(*gi);
- else if (coef > 0)
- lb_list.push_back(*gi);
- }
- }
-
- // simplify overflow expression for each pair of upper and lower bounds
- std::vector<std::vector<std::map<Variable_ID, int> > > overflow_table(
- lb_list.size(),
- std::vector<std::map<Variable_ID, int> >(ub_list.size(),
- std::map<Variable_ID, int>()));
- bool is_overflow_simplifiable = true;
- for (int i = 0; i < lb_list.size(); i++) {
- if (!is_overflow_simplifiable)
- break;
-
- for (int j = 0; j < ub_list.size(); j++) {
- // lower bound or upper bound has non-unit coefficient, can't simplify
- if (ub_list[j].get_coef(bound.set_var(level)) != -1
- || lb_list[i].get_coef(bound.set_var(level)) != 1) {
- is_overflow_simplifiable = false;
- break;
- }
-
- for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- if ((*ci).var != bound.set_var(level))
- overflow_table[i][j][(*ci).var] += (*ci).coef;
-
- break;
- }
- case Global_Var: {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = bound.get_local(g);
- else
- v = bound.get_local(g, (*ci).var->function_of());
- overflow_table[i][j][(*ci).var] += (*ci).coef;
- break;
- }
- default:
- throw loop_error("failed to calculate overflow amount");
- }
- }
- overflow_table[i][j][NULL] += ub_list[j].get_const();
-
- for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var: {
- if ((*ci).var != bound.set_var(level)) {
- overflow_table[i][j][(*ci).var] += (*ci).coef;
- if (overflow_table[i][j][(*ci).var] == 0)
- overflow_table[i][j].erase(
- overflow_table[i][j].find((*ci).var));
- }
- break;
- }
- case Global_Var: {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = bound.get_local(g);
- else
- v = bound.get_local(g, (*ci).var->function_of());
- overflow_table[i][j][(*ci).var] += (*ci).coef;
- if (overflow_table[i][j][(*ci).var] == 0)
- overflow_table[i][j].erase(
- overflow_table[i][j].find((*ci).var));
- break;
- }
- default:
- throw loop_error("failed to calculate overflow amount");
- }
- }
- overflow_table[i][j][NULL] += lb_list[i].get_const();
-
- overflow_table[i][j][NULL] += stride;
- if (unroll_amount == 0
- || (overflow_table[i][j].size() == 1
- && overflow_table[i][j][NULL] / stride
- < unroll_amount))
- unroll_amount = overflow_table[i][j][NULL] / stride;
- }
- }
-
- // loop iteration count can't be determined, bail out gracefully
- if (unroll_amount == 0)
- return std::set<int>();
-
- // further simply overflow calculation using coefficients' modular
- if (is_overflow_simplifiable) {
- for (int i = 0; i < lb_list.size(); i++)
- for (int j = 0; j < ub_list.size(); j++)
- if (stride == 1) {
- for (std::map<Variable_ID, int>::iterator k =
- overflow_table[i][j].begin();
- k != overflow_table[i][j].end();)
- if ((*k).first != NULL) {
- int t = int_mod_hat((*k).second, unroll_amount);
- if (t == 0) {
- overflow_table[i][j].erase(k++);
- } else {
- int t2 = hull.query_variable_mod((*k).first,
- unroll_amount);
- if (t2 != INT_MAX) {
- overflow_table[i][j][NULL] += t * t2;
- overflow_table[i][j].erase(k++);
- } else {
- (*k).second = t;
- k++;
- }
- }
- } else
- k++;
-
- overflow_table[i][j][NULL] = int_mod_hat(
- overflow_table[i][j][NULL], unroll_amount);
-
- // Since we don't have MODULO instruction in SUIF yet (only MOD), make all coef positive in the final formula
- for (std::map<Variable_ID, int>::iterator k =
- overflow_table[i][j].begin();
- k != overflow_table[i][j].end(); k++)
- if ((*k).second < 0)
- (*k).second += unroll_amount;
- }
- }
-
- // build overflow statement
- CG_outputBuilder *ocg = ir->builder();
- CG_outputRepr *overflow_code = NULL;
- Relation cond_upper(level), cond_lower(level);
- Relation overflow_constraint(0);
- F_And *overflow_constraint_root = overflow_constraint.add_and();
- std::vector<Free_Var_Decl *> over_var_list;
- if (is_overflow_simplifiable && lb_list.size() == 1) {
- for (int i = 0; i < ub_list.size(); i++) {
- if (overflow_table[0][i].size() == 1) {
- // upper splitting condition
- GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]);
- h.update_const(
- ((overflow_table[0][i][NULL] / stride) % unroll_amount)
- * -stride);
- } else {
- // upper splitting condition
- std::string over_name = overflow_var_name_prefix
- + to_string(overflow_var_name_counter++);
- Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name);
- over_var_list.push_back(over_free_var);
- GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]);
- h.update_coef(cond_upper.get_local(over_free_var), -stride);
-
- // insert constraint 0 <= overflow < unroll_amount
- Variable_ID v = overflow_constraint.get_local(over_free_var);
- GEQ_Handle h1 = overflow_constraint_root->add_GEQ();
- h1.update_coef(v, 1);
- GEQ_Handle h2 = overflow_constraint_root->add_GEQ();
- h2.update_coef(v, -1);
- h2.update_const(unroll_amount - 1);
-
- // create overflow assignment
- bound.setup_names();
- CG_outputRepr *rhs = NULL;
- for (std::map<Variable_ID, int>::iterator j =
- overflow_table[0][i].begin();
- j != overflow_table[0][i].end(); j++)
- if ((*j).first != NULL) {
- CG_outputRepr *t = ocg->CreateIdent((*j).first->name());
- if ((*j).second != 1)
- t = ocg->CreateTimes(ocg->CreateInt((*j).second),
- t);
- rhs = ocg->CreatePlus(rhs, t);
- } else if ((*j).second != 0)
- rhs = ocg->CreatePlus(rhs, ocg->CreateInt((*j).second));
-
- if (stride != 1)
- rhs = ocg->CreateIntegerCeil(rhs, ocg->CreateInt(stride));
- rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount));
-
- CG_outputRepr *lhs = ocg->CreateIdent(over_name);
- init_code = ocg->StmtListAppend(init_code,
- ocg->CreateAssignment(0, lhs, ocg->CreateInt(0)));
- lhs = ocg->CreateIdent(over_name);
- overflow_code = ocg->StmtListAppend(overflow_code,
- ocg->CreateAssignment(0, lhs, rhs));
- }
- }
-
- // lower splitting condition
- GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[0]);
- } else if (is_overflow_simplifiable && ub_list.size() == 1) {
- for (int i = 0; i < lb_list.size(); i++) {
-
- if (overflow_table[i][0].size() == 1) {
- // lower splitting condition
- GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]);
- h.update_const(overflow_table[i][0][NULL] * -stride);
- } else {
- // lower splitting condition
- std::string over_name = overflow_var_name_prefix
- + to_string(overflow_var_name_counter++);
- Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name);
- over_var_list.push_back(over_free_var);
- GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]);
- h.update_coef(cond_lower.get_local(over_free_var), -stride);
-
- // insert constraint 0 <= overflow < unroll_amount
- Variable_ID v = overflow_constraint.get_local(over_free_var);
- GEQ_Handle h1 = overflow_constraint_root->add_GEQ();
- h1.update_coef(v, 1);
- GEQ_Handle h2 = overflow_constraint_root->add_GEQ();
- h2.update_coef(v, -1);
- h2.update_const(unroll_amount - 1);
-
- // create overflow assignment
- bound.setup_names();
- CG_outputRepr *rhs = NULL;
- for (std::map<Variable_ID, int>::iterator j =
- overflow_table[0][i].begin();
- j != overflow_table[0][i].end(); j++)
- if ((*j).first != NULL) {
- CG_outputRepr *t = ocg->CreateIdent((*j).first->name());
- if ((*j).second != 1)
- t = ocg->CreateTimes(ocg->CreateInt((*j).second),
- t);
- rhs = ocg->CreatePlus(rhs, t);
- } else if ((*j).second != 0)
- rhs = ocg->CreatePlus(rhs, ocg->CreateInt((*j).second));
-
- if (stride != 1)
- rhs = ocg->CreateIntegerCeil(rhs, ocg->CreateInt(stride));
- rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount));
-
- CG_outputRepr *lhs = ocg->CreateIdent(over_name);
- init_code = ocg->StmtListAppend(init_code,
- ocg->CreateAssignment(0, lhs, ocg->CreateInt(0)));
- lhs = ocg->CreateIdent(over_name);
- overflow_code = ocg->StmtListAppend(overflow_code,
- ocg->CreateAssignment(0, lhs, rhs));
- }
- }
-
- // upper splitting condition
- GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[0]);
- } else {
- std::string over_name = overflow_var_name_prefix
- + to_string(overflow_var_name_counter++);
- Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name);
- over_var_list.push_back(over_free_var);
-
- Tuple<CG_outputRepr *> lb_repr_list, ub_repr_list;
- for (int i = 0; i < lb_list.size(); i++) {
- //lb_repr_list.append(outputLBasRepr(ocg, lb_list[i], bound, bound.set_var(dim+1), stride, stride_eq, Relation::True(bound.n_set()), std::vector<CG_outputRepr *>(bound.n_set(), NULL)));
- lb_repr_list.append(
- outputLBasRepr(ocg, lb_list[i], bound,
- bound.set_var(dim + 1), stride, stride_eq,
- Relation::True(bound.n_set()),
- std::vector<CG_outputRepr *>(bound.n_set())));
- GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]);
- }
- for (int i = 0; i < ub_list.size(); i++) {
- //ub_repr_list.append(outputUBasRepr(ocg, ub_list[i], bound, bound.set_var(dim+1), stride, stride_eq, std::vector<CG_outputRepr *>(bound.n_set(), NULL)));
- ub_repr_list.append(
- outputUBasRepr(ocg, ub_list[i], bound,
- bound.set_var(dim + 1), stride, stride_eq,
- std::vector<CG_outputRepr *>(bound.n_set())));
- GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]);
- h.update_coef(cond_upper.get_local(over_free_var), -stride);
- }
-
- CG_outputRepr *lbRepr, *ubRepr;
- if (lb_repr_list.size() > 1)
- lbRepr = ocg->CreateInvoke("max", lb_repr_list);
- else if (lb_repr_list.size() == 1)
- lbRepr = lb_repr_list[1];
-
- if (ub_repr_list.size() > 1)
- ubRepr = ocg->CreateInvoke("min", ub_repr_list);
- else if (ub_repr_list.size() == 1)
- ubRepr = ub_repr_list[1];
-
- // create overflow assignment
- bound.setup_names();
- CG_outputRepr *rhs = ocg->CreatePlus(ocg->CreateMinus(ubRepr, lbRepr),
- ocg->CreateInt(1));
- if (stride != 1)
- rhs = ocg->CreateIntegerDivide(rhs, ocg->CreateInt(stride));
- rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount));
- CG_outputRepr *lhs = ocg->CreateIdent(over_name);
- init_code = ocg->StmtListAppend(init_code,
- ocg->CreateAssignment(0, lhs, ocg->CreateInt(0)));
- lhs = ocg->CreateIdent(over_name);
- overflow_code = ocg->CreateAssignment(0, lhs, rhs);
-
- // insert constraint 0 <= overflow < unroll_amount
- Variable_ID v = overflow_constraint.get_local(over_free_var);
- GEQ_Handle h1 = overflow_constraint_root->add_GEQ();
- h1.update_coef(v, 1);
- GEQ_Handle h2 = overflow_constraint_root->add_GEQ();
- h2.update_coef(v, -1);
- h2.update_const(unroll_amount - 1);
- }
-
- // insert overflow statement
- int overflow_stmt_num = -1;
- if (overflow_code != NULL) {
- // build iteration space for overflow statement
- Relation mapping(level, level - 1);
- F_And *f_root = mapping.add_and();
- for (int i = 1; i < level; i++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(i), 1);
- h.update_coef(mapping.input_var(i), -1);
- }
- Relation overflow_IS = Range(Restrict_Domain(mapping, copy(hull)));
- for (int i = 1; i < level; i++)
- overflow_IS.name_set_var(i, hull.set_var(i)->name());
- overflow_IS.setup_names();
-
- // build dumb transformation relation for overflow statement
- Relation overflow_xform(level - 1, 2 * (level - 1) + 1);
- f_root = overflow_xform.add_and();
- for (int i = 1; i <= level - 1; i++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(overflow_xform.output_var(2 * i), 1);
- h.update_coef(overflow_xform.input_var(i), -1);
-
- h = f_root->add_EQ();
- h.update_coef(overflow_xform.output_var(2 * i - 1), 1);
- h.update_const(-lex[2 * i - 2]);
- }
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(overflow_xform.output_var(2 * (level - 1) + 1), 1);
- h.update_const(-lex[2 * (level - 1)]);
-
- shiftLexicalOrder(lex, dim - 1, 1);
- Statement overflow_stmt;
- overflow_stmt.code = overflow_code;
- overflow_stmt.IS = overflow_IS;
- overflow_stmt.xform = overflow_xform;
- overflow_stmt.loop_level = std::vector<LoopLevel>(level - 1);
- for (int i = 0; i < level - 1; i++) {
- overflow_stmt.loop_level[i].type =
- stmt[stmt_num].loop_level[i].type;
- if (stmt[stmt_num].loop_level[i].type == LoopLevelTile
- && stmt[stmt_num].loop_level[i].payload >= level)
- overflow_stmt.loop_level[i].payload = -1;
- else
- overflow_stmt.loop_level[i].payload =
- stmt[stmt_num].loop_level[i].payload;
- overflow_stmt.loop_level[i].parallel_level =
- stmt[stmt_num].loop_level[i].parallel_level;
- }
- stmt.push_back(overflow_stmt);
- dep.insert();
- overflow_stmt_num = stmt.size() - 1;
- overflow[overflow_stmt_num] = over_var_list;
-
- // update the global known information on overflow variable
- this->known = Intersection(this->known,
- Extend_Set(copy(overflow_constraint),
- this->known.n_set() - overflow_constraint.n_set()));
-
- // update dependence graph
- DependenceVector dv;
- dv.type = DEP_CONTROL;
- for (std::set<int>::iterator i = same_loop.begin();
- i != same_loop.end(); i++)
- dep.connect(overflow_stmt_num, *i, dv);
- dv.type = DEP_W2W;
- {
- IR_ScalarSymbol *overflow_sym = NULL;
- std::vector<IR_ScalarRef *> scalars = ir->FindScalarRef(
- overflow_code);
- for (int i = scalars.size() - 1; i >= 0; i--)
- if (scalars[i]->is_write()) {
- overflow_sym = scalars[i]->symbol();
- break;
- }
- for (int i = scalars.size() - 1; i >= 0; i--)
- delete scalars[i];
- dv.sym = overflow_sym;
- }
- dv.lbounds = std::vector<coef_t>(num_dep_dim, 0);
- dv.ubounds = std::vector<coef_t>(num_dep_dim, 0);
- int dep_dim = get_last_dep_dim_before(stmt_num, level);
- for (int i = dep_dim + 1; i < num_dep_dim; i++) {
- dv.lbounds[i] = -posInfinity;
- dv.ubounds[i] = posInfinity;
- }
- for (int i = 0; i <= dep_dim; i++) {
- if (i != 0) {
- dv.lbounds[i - 1] = 0;
- dv.ubounds[i - 1] = 0;
- }
- dv.lbounds[i] = 1;
- dv.ubounds[i] = posInfinity;
- dep.connect(overflow_stmt_num, overflow_stmt_num, dv);
- }
- }
-
- // split the loop so it can be fully unrolled
- std::set<int> result = split(stmt_num, level, cond_upper);
- std::set<int> result2 = split(stmt_num, level, cond_lower);
- for (std::set<int>::iterator i = result2.begin(); i != result2.end(); i++)
- result.insert(*i);
-
- // check if unrolled statements can be trivially lumped together as one statement
- bool can_be_lumped = true;
- if (can_be_lumped) {
- for (std::set<int>::iterator i = same_loop.begin();
- i != same_loop.end(); i++)
- if (*i != stmt_num) {
- if (stmt[*i].loop_level.size()
- != stmt[stmt_num].loop_level.size()) {
- can_be_lumped = false;
- break;
- }
- for (int j = 0; j < stmt[stmt_num].loop_level.size(); j++)
- if (!(stmt[*i].loop_level[j].type
- == stmt[stmt_num].loop_level[j].type
- && stmt[*i].loop_level[j].payload
- == stmt[stmt_num].loop_level[j].payload)) {
- can_be_lumped = false;
- break;
- }
- if (!can_be_lumped)
- break;
- std::vector<int> lex2 = getLexicalOrder(*i);
- for (int j = 2 * level; j < lex.size() - 1; j += 2)
- if (lex[j] != lex2[j]) {
- can_be_lumped = false;
- break;
- }
- if (!can_be_lumped)
- break;
- }
- }
- if (can_be_lumped) {
- for (std::set<int>::iterator i = same_loop.begin();
- i != same_loop.end(); i++)
- if (is_inner_loop_depend_on_level(stmt[*i].IS, level, known)) {
- can_be_lumped = false;
- break;
- }
- }
- if (can_be_lumped) {
- for (std::set<int>::iterator i = same_loop.begin();
- i != same_loop.end(); i++)
- if (*i != stmt_num) {
- if (!(Must_Be_Subset(copy(stmt[*i].IS), copy(stmt[stmt_num].IS))
- && Must_Be_Subset(copy(stmt[stmt_num].IS),
- copy(stmt[*i].IS)))) {
- can_be_lumped = false;
- break;
- }
- }
- }
- if (can_be_lumped) {
- for (std::set<int>::iterator i = same_loop.begin();
- i != same_loop.end(); i++) {
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[*i].second.begin();
- j != dep.vertex[*i].second.end(); j++)
- if (same_loop.find(j->first) != same_loop.end()) {
- for (int k = 0; k < j->second.size(); k++)
- if (j->second[k].type == DEP_CONTROL
- || j->second[k].type == DEP_UNKNOWN) {
- can_be_lumped = false;
- break;
- }
- if (!can_be_lumped)
- break;
- }
- if (!can_be_lumped)
- break;
- }
- }
-
- // add strides to original statements
- // for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
- // add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
-
- // std::vector<Free_Var_Decl *> depending_overflow_var;
- // for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
- // add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
- // if (overflow.find(*i) != overflow.end()) {
- // // TO DO: It should check whether overflow vaiable depends on
- // // this loop index and by how much. This step is important if
- // // you want to unroll loops in arbitrary order.
- // depending_overflow_var.insert(depending_overflow_var.end(), overflow[*i].begin(), overflow[*i].end());
-
- // continue;
- // }
- // }
-
-// std::map<int, std::vector<Statement> > pending;
-// for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-// add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride);
-
-// if (overflow.find(*i) != overflow.end()) {
-// // TO DO: It should check whether overflow vaiable depends on
-// // this loop index and by how much. This step is important if
-// // you want to unroll loops in arbitrary order.
-// depending_overflow_var.insert(depending_overflow_var.end(), overflow[*i].begin(), overflow[*i].end());
-
-// continue;
-// }
-
-// // create copy for each unroll amount
-// for (int j = 1; j < unroll_amount; j++) {
-// Tuple<CG_outputRepr *> funcList;
-// Tuple<std::string> loop_vars;
-// loop_vars.append(stmt[*i].IS.set_var((dim+1)/2)->name());
-// funcList.append(ocg->CreatePlus(ocg->CreateIdent(stmt[*i].IS.set_var(level)->name()), ocg->CreateInt(j*stride)));
-// CG_outputRepr *code = ocg->CreatePlaceHolder(0, stmt[*i].code->clone(), funcList, loop_vars);
-
-// // prepare the new statment to insert
-// Statement unrolled_stmt;
-// unrolled_stmt.IS = copy(stmt[*i].IS);
-// // adjust_loop_bound(unrolled_stmt.IS, (dim-1)/2, j);
-// unrolled_stmt.xform = copy(stmt[*i].xform);
-// unrolled_stmt.code = code;
-// unrolled_stmt.loop_level = stmt[*i].loop_level;
-// pending[*i].push_back(unrolled_stmt);
-// }
-// }
-
-// // adjust iteration space due to loop bounds depending on this loop
-// // index and affected overflow variables
-// for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-// for (int j = 0; j < pending[*i].size(); j++) {
-// adjust_loop_bound(pending[*i][j].IS, (dim-1)/2, j+1, depending_overflow_var);
-// //pending[*i][j].IS = Intersection(pending[*i][j].IS, Extend_Set(copy(this->known), pending[*i][j].IS.n_set() - this->known.n_set()));
-// }
-// }
-
- // insert unrolled statements
- int old_num_stmt = stmt.size();
- if (!can_be_lumped) {
- std::map<int, std::vector<int> > what_stmt_num;
-
- for (int j = 1; j < unroll_amount; j++) {
- for (std::set<int>::iterator i = same_loop.begin();
- i != same_loop.end(); i++) {
- Statement new_stmt;
-
- Tuple<CG_outputRepr *> funcList;
- Tuple<std::string> loop_vars;
- loop_vars.append(stmt[*i].IS.set_var(level)->name());
- funcList.append(
- ocg->CreatePlus(
- ocg->CreateIdent(
- stmt[*i].IS.set_var(level)->name()),
- ocg->CreateInt(j * stride)));
- new_stmt.code = ocg->CreatePlaceHolder(0,
- stmt[*i].code->clone(), funcList, loop_vars);
-
- new_stmt.IS = adjust_loop_bound(stmt[*i].IS, level, j * stride);
- add_loop_stride(new_stmt.IS, bound, level - 1,
- unroll_amount * stride);
-
- new_stmt.xform = copy(stmt[*i].xform);
- new_stmt.loop_level = stmt[*i].loop_level;
- stmt.push_back(new_stmt);
- dep.insert();
- what_stmt_num[*i].push_back(stmt.size() - 1);
- }
- }
- for (std::set<int>::iterator i = same_loop.begin();
- i != same_loop.end(); i++)
- add_loop_stride(stmt[*i].IS, bound, level - 1,
- unroll_amount * stride);
-
- // update dependence graph
- if (stmt[stmt_num].loop_level[level - 1].type == LoopLevelOriginal) {
- int dep_dim = stmt[stmt_num].loop_level[level - 1].payload;
- int new_stride = unroll_amount * stride;
- for (int i = 0; i < old_num_stmt; i++) {
- std::vector<std::pair<int, DependenceVector> > D;
-
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[i].second.begin();
- j != dep.vertex[i].second.end();) {
- if (same_loop.find(i) != same_loop.end()) {
- if (same_loop.find(j->first) != same_loop.end()) {
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if (dv.type == DEP_CONTROL
- || dv.type == DEP_UNKNOWN) {
- D.push_back(std::make_pair(j->first, dv));
- for (int kk = 0; kk < unroll_amount - 1;
- kk++)
- if (what_stmt_num[i][kk] != -1
- && what_stmt_num[j->first][kk]
- != -1)
- dep.connect(what_stmt_num[i][kk],
- what_stmt_num[j->first][kk],
- dv);
- } else {
- coef_t lb = dv.lbounds[dep_dim];
- coef_t ub = dv.ubounds[dep_dim];
- if (ub == lb
- && int_mod(lb,
- static_cast<coef_t>(new_stride))
- == 0) {
- D.push_back(
- std::make_pair(j->first, dv));
- for (int kk = 0; kk < unroll_amount - 1;
- kk++)
- if (what_stmt_num[i][kk] != -1
- && what_stmt_num[j->first][kk]
- != -1)
- dep.connect(
- what_stmt_num[i][kk],
- what_stmt_num[j->first][kk],
- dv);
- } else if (lb == -posInfinity
- && ub == posInfinity) {
- D.push_back(
- std::make_pair(j->first, dv));
- for (int kk = 0; kk < unroll_amount;
- kk++)
- if (kk == 0)
- D.push_back(
- std::make_pair(j->first,
- dv));
- else if (what_stmt_num[j->first][kk
- - 1] != -1)
- D.push_back(
- std::make_pair(
- what_stmt_num[j->first][kk
- - 1],
- dv));
- for (int t = 0; t < unroll_amount - 1;
- t++)
- if (what_stmt_num[i][t] != -1)
- for (int kk = 0;
- kk < unroll_amount;
- kk++)
- if (kk == 0)
- dep.connect(
- what_stmt_num[i][t],
- j->first, dv);
- else if (what_stmt_num[j->first][kk
- - 1] != -1)
- dep.connect(
- what_stmt_num[i][t],
- what_stmt_num[j->first][kk
- - 1],
- dv);
- } else {
- for (int kk = 0; kk < unroll_amount;
- kk++) {
- if (lb != -posInfinity) {
- if (kk * stride
- < int_mod(lb,
- static_cast<coef_t>(new_stride)))
- dv.lbounds[dep_dim] =
- floor(
- static_cast<double>(lb)
- / new_stride)
- * new_stride
- + new_stride;
- else
- dv.lbounds[dep_dim] =
- floor(
- static_cast<double>(lb)
- / new_stride)
- * new_stride;
- }
- if (ub != posInfinity) {
- if (kk * stride
- > int_mod(ub,
- static_cast<coef_t>(new_stride)))
- dv.ubounds[dep_dim] =
- floor(
- static_cast<double>(ub)
- / new_stride)
- * new_stride
- - new_stride;
- else
- dv.ubounds[dep_dim] =
- floor(
- static_cast<double>(ub)
- / new_stride)
- * new_stride;
- }
- if (dv.ubounds[dep_dim]
- >= dv.lbounds[dep_dim]) {
- if (kk == 0)
- D.push_back(
- std::make_pair(
- j->first,
- dv));
- else if (what_stmt_num[j->first][kk
- - 1] != -1)
- D.push_back(
- std::make_pair(
- what_stmt_num[j->first][kk
- - 1],
- dv));
- }
- }
- for (int t = 0; t < unroll_amount - 1;
- t++)
- if (what_stmt_num[i][t] != -1)
- for (int kk = 0;
- kk < unroll_amount;
- kk++) {
- if (lb != -posInfinity) {
- if (kk * stride
- < int_mod(
- lb + t
- + 1,
- static_cast<coef_t>(new_stride)))
- dv.lbounds[dep_dim] =
- floor(
- static_cast<double>(lb
- + (t
- + 1)
- * stride)
- / new_stride)
- * new_stride
- + new_stride;
- else
- dv.lbounds[dep_dim] =
- floor(
- static_cast<double>(lb
- + (t
- + 1)
- * stride)
- / new_stride)
- * new_stride;
- }
- if (ub != posInfinity) {
- if (kk * stride
- > int_mod(
- ub + t
- + 1,
- static_cast<coef_t>(new_stride)))
- dv.ubounds[dep_dim] =
- floor(
- static_cast<double>(ub
- + (t
- + 1)
- * stride)
- / new_stride)
- * new_stride
- - new_stride;
- else
- dv.ubounds[dep_dim] =
- floor(
- static_cast<double>(ub
- + (t
- + 1)
- * stride)
- / new_stride)
- * new_stride;
- }
- if (dv.ubounds[dep_dim]
- >= dv.lbounds[dep_dim]) {
- if (kk == 0)
- dep.connect(
- what_stmt_num[i][t],
- j->first,
- dv);
- else if (what_stmt_num[j->first][kk
- - 1] != -1)
- dep.connect(
- what_stmt_num[i][t],
- what_stmt_num[j->first][kk
- - 1],
- dv);
- }
- }
- }
- }
- }
-
- dep.vertex[i].second.erase(j++);
- } else {
- for (int kk = 0; kk < unroll_amount - 1; kk++)
- if (what_stmt_num[i][kk] != -1)
- dep.connect(what_stmt_num[i][kk], j->first,
- j->second);
-
- j++;
- }
- } else {
- if (same_loop.find(j->first) != same_loop.end())
- for (int k = 0; k < j->second.size(); k++)
- for (int kk = 0; kk < unroll_amount - 1; kk++)
- if (what_stmt_num[j->first][kk] != -1)
- D.push_back(
- std::make_pair(
- what_stmt_num[j->first][kk],
- j->second[k]));
- j++;
- }
- }
-
- for (int j = 0; j < D.size(); j++)
- dep.connect(i, D[j].first, D[j].second);
- }
- }
-
- // reset lexical order for the unrolled loop body
- std::set<int> new_same_loop;
- for (std::map<int, std::vector<int> >::iterator i =
- what_stmt_num.begin(); i != what_stmt_num.end(); i++) {
- new_same_loop.insert(i->first);
- for (int j = 0; j < i->second.size(); j++)
- new_same_loop.insert(i->second[j]);
- }
- setLexicalOrder(dim + 1, new_same_loop);
- } else {
- for (std::set<int>::iterator i = same_loop.begin();
- i != same_loop.end(); i++)
- add_loop_stride(stmt[*i].IS, bound, level - 1,
- unroll_amount * stride);
-
- int max_level = stmt[stmt_num].loop_level.size();
- std::vector<std::pair<int, int> > stmt_order;
- for (std::set<int>::iterator i = same_loop.begin();
- i != same_loop.end(); i++)
- stmt_order.push_back(
- std::make_pair(
- get_const(stmt[*i].xform, 2 * max_level,
- Output_Var), *i));
- sort(stmt_order.begin(), stmt_order.end());
-
- Statement new_stmt;
- new_stmt.code = NULL;
- for (int j = 1; j < unroll_amount; j++)
- for (int i = 0; i < stmt_order.size(); i++) {
- Tuple<CG_outputRepr *> funcList;
- Tuple<std::string> loop_vars;
- loop_vars.append(
- stmt[stmt_order[i].second].IS.set_var(level)->name());
- funcList.append(
- ocg->CreatePlus(
- ocg->CreateIdent(
- stmt[stmt_order[i].second].IS.set_var(
- level)->name()),
- ocg->CreateInt(j * stride)));
- CG_outputRepr *code = ocg->CreatePlaceHolder(0,
- stmt[stmt_order[i].second].code->clone(), funcList,
- loop_vars);
- new_stmt.code = ocg->StmtListAppend(new_stmt.code, code);
- }
-
- new_stmt.IS = copy(stmt[stmt_num].IS);
- new_stmt.xform = copy(stmt[stmt_num].xform);
- assign_const(new_stmt.xform, 2 * max_level,
- stmt_order[stmt_order.size() - 1].first + 1);
- new_stmt.loop_level = stmt[stmt_num].loop_level;
- stmt.push_back(new_stmt);
- dep.insert();
-
- // update dependence graph
- if (stmt[stmt_num].loop_level[level - 1].type == LoopLevelOriginal) {
- int dep_dim = stmt[stmt_num].loop_level[level - 1].payload;
- int new_stride = unroll_amount * stride;
- for (int i = 0; i < old_num_stmt; i++) {
- std::vector<std::pair<int, std::vector<DependenceVector> > > D;
-
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[i].second.begin();
- j != dep.vertex[i].second.end();) {
- if (same_loop.find(i) != same_loop.end()) {
- if (same_loop.find(j->first) != same_loop.end()) {
- std::vector<DependenceVector> dvs11, dvs12, dvs22,
- dvs21;
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if (dv.type == DEP_CONTROL
- || dv.type == DEP_UNKNOWN) {
- if (i == j->first) {
- dvs11.push_back(dv);
- dvs22.push_back(dv);
- } else
- throw loop_error(
- "unrolled statements lumped together illegally");
- } else {
- coef_t lb = dv.lbounds[dep_dim];
- coef_t ub = dv.ubounds[dep_dim];
- if (ub == lb
- && int_mod(lb,
- static_cast<coef_t>(new_stride))
- == 0) {
- dvs11.push_back(dv);
- dvs22.push_back(dv);
- } else {
- if (lb != -posInfinity)
- dv.lbounds[dep_dim] = ceil(
- static_cast<double>(lb)
- / new_stride)
- * new_stride;
- if (ub != posInfinity)
- dv.ubounds[dep_dim] = floor(
- static_cast<double>(ub)
- / new_stride)
- * new_stride;
- if (dv.ubounds[dep_dim]
- >= dv.lbounds[dep_dim])
- dvs11.push_back(dv);
-
- if (lb != -posInfinity)
- dv.lbounds[dep_dim] = ceil(
- static_cast<double>(lb)
- / new_stride)
- * new_stride;
- if (ub != posInfinity)
- dv.ubounds[dep_dim] = ceil(
- static_cast<double>(ub)
- / new_stride)
- * new_stride;
- if (dv.ubounds[dep_dim]
- >= dv.lbounds[dep_dim])
- dvs21.push_back(dv);
-
- if (lb != -posInfinity)
- dv.lbounds[dep_dim] = floor(
- static_cast<double>(lb)
- / new_stride)
- * new_stride;
- if (ub != posInfinity)
- dv.ubounds[dep_dim] = floor(
- static_cast<double>(ub
- - stride)
- / new_stride)
- * new_stride;
- if (dv.ubounds[dep_dim]
- >= dv.lbounds[dep_dim])
- dvs12.push_back(dv);
-
- if (lb != -posInfinity)
- dv.lbounds[dep_dim] = floor(
- static_cast<double>(lb)
- / new_stride)
- * new_stride;
- if (ub != posInfinity)
- dv.ubounds[dep_dim] = ceil(
- static_cast<double>(ub
- - stride)
- / new_stride)
- * new_stride;
- if (dv.ubounds[dep_dim]
- >= dv.lbounds[dep_dim])
- dvs22.push_back(dv);
- }
- }
- }
- if (dvs11.size() > 0)
- D.push_back(std::make_pair(i, dvs11));
- if (dvs22.size() > 0)
- dep.connect(old_num_stmt, old_num_stmt, dvs22);
- if (dvs12.size() > 0)
- D.push_back(
- std::make_pair(old_num_stmt, dvs12));
- if (dvs21.size() > 0)
- dep.connect(old_num_stmt, i, dvs21);
-
- dep.vertex[i].second.erase(j++);
- } else {
- dep.connect(old_num_stmt, j->first, j->second);
- j++;
- }
- } else {
- if (same_loop.find(j->first) != same_loop.end())
- D.push_back(
- std::make_pair(old_num_stmt, j->second));
- j++;
- }
- }
-
- for (int j = 0; j < D.size(); j++)
- dep.connect(i, D[j].first, D[j].second);
- }
- }
- }
-
- return result;
-}
-
-std::vector<int> Loop::getLexicalOrder(int stmt_num) const {
- assert(stmt_num < stmt.size());
-
- const int n = stmt[stmt_num].xform.n_out();
- std::vector<int> lex(n, 0);
-
- for (int i = 0; i < n; i += 2)
- lex[i] = get_const(stmt[stmt_num].xform, i, Output_Var);
-
- return lex;
-}
-
-std::set<int> Loop::getStatements(const std::vector<int> &lex, int dim) const {
- const int m = stmt.size();
-
- std::set<int> same_loops;
- for (int i = 0; i < m; i++) {
- if (dim < 0)
- same_loops.insert(i);
- else {
- std::vector<int> a_lex = getLexicalOrder(i);
- int j;
- for (j = 0; j <= dim; j += 2)
- if (lex[j] != a_lex[j])
- break;
- if (j > dim)
- same_loops.insert(i);
- }
- }
-
- return same_loops;
-}
-
-void Loop::shiftLexicalOrder(const std::vector<int> &lex, int dim, int amount) {
- const int m = stmt.size();
-
- if (amount == 0)
- return;
-
- for (int i = 0; i < m; i++) {
- std::vector<int> lex2 = getLexicalOrder(i);
-
- bool need_shift = true;
-
- for (int j = 0; j < dim; j++)
- if (lex2[j] != lex[j]) {
- need_shift = false;
- break;
- }
-
- if (!need_shift)
- continue;
-
- if (amount > 0) {
- if (lex2[dim] < lex[dim])
- continue;
- } else if (amount < 0) {
- if (lex2[dim] > lex[dim])
- continue;
- }
-
- assign_const(stmt[i].xform, dim, lex2[dim] + amount);
- }
-}
-
-void Loop::setLexicalOrder(int dim, const std::set<int> &active,
- int starting_order) {
- if (active.size() == 0)
- return;
-
- // check for sanity of parameters
- if (dim < 0 || dim % 2 != 0)
- throw std::invalid_argument(
- "invalid constant loop level to set lexicographical order");
- std::vector<int> lex;
- int ref_stmt_num;
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
- if ((*i) < 0 || (*i) >= stmt.size())
- throw std::invalid_argument(
- "invalid statement number " + to_string(*i));
- if (dim >= stmt[*i].xform.n_out())
- throw std::invalid_argument(
- "invalid constant loop level to set lexicographical order");
- if (i == active.begin()) {
- lex = getLexicalOrder(*i);
- ref_stmt_num = *i;
- } else {
- std::vector<int> lex2 = getLexicalOrder(*i);
- for (int j = 0; j < dim; j += 2)
- if (lex[j] != lex2[j])
- throw std::invalid_argument(
- "statements are not in the same sub loop nest");
- }
- }
-
- // sepearate statements by current loop level types
- int level = (dim + 2) / 2;
- std::map<std::pair<LoopLevelType, int>, std::set<int> > active_by_level_type;
- std::set<int> active_by_no_level;
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
- if (level > stmt[*i].loop_level.size())
- active_by_no_level.insert(*i);
- else
- active_by_level_type[std::make_pair(
- stmt[*i].loop_level[level - 1].type,
- stmt[*i].loop_level[level - 1].payload)].insert(*i);
- }
-
- // further separate statements due to control dependences
- std::vector<std::set<int> > active_by_level_type_splitted;
- for (std::map<std::pair<LoopLevelType, int>, std::set<int> >::iterator i =
- active_by_level_type.begin(); i != active_by_level_type.end(); i++)
- active_by_level_type_splitted.push_back(i->second);
- for (std::set<int>::iterator i = active_by_no_level.begin();
- i != active_by_no_level.end(); i++)
- for (int j = active_by_level_type_splitted.size() - 1; j >= 0; j--) {
- std::set<int> controlled, not_controlled;
- for (std::set<int>::iterator k =
- active_by_level_type_splitted[j].begin();
- k != active_by_level_type_splitted[j].end(); k++) {
- std::vector<DependenceVector> dvs = dep.getEdge(*i, *k);
- bool is_controlled = false;
- for (int kk = 0; kk < dvs.size(); kk++)
- if (dvs[kk].type = DEP_CONTROL) {
- is_controlled = true;
- break;
- }
- if (is_controlled)
- controlled.insert(*k);
- else
- not_controlled.insert(*k);
- }
- if (controlled.size() != 0 && not_controlled.size() != 0) {
- active_by_level_type_splitted.erase(
- active_by_level_type_splitted.begin() + j);
- active_by_level_type_splitted.push_back(controlled);
- active_by_level_type_splitted.push_back(not_controlled);
- }
- }
-
- // set lexical order separating loops with different loop types first
- if (active_by_level_type_splitted.size() + active_by_no_level.size() > 1) {
- int dep_dim = get_last_dep_dim_before(ref_stmt_num, level) + 1;
-
- Graph<std::set<int>, Empty> g;
- for (std::vector<std::set<int> >::iterator i =
- active_by_level_type_splitted.begin();
- i != active_by_level_type_splitted.end(); i++)
- g.insert(*i);
- for (std::set<int>::iterator i = active_by_no_level.begin();
- i != active_by_no_level.end(); i++) {
- std::set<int> t;
- t.insert(*i);
- g.insert(t);
- }
- for (int i = 0; i < g.vertex.size(); i++)
- for (int j = i + 1; j < g.vertex.size(); j++) {
- bool connected = false;
- for (std::set<int>::iterator ii = g.vertex[i].first.begin();
- ii != g.vertex[i].first.end(); ii++) {
- for (std::set<int>::iterator jj = g.vertex[j].first.begin();
- jj != g.vertex[j].first.end(); jj++) {
- std::vector<DependenceVector> dvs = dep.getEdge(*ii,
- *jj);
- for (int k = 0; k < dvs.size(); k++)
- if (dvs[k].is_control_dependence()
- || (dvs[k].is_data_dependence()
- && !dvs[k].has_been_carried_before(
- dep_dim))) {
- g.connect(i, j);
- connected = true;
- break;
- }
- if (connected)
- break;
- }
- if (connected)
- break;
- }
- connected = false;
- for (std::set<int>::iterator ii = g.vertex[i].first.begin();
- ii != g.vertex[i].first.end(); ii++) {
- for (std::set<int>::iterator jj = g.vertex[j].first.begin();
- jj != g.vertex[j].first.end(); jj++) {
- std::vector<DependenceVector> dvs = dep.getEdge(*jj,
- *ii);
- for (int k = 0; k < dvs.size(); k++)
- if (dvs[k].is_control_dependence()
- || (dvs[k].is_data_dependence()
- && !dvs[k].has_been_carried_before(
- dep_dim))) {
- g.connect(j, i);
- connected = true;
- break;
- }
- if (connected)
- break;
- }
- if (connected)
- break;
- }
- }
-
- std::vector<std::set<int> > s = g.topoSort();
- if (s.size() != g.vertex.size())
- throw loop_error(
- "cannot separate statements with different loop types at loop level "
- + to_string(level));
-
- // assign lexical order
- int order = starting_order;
- for (int i = 0; i < s.size(); i++) {
- std::set<int> &cur_scc = g.vertex[*(s[i].begin())].first;
- int sz = cur_scc.size();
- if (sz == 1) {
- int cur_stmt = *(cur_scc.begin());
- assign_const(stmt[cur_stmt].xform, dim, order);
- for (int j = dim + 2; j < stmt[cur_stmt].xform.n_out(); j += 2)
- assign_const(stmt[cur_stmt].xform, j, 0);
- order++;
- } else {
- setLexicalOrder(dim, cur_scc, order);
- order += sz;
- }
- }
- }
- // set lexical order seperating single iteration statements and loops
- else {
- std::set<int> true_singles;
- std::set<int> nonsingles;
- std::map<coef_t, std::set<int> > fake_singles;
-
- // sort out statements that do not require loops
- for (std::set<int>::iterator i = active.begin(); i != active.end();
- i++) {
- Relation cur_IS = getNewIS(*i);
- if (is_single_iteration(cur_IS, dim + 1)) {
- bool is_all_single = true;
- for (int j = dim + 3; j < stmt[*i].xform.n_out(); j += 2)
- if (!is_single_iteration(cur_IS, j)) {
- is_all_single = false;
- break;
- }
- if (is_all_single)
- true_singles.insert(*i);
- else {
- try {
- fake_singles[get_const(cur_IS, dim + 1, Set_Var)].insert(
- *i);
- } catch (const std::exception &e) {
- fake_singles[posInfinity].insert(*i);
- }
- }
- } else
- nonsingles.insert(*i);
- }
-
- // split nonsingles forcibly according to negative dependences present (loop unfusible)
- int dep_dim = get_dep_dim_of(ref_stmt_num, level);
- Graph<int, Empty> g2;
- for (std::set<int>::iterator i = nonsingles.begin();
- i != nonsingles.end(); i++)
- g2.insert(*i);
- for (int i = 0; i < g2.vertex.size(); i++)
- for (int j = i + 1; j < g2.vertex.size(); j++) {
- std::vector<DependenceVector> dvs = dep.getEdge(
- g2.vertex[i].first, g2.vertex[j].first);
- for (int k = 0; k < dvs.size(); k++)
- if (dvs[k].is_control_dependence()
- || (dvs[k].is_data_dependence()
- && dvs[k].has_negative_been_carried_at(
- dep_dim))) {
- g2.connect(i, j);
- break;
- }
- dvs = dep.getEdge(g2.vertex[j].first, g2.vertex[i].first);
- for (int k = 0; k < dvs.size(); k++)
- if (dvs[k].is_control_dependence()
- || (dvs[k].is_data_dependence()
- && dvs[k].has_negative_been_carried_at(
- dep_dim))) {
- g2.connect(j, i);
- break;
- }
- }
-
- std::vector<std::set<int> > s2 = g2.packed_topoSort();
-
- std::vector<std::set<int> > splitted_nonsingles;
- for (int i = 0; i < s2.size(); i++) {
- std::set<int> cur_scc;
- for (std::set<int>::iterator j = s2[i].begin(); j != s2[i].end();
- j++)
- cur_scc.insert(g2.vertex[*j].first);
- splitted_nonsingles.push_back(cur_scc);
- }
-
- // convert to dependence graph for grouped statements
- dep_dim = get_last_dep_dim_before(ref_stmt_num, level) + 1;
- Graph<std::set<int>, Empty> g;
- for (std::set<int>::iterator i = true_singles.begin();
- i != true_singles.end(); i++) {
- std::set<int> t;
- t.insert(*i);
- g.insert(t);
- }
- for (int i = 0; i < splitted_nonsingles.size(); i++) {
- g.insert(splitted_nonsingles[i]);
- }
- for (std::map<coef_t, std::set<int> >::iterator i =
- fake_singles.begin(); i != fake_singles.end(); i++)
- g.insert((*i).second);
-
- for (int i = 0; i < g.vertex.size(); i++)
- for (int j = i + 1; j < g.vertex.size(); j++) {
- bool connected = false;
- for (std::set<int>::iterator ii = g.vertex[i].first.begin();
- ii != g.vertex[i].first.end(); ii++) {
- for (std::set<int>::iterator jj = g.vertex[j].first.begin();
- jj != g.vertex[j].first.end(); jj++) {
- std::vector<DependenceVector> dvs = dep.getEdge(*ii,
- *jj);
- for (int k = 0; k < dvs.size(); k++)
- if (dvs[k].is_control_dependence()
- || (dvs[k].is_data_dependence()
- && !dvs[k].has_been_carried_before(
- dep_dim))) {
- g.connect(i, j);
- connected = true;
- break;
- }
- if (connected)
- break;
- }
- if (connected)
- break;
- }
- connected = false;
- for (std::set<int>::iterator ii = g.vertex[i].first.begin();
- ii != g.vertex[i].first.end(); ii++) {
- for (std::set<int>::iterator jj = g.vertex[j].first.begin();
- jj != g.vertex[j].first.end(); jj++) {
- std::vector<DependenceVector> dvs = dep.getEdge(*jj,
- *ii);
- for (int k = 0; k < dvs.size(); k++)
- if (dvs[k].is_control_dependence()
- || (dvs[k].is_data_dependence()
- && !dvs[k].has_been_carried_before(
- dep_dim))) {
- g.connect(j, i);
- connected = true;
- break;
- }
- if (connected)
- break;
- }
- if (connected)
- break;
- }
- }
-
- // topological sort according to chun's permute algorithm
- std::vector<std::set<int> > s = g.topoSort();
-
- // assign lexical order
- int order = starting_order;
- for (int i = 0; i < s.size(); i++) {
- // translate each SCC into original statements
- std::set<int> cur_scc;
- for (std::set<int>::iterator j = s[i].begin(); j != s[i].end(); j++)
- copy(g.vertex[*j].first.begin(), g.vertex[*j].first.end(),
- inserter(cur_scc, cur_scc.begin()));
-
- // now assign the constant
- for (std::set<int>::iterator j = cur_scc.begin();
- j != cur_scc.end(); j++)
- assign_const(stmt[*j].xform, dim, order);
-
- if (cur_scc.size() > 1)
- setLexicalOrder(dim + 2, cur_scc);
- else if (cur_scc.size() == 1) {
- int cur_stmt = *(cur_scc.begin());
- for (int j = dim + 2; j < stmt[cur_stmt].xform.n_out(); j += 2)
- assign_const(stmt[cur_stmt].xform, j, 0);
- }
-
- if (cur_scc.size() > 0)
- order++;
- }
- }
-}
-
-void Loop::apply_xform() {
- std::set<int> active;
- for (int i = 0; i < stmt.size(); i++)
- active.insert(i);
- apply_xform(active);
-}
-
-void Loop::apply_xform(int stmt_num) {
- std::set<int> active;
- active.insert(stmt_num);
- apply_xform(active);
-}
-
-void Loop::apply_xform(std::set<int> &active) {
- int max_n = 0;
-
- CG_outputBuilder *ocg = ir->builder();
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++) {
- int n = stmt[*i].loop_level.size();
- if (n > max_n)
- max_n = n;
-
- std::vector<int> lex = getLexicalOrder(*i);
-
- Relation mapping(2 * n + 1, n);
- F_And *f_root = mapping.add_and();
- for (int j = 1; j <= n; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(j), 1);
- h.update_coef(mapping.input_var(2 * j), -1);
- }
- mapping = Composition(mapping, stmt[*i].xform);
- mapping.simplify();
-
- // match omega input/output variables to variable names in the code
- for (int j = 1; j <= stmt[*i].IS.n_set(); j++)
- mapping.name_input_var(j, stmt[*i].IS.set_var(j)->name());
- for (int j = 1; j <= n; j++)
- mapping.name_output_var(j,
- tmp_loop_var_name_prefix
- + to_string(tmp_loop_var_name_counter + j - 1));
- mapping.setup_names();
-
- Relation known = Extend_Set(copy(this->known),
- mapping.n_out() - this->known.n_set());
- //stmt[*i].code = outputStatement(ocg, stmt[*i].code, 0, mapping, known, std::vector<CG_outputRepr *>(mapping.n_out(), NULL));
- stmt[*i].code = outputStatement(ocg, stmt[*i].code, 0, mapping, known,
- std::vector<CG_outputRepr *>(mapping.n_out()));
- stmt[*i].IS = Range(Restrict_Domain(mapping, stmt[*i].IS));
- stmt[*i].IS.simplify();
-
- // replace original transformation relation with straight 1-1 mapping
- mapping = Relation(n, 2 * n + 1);
- f_root = mapping.add_and();
- for (int j = 1; j <= n; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(2 * j), 1);
- h.update_coef(mapping.input_var(j), -1);
- }
- for (int j = 1; j <= 2 * n + 1; j += 2) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(j), 1);
- h.update_const(-lex[j - 1]);
- }
- stmt[*i].xform = mapping;
- }
-
- tmp_loop_var_name_counter += max_n;
-}
-
-void Loop::addKnown(const Relation &cond) {
- int n1 = this->known.n_set();
-
- Relation r = copy(cond);
- int n2 = r.n_set();
-
- if (n1 < n2)
- this->known = Extend_Set(this->known, n2 - n1);
- else if (n1 > n2)
- r = Extend_Set(r, n1 - n2);
-
- this->known = Intersection(this->known, r);
-}
-
-bool Loop::nonsingular(const std::vector<std::vector<int> > &T) {
- if (stmt.size() == 0)
- return true;
-
- // check for sanity of parameters
- for (int i = 0; i < stmt.size(); i++) {
- if (stmt[i].loop_level.size() != num_dep_dim)
- throw std::invalid_argument(
- "nonsingular loop transformations must be applied to original perfect loop nest");
- for (int j = 0; j < stmt[i].loop_level.size(); j++)
- if (stmt[i].loop_level[j].type != LoopLevelOriginal)
- throw std::invalid_argument(
- "nonsingular loop transformations must be applied to original perfect loop nest");
- }
- if (T.size() != num_dep_dim)
- throw std::invalid_argument("invalid transformation matrix");
- for (int i = 0; i < stmt.size(); i++)
- if (T[i].size() != num_dep_dim + 1 && T[i].size() != num_dep_dim)
- throw std::invalid_argument("invalid transformation matrix");
-
- // build relation from matrix
- Relation mapping(2 * num_dep_dim + 1, 2 * num_dep_dim + 1);
- F_And *f_root = mapping.add_and();
- for (int i = 0; i < num_dep_dim; i++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(2 * (i + 1)), -1);
- for (int j = 0; j < num_dep_dim; j++)
- if (T[i][j] != 0)
- h.update_coef(mapping.input_var(2 * (j + 1)), T[i][j]);
- if (T[i].size() == num_dep_dim + 1)
- h.update_const(T[i][num_dep_dim]);
- }
- for (int i = 1; i <= 2 * num_dep_dim + 1; i += 2) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.output_var(i), -1);
- h.update_coef(mapping.input_var(i), 1);
- }
-
- // update transformation relations
- for (int i = 0; i < stmt.size(); i++)
- stmt[i].xform = Composition(copy(mapping), stmt[i].xform);
-
- // update dependence graph
- for (int i = 0; i < dep.vertex.size(); i++)
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();
- j++) {
- std::vector<DependenceVector> dvs = j->second;
- for (int k = 0; k < dvs.size(); k++) {
- DependenceVector &dv = dvs[k];
- switch (dv.type) {
- case DEP_W2R:
- case DEP_R2W:
- case DEP_W2W:
- case DEP_R2R: {
- std::vector<coef_t> lbounds(num_dep_dim), ubounds(
- num_dep_dim);
- for (int p = 0; p < num_dep_dim; p++) {
- coef_t lb = 0;
- coef_t ub = 0;
- for (int q = 0; q < num_dep_dim; q++) {
- if (T[p][q] > 0) {
- if (lb == -posInfinity
- || dv.lbounds[q] == -posInfinity)
- lb = -posInfinity;
- else
- lb += T[p][q] * dv.lbounds[q];
- if (ub == posInfinity
- || dv.ubounds[q] == posInfinity)
- ub = posInfinity;
- else
- ub += T[p][q] * dv.ubounds[q];
- } else if (T[p][q] < 0) {
- if (lb == -posInfinity
- || dv.ubounds[q] == posInfinity)
- lb = -posInfinity;
- else
- lb += T[p][q] * dv.ubounds[q];
- if (ub == posInfinity
- || dv.lbounds[q] == -posInfinity)
- ub = posInfinity;
- else
- ub += T[p][q] * dv.lbounds[q];
- }
- }
- if (T[p].size() == num_dep_dim + 1) {
- if (lb != -posInfinity)
- lb += T[p][num_dep_dim];
- if (ub != posInfinity)
- ub += T[p][num_dep_dim];
- }
- lbounds[p] = lb;
- ubounds[p] = ub;
- }
- dv.lbounds = lbounds;
- dv.ubounds = ubounds;
-
- break;
- }
- default:
- ;
- }
- }
- j->second = dvs;
- }
-
- // set constant loop values
- std::set<int> active;
- for (int i = 0; i < stmt.size(); i++)
- active.insert(i);
- setLexicalOrder(0, active);
-
- return true;
-}
-
-void Loop::skew(const std::set<int> &stmt_nums, int level,
- const std::vector<int> &skew_amount) {
- if (stmt_nums.size() == 0)
- return;
-
- // check for sanity of parameters
- int ref_stmt_num = *(stmt_nums.begin());
- std::vector<std::set<int> > array_of_deps;
- for (std::set<int>::const_iterator i = stmt_nums.begin();
- i != stmt_nums.end(); i++) {
- if (*i < 0 || *i >= stmt.size())
- throw std::invalid_argument(
- "invalid statement number " + to_string(*i));
- if (level < 1 || level > stmt[*i].loop_level.size())
- throw std::invalid_argument(
- "invalid loop level " + to_string(level));
- for (int j = stmt[*i].loop_level.size(); j < skew_amount.size(); j++)
- if (skew_amount[j] != 0)
- throw std::invalid_argument("invalid skewing formula");
- }
-
- // set trasformation relations
- for (std::set<int>::const_iterator i = stmt_nums.begin();
- i != stmt_nums.end(); i++) {
- int n = stmt[*i].xform.n_out();
- Relation r(n, n);
- F_And *f_root = r.add_and();
- for (int j = 1; j <= n; j++)
- if (j != 2 * level) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.input_var(j), 1);
- h.update_coef(r.output_var(j), -1);
- }
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.output_var(2 * level), -1);
- for (int j = 0; j < skew_amount.size(); j++)
- if (skew_amount[j] != 0)
- h.update_coef(r.input_var(2 * (j + 1)), skew_amount[j]);
-
- stmt[*i].xform = Composition(r, stmt[*i].xform);
- stmt[*i].xform.simplify();
- applyXform(*i);
- std::set<int> dont_consider;
- //}
-
- // update dependence graph
- if (stmt[ref_stmt_num].loop_level[level - 1].type
- == LoopLevelOriginal) {
- int dep_dim = stmt[ref_stmt_num].loop_level[level - 1].payload;
- for (std::set<int>::const_iterator i = stmt_nums.begin();
- i != stmt_nums.end(); i++)
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[*i].second.begin();
- j != dep.vertex[*i].second.end(); j++)
- if (stmt_nums.find(j->first) != stmt_nums.end()) {
- // dependence between skewed statements
- std::vector<DependenceVector> dvs = j->second;
- for (int k = 0; k < dvs.size(); k++) {
- DependenceVector &dv = dvs[k];
- if (dv.is_data_dependence()) {
- coef_t lb = 0;
- coef_t ub = 0;
- for (int kk = 0; kk < skew_amount.size();
- kk++) {
- int cur_dep_dim = get_dep_dim_of(*i,
- kk + 1);
- if (skew_amount[kk] > 0) {
- if (lb != -posInfinity
- && stmt[*i].loop_level[kk].type
- == LoopLevelOriginal
- && dv.lbounds[cur_dep_dim]
- != -posInfinity)
- lb += skew_amount[kk]
- * dv.lbounds[cur_dep_dim];
- else {
- if (cur_dep_dim != -1
- && !(dv.lbounds[cur_dep_dim]
- == 0
- && dv.ubounds[cur_dep_dim]
- == 0))
- lb = -posInfinity;
- }
- if (ub != posInfinity
- && stmt[*i].loop_level[kk].type
- == LoopLevelOriginal
- && dv.ubounds[cur_dep_dim]
- != posInfinity)
- ub += skew_amount[kk]
- * dv.ubounds[cur_dep_dim];
- else {
- if (cur_dep_dim != -1
- && !(dv.lbounds[cur_dep_dim]
- == 0
- && dv.ubounds[cur_dep_dim]
- == 0))
- ub = posInfinity;
- }
- } else if (skew_amount[kk] < 0) {
- if (lb != -posInfinity
- && stmt[*i].loop_level[kk].type
- == LoopLevelOriginal
- && dv.ubounds[cur_dep_dim]
- != posInfinity)
- lb += skew_amount[kk]
- * dv.ubounds[cur_dep_dim];
- else {
- if (cur_dep_dim != -1
- && !(dv.lbounds[cur_dep_dim]
- == 0
- && dv.ubounds[cur_dep_dim]
- == 0))
- lb = -posInfinity;
- }
- if (ub != posInfinity
- && stmt[*i].loop_level[kk].type
- == LoopLevelOriginal
- && dv.lbounds[cur_dep_dim]
- != -posInfinity)
- ub += skew_amount[kk]
- * dv.lbounds[cur_dep_dim];
- else {
- if (cur_dep_dim != -1
- && !(dv.lbounds[cur_dep_dim]
- == 0
- && dv.ubounds[cur_dep_dim]
- == 0))
- ub = posInfinity;
- }
- }
- }
- if ((dv.isCarried(dep_dim)
- && dv.hasPositive(dep_dim)) && dv.quasi)
- dv.quasi = false;
-
- if ((dv.isCarried(dep_dim)
- && dv.hasNegative(dep_dim))
- && !dv.quasi)
- throw loop_error(
- "loop error: Skewing is illegal, dependence violation!");
- dv.lbounds[dep_dim] = lb;
- dv.ubounds[dep_dim] = ub;
- if ((dv.isCarried(dep_dim)
- && dv.hasPositive(dep_dim)) && dv.quasi)
- dv.quasi = false;
-
- if ((dv.isCarried(dep_dim)
- && dv.hasNegative(dep_dim))
- && !dv.quasi)
- throw loop_error(
- "loop error: Skewing is illegal, dependence violation!");
- }
- }
-
- j->second = dvs;
- }
- } else {
- // dependence from skewed statement to unskewed statement becomes jumbled,
- // put distance value at skewed dimension to unknown
- /*std::vector<DependenceVector> dvs = j->second;
- for (int k = 0; k < dvs.size(); k++) {
- DependenceVector &dv = dvs[k];
- if (dv.is_data_dependence()) {
- dv.lbounds[dep_dim] = -posInfinity;
- dv.ubounds[dep_dim] = posInfinity;
- }
- }
- j->second = dvs;
- */
- dont_consider.insert(j->first);
- }
- for (int l = 0; l < dep.vertex.size(); l++)
- if (stmt_nums.find(l) == stmt_nums.end())
- if (dont_consider.find(l) == stmt_nums.end()
- && (dep.vertex[l].second.find(*i)
- != dep.vertex[l].second.end()))
- dont_consider.insert(l);
- array_of_deps.push_back(dont_consider);
- }
- /*for (int i = 0; i < dep.vertex.size(); i++)
- if (stmt_nums.find(i) == stmt_nums.end())
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[i].second.begin();
- j != dep.vertex[i].second.end(); j++)
- if (stmt_nums.find(j->first) != stmt_nums.end()) {
- // dependence from unskewed statement to skewed statement becomes jumbled,
- // put distance value at skewed dimension to unknown
- std::vector<DependenceVector> dvs = j->second;
- for (int k = 0; k < dvs.size(); k++) {
- DependenceVector &dv = dvs[k];
- if (dv.is_data_dependence()) {
- dv.lbounds[dep_dim] = -posInfinity;
- dv.ubounds[dep_dim] = posInfinity;
- }
- }
- j->second = dvs;
- }
- }*/
- std::set<int>::const_iterator w = stmt_nums.begin();
- for (int i = 0; i < array_of_deps.size() && w != stmt_nums.end(); i++)
- for (std::set<int>::const_iterator j = array_of_deps[i].begin();
- j != array_of_deps[i].end(); j++) {
- if (dep.vertex[*w].second.find(*j) != dep.vertex[*w].second.end())
- dep.disconnect(*w, *j);
- if (dep.vertex[*j].second.find(*w) != dep.vertex[*j].second.end())
- dep.disconnect(*j, *w);
- int x, y;
- std::pair<std::vector<DependenceVector>,
- std::vector<DependenceVector> > dv_s;
- if ((*w) <= (*j)) {
- x = *w;
- y = *j;
-
- dv_s = test_data_dependences(ir_, stmt[x].code, stmt[x].IS,
- stmt[y].code, stmt[y].IS, freevar, index, x, y);
- } else {
- x = *j;
- y = *w;
- dv_s = test_data_dependences(ir_, stmt[y].code, stmt[y].IS,
- stmt[x].code, stmt[x].IS, freevar, index, x, y);
- }
- for (int k = 0; k < dv_s.first.size(); k++) {
- if (is_dependence_valid_based_on_lex_order(x, y, dv_s.first[k],
- true))
- dep.connect(x, y, dv_s.first[k]);
- else
- dep.connect(y, x, dv_s.first[k].reverse());
- }
- for (int k = 0; k < dv_s.second.size(); k++) {
- if (is_dependence_valid_based_on_lex_order(x, y, dv_s.second[k],
- false))
- dep.connect(y, x, dv_s.second[k]);
- else
- dep.connect(x, y, dv_s.second[k].reverse());
- }
- w++;
- }
-}
-
-void Loop::shift(const std::set<int> &stmt_nums, int level, int shift_amount) {
- if (stmt_nums.size() == 0)
- return;
-
- // check for sanity of parameters
- int ref_stmt_num = *(stmt_nums.begin());
- for (std::set<int>::const_iterator i = stmt_nums.begin();
- i != stmt_nums.end(); i++) {
- if (*i < 0 || *i >= stmt.size())
- throw std::invalid_argument(
- "invalid statement number " + to_string(*i));
- if (level < 1 || level > stmt[*i].loop_level.size())
- throw std::invalid_argument(
- "invalid loop level " + to_string(level));
- }
-
- // do nothing
- if (shift_amount == 0)
- return;
-
- // set trasformation relations
- for (std::set<int>::const_iterator i = stmt_nums.begin();
- i != stmt_nums.end(); i++) {
- int n = stmt[*i].xform.n_out();
-
- Relation r(n, n);
- F_And *f_root = r.add_and();
- for (int j = 1; j <= n; j++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(r.input_var(j), 1);
- h.update_coef(r.output_var(j), -1);
- if (j == 2 * level)
- h.update_const(shift_amount);
- }
-
- stmt[*i].xform = Composition(r, stmt[*i].xform);
- stmt[*i].xform.simplify();
- }
-
- // update dependence graph
- if (stmt[ref_stmt_num].loop_level[level - 1].type == LoopLevelOriginal) {
- int dep_dim = stmt[ref_stmt_num].loop_level[level - 1].payload;
- for (std::set<int>::const_iterator i = stmt_nums.begin();
- i != stmt_nums.end(); i++)
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[*i].second.begin();
- j != dep.vertex[*i].second.end(); j++)
- if (stmt_nums.find(j->first) == stmt_nums.end()) {
- // dependence from shifted statement to unshifted statement
- std::vector<DependenceVector> dvs = j->second;
- for (int k = 0; k < dvs.size(); k++) {
- DependenceVector &dv = dvs[k];
- if (dv.is_data_dependence()) {
- if (dv.lbounds[dep_dim] != -posInfinity)
- dv.lbounds[dep_dim] -= shift_amount;
- if (dv.ubounds[dep_dim] != posInfinity)
- dv.ubounds[dep_dim] -= shift_amount;
- }
- }
- j->second = dvs;
- }
- for (int i = 0; i < dep.vertex.size(); i++)
- if (stmt_nums.find(i) == stmt_nums.end())
- for (DependenceGraph::EdgeList::iterator j =
- dep.vertex[i].second.begin();
- j != dep.vertex[i].second.end(); j++)
- if (stmt_nums.find(j->first) != stmt_nums.end()) {
- // dependence from unshifted statement to shifted statement
- std::vector<DependenceVector> dvs = j->second;
- for (int k = 0; k < dvs.size(); k++) {
- DependenceVector &dv = dvs[k];
- if (dv.is_data_dependence()) {
- if (dv.lbounds[dep_dim] != -posInfinity)
- dv.lbounds[dep_dim] += shift_amount;
- if (dv.ubounds[dep_dim] != posInfinity)
- dv.ubounds[dep_dim] += shift_amount;
- }
- }
- j->second = dvs;
- }
- }
-}
-
-// bool Loop::fuse(const std::set<int> &stmt_nums, int level) {
-// if (stmt_nums.size() == 0 || stmt_nums.size() == 1)
-// return true;
-// int dim = 2*level-1;
-
-// // check for sanity of parameters
-// std::vector<int> ref_lex;
-// for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-// if (*i < 0 || *i >= stmt.size())
-// throw std::invalid_argument("invalid statement number " + to_string(*i));
-// if (level < 1 || level > (stmt[*i].xform.n_out()-1)/2)
-// throw std::invalid_argument("invalid loop level " + to_string(level));
-// if (ref_lex.size() == 0)
-// ref_lex = getLexicalOrder(*i);
-// else {
-// std::vector<int> lex = getLexicalOrder(*i);
-// for (int j = 0; j < dim-1; j+=2)
-// if (lex[j] != ref_lex[j])
-// throw std::invalid_argument("statements for fusion must be in the same level-" + to_string(level-1) + " subloop");
-// }
-// }
-
-// // collect lexicographical order values from to-be-fused statements
-// std::set<int> lex_values;
-// for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-// std::vector<int> lex = getLexicalOrder(*i);
-// lex_values.insert(lex[dim-1]);
-// }
-// if (lex_values.size() == 1)
-// return true;
-
-// // negative dependence would prevent fusion
-// int dep_dim = xform_index[dim].first;
-// for (std::set<int>::iterator i = lex_values.begin(); i != lex_values.end(); i++) {
-// ref_lex[dim-1] = *i;
-// std::set<int> a = getStatements(ref_lex, dim-1);
-// std::set<int>::iterator j = i;
-// j++;
-// for (; j != lex_values.end(); j++) {
-// ref_lex[dim-1] = *j;
-// std::set<int> b = getStatements(ref_lex, dim-1);
-// for (std::set<int>::iterator ii = a.begin(); ii != a.end(); ii++)
-// for (std::set<int>::iterator jj = b.begin(); jj != b.end(); jj++) {
-// std::vector<DependenceVector> dvs;
-// dvs = dep.getEdge(*ii, *jj);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim) && dvs[k].hasNegative(dep_dim))
-// throw loop_error("loop error: statements " + to_string(*ii) + " and " + to_string(*jj) + " cannot be fused together due to negative dependence");
-// dvs = dep.getEdge(*jj, *ii);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim) && dvs[k].hasNegative(dep_dim))
-// throw loop_error("loop error: statements " + to_string(*jj) + " and " + to_string(*ii) + " cannot be fused together due to negative dependence");
-// }
-// }
-// }
-
-// // collect all other lexicographical order values from the subloop
-// // enclosing these to-be-fused loops
-// std::set<int> same_loop = getStatements(ref_lex, dim-3);
-// std::set<int> other_lex_values;
-// for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-// std::vector<int> lex = getLexicalOrder(*i);
-// if (lex_values.find(lex[dim-1]) == lex_values.end())
-// other_lex_values.insert(lex[dim-1]);
-// }
-
-// // update to-be-fused loops due to dependence cycle
-// Graph<std::set<int>, Empty> g;
-// {
-// std::set<int> t;
-// for (std::set<int>::iterator i = lex_values.begin(); i != lex_values.end(); i++) {
-// ref_lex[dim-1] = *i;
-// std::set<int> t2 = getStatements(ref_lex, dim-1);
-// std::set_union(t.begin(), t.end(), t2.begin(), t2.end(), inserter(t, t.begin()));
-// }
-// g.insert(t);
-// }
-// for (std::set<int>::iterator i = other_lex_values.begin(); i != other_lex_values.end(); i++) {
-// ref_lex[dim-1] = *i;
-// std::set<int> t = getStatements(ref_lex, dim-1);
-// g.insert(t);
-// }
-// for (int i = 0; i < g.vertex.size(); i++)
-// for (int j = i+1; j < g.vertex.size(); j++)
-// for (std::set<int>::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++)
-// for (std::set<int>::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) {
-// std::vector<DependenceVector> dvs;
-// dvs = dep.getEdge(*ii, *jj);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim)) {
-// g.connect(i, j);
-// break;
-// }
-// dvs = dep.getEdge(*jj, *ii);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim)) {
-// g.connect(j, i);
-// break;
-// }
-// }
-// std::vector<std::set<int> > s = g.topoSort();
-// int fused_lex_value = 0;
-// for (int i = 0; i < s.size(); i++)
-// if (s[i].find(0) != s[i].end()) {
-// // now add additional lexicographical order values
-// for (std::set<int>::iterator j = s[i].begin(); j != s[i].end(); j++)
-// if (*j != 0) {
-// int stmt = *(g.vertex[*j].first.begin());
-// std::vector<int> lex = getLexicalOrder(stmt);
-// lex_values.insert(lex[dim-1]);
-// }
-
-// if (s.size() > 1) {
-// if (i == 0) {
-// int min_lex_value;
-// for (std::set<int>::iterator j = s[i+1].begin(); j != s[i+1].end(); j++) {
-// int stmt = *(g.vertex[*j].first.begin());
-// std::vector<int> lex = getLexicalOrder(stmt);
-// if (j == s[i+1].begin())
-// min_lex_value = lex[dim-1];
-// else if (lex[dim-1] < min_lex_value)
-// min_lex_value = lex[dim-1];
-// }
-// fused_lex_value = min_lex_value - 1;
-// }
-// else {
-// int max_lex_value;
-// for (std::set<int>::iterator j = s[i-1].begin(); j != s[i-1].end(); j++) {
-// int stmt = *(g.vertex[*j].first.begin());
-// std::vector<int> lex = getLexicalOrder(stmt);
-// if (j == s[i-1].begin())
-// max_lex_value = lex[dim-1];
-// else if (lex[dim-1] > max_lex_value)
-// max_lex_value = lex[dim-1];
-// }
-// fused_lex_value = max_lex_value + 1;
-// }
-// }
-
-// break;
-// }
-
-// // sort the newly updated to-be-fused lexicographical order values
-// std::vector<int> ordered_lex_values;
-// for (std::set<int>::iterator i = lex_values.begin(); i != lex_values.end(); i++)
-// ordered_lex_values.push_back(*i);
-// std::sort(ordered_lex_values.begin(), ordered_lex_values.end());
-
-// // make sure internal loops inside to-be-fused loops have the same
-// // lexicographical order before and after fusion
-// std::vector<std::pair<int, int> > inside_lex_range(ordered_lex_values.size());
-// for (int i = 0; i < ordered_lex_values.size(); i++) {
-// ref_lex[dim-1] = ordered_lex_values[i];
-// std::set<int> the_stmts = getStatements(ref_lex, dim-1);
-// std::set<int>::iterator j = the_stmts.begin();
-// std::vector<int> lex = getLexicalOrder(*j);
-// int min_inside_lex_value = lex[dim+1];
-// int max_inside_lex_value = lex[dim+1];
-// j++;
-// for (; j != the_stmts.end(); j++) {
-// std::vector<int> lex = getLexicalOrder(*j);
-// if (lex[dim+1] < min_inside_lex_value)
-// min_inside_lex_value = lex[dim+1];
-// if (lex[dim+1] > max_inside_lex_value)
-// max_inside_lex_value = lex[dim+1];
-// }
-// inside_lex_range[i].first = min_inside_lex_value;
-// inside_lex_range[i].second = max_inside_lex_value;
-// }
-// for (int i = 1; i < ordered_lex_values.size(); i++)
-// if (inside_lex_range[i].first <= inside_lex_range[i-1].second) {
-// int shift_lex_value = inside_lex_range[i-1].second - inside_lex_range[i].first + 1;
-// ref_lex[dim-1] = ordered_lex_values[i];
-// ref_lex[dim+1] = inside_lex_range[i].first;
-// shiftLexicalOrder(ref_lex, dim+1, shift_lex_value);
-// inside_lex_range[i].first += shift_lex_value;
-// inside_lex_range[i].second += shift_lex_value;
-// }
-
-// // set lexicographical order for fused loops
-// for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
-// std::vector<int> lex = getLexicalOrder(*i);
-// if (lex_values.find(lex[dim-1]) != lex_values.end())
-// assign_const(stmt[*i].xform, dim-1, fused_lex_value);
-// }
-
-// // no need to update dependence graph
-// ;
-
-// return true;
-// }
-
-// bool Loop::distribute(const std::set<int> &stmt_nums, int level) {
-// if (stmt_nums.size() == 0 || stmt_nums.size() == 1)
-// return true;
-// int dim = 2*level-1;
-
-// // check for sanity of parameters
-// std::vector<int> ref_lex;
-// for (std::set<int>::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) {
-// if (*i < 0 || *i >= stmt.size())
-// throw std::invalid_argument("invalid statement number " + to_string(*i));
-// if (level < 1 || level > (stmt[*i].xform.n_out()-1)/2)
-// throw std::invalid_argument("invalid loop level " + to_string(level));
-// if (ref_lex.size() == 0)
-// ref_lex = getLexicalOrder(*i);
-// else {
-// std::vector<int> lex = getLexicalOrder(*i);
-// for (int j = 0; j <= dim-1; j+=2)
-// if (lex[j] != ref_lex[j])
-// throw std::invalid_argument("statements for distribution must be in the same level-" + to_string(level) + " subloop");
-// }
-// }
-
-// // find SCC in the to-be-distributed loop
-// int dep_dim = xform_index[dim].first;
-// std::set<int> same_loop = getStatements(ref_lex, dim-1);
-// Graph<int, Empty> g;
-// for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-// g.insert(*i);
-// for (int i = 0; i < g.vertex.size(); i++)
-// for (int j = i+1; j < g.vertex.size(); j++) {
-// std::vector<DependenceVector> dvs;
-// dvs = dep.getEdge(g.vertex[i].first, g.vertex[j].first);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim)) {
-// g.connect(i, j);
-// break;
-// }
-// dvs = dep.getEdge(g.vertex[j].first, g.vertex[i].first);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim)) {
-// g.connect(j, i);
-// break;
-// }
-// }
-// std::vector<std::set<int> > s = g.topoSort();
-
-// // find statements that cannot be distributed due to dependence cycle
-// Graph<std::set<int>, Empty> g2;
-// for (int i = 0; i < s.size(); i++) {
-// std::set<int> t;
-// for (std::set<int>::iterator j = s[i].begin(); j != s[i].end(); j++)
-// if (stmt_nums.find(g.vertex[*j].first) != stmt_nums.end())
-// t.insert(g.vertex[*j].first);
-// if (!t.empty())
-// g2.insert(t);
-// }
-// for (int i = 0; i < g2.vertex.size(); i++)
-// for (int j = i+1; j < g2.vertex.size(); j++)
-// for (std::set<int>::iterator ii = g2.vertex[i].first.begin(); ii != g2.vertex[i].first.end(); ii++)
-// for (std::set<int>::iterator jj = g2.vertex[j].first.begin(); jj != g2.vertex[j].first.end(); jj++) {
-// std::vector<DependenceVector> dvs;
-// dvs = dep.getEdge(*ii, *jj);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim)) {
-// g2.connect(i, j);
-// break;
-// }
-// dvs = dep.getEdge(*jj, *ii);
-// for (int k = 0; k < dvs.size(); k++)
-// if (dvs[k].isCarried(dep_dim)) {
-// g2.connect(j, i);
-// break;
-// }
-// }
-// std::vector<std::set<int> > s2 = g2.topoSort();
-
-// // nothing to distribute
-// if (s2.size() == 1)
-// throw loop_error("loop error: no statement can be distributed due to dependence cycle");
-
-// std::vector<std::set<int> > s3;
-// for (int i = 0; i < s2.size(); i++) {
-// std::set<int> t;
-// for (std::set<int>::iterator j = s2[i].begin(); j != s2[i].end(); j++)
-// std::set_union(t.begin(), t.end(), g2.vertex[*j].first.begin(), g2.vertex[*j].first.end(), inserter(t, t.begin()));
-// s3.push_back(t);
-// }
-
-// // associate other affected statements with the right distributed statements
-// for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++)
-// if (stmt_nums.find(*i) == stmt_nums.end()) {
-// bool is_inserted = false;
-// int potential_insertion_point = 0;
-// for (int j = 0; j < s3.size(); j++) {
-// for (std::set<int>::iterator k = s3[j].begin(); k != s3[j].end(); k++) {
-// std::vector<DependenceVector> dvs;
-// dvs = dep.getEdge(*i, *k);
-// for (int kk = 0; kk < dvs.size(); kk++)
-// if (dvs[kk].isCarried(dep_dim)) {
-// s3[j].insert(*i);
-// is_inserted = true;
-// break;
-// }
-// dvs = dep.getEdge(*k, *i);
-// for (int kk = 0; kk < dvs.size(); kk++)
-// if (dvs[kk].isCarried(dep_dim))
-// potential_insertion_point = j;
-// }
-// if (is_inserted)
-// break;
-// }
-
-// if (!is_inserted)
-// s3[potential_insertion_point].insert(*i);
-// }
-
-// // set lexicographical order after distribution
-// int order = ref_lex[dim-1];
-// shiftLexicalOrder(ref_lex, dim-1, s3.size()-1);
-// for (std::vector<std::set<int> >::iterator i = s3.begin(); i != s3.end(); i++) {
-// for (std::set<int>::iterator j = (*i).begin(); j != (*i).end(); j++)
-// assign_const(stmt[*j].xform, dim-1, order);
-// order++;
-// }
-
-// // no need to update dependence graph
-// ;
-
-// return true;
-// }
-
diff --git a/mem_mapping_utils.cc b/mem_mapping_utils.cc
deleted file mode 100644
index 645fe59..0000000
--- a/mem_mapping_utils.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-#include <vector>
-#include <string.h>
-#include <map>
-#include "rose.h"
-#include "mem_mapping_utils.hh"
-
-using namespace SageBuilder;
-using namespace SageInterface;
-
-memory_mapping::memory_mapping (bool used, const char * array_name){
- this->mem_used = used;
- this->add(array_name);
-}
-
-texture_memory_mapping::texture_memory_mapping(bool used, const char* array_name) : memory_mapping(used, array_name) { }
-constant_memory_mapping::constant_memory_mapping(bool used, const char* array_name) : memory_mapping(used, array_name) { }
-//texture_memory_mapping::texture_memory_mapping (bool used, const char* array_name, int width, int height) {
-// tex_mem_used = used;
-// this->add(array_name, width, height);
-//}
-
-void memory_mapping::add(const char * array_name) {
- this->mapped_array_name.push_back(std::string(array_name));
- //std::vector<int> ivec = std::vector<int>();
- //dims[std::string(array_name)] = ivec;
-}
-//void texture_memory_mapping::add(const char* array_name, int width, int height) {
-// tex_mapped_array_name.push_back(std::string(array_name));
-// std::vector<int> ivec = std::vector<int>();
-// ivec.push_back(width);
-// ivec.push_back(height);
-// dims[std::string(array_name)] = ivec;
-//}
-
-bool memory_mapping::is_mem_used(){
- return this->mem_used;
-}
-bool memory_mapping::is_array_mapped(const char * array_name){
-
- for( int i=0; i<mapped_array_name.size(); i++){
- if(!(strcmp(array_name, mapped_array_name[i].c_str())))
- return true;
- }
- return false;
-}
-void memory_mapping::set_mapped_symbol(const char * array_name, SgVariableSymbol* sym) {
- this->mapped_symbol[std::string(array_name)] = sym;
-}
-void texture_memory_mapping::set_devptr_symbol(const char * array_name, SgVariableSymbol* sym) {
- devptr_symbol[std::string(array_name)] = sym;
-}
-void memory_mapping::set_vardef(const char* array_name, VarDefs* vardef) {
- this->vardefs[std::string(array_name)] = vardef;
-}
-SgVarRefExp* memory_mapping::get_mapped_symbol_exp(const char * array_name) {
- return buildVarRefExp(this->mapped_symbol[std::string(array_name)]);
-}
-SgVarRefExp* texture_memory_mapping::get_devptr_symbol_exp(const char * array_name) {
- return buildVarRefExp(devptr_symbol[std::string(array_name)]);
-}
-VarDefs* memory_mapping::get_vardef(const char* vardef_name) {
- return this->vardefs[std::string(vardef_name)];
-}
-//int texture_memory_mapping::get_dims(const char* array_name) {
-// return (int)(dims[std::string(array_name)].size());
-//}
-//int texture_memory_mapping::get_dim_length(const char* array_name, int dim) {
-// return dims[std::string(array_name)][dim];
-//}
-memory_mapping::memory_mapping() {
- mem_used = false;
-}
-texture_memory_mapping::texture_memory_mapping() : memory_mapping() { }
-constant_memory_mapping::constant_memory_mapping() : memory_mapping() { }
-
-
diff --git a/mem_mapping_utils.hh b/mem_mapping_utils.hh
deleted file mode 100644
index 8ff0545..0000000
--- a/mem_mapping_utils.hh
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef MEM_MAPPING_UTILS_HH
-#define MEM_MAPPING_UTILS_HH
-
-#include <vector>
-#include <string.h>
-#include <map>
-#include "rose.h"
-
-using namespace SageInterface;
-using namespace SageBuilder;
-
-struct VarDefs;
-
-class memory_mapping {
-private:
- bool mem_used;
- std::vector< std::string > mapped_array_name;
- std::map<std::string, SgVariableSymbol*> mapped_symbol;
- std::map<std::string, VarDefs*> vardefs;
-public:
- memory_mapping();
- memory_mapping(bool used, const char* array_name);
- void add(const char* array_name);
- bool is_mem_used();
- bool is_array_mapped(const char* array_name);
- void set_mapped_symbol(const char* array_name, SgVariableSymbol* sym);
- void set_vardef(const char* array_name, VarDefs* vardef);
- SgVarRefExp* get_mapped_symbol_exp(const char* array_name);
- VarDefs* get_vardef(const char* vardef_name);
-};
-
-//protonu --class introduced to hold texture memory information in one single place
-//this might help me get over the weird memory issues I am having with the Loop class
-//where someone/something corrupts my memory
-
-class texture_memory_mapping : public memory_mapping {
-private:
- std::map<std::string, SgVariableSymbol*> devptr_symbol;
- // a hack for multi-dimensional texture mapping
- //std::map<std::string, std::vector<int> > dims;
-public:
- texture_memory_mapping ( bool used, const char * array_name);
- //texture_memory_mapping (bool used, const char* array_name, int width, int height);
- // this function is a hack to get arround a bug
- // void add(const char* array_name, int width, int height);
- void set_devptr_symbol(const char * array_name, SgVariableSymbol* sym);
- SgVarRefExp* get_devptr_symbol_exp(const char * array_name);
- //int get_dim_length(const char* array_name, int dim);
- //int get_dims(const char* array_name);
- texture_memory_mapping();
-};
-
-class constant_memory_mapping : public memory_mapping {
-public:
- constant_memory_mapping();
- constant_memory_mapping(bool used, const char* array_name);
-};
-
-#endif
diff --git a/omega/INSTALL b/omega/INSTALL
deleted file mode 100644
index f3c3558..0000000
--- a/omega/INSTALL
+++ /dev/null
@@ -1,34 +0,0 @@
-BUILD
-=====
-
-0. Install Rose using the rose installation instructions given.
-
-1. Edit Makefile.config. Change BUILD_CODEGEN to false if you don't want
- CodeGen+ library to be built.
-
-2. Do "make depend".
-
-3. Optionally, do "make clean" to remove object files or "make veryclean"
- to additionally remove target files.
-
-4. Do "make".
-
-
-INSTALLATION
-============
-
-You can use Omega+ and CodeGen+ in source directory since all links
-are already created in bin/, lib/ and include/ subdirectories.
-
-omega/ source directory root
- bin/ command line interface "oc"
- lib/ libraries "libomega.a" and "libcode_gen.a"
- include/
- omega.h main Omega+ header file
- omega/ Omega+ header files
- basic/ basic utility header files
- code_gen/ CodeGen+ header files
-
-You can also do "make install" to copy necessary files into
-/usr/local for root account, or use home directory for other accounts.
-
diff --git a/omega/README b/omega/README
deleted file mode 100644
index 378f4bd..0000000
--- a/omega/README
+++ /dev/null
@@ -1,96 +0,0 @@
-Omega+ and CodeGen+ 2.2 open source release
-See LICENSE file for copyright information.
-
-Omega+ is a mathematical library for manipulating integer linear
-constraints over integer variables in first order logic, and
-operations on integer sets and their mappings. CodeGen+ is a code
-generation library by scanning the points in a union of polytopes.
-A command-line interface to libraries is also included.
-
-
-What is new?
-============
-
-version 2.2:
- * Redesigned polyhedra scanning which generates higher quality code
- than before especially for complex set of polyhedra.
- * New SimpleHull for hull approximation (deprecate Hull).
- * Command line editing and history support in calculator.
-
-version 2.1:
- * Updated "effort" parameter's meaning in MMGenerateCode: value n
- (n >= 0, default to 1) means that control overheads are removed
- from all n-depth innermost loops.
- * Enhanced stride handling in the code generation.
- * Support code generation for a set of iteration spaces with different
- dimensionality.
- * New ConvexRepresentation that reduces the number of conjuncts in a union
- (deprecate CheckForConvexPairs and CheckForConvexRepresentation).
- * Handle floor/ceiling defined variables cleanly in output code.
- * Use namespace omega for the library.
- * New closure functions contributed by Klimek Tomasz (R^+ and R^@).
-
-version 2.0:
- * Improved internal code generation interface so that it generates both
- string and rose ouput now, and more easily extendable for new compiler
- intermediate representations.
- * Improved gist function so that integer modular constraints are handled
- more gracefully.
- * Merge duplicate if-conditions in generated code, which might still miss
- a few opportunities due to the way AST is constructed.
- * Correct output/input variable substitution for non-unimodular
- mapping relations.
- * Deprecate Omega's assert/Exit interface.
- * Some fixing in calculator's parsing and interactive interface.
-
-version 1.2 (Omega Project):
- * Support for code generation with memory mappings, as described in
- Tina Shen's MASPLAS '98 paper. This is available in oc via the
- tcodegen function; see examples/calc/mm* for examples.
- * Use of the compile-time flags -DSTILL_CHECK_MULT=1 -DNDEBUG turns off
- all assertions and chechk _except_ some checks for integer overflow
- during variable elimination in the omega core. Unless you know a priori
- that overflow cannot occur, you should use this instead of just plain
- -DNDEBUG when optimizing.
- * You can now use "assertUnsatisfiable relation" to cause oc to quit if
- "relation" could be satisfiable. This is mainly useful when running oc
- in a script.
-
-version 1.1 (Omega Project):
- * An exact convex hull computation.
- * An improved system for handling inexact relations, including taking
- upper and lower bounds, checking for subsets, and checking tautologies.
- * Better handling of existentially quantified variables: we can now
- negate and generate code for sets like:
- {[i]: 1 <= i <= n && exists (alpha: i <= 10*alpha <= i+k)}.
- * An Example operator, that gives a sample solution to set or relation.
-
-version 0.90 (Omega Project):
- * Initial release.
-
-
-DIRECTORIES
-===========
-
-omega/
- omega_lib/ source files for the Omega+ library
- code_gen/ source files for the CodeGen+ library
- omega_calc/ source files for the calculator
- examples/ script examples using calculator
- c_code/ code examples for using libraries
- bin/ links to executables: oc
- lib/ links to libraries: libomega.a, libcode_gen.a
- include/ links to header files
-
-
-DOCUMENTATION AND QUESTIONS
-===========================
-
-There are only old documents from the Omega Project under doc/ subdirectory
-for now.
-
-Software website:
- http://www.chunchen.info/omega
-
-For questions, bug reports or suggestions, please contact:
- mailto:riverofdreams@gmail.com
diff --git a/omega/ROSE_INSTALL.txt b/omega/ROSE_INSTALL.txt
deleted file mode 100644
index 79e0c43..0000000
--- a/omega/ROSE_INSTALL.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-INSTALLATION STEPS:
-
-1) Please install Boost library version <= 1.45.0 using these instruccions
-
-1. Download BOOST.
-Download BOOST at www.boost.org/users/download.
-
-2. Untar BOOST.
-Type tar -zxf BOOST-[VersionNumber].tar.gz to untar the BOOST distribution.
-
-3. Create a separate install tree.
-Type mkdir installTree to create a location for the install.
-
-4. Run the bootstrap.sh script.
-Type ./bootstrap.sh --prefix=[installTree]
-
-5. Run bjam.
-Type ./bjam install --prefix=[installTree]
-
-
-6) set your BOOSTHOME environment variable to where you've installed BOOST.
-
-7) Download the latest version of rose from the website.
- https://outreach.scidac.gov/frs/?group_id=24
-
-8) set the JAVA_HOME environment variable in your ${HOME}/.bashrc
- eg. export JAVA_HOME=/usr/lib/jvm/java-1.6.0-openjdk
-
-9) add this to the LD_LIBRARY_PATH environment variable
-
- LD_LIBRARY_PATH=$JAVA_HOME/jre/lib/i386/server:$LD_LIBRARY_PATH
- export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${BOOSTHOME}/lib
-
-10) make a new empty directory separate from the downloaded source directory($ROSE_SRC) for rose.
- eg. mkdir ${HOME}/compileTree
-
-11) set your ROSEHOME environment variable in ${HOME}/.bashrc to ${HOME}/compileTree
-
-12) run the following command from this ${ROSEHOME}
- ${ROSE_SRC}/configure --prefix=${ROSEHOME} --with-boost=${BOOSTHOME} --with-boost-libdir=${BOOSTHOME}/lib -with-haskell=no
-
-13) run the following command to compile:
- make install-core
-
-
-14) Install lua version <= 5.1 (usually not necessary to set the LUAHOME environment variable unless
- you installed it in a local directory, in which case set LUAHOME to that directory). Lua is only required for
- cuda-chill and not plain chill.
-
-15) If you are installing for CUDA-CHILL set the CUDACHILL environment variable to true
- else false
-
-
-16) Install omega by doing the following commands
- i) make clean
- ii) make veryclean
- iii)make depend
- iv) make
-
-17) Set your OMEGAHOME environment variable to the appropriate directory in ${HOME}/.bashrc
-
-18) Install cuda-chill by doing the following commands
- i) make clean
- ii) make veryclean
- iii)make depend-cuda-chill
- iv) make cuda-chill
-
- else if you are installing just plain chill
- export CUDACHILL=false; (remember to rebuild plain omega as well)
- i) make clean
- ii) make veryclean
- iii)make depend
- iv) make
-
-19) Go to examples/cuda-chill and run ../../cuda-chill mm.lua
-
-20) If running plain Chill go to examples/chill and run ../../chill gemm.script
diff --git a/omega/bin/oc b/omega/bin/oc
deleted file mode 120000
index be58273..0000000
--- a/omega/bin/oc
+++ /dev/null
@@ -1 +0,0 @@
-../omega_calc/obj/oc \ No newline at end of file
diff --git a/orig_loop_datacopy.cc b/orig_loop_datacopy.cc
deleted file mode 100644
index 04741bc..0000000
--- a/orig_loop_datacopy.cc
+++ /dev/null
@@ -1,1175 +0,0 @@
-/*****************************************************************************
- Copyright (C) 2008 University of Southern California
- Copyright (C) 2009-2010 University of Utah
- All Rights Reserved.
-
- Purpose:
- Various data copy schemes.
-
- Notes:
-
- History:
- 02/20/09 Created by Chun Chen by splitting original datacopy from loop.cc
-*****************************************************************************/
-
-#include <code_gen/code_gen.h>
-#include <code_gen/output_repr.h>
-#include "loop.hh"
-#include "omegatools.hh"
-#include "ir_code.hh"
-#include "chill_error.hh"
-
-using namespace omega;
-
-//
-// data copy function by referring arrays by numbers.
-// e.g. A[i] = A[i-1] + B[i]
-// parameter array_ref_num=[0,2] means to copy data touched by A[i-1] and A[i]
-//
-bool Loop::datacopy(const std::vector<std::pair<int, std::vector<int> > > &array_ref_nums, int level,
- bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, int memory_type) {
- // check for sanity of parameters
- std::set<int> same_loop;
- for (int i = 0; i < array_ref_nums.size(); i++) {
- int stmt_num = array_ref_nums[i].first;
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invalid statement number " + to_string(stmt_num));
- if (level <= 0 || level > stmt[stmt_num].loop_level.size())
- throw std::invalid_argument("invalid loop level " + to_string(level));
- if (i == 0) {
- std::vector<int> lex = getLexicalOrder(stmt_num);
- same_loop = getStatements(lex, 2*level-2);
- }
- else if (same_loop.find(stmt_num) == same_loop.end())
- throw std::invalid_argument("array references for data copy must be located in the same subloop");
- }
-
- // convert array reference numbering scheme to actual array references
- std::vector<std::pair<int, std::vector<IR_ArrayRef *> > > selected_refs;
- for (int i = 0; i < array_ref_nums.size(); i++) {
- if (array_ref_nums[i].second.size() == 0)
- continue;
-
- int stmt_num = array_ref_nums[i].first;
- selected_refs.push_back(std::make_pair(stmt_num, std::vector<IR_ArrayRef *>()));
- std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[stmt_num].code);
- std::vector<bool> selected(refs.size(), false);
- for (int j = 0; j < array_ref_nums[i].second.size(); j++) {
- int ref_num = array_ref_nums[i].second[j];
- if (ref_num < 0 || ref_num >= refs.size()) {
- for (int k = 0; k < refs.size(); k++)
- delete refs[k];
- throw std::invalid_argument("invalid array reference number " + to_string(ref_num) + " in statement " + to_string(stmt_num));
- }
- selected_refs[selected_refs.size()-1].second.push_back(refs[ref_num]);
- selected[ref_num] = true;
- }
- for (int j = 0; j < refs.size(); j++)
- if (!selected[j])
- delete refs[j];
- }
- if (selected_refs.size() == 0)
- throw std::invalid_argument("found no array references to copy");
-
- // do the copy
- return datacopy_privatized(selected_refs, level, std::vector<int>(), allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, memory_type);
-}
-
-//
-// data copy function by referring arrays by name.
-// e.g. A[i] = A[i-1] + B[i]
-// parameter array_name=A means to copy data touched by A[i-1] and A[i]
-//
-bool Loop::datacopy(int stmt_num, int level, const std::string &array_name,
- bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, int memory_type) {
- // check for sanity of parameters
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invalid statement number " + to_string(stmt_num));
- if (level <= 0 || level > stmt[stmt_num].loop_level.size())
- throw std::invalid_argument("invalid loop level " + to_string(level));
-
- // collect array references by name
- std::vector<int> lex = getLexicalOrder(stmt_num);
- int dim = 2*level - 1;
- std::set<int> same_loop = getStatements(lex, dim-1);
-
- std::vector<std::pair<int, std::vector<IR_ArrayRef *> > > selected_refs;
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
- std::vector<IR_ArrayRef *> t;
- std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[*i].code);
- for (int j = 0; j < refs.size(); j++)
- if (refs[j]->name() == array_name)
- t.push_back(refs[j]);
- else
- delete refs[j];
- if (t.size() != 0)
- selected_refs.push_back(std::make_pair(*i, t));
- }
- if (selected_refs.size() == 0)
- throw std::invalid_argument("found no array references with name " + to_string(array_name) + " to copy");
-
- // do the copy
- return datacopy_privatized(selected_refs, level, std::vector<int>(), allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, memory_type);
-}
-
-
-bool Loop::datacopy_privatized(int stmt_num, int level, const std::string &array_name, const std::vector<int> &privatized_levels,
- bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, int memory_type) {
- // check for sanity of parameters
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invalid statement number " + to_string(stmt_num));
- if (level <= 0 || level > stmt[stmt_num].loop_level.size())
- throw std::invalid_argument("invalid loop level " + to_string(level));
-
- // collect array references by name
- std::vector<int> lex = getLexicalOrder(stmt_num);
- int dim = 2*level - 1;
- std::set<int> same_loop = getStatements(lex, dim-1);
-
- std::vector<std::pair<int, std::vector<IR_ArrayRef *> > > selected_refs;
- for (std::set<int>::iterator i = same_loop.begin(); i != same_loop.end(); i++) {
- selected_refs.push_back(std::make_pair(*i, std::vector<IR_ArrayRef *>()));
-
- std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[*i].code);
- for (int j = 0; j < refs.size(); j++)
- if (refs[j]->name() == array_name)
- selected_refs[selected_refs.size()-1].second.push_back(refs[j]);
- else
- delete refs[j];
- }
- if (selected_refs.size() == 0)
- throw std::invalid_argument("found no array references with name " + to_string(array_name) + " to copy");
-
- // do the copy
- return datacopy_privatized(selected_refs, level, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, memory_type);
-}
-
-
-bool Loop::datacopy_privatized(const std::vector<std::pair<int, std::vector<int> > > &array_ref_nums, int level, const std::vector<int> &privatized_levels, bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, int memory_type) {
- // check for sanity of parameters
- std::set<int> same_loop;
- for (int i = 0; i < array_ref_nums.size(); i++) {
- int stmt_num = array_ref_nums[i].first;
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invalid statement number " + to_string(stmt_num));
- if (level <= 0 || level > stmt[stmt_num].loop_level.size())
- throw std::invalid_argument("invalid loop level " + to_string(level));
- if (i == 0) {
- std::vector<int> lex = getLexicalOrder(stmt_num);
- same_loop = getStatements(lex, 2*level-2);
- }
- else if (same_loop.find(stmt_num) == same_loop.end())
- throw std::invalid_argument("array references for data copy must be located in the same subloop");
- }
-
- // convert array reference numbering scheme to actual array references
- std::vector<std::pair<int, std::vector<IR_ArrayRef *> > > selected_refs;
- for (int i = 0; i < array_ref_nums.size(); i++) {
- if (array_ref_nums[i].second.size() == 0)
- continue;
-
- int stmt_num = array_ref_nums[i].first;
- selected_refs.push_back(std::make_pair(stmt_num, std::vector<IR_ArrayRef *>()));
- std::vector<IR_ArrayRef *> refs = ir->FindArrayRef(stmt[stmt_num].code);
- std::vector<bool> selected(refs.size(), false);
- for (int j = 0; j < array_ref_nums[i].second.size(); j++) {
- int ref_num = array_ref_nums[i].second[j];
- if (ref_num < 0 || ref_num >= refs.size()) {
- for (int k = 0; k < refs.size(); k++)
- delete refs[k];
- throw std::invalid_argument("invalid array reference number " + to_string(ref_num) + " in statement " + to_string(stmt_num));
- }
- selected_refs[selected_refs.size()-1].second.push_back(refs[ref_num]);
- selected[ref_num] = true;
- }
- for (int j = 0; j < refs.size(); j++)
- if (!selected[j])
- delete refs[j];
- }
- if (selected_refs.size() == 0)
- throw std::invalid_argument("found no array references to copy");
-
- // do the copy
- return datacopy_privatized(selected_refs, level, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, memory_type);
-}
-
-
-//
-// Implement low level datacopy function with lots of options.
-//
-bool Loop::datacopy_privatized(const std::vector<std::pair<int, std::vector<IR_ArrayRef *> > > &stmt_refs, int level,
- const std::vector<int> &privatized_levels,
- bool allow_extra_read, int fastest_changing_dimension,
- int padding_stride, int padding_alignment, int memory_type) {
- if (stmt_refs.size() == 0)
- return true;
-
- // check for sanity of parameters
- IR_ArraySymbol *sym = NULL;
- std::vector<int> lex;
- std::set<int> active;
- if (level <= 0)
- throw std::invalid_argument("invalid loop level " + to_string(level));
- for (int i = 0; i < privatized_levels.size(); i++) {
- if (i == 0) {
- if (privatized_levels[i] < level)
- throw std::invalid_argument("privatized loop levels must be no less than level " + to_string(level));
- }
- else if (privatized_levels[i] <= privatized_levels[i-1])
- throw std::invalid_argument("privatized loop levels must be in ascending order");
- }
- for (int i = 0; i < stmt_refs.size(); i++) {
- int stmt_num = stmt_refs[i].first;
- active.insert(stmt_num);
- if (stmt_num < 0 || stmt_num >= stmt.size())
- throw std::invalid_argument("invalid statement number " + to_string(stmt_num));
- if (privatized_levels.size() != 0) {
- if (privatized_levels[privatized_levels.size()-1] > stmt[stmt_num].loop_level.size())
- throw std::invalid_argument("invalid loop level " + to_string(privatized_levels[privatized_levels.size()-1]) + " for statement " + to_string(stmt_num));
- }
- else {
- if (level > stmt[stmt_num].loop_level.size())
- throw std::invalid_argument("invalid loop level " + to_string(level) + " for statement " + to_string(stmt_num));
- }
- for (int j = 0; j < stmt_refs[i].second.size(); j++) {
- if (sym == NULL) {
- sym = stmt_refs[i].second[j]->symbol();
- lex = getLexicalOrder(stmt_num);
- }
- else {
- IR_ArraySymbol *t = stmt_refs[i].second[j]->symbol();
- if (t->name() != sym->name()) {
- delete t;
- delete sym;
- throw std::invalid_argument("try to copy data from different arrays");
- }
- delete t;
- }
- }
- }
- if (!(fastest_changing_dimension >= -1 && fastest_changing_dimension < sym->n_dim()))
- throw std::invalid_argument("invalid fastest changing dimension for the array to be copied");
- if (padding_stride < 0)
- throw std::invalid_argument("invalid temporary array stride requirement");
- if (padding_alignment == -1 || padding_alignment == 0)
- throw std::invalid_argument("invalid temporary array alignment requirement");
-
- int dim = 2*level - 1;
- int n_dim = sym->n_dim();
-
- if (fastest_changing_dimension == -1)
- switch (sym->layout_type()) {
- case IR_ARRAY_LAYOUT_ROW_MAJOR:
- fastest_changing_dimension = n_dim - 1;
- break;
- case IR_ARRAY_LAYOUT_COLUMN_MAJOR:
- fastest_changing_dimension = 0;
- break;
- default:
- throw loop_error("unsupported array layout");
- }
-
-
- // build iteration spaces for all reads and for all writes separately
- apply_xform(active);
- bool has_write_refs = false;
- bool has_read_refs = false;
- Relation wo_copy_is = Relation::False(level-1+privatized_levels.size()+n_dim);
- Relation ro_copy_is = Relation::False(level-1+privatized_levels.size()+n_dim);
- for (int i = 0; i < stmt_refs.size(); i++) {
- int stmt_num = stmt_refs[i].first;
-
- for (int j = 0; j < stmt_refs[i].second.size(); j++) {
- Relation mapping(stmt[stmt_num].IS.n_set(), level-1+privatized_levels.size()+n_dim);
- for (int k = 1; k <= mapping.n_inp(); k++)
- mapping.name_input_var(k, stmt[stmt_num].IS.set_var(k)->name());
- mapping.setup_names();
- F_And *f_root = mapping.add_and();
- for (int k = 1; k <= level-1; k++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.input_var(k), 1);
- h.update_coef(mapping.output_var(k), -1);
- }
- for (int k = 0; k < privatized_levels.size(); k++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.input_var(privatized_levels[k]), 1);
- h.update_coef(mapping.output_var(level+k), -1);
- }
- for (int k = 0; k < n_dim; k++) {
- CG_outputRepr *repr = stmt_refs[i].second[j]->index(k);
- exp2formula(ir, mapping, f_root, freevar, repr, mapping.output_var(level-1+privatized_levels.size()+k+1), 'w', IR_COND_EQ, false);
- repr->clear();
- delete repr;
- }
- Relation r = Range(Restrict_Domain(mapping, Intersection(copy(stmt[stmt_num].IS), Extend_Set(copy(this->known), stmt[stmt_num].IS.n_set() - this->known.n_set()))));
- if (stmt_refs[i].second[j]->is_write()) {
- has_write_refs = true;
- wo_copy_is = Union(wo_copy_is, r);
- wo_copy_is.simplify(2, 4);
- }
- else {
- has_read_refs = true;
- //protonu--removing the next line for now
- ro_copy_is = Union(ro_copy_is, r);
- ro_copy_is.simplify(2, 4);
- //ro_copy_is = ConvexRepresentation(Union(ro_copy_is, r));
-
- }
- }
- }
-
- if (allow_extra_read) {
- Relation t = DecoupledConvexHull(copy(ro_copy_is));
- if (t.number_of_conjuncts() > 1)
- ro_copy_is = RectHull(ro_copy_is);
- else
- ro_copy_is = t;
- }
- else {
- Relation t = ConvexRepresentation(copy(ro_copy_is));
- if (t.number_of_conjuncts() > 1)
- ro_copy_is = RectHull(ro_copy_is);
- else
- ro_copy_is = t;
- }
- wo_copy_is = ConvexRepresentation(wo_copy_is);
-
- if (allow_extra_read) {
- Tuple<Relation> Rs;
- Tuple<int> active;
- for (DNF_Iterator di(ro_copy_is.query_DNF()); di; di++) {
- Rs.append(Relation(ro_copy_is, di.curr()));
- active.append(1);
- }
- Relation the_gcs = Relation::True(ro_copy_is.n_set());
- for (int i = level-1+privatized_levels.size()+1; i <= level-1+privatized_levels.size()+n_dim; i++) {
- Relation r = greatest_common_step(Rs, active, i, Relation::Null());
- the_gcs = Intersection(the_gcs, r);
- }
-
- ro_copy_is = Approximate(ro_copy_is);
- ro_copy_is = ConvexRepresentation(ro_copy_is);
- ro_copy_is = Intersection(ro_copy_is, the_gcs);
- ro_copy_is.simplify();
- }
- for (int i = 1; i <= level-1+privatized_levels.size()+n_dim; i++) {
- wo_copy_is.name_set_var(i, tmp_loop_var_name_prefix+to_string(i));
- ro_copy_is.name_set_var(i, tmp_loop_var_name_prefix+to_string(i));
- }
- wo_copy_is.setup_names();
- ro_copy_is.setup_names();
-
- // build merged iteration space for calculating temporary array size
- bool already_use_recthull = false;
- Relation untampered_copy_is = ConvexRepresentation(Union(copy(wo_copy_is), copy(ro_copy_is)));
- Relation copy_is = untampered_copy_is;
- if (copy_is.number_of_conjuncts() > 1) {
- try {
- copy_is = ConvexHull(copy(untampered_copy_is));
- }
- catch (const std::overflow_error &e) {
- copy_is = RectHull(copy(untampered_copy_is));
- already_use_recthull = true;
- }
- }
-
-
-Retry_copy_is:
- // extract temporary array information
- CG_outputBuilder *ocg = ir->builder();
- std::vector<CG_outputRepr *> index_lb(n_dim); // initialized to NULL
- std::vector<coef_t> index_stride(n_dim, 1);
- std::vector<bool> is_index_eq(n_dim, false);
- std::vector<std::pair<int, CG_outputRepr *> > index_sz(0);
- Relation reduced_copy_is = copy(copy_is);
-
- for (int i = 0; i < n_dim; i++) {
- if (i != 0)
- reduced_copy_is = Project(reduced_copy_is, level-1+privatized_levels.size()+i, Set_Var);
- Relation bound = get_loop_bound(reduced_copy_is, level-1+privatized_levels.size()+i);
-
- // extract stride
- EQ_Handle stride_eq;
- {
- bool simple_stride = true;
- int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(level-1+privatized_levels.size()+i+1), stride_eq, simple_stride);
- if (strides > 1) {
- throw loop_error("too many strides");
- }
- else if (strides == 1) {
- int sign = stride_eq.get_coef(bound.set_var(level-1+privatized_levels.size()+i+1));
- Constr_Vars_Iter it(stride_eq, true);
- index_stride[i] = abs((*it).coef/sign);
- }
- }
-
- // check if this arary index requires loop
- Conjunct *c = bound.query_DNF()->single_conjunct();
- for (EQ_Iterator ei(c->EQs()); ei; ei++) {
- if ((*ei).has_wildcards())
- continue;
-
- int coef = (*ei).get_coef(bound.set_var(level-1+privatized_levels.size()+i+1));
- if (coef != 0) {
- int sign = 1;
- if (coef < 0) {
- coef = -coef;
- sign = -1;
- }
-
- CG_outputRepr *op = NULL;
- for (Constr_Vars_Iter ci(*ei); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var:
- {
- if ((*ci).var != bound.set_var(level-1+privatized_levels.size()+i+1))
- if ((*ci).coef*sign == 1)
- op = ocg->CreateMinus(op, ocg->CreateIdent((*ci).var->name()));
- else if ((*ci).coef*sign == -1)
- op = ocg->CreatePlus(op, ocg->CreateIdent((*ci).var->name()));
- else if ((*ci).coef*sign > 1)
- op = ocg->CreateMinus(op, ocg->CreateTimes(ocg->CreateInt(abs((*ci).coef)), ocg->CreateIdent((*ci).var->name())));
- else // (*ci).coef*sign < -1
- op = ocg->CreatePlus(op, ocg->CreateTimes(ocg->CreateInt(abs((*ci).coef)), ocg->CreateIdent((*ci).var->name())));
- break;
- }
- case Global_Var:
- {
- Global_Var_ID g = (*ci).var->get_global_var();
- if ((*ci).coef*sign == 1)
- op = ocg->CreateMinus(op, ocg->CreateIdent(g->base_name()));
- else if ((*ci).coef*sign == -1)
- op = ocg->CreatePlus(op, ocg->CreateIdent(g->base_name()));
- else if ((*ci).coef*sign > 1)
- op = ocg->CreateMinus(op, ocg->CreateTimes(ocg->CreateInt(abs((*ci).coef)), ocg->CreateIdent(g->base_name())));
- else // (*ci).coef*sign < -1
- op = ocg->CreatePlus(op, ocg->CreateTimes(ocg->CreateInt(abs((*ci).coef)), ocg->CreateIdent(g->base_name())));
- break;
- }
- default:
- throw loop_error("unsupported array index expression");
- }
- }
- if ((*ei).get_const() != 0)
- op = ocg->CreatePlus(op, ocg->CreateInt(-sign*((*ei).get_const())));
- if (coef != 1)
- op = ocg->CreateIntegerDivide(op, ocg->CreateInt(coef));
-
- index_lb[i] = op;
- is_index_eq[i] = true;
- break;
- }
- }
- if (is_index_eq[i])
- continue;
-
- // seperate lower and upper bounds
- std::vector<GEQ_Handle> lb_list, ub_list;
- for (GEQ_Iterator gi(c->GEQs()); gi; gi++) {
- int coef = (*gi).get_coef(bound.set_var(level-1+privatized_levels.size()+i+1));
- if (coef != 0 && (*gi).has_wildcards()) {
- bool clean_bound = true;
- GEQ_Handle h;
- for (Constr_Vars_Iter cvi(*gi, true); gi; gi++)
- if (!findFloorInequality(bound, (*cvi).var, h, bound.set_var(level-1+privatized_levels.size()+i+1))) {
- clean_bound = false;
- break;
- }
- if (!clean_bound)
- continue;
- }
-
- if (coef > 0)
- lb_list.push_back(*gi);
- else if (coef < 0)
- ub_list.push_back(*gi);
- }
- if (lb_list.size() == 0 || ub_list.size() == 0)
- if (already_use_recthull)
- throw loop_error("failed to calcuate array footprint size");
- else {
- copy_is = RectHull(copy(untampered_copy_is));
- already_use_recthull = true;
- goto Retry_copy_is;
- }
-
- // build lower bound representation
- Tuple<CG_outputRepr *> lb_repr_list;
- for (int j = 0; j < lb_list.size(); j++)
- lb_repr_list.append(outputLBasRepr(ocg, lb_list[j], bound,
- bound.set_var(level-1+privatized_levels.size()+i+1),
- index_stride[i], stride_eq, Relation::True(bound.n_set()),
- std::vector<CG_outputRepr *>(bound.n_set())));
-
- if (lb_repr_list.size() > 1)
- index_lb[i] = ocg->CreateInvoke("max", lb_repr_list);
- else if (lb_repr_list.size() == 1)
- index_lb[i] = lb_repr_list[1];
-
- // build temporary array size representation
- {
- Relation cal(copy_is.n_set(), 1);
- F_And *f_root = cal.add_and();
- for (int j = 0; j < ub_list.size(); j++)
- for (int k = 0; k < lb_list.size(); k++) {
- GEQ_Handle h = f_root->add_GEQ();
-
- for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var:
- {
- int pos = (*ci).var->get_position();
- h.update_coef(cal.input_var(pos), (*ci).coef);
- break;
- }
- case Global_Var:
- {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = cal.get_local(g);
- else
- v = cal.get_local(g, (*ci).var->function_of());
- h.update_coef(v, (*ci).coef);
- break;
- }
- default:
- throw loop_error("cannot calculate temporay array size statically");
- }
- }
- h.update_const(ub_list[j].get_const());
-
- for (Constr_Vars_Iter ci(lb_list[k]); ci; ci++) {
- switch ((*ci).var->kind()) {
- case Input_Var:
- {
- int pos = (*ci).var->get_position();
- h.update_coef(cal.input_var(pos), (*ci).coef);
- break;
- }
- case Global_Var:
- {
- Global_Var_ID g = (*ci).var->get_global_var();
- Variable_ID v;
- if (g->arity() == 0)
- v = cal.get_local(g);
- else
- v = cal.get_local(g, (*ci).var->function_of());
- h.update_coef(v, (*ci).coef);
- break;
- }
- default:
- throw loop_error("cannot calculate temporay array size statically");
- }
- }
- h.update_const(lb_list[k].get_const());
-
- h.update_const(1);
- h.update_coef(cal.output_var(1), -1);
- }
-
- cal = Restrict_Domain(cal, copy(copy_is));
- for (int j = 1; j <= cal.n_inp(); j++)
- cal = Project(cal, j, Input_Var);
- cal.simplify();
-
- // pad temporary array size
- // TODO: for variable array size, create padding formula
- Conjunct *c = cal.query_DNF()->single_conjunct();
- bool is_index_bound_const = false;
- for (GEQ_Iterator gi(c->GEQs()); gi && !is_index_bound_const; gi++)
- if ((*gi).is_const(cal.output_var(1))) {
- coef_t size = (*gi).get_const() / (-(*gi).get_coef(cal.output_var(1)));
- if (padding_stride != 0) {
- size = (size + index_stride[i] - 1) / index_stride[i];
- if (i == fastest_changing_dimension)
- size = size * padding_stride;
- }
- if (i == fastest_changing_dimension) {
- if (padding_alignment > 1) { // align to boundary for data packing
- int residue = size % padding_alignment;
- if (residue)
- size = size+padding_alignment-residue;
- }
- else if (padding_alignment < -1) { // un-alignment for memory bank conflicts
- while (gcd(size, static_cast<coef_t>(-padding_alignment)) != 1)
- size++;
- }
- }
- index_sz.push_back(std::make_pair(i, ocg->CreateInt(size)));
- is_index_bound_const = true;
- }
-
- if (!is_index_bound_const) {
- for (GEQ_Iterator gi(c->GEQs()); gi && !is_index_bound_const; gi++) {
- int coef = (*gi).get_coef(cal.output_var(1));
- if (coef < 0) {
- CG_outputRepr *op = NULL;
- for (Constr_Vars_Iter ci(*gi); ci; ci++) {
- if ((*ci).var != cal.output_var(1)) {
- switch((*ci).var->kind()) {
- case Global_Var:
- {
- Global_Var_ID g = (*ci).var->get_global_var();
- if ((*ci).coef == 1)
- op = ocg->CreatePlus(op, ocg->CreateIdent(g->base_name()));
- else if ((*ci).coef == -1)
- op = ocg->CreateMinus(op, ocg->CreateIdent(g->base_name()));
- else if ((*ci).coef > 1)
- op = ocg->CreatePlus(op, ocg->CreateTimes(ocg->CreateInt((*ci).coef), ocg->CreateIdent(g->base_name())));
- else // (*ci).coef < -1
- op = ocg->CreateMinus(op, ocg->CreateTimes(ocg->CreateInt(-(*ci).coef), ocg->CreateIdent(g->base_name())));
- break;
- }
- default:
- throw loop_error("failed to generate array index bound code");
- }
- }
- }
- int c = (*gi).get_const();
- if (c > 0)
- op = ocg->CreatePlus(op, ocg->CreateInt(c));
- else if (c < 0)
- op = ocg->CreateMinus(op, ocg->CreateInt(-c));
- if (padding_stride != 0) {
- if (i == fastest_changing_dimension) {
- coef_t g = gcd(index_stride[i], static_cast<coef_t>(padding_stride));
- coef_t t1 = index_stride[i] / g;
- if (t1 != 1)
- op = ocg->CreateIntegerDivide(ocg->CreatePlus(op, ocg->CreateInt(t1-1)), ocg->CreateInt(t1));
- coef_t t2 = padding_stride / g;
- if (t2 != 1)
- op = ocg->CreateTimes(op, ocg->CreateInt(t2));
- }
- else if (index_stride[i] != 1) {
- op = ocg->CreateIntegerDivide(ocg->CreatePlus(op, ocg->CreateInt(index_stride[i]-1)), ocg->CreateInt(index_stride[i]));
- }
- }
-
- index_sz.push_back(std::make_pair(i, op));
- break;
- }
- }
- }
- }
- }
-
- // change the temporary array index order
- for (int i = 0; i < index_sz.size(); i++)
- if (index_sz[i].first == fastest_changing_dimension)
- switch (sym->layout_type()) {
- case IR_ARRAY_LAYOUT_ROW_MAJOR:
- std::swap(index_sz[index_sz.size()-1], index_sz[i]);
- break;
- case IR_ARRAY_LAYOUT_COLUMN_MAJOR:
- std::swap(index_sz[0], index_sz[i]);
- break;
- default:
- throw loop_error("unsupported array layout");
- }
-
- // declare temporary array or scalar
- IR_Symbol *tmp_sym;
- if (index_sz.size() == 0) {
- tmp_sym = ir->CreateScalarSymbol(sym, memory_type);
- }
- else {
- std::vector<CG_outputRepr *> tmp_array_size(index_sz.size());
- for (int i = 0; i < index_sz.size(); i++)
- tmp_array_size[i] = index_sz[i].second->clone();
- tmp_sym = ir->CreateArraySymbol(sym, tmp_array_size, memory_type);
- }
-
- // create temporary array read initialization code
- CG_outputRepr *copy_code_read;
- if (has_read_refs)
- if (index_sz.size() == 0) {
- IR_ScalarRef *tmp_scalar_ref = ir->CreateScalarRef(static_cast<IR_ScalarSymbol *>(tmp_sym));
-
- std::vector<CG_outputRepr *> rhs_index(n_dim);
- for (int i = 0; i < index_lb.size(); i++)
- if (is_index_eq[i])
- rhs_index[i] = index_lb[i]->clone();
- else
- rhs_index[i] = ir->builder()->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+i+1)->name());
- IR_ArrayRef *copied_array_ref = ir->CreateArrayRef(sym, rhs_index);
-
- copy_code_read = ir->builder()->CreateAssignment(0, tmp_scalar_ref->convert(), copied_array_ref->convert());
- }
- else {
- std::vector<CG_outputRepr *> lhs_index(index_sz.size());
- for (int i = 0; i < index_sz.size(); i++) {
- int cur_index_num = index_sz[i].first;
- CG_outputRepr *cur_index_repr = ocg->CreateMinus(ocg->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+cur_index_num+1)->name()), index_lb[cur_index_num]->clone());
- if (padding_stride != 0) {
- if (i == n_dim-1) {
- coef_t g = gcd(index_stride[cur_index_num], static_cast<coef_t>(padding_stride));
- coef_t t1 = index_stride[cur_index_num] / g;
- if (t1 != 1)
- cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(t1));
- coef_t t2 = padding_stride / g;
- if (t2 != 1)
- cur_index_repr = ocg->CreateTimes(cur_index_repr, ocg->CreateInt(t2));
- }
- else if (index_stride[cur_index_num] != 1) {
- cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(index_stride[cur_index_num]));
- }
- }
-
- if (ir->ArrayIndexStartAt() != 0)
- cur_index_repr = ocg->CreatePlus(cur_index_repr, ocg->CreateInt(ir->ArrayIndexStartAt()));
- lhs_index[i] = cur_index_repr;
- }
-
- IR_ArrayRef *tmp_array_ref = ir->CreateArrayRef(static_cast<IR_ArraySymbol *>(tmp_sym), lhs_index);
-
- std::vector<CG_outputRepr *> rhs_index(n_dim);
- for (int i = 0; i < index_lb.size(); i++)
- if (is_index_eq[i])
- rhs_index[i] = index_lb[i]->clone();
- else
- rhs_index[i] = ir->builder()->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+i+1)->name());
- IR_ArrayRef *copied_array_ref = ir->CreateArrayRef(sym, rhs_index);
-
- copy_code_read = ir->builder()->CreateAssignment(0, tmp_array_ref->convert(), copied_array_ref->convert());
- }
-
- // create temporary array write back code
- CG_outputRepr *copy_code_write;
- if (has_write_refs)
- if (index_sz.size() == 0) {
- IR_ScalarRef *tmp_scalar_ref = ir->CreateScalarRef(static_cast<IR_ScalarSymbol *>(tmp_sym));
-
- std::vector<CG_outputRepr *> rhs_index(n_dim);
- for (int i = 0; i < index_lb.size(); i++)
- if (is_index_eq[i])
- rhs_index[i] = index_lb[i]->clone();
- else
- rhs_index[i] = ir->builder()->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+i+1)->name());
- IR_ArrayRef *copied_array_ref = ir->CreateArrayRef(sym, rhs_index);
-
- copy_code_write = ir->builder()->CreateAssignment(0, copied_array_ref->convert(), tmp_scalar_ref->convert());
- }
- else {
- std::vector<CG_outputRepr *> lhs_index(n_dim);
- for (int i = 0; i < index_lb.size(); i++)
- if (is_index_eq[i])
- lhs_index[i] = index_lb[i]->clone();
- else
- lhs_index[i] = ir->builder()->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+i+1)->name());
- IR_ArrayRef *copied_array_ref = ir->CreateArrayRef(sym, lhs_index);
-
- std::vector<CG_outputRepr *> rhs_index(index_sz.size());
- for (int i = 0; i < index_sz.size(); i++) {
- int cur_index_num = index_sz[i].first;
- CG_outputRepr *cur_index_repr = ocg->CreateMinus(ocg->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+cur_index_num+1)->name()), index_lb[cur_index_num]->clone());
- if (padding_stride != 0) {
- if (i == n_dim-1) {
- coef_t g = gcd(index_stride[cur_index_num], static_cast<coef_t>(padding_stride));
- coef_t t1 = index_stride[cur_index_num] / g;
- if (t1 != 1)
- cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(t1));
- coef_t t2 = padding_stride / g;
- if (t2 != 1)
- cur_index_repr = ocg->CreateTimes(cur_index_repr, ocg->CreateInt(t2));
- }
- else if (index_stride[cur_index_num] != 1) {
- cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(index_stride[cur_index_num]));
- }
- }
-
- if (ir->ArrayIndexStartAt() != 0)
- cur_index_repr = ocg->CreatePlus(cur_index_repr, ocg->CreateInt(ir->ArrayIndexStartAt()));
- rhs_index[i] = cur_index_repr;
- }
- IR_ArrayRef *tmp_array_ref = ir->CreateArrayRef(static_cast<IR_ArraySymbol *>(tmp_sym), rhs_index);
-
- copy_code_write = ir->builder()->CreateAssignment(0, copied_array_ref->convert(), tmp_array_ref->convert());
- }
-
- // now we can remove those loops for array indexes that are
- // dependent on others
- if (!(index_sz.size() == n_dim && (sym->layout_type() == IR_ARRAY_LAYOUT_ROW_MAJOR || n_dim <= 1))) {
- Relation mapping(level-1+privatized_levels.size()+n_dim, level-1+privatized_levels.size()+index_sz.size());
- F_And *f_root = mapping.add_and();
- for (int i = 1; i <= level-1+privatized_levels.size(); i++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.input_var(i), 1);
- h.update_coef(mapping.output_var(i), -1);
- }
-
- int cur_index = 0;
- std::vector<int> mapped_index(index_sz.size());
- for (int i = 0; i < n_dim; i++)
- if (!is_index_eq[i]) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(mapping.input_var(level-1+privatized_levels.size()+i+1), 1);
- switch (sym->layout_type()) {
- case IR_ARRAY_LAYOUT_COLUMN_MAJOR: {
- h.update_coef(mapping.output_var(level-1+privatized_levels.size()+index_sz.size()-cur_index), -1);
- mapped_index[index_sz.size()-cur_index-1] = i;
- break;
- }
- case IR_ARRAY_LAYOUT_ROW_MAJOR: {
- h.update_coef(mapping.output_var(level-1+privatized_levels.size()+cur_index+1), -1);
- mapped_index[cur_index] = i;
- break;
- }
- default:
- throw loop_error("unsupported array layout");
- }
- cur_index++;
- }
-
- wo_copy_is = Range(Restrict_Domain(copy(mapping), wo_copy_is));
- ro_copy_is = Range(Restrict_Domain(copy(mapping), ro_copy_is));
- for (int i = 1; i <= level-1+privatized_levels.size(); i++) {
- wo_copy_is.name_set_var(i, copy_is.set_var(i)->name());
- ro_copy_is.name_set_var(i, copy_is.set_var(i)->name());
- }
- for (int i = 0; i < index_sz.size(); i++) {
- wo_copy_is.name_set_var(level-1+privatized_levels.size()+i+1, copy_is.set_var(level-1+privatized_levels.size()+mapped_index[i]+1)->name());
- ro_copy_is.name_set_var(level-1+privatized_levels.size()+i+1, copy_is.set_var(level-1+privatized_levels.size()+mapped_index[i]+1)->name());
- }
- wo_copy_is.setup_names();
- ro_copy_is.setup_names();
- }
-
- // insert read copy statement
- int old_num_stmt = stmt.size();
- int ro_copy_stmt_num = -1;
- if (has_read_refs) {
- Relation copy_xform(ro_copy_is.n_set(), 2*ro_copy_is.n_set()+1);
- {
- F_And *f_root = copy_xform.add_and();
- for (int i = 1; i <= ro_copy_is.n_set(); i++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(copy_xform.input_var(i), 1);
- h.update_coef(copy_xform.output_var(2*i), -1);
- }
- for (int i = 1; i <= dim; i+=2) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(copy_xform.output_var(i), -1);
- h.update_const(lex[i-1]);
- }
- for (int i = dim+2; i <= copy_xform.n_out(); i+=2) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(copy_xform.output_var(i), 1);
- }
- }
-
- Statement copy_stmt_read;
- copy_stmt_read.IS = ro_copy_is;
- copy_stmt_read.xform = copy_xform;
- copy_stmt_read.code = copy_code_read;
- copy_stmt_read.loop_level = std::vector<LoopLevel>(ro_copy_is.n_set());
- for (int i = 0; i < level-1; i++) {
- copy_stmt_read.loop_level[i].type = stmt[*(active.begin())].loop_level[i].type;
- if (stmt[*(active.begin())].loop_level[i].type == LoopLevelTile &&
- stmt[*(active.begin())].loop_level[i].payload >= level) {
- int j;
- for (j = 0; j < privatized_levels.size(); j++)
- if (privatized_levels[j] == stmt[*(active.begin())].loop_level[i].payload)
- break;
- if (j == privatized_levels.size())
- copy_stmt_read.loop_level[i].payload = -1;
- else
- copy_stmt_read.loop_level[i].payload = level + j;
- }
- else
- copy_stmt_read.loop_level[i].payload = stmt[*(active.begin())].loop_level[i].payload;
- copy_stmt_read.loop_level[i].parallel_level = stmt[*(active.begin())].loop_level[i].parallel_level;
- }
- for (int i = 0; i < privatized_levels.size(); i++) {
- copy_stmt_read.loop_level[level-1+i].type = stmt[*(active.begin())].loop_level[privatized_levels[i]].type;
- copy_stmt_read.loop_level[level-1+i].payload = stmt[*(active.begin())].loop_level[privatized_levels[i]].payload;
- copy_stmt_read.loop_level[level-1+i].parallel_level = stmt[*(active.begin())].loop_level[privatized_levels[i]].parallel_level;
- }
- int left_num_dim = num_dep_dim - (get_last_dep_dim_before(*(active.begin()), level) + 1);
- for (int i = 0; i < min(left_num_dim, static_cast<int>(index_sz.size())); i++) {
- copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].type = LoopLevelOriginal;
- copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].payload = num_dep_dim-left_num_dim+i;
- copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].parallel_level = 0;
- }
- for (int i = min(left_num_dim, static_cast<int>(index_sz.size())); i < index_sz.size(); i++) {
- copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].type = LoopLevelUnknown;
- copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].payload = -1;
- copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].parallel_level = 0;
- }
-
- shiftLexicalOrder(lex, dim-1, 1);
- stmt.push_back(copy_stmt_read);
- ro_copy_stmt_num = stmt.size() - 1;
- dep.insert();
- }
-
- // insert write copy statement
- int wo_copy_stmt_num = -1;
- if (has_write_refs) {
- Relation copy_xform(wo_copy_is.n_set(), 2*wo_copy_is.n_set()+1);
- {
- F_And *f_root = copy_xform.add_and();
- for (int i = 1; i <= wo_copy_is.n_set(); i++) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(copy_xform.input_var(i), 1);
- h.update_coef(copy_xform.output_var(2*i), -1);
- }
- for (int i = 1; i <= dim; i+=2) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(copy_xform.output_var(i), -1);
- h.update_const(lex[i-1]);
- }
- for (int i = dim+2; i <= copy_xform.n_out(); i+=2) {
- EQ_Handle h = f_root->add_EQ();
- h.update_coef(copy_xform.output_var(i), 1);
- }
- }
-
- Statement copy_stmt_write;
- copy_stmt_write.IS = wo_copy_is;
- copy_stmt_write.xform = copy_xform;
- copy_stmt_write.code = copy_code_write;
- copy_stmt_write.loop_level = std::vector<LoopLevel>(wo_copy_is.n_set());
- for (int i = 0; i < level-1; i++) {
- copy_stmt_write.loop_level[i].type = stmt[*(active.begin())].loop_level[i].type;
- if (stmt[*(active.begin())].loop_level[i].type == LoopLevelTile &&
- stmt[*(active.begin())].loop_level[i].payload >= level) {
- int j;
- for (j = 0; j < privatized_levels.size(); j++)
- if (privatized_levels[j] == stmt[*(active.begin())].loop_level[i].payload)
- break;
- if (j == privatized_levels.size())
- copy_stmt_write.loop_level[i].payload = -1;
- else
- copy_stmt_write.loop_level[i].payload = level + j;
- }
- else
- copy_stmt_write.loop_level[i].payload = stmt[*(active.begin())].loop_level[i].payload;
- copy_stmt_write.loop_level[i].parallel_level = stmt[*(active.begin())].loop_level[i].parallel_level;
- }
- for (int i = 0; i < privatized_levels.size(); i++) {
- copy_stmt_write.loop_level[level-1+i].type = stmt[*(active.begin())].loop_level[privatized_levels[i]].type;
- copy_stmt_write.loop_level[level-1+i].payload = stmt[*(active.begin())].loop_level[privatized_levels[i]].payload;
- copy_stmt_write.loop_level[level-1+i].parallel_level = stmt[*(active.begin())].loop_level[privatized_levels[i]].parallel_level;
- }
- int left_num_dim = num_dep_dim - (get_last_dep_dim_before(*(active.begin()), level) + 1);
- for (int i = 0; i < min(left_num_dim, static_cast<int>(index_sz.size())); i++) {
- copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].type = LoopLevelOriginal;
- copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].payload = num_dep_dim-left_num_dim+i;
- copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].parallel_level = 0;
- }
- for (int i = min(left_num_dim, static_cast<int>(index_sz.size())); i < index_sz.size(); i++) {
- copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].type = LoopLevelUnknown;
- copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].payload = -1;
- copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].parallel_level = 0;
- }
-
- lex[dim-1]++;
- shiftLexicalOrder(lex, dim-1, -2);
- stmt.push_back(copy_stmt_write);
- wo_copy_stmt_num = stmt.size() - 1;
- dep.insert();
- }
-
- // replace original array accesses with temporary array accesses
- for (int i =0; i < stmt_refs.size(); i++)
- for (int j = 0; j < stmt_refs[i].second.size(); j++) {
- if (index_sz.size() == 0) {
- IR_ScalarRef *tmp_scalar_ref = ir->CreateScalarRef(static_cast<IR_ScalarSymbol *>(tmp_sym));
- ir->ReplaceExpression(stmt_refs[i].second[j], tmp_scalar_ref->convert());
- }
- else {
- std::vector<CG_outputRepr *> index_repr(index_sz.size());
- for (int k = 0; k < index_sz.size(); k++) {
- int cur_index_num = index_sz[k].first;
-
- CG_outputRepr *cur_index_repr = ocg->CreateMinus(stmt_refs[i].second[j]->index(cur_index_num), index_lb[cur_index_num]->clone());
- if (padding_stride != 0) {
- if (k == n_dim-1) {
- coef_t g = gcd(index_stride[cur_index_num], static_cast<coef_t>(padding_stride));
- coef_t t1 = index_stride[cur_index_num] / g;
- if (t1 != 1)
- cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(t1));
- coef_t t2 = padding_stride / g;
- if (t2 != 1)
- cur_index_repr = ocg->CreateTimes(cur_index_repr, ocg->CreateInt(t2));
- }
- else if (index_stride[cur_index_num] != 1) {
- cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(index_stride[cur_index_num]));
- }
- }
-
- if (ir->ArrayIndexStartAt() != 0)
- cur_index_repr = ocg->CreatePlus(cur_index_repr, ocg->CreateInt(ir->ArrayIndexStartAt()));
- index_repr[k] = cur_index_repr;
- }
-
- IR_ArrayRef *tmp_array_ref = ir->CreateArrayRef(static_cast<IR_ArraySymbol *>(tmp_sym), index_repr);
- ir->ReplaceExpression(stmt_refs[i].second[j], tmp_array_ref->convert());
- }
- }
-
- // update dependence graph
- int dep_dim = get_last_dep_dim_before(*(active.begin()), level) + 1;
- if (ro_copy_stmt_num != -1) {
- for (int i = 0; i < old_num_stmt; i++) {
- std::vector<std::vector<DependenceVector> > D;
-
- for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();) {
- if (active.find(i) != active.end() && active.find(j->first) == active.end()) {
- std::vector<DependenceVector> dvs1, dvs2;
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if (dv.sym != NULL && dv.sym->name() == sym->name() && (dv.type == DEP_R2R || dv.type == DEP_R2W))
- dvs1.push_back(dv);
- else
- dvs2.push_back(dv);
- }
- j->second = dvs2;
- if (dvs1.size() > 0)
- dep.connect(ro_copy_stmt_num, j->first, dvs1);
- }
- else if (active.find(i) == active.end() && active.find(j->first) != active.end()) {
- std::vector<DependenceVector> dvs1, dvs2;
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if (dv.sym != NULL && dv.sym->name() == sym->name() && (dv.type == DEP_R2R || dv.type == DEP_W2R))
- dvs1.push_back(dv);
- else
- dvs2.push_back(dv);
- }
- j->second = dvs2;
- if (dvs1.size() > 0)
- D.push_back(dvs1);
- }
-
- if (j->second.size() == 0)
- dep.vertex[i].second.erase(j++);
- else
- j++;
- }
-
- for (int j = 0; j < D.size(); j++)
- dep.connect(i, ro_copy_stmt_num, D[j]);
- }
-
- // insert dependences from copy statement loop to copied statements
- DependenceVector dv;
- dv.type = DEP_W2R;
- dv.sym = tmp_sym->clone();
- dv.lbounds = std::vector<coef_t>(num_dep_dim, 0);
- dv.ubounds = std::vector<coef_t>(num_dep_dim, 0);
- for (int i = dep_dim; i < num_dep_dim; i++) {
- dv.lbounds[i] = -posInfinity;
- dv.ubounds[i] = posInfinity;
- }
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++)
- dep.connect(ro_copy_stmt_num, *i, dv);
- }
-
- if (wo_copy_stmt_num != -1) {
- for (int i = 0; i < old_num_stmt; i++) {
- std::vector<std::vector<DependenceVector> > D;
-
- for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();) {
- if (active.find(i) != active.end() && active.find(j->first) == active.end()) {
- std::vector<DependenceVector> dvs1, dvs2;
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if (dv.sym != NULL && dv.sym->name() == sym->name() && (dv.type == DEP_W2R || dv.type == DEP_W2W))
- dvs1.push_back(dv);
- else
- dvs2.push_back(dv);
- }
- j->second = dvs2;
- if (dvs1.size() > 0)
- dep.connect(wo_copy_stmt_num, j->first, dvs1);
- }
- else if (active.find(i) == active.end() && active.find(j->first) != active.end()) {
- std::vector<DependenceVector> dvs1, dvs2;
- for (int k = 0; k < j->second.size(); k++) {
- DependenceVector dv = j->second[k];
- if (dv.sym != NULL && dv.sym->name() == sym->name() && (dv.type == DEP_R2W || dv.type == DEP_W2W))
- dvs1.push_back(dv);
- else
- dvs2.push_back(dv);
- }
- j->second = dvs2;
- if (dvs1.size() > 0)
- D.push_back(dvs1);
- }
-
- if (j->second.size() == 0)
- dep.vertex[i].second.erase(j++);
- else
- j++;
- }
-
- for (int j = 0; j < D.size(); j++)
- dep.connect(i, wo_copy_stmt_num, D[j]);
- }
-
- // insert dependences from copied statements to write statements
- DependenceVector dv;
- dv.type = DEP_W2R;
- dv.sym = tmp_sym->clone();
- dv.lbounds = std::vector<coef_t>(num_dep_dim, 0);
- dv.ubounds = std::vector<coef_t>(num_dep_dim, 0);
- for (int i = dep_dim; i < num_dep_dim; i++) {
- dv.lbounds[i] = -posInfinity;
- dv.ubounds[i] = posInfinity;
- }
- for (std::set<int>::iterator i = active.begin(); i != active.end(); i++)
- dep.connect(*i, wo_copy_stmt_num, dv);
-
- }
-
- // update variable name for dependences among copied statements
- for (int i = 0; i < old_num_stmt; i++) {
- if (active.find(i) != active.end())
- for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++)
- if (active.find(j->first) != active.end())
- for (int k = 0; k < j->second.size(); k++) {
- IR_Symbol *s = tmp_sym->clone();
- j->second[k].sym = s;
- }
- }
-
- // insert anti-dependence from write statement to read statement
- if (ro_copy_stmt_num != -1 && wo_copy_stmt_num != -1)
- if (dep_dim >= 0) {
- DependenceVector dv;
- dv.type = DEP_R2W;
- dv.sym = tmp_sym->clone();
- dv.lbounds = std::vector<coef_t>(num_dep_dim, 0);
- dv.ubounds = std::vector<coef_t>(num_dep_dim, 0);
- for (int k = dep_dim; k < num_dep_dim; k++) {
- dv.lbounds[k] = -posInfinity;
- dv.ubounds[k] = posInfinity;
- }
- for (int k = 0; k < dep_dim; k++) {
- if (k != 0) {
- dv.lbounds[k-1] = 0;
- dv.ubounds[k-1] = 0;
- }
- dv.lbounds[k] = 1;
- dv.ubounds[k] = posInfinity;
- dep.connect(wo_copy_stmt_num, ro_copy_stmt_num, dv);
- }
- }
-
-
- // cleanup
- delete sym;
- delete tmp_sym;
- for (int i = 0; i < index_lb.size(); i++) {
- index_lb[i]->clear();
- delete index_lb[i];
- }
- for (int i = 0; i < index_sz.size(); i++) {
- index_sz[i].second->clear();
- delete index_sz[i].second;
- }
-
- return true;
-}