From c285135eb903c31cd221f90f03e288a6b67770cd Mon Sep 17 00:00:00 2001 From: Derick Huth Date: Thu, 24 Sep 2015 11:26:53 -0600 Subject: pre-v0.2.1 --- INSTALL | 18 - Makefile-Old | 251 --- chill_run.cc | 15 +- chillmodule.cc | 4 +- dep.cc | 2 +- examples/chill/gemm.c | 18 - examples/chill/gemm.script | 31 - examples/chill/gemv.c | 15 - examples/chill/gemv.script | 9 - examples/chill/jacobi1.c | 13 - examples/chill/jacobi1.script | 18 - examples/chill/jacobi2.c | 15 - examples/chill/jacobi2.script | 21 - examples/chill/unroll.c | 33 - examples/chill/unroll.script | 35 - examples/cuda-chill/cp.c | 29 - examples/cuda-chill/cp.lua | 46 - examples/cuda-chill/cudaize.lua | 1004 --------- examples/cuda-chill/cudaize.py | 1047 --------- examples/cuda-chill/mm.c | 10 - examples/cuda-chill/mm.lua | 38 - examples/cuda-chill/mpeg4.c | 23 - examples/cuda-chill/mpeg4.lua | 45 - examples/cuda-chill/mriq-fh.c | 38 - examples/cuda-chill/mriq-fh.lua | 73 - examples/cuda-chill/mriq.c | 33 - examples/cuda-chill/mriq.lua | 55 - examples/cuda-chill/mv-shadow.c | 9 - examples/cuda-chill/mv-shadow.lua | 65 - examples/cuda-chill/mv.c | 9 - examples/cuda-chill/mv.lua | 65 - examples/cuda-chill/mv_try.c | 9 - examples/cuda-chill/mv_try.lua | 14 - examples/cuda-chill/nbody.c | 66 - examples/cuda-chill/nbody.lua | 53 - examples/cuda-chill/tmv-shadow.c | 9 - examples/cuda-chill/tmv-shadow.lua | 50 - examples/cuda-chill/tmv.c | 9 - examples/cuda-chill/tmv.lua | 50 - examples/fortran/README | 10 - examples/fortran/ccd.f | 32 - examples/fortran/ccd.script | 18 - examples/fortran/gemm.f90 | 58 - examples/fortran/gemm.script | 30 - examples/fortran/rose_gemm.f90 | 155 -- graph-test.cc | 148 -- graph.hh | 3 +- include/ir_suif.hh | 1 - include/ir_suif_utils.hh | 1 - ir_cuda_rose_utils.cc | 191 -- ir_cuda_suif_utils.cc | 54 - ir_cudarose.cc | 165 -- ir_cudarose.hh | 46 - ir_cudasuif.cc | 144 -- ir_cudasuif.hh | 36 - loop.cc | 1 + loop_backup.cc | 3311 ---------------------------- loop_cuda.cc | 2123 ------------------ loop_cuda_rose.cc | 3734 ------------------------------- loop_modified.cc | 4234 ------------------------------------ mem_mapping_utils.cc | 76 - mem_mapping_utils.hh | 59 - omega/INSTALL | 34 - omega/README | 96 - omega/ROSE_INSTALL.txt | 77 - omega/bin/oc | 1 - orig_loop_datacopy.cc | 1175 ---------- 67 files changed, 12 insertions(+), 19348 deletions(-) delete mode 100644 INSTALL delete mode 100644 Makefile-Old delete mode 100644 examples/chill/gemm.c delete mode 100644 examples/chill/gemm.script delete mode 100644 examples/chill/gemv.c delete mode 100644 examples/chill/gemv.script delete mode 100644 examples/chill/jacobi1.c delete mode 100644 examples/chill/jacobi1.script delete mode 100644 examples/chill/jacobi2.c delete mode 100644 examples/chill/jacobi2.script delete mode 100644 examples/chill/unroll.c delete mode 100644 examples/chill/unroll.script delete mode 100644 examples/cuda-chill/cp.c delete mode 100644 examples/cuda-chill/cp.lua delete mode 100644 examples/cuda-chill/cudaize.lua delete mode 100755 examples/cuda-chill/cudaize.py delete mode 100644 examples/cuda-chill/mm.c delete mode 100644 examples/cuda-chill/mm.lua delete mode 100755 examples/cuda-chill/mpeg4.c delete mode 100644 examples/cuda-chill/mpeg4.lua delete mode 100755 examples/cuda-chill/mriq-fh.c delete mode 100755 examples/cuda-chill/mriq-fh.lua delete mode 100644 examples/cuda-chill/mriq.c delete mode 100644 examples/cuda-chill/mriq.lua delete mode 100644 examples/cuda-chill/mv-shadow.c delete mode 100644 examples/cuda-chill/mv-shadow.lua delete mode 100644 examples/cuda-chill/mv.c delete mode 100644 examples/cuda-chill/mv.lua delete mode 100644 examples/cuda-chill/mv_try.c delete mode 100644 examples/cuda-chill/mv_try.lua delete mode 100644 examples/cuda-chill/nbody.c delete mode 100644 examples/cuda-chill/nbody.lua delete mode 100644 examples/cuda-chill/tmv-shadow.c delete mode 100644 examples/cuda-chill/tmv-shadow.lua delete mode 100644 examples/cuda-chill/tmv.c delete mode 100644 examples/cuda-chill/tmv.lua delete mode 100644 examples/fortran/README delete mode 100644 examples/fortran/ccd.f delete mode 100644 examples/fortran/ccd.script delete mode 100644 examples/fortran/gemm.f90 delete mode 100644 examples/fortran/gemm.script delete mode 100644 examples/fortran/rose_gemm.f90 delete mode 100644 graph-test.cc delete mode 120000 include/ir_suif.hh delete mode 120000 include/ir_suif_utils.hh delete mode 100644 ir_cuda_rose_utils.cc delete mode 100644 ir_cuda_suif_utils.cc delete mode 100644 ir_cudarose.cc delete mode 100644 ir_cudarose.hh delete mode 100644 ir_cudasuif.cc delete mode 100644 ir_cudasuif.hh delete mode 100644 loop_backup.cc delete mode 100644 loop_cuda.cc delete mode 100644 loop_cuda_rose.cc delete mode 100644 loop_modified.cc delete mode 100644 mem_mapping_utils.cc delete mode 100644 mem_mapping_utils.hh delete mode 100644 omega/INSTALL delete mode 100644 omega/README delete mode 100644 omega/ROSE_INSTALL.txt delete mode 120000 omega/bin/oc delete mode 100644 orig_loop_datacopy.cc diff --git a/INSTALL b/INSTALL deleted file mode 100644 index aef619a..0000000 --- a/INSTALL +++ /dev/null @@ -1,18 +0,0 @@ -BUILD -===== - -1. Edit Makefile. Change SUIFHOME and OMEGAHOME to correct paths. - -2. Do "make depend" in the chill directory. - -3. Optional, do "make clean" or "make veryclean" which removes additional - target files and flex/bison generated files. - -4. Do "make". - - -INSTALLATION -============ - -You can use CHiLL in source directory since all links are already -created in bin/, lib/ and include/ directories. diff --git a/Makefile-Old b/Makefile-Old deleted file mode 100644 index 7f2c8b5..0000000 --- a/Makefile-Old +++ /dev/null @@ -1,251 +0,0 @@ - -.SUFFIXES: -.PHONY: all depend depend-cuda-chill clean veryclean cuda-chill -.PHONY: chill - -CC = g++ -CFLAGS = -g -Wno-write-strings -DEPENDENCE_CFLAGS = -M -OMEGAHOME=./omega - -ifdef TEST_COVERAGE - CFLAGS := $(CFLAGS) -fprofile-arcs -ftest-coverage -endif - -# TODO auto-generate using config.h generated by autoconf? -CHILLVERSION = "\"0.2.0\"" -PYTHON=python #=$(shell `which python` ) -PYVERSION=$(shell $(PYTHON) -c "import sys; print(sys.version[:3])") # 2.6 -PYTHONVER = python$(PYVERSION) -PYTHONINCLUDE = $(shell $(PYTHON) -c "from distutils import sysconfig; print(sysconfig.get_python_inc())") -PYTHONLIBDIR = $(shell $(PYTHON) -c "from distutils import sysconfig; print(sysconfig.get_config_var('LIBDIR'))") -PYTHONCONFIG = $(shell $(PYTHON) -c "from distutils import sysconfig; print(sysconfig.get_config_var('LIBPL'))") -# SCRIPT_LANG = lua <-- supplied by the command line - - -# this creates a LUAHOME even if you don't have such a directory -ifeq ($(strip $(wildcard $(LUAHOME))),) -LUAHOME = $(HOME)/lua -endif -LUA_PATH = -L${LUAHOME}/lib - - -# where do include files live -INC_PATH = -I${PYTHONINCLUDE} -I${OMEGAHOME}/include -I${LUAHOME}/include - -# where do libraries live -LIB_PATH = -L${OMEGAHOME}/code_gen/obj -L${OMEGAHOME}/omega_lib/obj -# seemingly not needed -L${PYTHONCONFIG} - - - -CORE_LIBS = -lm -lcodegen -lomega -RUNNER_LIBS = -llua -ldl -lreadline -lhistory -lpthread -ldl -lutil -lm -l${PYTHONVER} - -TDLHOME = ${ROSEHOME}/libltdl - -BOOST_DATE_TIME_LIB = -lboost_date_time -BOOST_FILESYSTEM_LIB = -lboost_filesystem -BOOST_LDFLAGS = -L${BOOSTHOME}/lib -BOOST_PROGRAM_OPTIONS_LIB = -lboost_program_options -BOOST_REGEX_LIB = -lboost_regex -BOOST_SYSTEM_LIB = -lboost_system -BOOST_THREAD_LIB = -lboost_thread -BOOST_WAVE_LIB = -lboost_wave - -ROSE_LIBS = -lrose $(BOOST_LDFLAGS) $(BOOST_DATE_TIME_LIB)\ - $(BOOST_THREAD_LIB) $(BOOST_FILESYSTEM_LIB) $(BOOST_PROGRAM_OPTIONS_LIB)\ - $(BOOST_REGEX_LIB) $(BOOST_SYSTEM_LIB) $(BOOST_SERIALIZATION_LIB) \ - $(BOOST_WAVE_LIB) -lrt -ldl - - -# Source files common to both chill and cuda-chill -CORE_SRCS = dep.cc omegatools.cc irtools.cc loop.cc loop_basic.cc loop_datacopy.cc loop_unroll.cc loop_tile.cc loop_extra.cc -LIB_SRCS = $(CORE_SRCS) - -# files that will be generated by bison, flex, and make that need to be removed at clean. -GENERATED_SRCS = parser.tab.hh parser.tab.cc parse_expr.yy.cc parse_expr.ll.hh parse_expr.tab.cc parse_expr.tab.hh Makefile.deps -# object files that are specific to lua or python builds. -- This is used so that SCRIPT_LANG does not need to be specified during clean -ORPHAN_OBJS = chill_run_util.o chillmodule.o parse_expr.tab.o parse_expr.yy.o - -# files used in chill and cuda-chill interfaces -ifeq ($(SCRIPT_LANG),lua) - RUNNER_SRCS = chill_run.cc chill_env.cc -else - ifeq ($(SCRIPT_LANG),python) - RUNNER_SRCS = chill_run.cc chillmodule.cc - else - RUNNER_SRCS = chill_run.cc chill_env.cc - endif -endif - -# files used in chill but not cuda-chill -IR_CHILL_SRCS = ir_rose.cc ir_rose_utils.cc -ifeq ($(SCRIPT_LANG),lua) - YACC_SRCS = parse_expr.yy.cc parse_expr.tab.cc - CHILL_RUNNER_SRCS = chill_run_util.cc - CHILL_SRCS = $(CORE_SRCS) $(IR_CHILL_SRCS) $(CHILL_RUNNER_SRCS) $(RUNNER_SRCS) -else - ifeq ($(SCRIPT_LANG),python) - YACC_SRCS = parse_expr.yy.cc parse_expr.tab.cc - CHILL_RUNNER_SRCS = chill_run_util.cc - CHILL_SRCS = $(CORE_SRCS) $(IR_CHILL_SRCS) $(CHILL_RUNNER_SRCS) $(RUNNER_SRCS) - else - YACC_SRCS = lex.yy.cc parser.tab.cc - CHILL_RUNNER_SRCS = - CHILL_SRCS = $(CORE_SRCS) $(IR_CHILL_SRCS) $(YACC_SRCS) $(RUNNER_SRCS) - endif -endif - -# source files for cuda-chill but not chill -CUDACHILL_ONLY_SRCS = mem_mapping_utils.cc loop_cuda_rose.cc -IR_CUDACHILL_SRCS = ir_rose.cc ir_rose_utils.cc ir_cudarose.cc ir_cuda_rose_utils.cc -CUDACHILL_RUNNER_SRCS = -CUDACHILL_SRCS = $(CORE_SRCS) $(CUDACHILL_ONLY_SRCS) $(IR_CUDACHILL_SRCS) $(RUNNER_SRCS) $(CUDACHILL_RUNNER_SRCS) - -# set interface language flags -ifeq ($(SCRIPT_LANG),lua) - RUNNER_EXTRA_CFLAGS = -DLUA -else - ifeq ($(SCRIPT_LANG),python) - RUNNER_EXTRA_CFLAGS = -DPYTHON - endif -endif - -depend-cuda-chill: CFLAGS := $(CFLAGS) -DCUDACHILL -cuda-chill: CFLAGS := $(CFLAGS) -DCUDACHILL - -ALL_SRCS = $(CORE_SRCS) $(YACC_SRCS) $(IR_CHILL_SRCS) $(CUDACHILL_ONLY_SRCS) $(IR_CUDACHILL_SRCS) $(RUNNER_SRCS) $(CHILL_RUNNER_SRCS) $(CUDACHILL_RUNNER_SRCS) -ALL_OBJS = $(ALL_SRCS:.cc=.o) $(ORPHAN_OBJS) - -RUNNER_DEFINES = -DLUA_USE_LINUX -DCHILL_BUILD_VERSION=$(CHILLVERSION) -DCHILL_BUILD_DATE="\"$(CHILL_BUILD_DATE)\"" - - -YACC_EXTRA_CFLAGS = - -##################################################################### -# compiler intermediate code specific definitions -##################################################################### - - - -#LIBS := $(LIBS) $(ROSE_LIBS) -LIB_PATH := $(LIB_PATH) -L${ROSEHOME}/lib -L${TDLHOME} -#LIB_SRCS := $(LIB_SRCS) # $(IR_SRCS) -INC_PATH := $(INC_PATH) -I${ROSEHOME}/include -I${BOOSTHOME}/include -YACC_EXTRA_CFLAGS := -DBUILD_ROSE -RUNNER_EXTRA_CFLAGS := $(RUNNER_EXTRA_CFLAGS) -DBUILD_ROSE - - -##################################################################### -# build rules -##################################################################### - -YACC_OBJS = $(YACC_SRCS:.cc=.o) -RUNNER_OBJS = $(RUNNER_SRCS:.cc=.o) -CHILL_RUNNER_OBJS = $(CHILL_RUNNER_SRCS:.cc=.o) -CUDACHILL_RUNNER_OBJS = $(CUDACHILL_RUNNER_SRCS:.cc=.o) -LIB_OBJS = $(LIB_SRCS:.cc=.o) -IR_CHILL_OBJS = $(IR_CHILL_SRCS:.cc=.o) -IR_CUDACHILL_OBJS = $(IR_CUDACHILL_SRCS:.cc=.o) -CUDACHILL_ONLY_OBJS = $(CUDACHILL_ONLY_SRCS:.cc=.o) - -CHILL_OBJS = $(CHILL_SRCS:.cc=.o) -CUDACHILL_OBJS = $(CUDACHILL_SRCS:.cc=.o) - - -all: - $(MAKE) depend-chill - $(MAKE) chill - $(MAKE) depend-cuda-chill - $(MAKE) cuda-chill - - -# can't these be combined to a superset of all source files? -depend: depend-cuda-chill - -depend-chill: $(LIB_SRCS) $(RUNNER_SRCS) $(CHILL_RUNNER_SRCS) $(YACC_SRCS) - $(CC) $(DEPENDENCE_CFLAGS) $(INC_PATH) $(LIB_SRCS) $(RUNNER_SRCS) $(CHILL_RUNNER_SRCS) $(YACC_SRCS) > Makefile.deps - -depend-cuda-chill: $(LIB_SRCS) $(RUNNER_SRCS) $(CUDACHILL_RUNNER_SRCS) - $(CC) $(DEPENDENCE_CFLAGS) $(INC_PATH) $(LIB_SRCS) $(RUNNER_SRCS) $(CUDACHILL_RUNNER_SRCS) > Makefile.deps - -libchill_xform.a: $(LIB_OBJS) $(IR_CHILL_OBJS) - ar -rs $@ $(LIB_OBJS) $(IR_CHILL_OBJS) - -libcudachill_xform.a: $(LIB_OBJS) $(IR_CUDACHILL_OBJS) $(CUDACHILL_ONLY_OBJS) - ar -rs $@ $(LIB_OBJS) $(IR_CUDACHILL_OBJS) $(CUDACHILL_ONLY_OBJS) - -%.o: %.cc - $(CC) $(CFLAGS) $(INC_PATH) $< -c -o $@ - - -clean: - @rm -fr $(ALL_OBJS) $(YACC_SRCS) $(GENERATED_SRCS) - -veryclean: - @rm -fr $(ALL_OBJS) $(YACC_SRCS) libchill_xform.a libcudachill_xform.a chill cuda-chill - - -cuda-chill: libcudachill_xform.a $(CUDACHILL_RUNNER_OBJS) $(RUNNER_OBJS) - $(CC) $(CFLAGS) $(LIB_PATH) $(LUA_PATH) $(CUDACHILL_RUNNER_OBJS) $(RUNNER_OBJS) $< $(CORE_LIBS) $(ROSE_LIBS) $(RUNNER_LIBS) -o $@ - -ifeq ($(SCRIPT_LANG),lua) -chill: libchill_xform.a $(CHILL_RUNNER_OBJS) $(RUNNER_OBJS) $(YACC_OBJS) - $(CC) $(CFLAGS) $(LIB_PATH) $(LUA_PATH) $(YACC_OBJS) $(CHILL_RUNNER_OBJS) $(RUNNER_OBJS) $< $(CORE_LIBS) $(ROSE_LIBS) $(RUNNER_LIBS) -o $@ -else -ifeq ($(SCRIPT_LANG),python) -chill: libchill_xform.a $(CHILL_RUNNER_OBJS) $(RUNNER_OBJS) $(YACC_OBJS) - $(CC) $(CFLAGS) $(LIB_PATH) $(YACC_OBJS) $(CHILL_RUNNER_OBJS) $(RUNNER_OBJS) $< $(CORE_LIBS) $(ROSE_LIBS) $(RUNNER_LIBS) -o $@ - -else -chill: libchill_xform.a $(YACC_OBJS) - $(CC) $(CFLAGS) $(LIB_PATH) $(YACC_OBJS) $< $(CORE_LIBS) $(ROSE_LIBS) -o $@ -endif -endif - - -lex.yy.cc: parser.ll parser.tab.hh - flex++ parser.ll - -lex.yy.o: lex.yy.cc - $(CC) $(CFLAGS) -c $< -o $@ - -parser.tab.hh parser.tab.cc: parser.yy - bison -t -d $< - -parser.tab.o: parser.tab.cc - $(CC) $(CFLAGS) $(YACC_EXTRA_CFLAGS) $(INC_PATH) -DCHILL_BUILD_DATE="\"$(CHILL_BUILD_DATE)\"" -c $< -o $@ - - -parse_expr.tab.cc: parse_expr.yy - bison -t -d parse_expr.yy - -parse_expr.tab.o: parse_expr.tab.cc - $(CC) $(CFLAGS) $(YACC_CFLAGS) $(INC_PATH) -o $@ -c parse_expr.tab.cc - -parse_expr.yy.cc: parse_expr.tab.cc parse_expr.ll - flex -o parse_expr.yy.cc parse_expr.ll - -parse_expr.yy.o: parse_expr.yy.cc - $(CC) $(CFLAGS) $(YACC_CFLAGS) $(INC_PATH) -o $@ -c parse_expr.yy.cc - -$(RUNNER_SRCS:.cc=.o): %.o: %.cc - $(CC) $(CFLAGS) $(RUNNER_EXTRA_CFLAGS) $(INC_PATH) $(RUNNER_DEFINES) $< -c -o $@ - -$(CHILL_RUNNER_SRCS:.cc=.o): %.o: %.cc - $(CC) $(CFLAGS) $(RUNNER_EXTRA_CFLAGS) $(INC_PATH) $(RUNNER_DEFINES) $< -c -o $@ - -$(CUDACHILL_RUNNER_SRCS:.cc=.o): %.o %.cc - $(CC) $(CFLAGS) $(RUNNER_EXTRA_CFLAGS) $(INC_PATH) $(RUNNER_DEFINES) $< -c -o $@ - - -$(IR_SRCS:.cc=.o): %.o: %.cc - $(CC) -Wno-write-strings $(CFLAGS) $(INC_PATH) $< -c -o $@ - -ifeq ($(shell test -f Makefile.deps && echo "true"), true) -include Makefile.deps -endif - -CHILL_BUILD_DATE = $(shell date +%m/%d/%Y) - diff --git a/chill_run.cc b/chill_run.cc index a3c9180..d33819b 100644 --- a/chill_run.cc +++ b/chill_run.cc @@ -281,14 +281,14 @@ int main( int argc, char* argv[] ) //--- // Run a CHiLL interpreter //--- - printf("CUDA-CHiLL v0.2.0 (built on %s)\n", CHILL_BUILD_DATE); + printf("CHiLL v0.2.1 (built on %s)\n", CHILL_BUILD_DATE); printf("Copyright (C) 2008 University of Southern California\n"); printf("Copyright (C) 2009-2012 University of Utah\n"); //is_interactive = true; // let the lua interpreter know. fflush(stdout); // TODO: read lines of python code. //Not sure if we should set fail from interactive mode - printf("CUDA-CHiLL ending...\n"); + printf("CHiLL ending...\n"); fflush(stdout); } @@ -336,7 +336,7 @@ int main( int argc, char* argv[] ) //--- // Run a CHiLL interpreter //--- - printf("CUDA-CHiLL v0.2.0 (built on %s)\n", CHILL_BUILD_DATE); + printf("CUDA-CHiLL v0.2.1 (built on %s)\n", CHILL_BUILD_DATE); printf("Copyright (C) 2008 University of Southern California\n"); printf("Copyright (C) 2009-2012 University of Utah\n"); is_interactive = true; // let the lua interpreter know. @@ -359,7 +359,6 @@ int main( int argc, char* argv[] ) #endif #ifdef BUILD_ROSE ((IR_cudaroseCode *)(ir_code))->commit_loop(myloop, lnum); - ((IR_roseCode*)(ir_code))->finalizeRose(); #elif BUILD_SUIF ((IR_cudasuifCode *)(ir_code))->commit_loop(myloop, lnum); #endif @@ -375,16 +374,14 @@ int main( int argc, char* argv[] ) lnum_end = get_loop_num_end(L); DEBUG_PRINT("calling ROSE code gen? loop num %d - %d\n", lnum_start, lnum_end); #endif - +#endif #ifdef BUILD_ROSE finalize_loop(lnum_start, lnum_end); //((IR_roseCode*)(ir_cide))->commit_loop(myloop, lnum); ((IR_roseCode*)(ir_code))->finalizeRose(); - #elif BUILD_SUIF - ((IR_suifCode*)(ir_code))->commit_loop(myloop, lnum); + //#elif BUILD_SUIF + //((IR_suifCode*)(ir_code))->commit_loop(myloop, lnum); #endif - -#endif delete ir_code; } #ifdef PYTHON diff --git a/chillmodule.cc b/chillmodule.cc index fa55199..fbeb477 100644 --- a/chillmodule.cc +++ b/chillmodule.cc @@ -1431,7 +1431,7 @@ static PyObject* chill_permute(PyObject* self, PyObject* args) { int stmt_num = intArg(args, 1); int level = intArg(args, 2); std::vector pi; - if(!tointvector(args, 2, pi)) + if(!tointvector(args, 3, pi)) throw std::runtime_error("the third argument in permute(stmt_num, level, pi) must be an int vector"); myloop->permute(stmt_num, level, pi); } @@ -1750,7 +1750,7 @@ static PyMethodDef ChillMethods[] = { {"print_space", chill_print_space, METH_VARARGS, "print something or other "}, {"add_sync", chill_add_sync, METH_VARARGS, "add sync, whatever that is"}, {"rename_index", chill_rename_index, METH_VARARGS, "rename a loop index"}, - {"permute", chill_permute_v2, METH_VARARGS, "change the order of loops?"}, + {"permute", chill_permute, METH_VARARGS, "change the order of loops?"}, {"tile3", chill_tile_v2_3arg, METH_VARARGS, "something to do with tile"}, {"tile7", chill_tile_v2_7arg, METH_VARARGS, "something to do with tile"}, {"thread_dims", thread_dims, METH_VARARGS, "tx, ty, tz "}, diff --git a/dep.cc b/dep.cc index 7bf781a..a675d03 100644 --- a/dep.cc +++ b/dep.cc @@ -37,7 +37,7 @@ std::ostream& operator<<(std::ostream &os, const DependenceVector &d) { switch (d.type) { case DEP_W2R: - os << "flow"; + os << "true"; if (d.is_reduction) os << "_reduction"; break; diff --git a/examples/chill/gemm.c b/examples/chill/gemm.c deleted file mode 100644 index a565511..0000000 --- a/examples/chill/gemm.c +++ /dev/null @@ -1,18 +0,0 @@ - -#define N 512 - -int main() { - - float a[N][N], b[N][N], c[N][N]; - - int i, j, k; - - for (j = 0; j < N; j++) - for (k = 0; k < N; k++) - for (i = 0; i < N; i++) { - c[i][j] = c[i][j] + a[i][k] * b[k][j]; - } - - return 0; -} - diff --git a/examples/chill/gemm.script b/examples/chill/gemm.script deleted file mode 100644 index ed91567..0000000 --- a/examples/chill/gemm.script +++ /dev/null @@ -1,31 +0,0 @@ -#matrix multiply large array size for intel machine -source: gemm.c -procedure: main -format: rose -loop: 0 - -TI = 128 -TJ = 8 -TK = 512 -UI = 2 -UJ = 2 - -permute([3,1,2]) -tile(0,2,TJ) -#print space -tile(0,2,TI) -#print space -tile(0,5,TK) -#print space - -datacopy(0,3,a,false,1) -#print space - -datacopy(0,4,b) -print -unroll(0,4,UI)#print space -print -unroll(0,5,UJ) -#print space -print - diff --git a/examples/chill/gemv.c b/examples/chill/gemv.c deleted file mode 100644 index 610d4cb..0000000 --- a/examples/chill/gemv.c +++ /dev/null @@ -1,15 +0,0 @@ -#define N 10 - -int main() { - // int n; - float a[N]; - float b[N]; - float c[N][N]; - - int i, j; - - for (i = 1; i < N; i++) - for (j = 1; j < N; j++) - a[i] = a[i] + c[i][j] * b[j]; - -} diff --git a/examples/chill/gemv.script b/examples/chill/gemv.script deleted file mode 100644 index f1d5f89..0000000 --- a/examples/chill/gemv.script +++ /dev/null @@ -1,9 +0,0 @@ -source: gemv.c # matrix-vector multiply -procedure: main -format : rose -loop: 0 - - - -original() -print diff --git a/examples/chill/jacobi1.c b/examples/chill/jacobi1.c deleted file mode 100644 index 0fcaee4..0000000 --- a/examples/chill/jacobi1.c +++ /dev/null @@ -1,13 +0,0 @@ -#define N 512 - -int main() { - int i, t; - - float a[N][N]; - - for (t = 2; t <= 100; t++) - for (i = 2; i <= N - 1; i++) - a[t][i] = a[t - 1][i - 1] + a[t - 1][i] + a[t - 1][i + 1]; - - return 0; -} diff --git a/examples/chill/jacobi1.script b/examples/chill/jacobi1.script deleted file mode 100644 index c0dec8d..0000000 --- a/examples/chill/jacobi1.script +++ /dev/null @@ -1,18 +0,0 @@ -# -# tiling perfect jacobi loop nest with time step, use -# unimodular transformation first (only applicable to the -# perfect loop nest) to make tiling legal. -# - -source: jacobi1.c -procedure: main -format : rose -loop: 0 - -print dep - -nonsingular([[1,0],[1,1]]) # unimodular matrix, determinant is one -tile(0,2,64) - -print dep -print diff --git a/examples/chill/jacobi2.c b/examples/chill/jacobi2.c deleted file mode 100644 index b8d8d7b..0000000 --- a/examples/chill/jacobi2.c +++ /dev/null @@ -1,15 +0,0 @@ -#define N 512 - -int main() { - double a[N]; - double b[N]; - int t, i; - for (t = 1; t <= 100; t++) { - for (i = 2; i <= N - 1; i++) - b[i] = (double) 0.25 * (a[i - 1] + a[i + 1]) + (double) 0.5 * a[i]; - - for (i = 2; i <= N - 1; i++) - a[i] = b[i]; - } - return 0; -} diff --git a/examples/chill/jacobi2.script b/examples/chill/jacobi2.script deleted file mode 100644 index afe14c6..0000000 --- a/examples/chill/jacobi2.script +++ /dev/null @@ -1,21 +0,0 @@ -# -# tiling imperfect jacobi loop nest, more details in the paper -# "Automatic Tiling of Iterative Stencil Loops" by Zhiyuan Li and -# Yonghong Song, TOPLAS, 2004. -# - -source: jacobi2.c -procedure: main -format: rose -loop: 0 - -print dep - -original() -shift([1], 2, 1) -fuse([0,1], 2) # optional -skew([0,1], 2, [2,1]) -tile(0, 2, 32, 1) - -print dep -print diff --git a/examples/chill/unroll.c b/examples/chill/unroll.c deleted file mode 100644 index e74dea3..0000000 --- a/examples/chill/unroll.c +++ /dev/null @@ -1,33 +0,0 @@ - -#define N 14 -#define DT 0.314 - -void foo(int n, float* x, float* y, float* z, float* f3, float* f1, float* w) { - - int i, j; - - for (i = 1; i <= 14; i++) - x[i] = 1.0; - - for (i = 1; i <= 14; i += 3) - y[i] = 1.0; - - for (i = N + 1; i <= N + 20; i += 3) - z[i] = 1.0; - - for (i = 0; i <= N; i++) { - for (j = i; j <= i + N; j++) - f3[i] = f3[i] + f1[j] * w[j - i]; - f3[i] = f3[i] * DT; - } - - return 0; -} - -int main() { - float x[N], y[N], z[N], f3[N], f1[N], w[N]; - - foo(N, x, y, z, f3, f1, w); - return 0; -} - diff --git a/examples/chill/unroll.script b/examples/chill/unroll.script deleted file mode 100644 index e64acb6..0000000 --- a/examples/chill/unroll.script +++ /dev/null @@ -1,35 +0,0 @@ -# -# Test unroll-and-jam. The last loop adapted from the simple -# convolution example from p463 of "Optimizing Compilers for -# Modern Architectures", by Randy Allen and Ken Kennedy. -# - -source: unroll.c -procedure: foo -format: rose -# fully unroll a loop with known iteration count -loop: 0 -original() -unroll(0,1,3) -print -print space - - -# a strided loop -loop: 1 -original() -unroll(0,1,2) -print -print space - -# lower and upper bounds are not constant -loop: 2 -original() -unroll(0,1,20) -print - -# parallelogram iteration space -loop: 3 -original() -unroll(0,1,2) -print diff --git a/examples/cuda-chill/cp.c b/examples/cuda-chill/cp.c deleted file mode 100644 index 837d7a6..0000000 --- a/examples/cuda-chill/cp.c +++ /dev/null @@ -1,29 +0,0 @@ -#define N 1 - -#define VOLSIZEY 512 -#define VOLSIZEX 512 -#define VOLSIZEZ 1 -#define ATOMCOUNT 4000 -#define GRIDSPACING 0.1 -#define zDim 0 - -extern float sqrtf(float); - -void cenergy_cpu(float atoms[ATOMCOUNT*4],float *energy,float z) -{ -int i,j,n;float dx,dy,dz; - - for (j=0; j 0) then - --print("Good enough"..(# cur_idxs[i])) - --print("returning "..i) - return i - end - end - return -1 --sentinal that there were no non-dummy indices left -end - -function build_order(final_order, tile_idx_names, ctrl_idx_names, tile_idx_map, cur_level) - order = {} - --print("\nbuild_order()") - --print("build_order(): final_order = ( "..list_to_string(final_order).." )") - --print("build_order(): ctrl_idx_names = ("..list_to_string(ctrl_idx_names).." )") - --print("cur_level "..cur_level.."") - --io.flush() - - for i,k in ipairs(final_order) do - skip = false - cur = final_order[i] - --print("\ncur "..cur.." = final_order["..i.."] = "..final_order[i].." ") - --control loops below our current level should not be in the current order - for j=cur_level+2,# ctrl_idx_names do - --print("j "..j.." final_order["..i.."] = "..final_order[i].." ") - if ctrl_idx_names[j] == final_order[i] then - skip = true - --print("SKIP "..final_order[i].." ") - --io.flush() - end - end - --possibly substitute tile indices ifn necessar - if table.contains_key(tile_idx_map,final_order[i]) then - approved_sub = false - sub_string = tile_idx_map[final_order[i]] - for j=cur_level+2,# tile_idx_names do - if tile_idx_names[j] == sub_string then - approved_sub = true - end - end - if approved_sub then - cur = sub_string - end - end - if not skip then - table.insert(order,cur) - end - end - return order -end - -function list_to_string(str_list) - --Helpful debug output - l = "" - for i,str in ipairs(str_list) do - if i > 1 then - l = l .. ", " .. str - else - l = str - end - end - return l -end - - -function find_cur_level(stmt,idx) - --Search cur_indices for a idx at stmt - cur = cur_indices(stmt) - --print(string.format("find_cur_level(stmt %d, idx %s) Cur indices %s", stmt, idx, list_to_string(cur))) - for i,cidx in ipairs(cur) do - if cidx == idx then - --print(string.format("found it at index %d", i)) - return i - end - end - error("Unable to find "..idx.." in current list of indices") -end - - -function chk_cur_level(stmt,idx) - --Search cur_indices for a idx at stmt - cur = cur_indices(stmt) - for i,cidx in ipairs(cur) do - if cidx == idx then - return i - end - end - return -1 -end - - -function find_offset(cur_order, tile, control) - --print("Looking for tile '"..tile.."' and control '"..control.."' in ( "..list_to_string(cur_order)..", )") - idx1 = -1 - idx2 = -1 - for i,cur in ipairs(cur_order) do - if(cur == tile) then - idx1 = i - end - if(cur == control) then - idx2 = i - end - end - if(idx1 < 0) then - error("Unable to find tile " .. tile .. " in current list of indices") - end - if(idx2 < 0) then - error("Unable to find control " .. control .. " in current list of indices") - end - --print("found at level " .. idx2 .. " and " .. idx1) - if(idx2 < idx1) then - return idx2-idx1+1 - else - return idx2-idx1 - end -end - -function tile_by_index(tile_indices, sizes, index_names, final_order, tile_method) - --print "STARTING TILE BY INDEX" - --io.flush() - stmt = 0 --assume stmt 0 - cur = cur_indices(stmt) - --print("Cur indices "..list_to_string(cur)) - if not valid_indices(stmt,tile_indices) then - error('One of the indices in the first parameter were not '.. - 'found in the current set of indices.') - end - if not tile_method then tile_method = counted end - tile_idx_names = {} - for i,s in ipairs(tile_indices) do tile_idx_names[i]=s end --shallow copy - --print("tile_index_names: ['"..list_to_string(tile_indices).."']") - - --print("index_names: ") - --for k,v in pairs(index_names) do print(k,v) end - - --io.flush() - - ctrl_idx_names = {} - tile_idx_map = {} - for k,v in pairs(index_names) do - valid = false - if(string.sub(k,1,1) == "l") then - if string.sub(k,-8) == "_control" then - i = tonumber(string.sub(k,2,-9)) - if i and i >= 1 and i <= (# tile_indices) then - ctrl_idx_names[i] = v - --print(string.format("Handling control %s for loop level %d",v,i)) - --print("control "..k.." name "..v.." ") - valid = true - end - elseif string.sub(k,-5) == "_tile" then - i = tonumber(string.sub(k,2,-6)) - if i and i >= 1 and i <= (# tile_indices) then - --print(string.format("tile %s -> %s",tile_indices[i], v)) - tile_idx_names[i] = v - tile_idx_map[v] = tile_indices[i] - --print(string.format("tile %s -> %s",tile_indices[i], v)) - valid = true - end - end - end - if not valid then error(string.format("%s is not a proper key for specifying ".. - "tile or control loop indices\n", k)) end - end - - --filter out control indices (and do name substitution of unprocessed tile indices) for a given level - cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, -1) - permute(stmt, cur_order) - - for i,cur_idx in ipairs(tile_indices) do - --print(string.format("i %d cur_idx %s calling build order ********", i-1, cur_idx)) - cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1) - --Find a offset between tile loop and control loop - -- 0 = control loop one level above tile loop - -- -1 = control loop two levels above tile loop - -- > 0 = tile loop above control loop - -- In the last case, we do two extra tile commands to get the control - -- above the tile and then rely on the final permute to handle the - -- rest - level = find_cur_level(stmt,cur_idx) - offset = find_offset(cur_order, tile_idx_names[i], ctrl_idx_names[i]) - --print(string.format("offset %d", offset)) - - if (offset <= 0) then - --print(string.format("[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %s)",stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method)) - tile(stmt, level, sizes[i], level+offset, tile_idx_names[i], ctrl_idx_names[i], tile_method) - else - --print(string.format("2tile(%d, %d, %d, %d, %s, %s, %s)", stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method)) - tile(stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method);--regular level - --flip tile and control loop - --print(string.format("3tile(%d, %d, %d)",stmt, level+1, level+1)) - tile(stmt, level+1, level+1); - --print(string.format("4tile(%d, %d, %d)",stmt, level+1, level)) - tile(stmt, level+1, level); - --print(string.format("\n[offset>0]tile(%d, %d, %d, %d,%s,%s,%s)",stmt, level, sizes[i], level, tile_idx_names[i], ctrl_idx_names[i], tile_method)) - --print_code() - - end - - --Do permutation based on cur_order - --print "permute based on build order calling build_order()" - --print "cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1)" - cur_order = build_order(final_order, tile_indices, ctrl_idx_names, tile_idx_map, i-1) - --print "permute(stmt, cur_order);" - permute(stmt, cur_order); - --print "\nafter permute(), code is:" - --print_code() - end - --print "ENDING TILE BY INDEX" - --print_code() -end - -function normalize_index(index) - stmt = 0 --assume stmt 0cur = cur_indices(stmt) - --print("Cur indices "..list_to_string(cur)) - l = find_cur_level(stmt, index) - tile(stmt, l, l) - --print(string.format("\n[Normalize]tile(%d, %d, %d)",stmt, l,l)) -end - -function is_in_indices(stmt, idx) - cur = cur_indices(stmt) - for i=0,#cur,1 do - if(cur[i]==idx) then - return true - end - end - return false - -end - - -function copy_to_registers(start_loop, array_name) - - --print("\n\n****** starting copy to registers") - io.flush() - - stmt = 0 --assume stmt 0 - - -- [Malik] first we make sure that tx and ty are consecutive loops in the 2D thread setup, otherwise all levels for subsequent operations are messed up. Start logic. - cur = cur_indices(stmt) - table_Size = table.getn(cur) - - --print(string.format("Cur indices %s,",list_to_string(cur))) - --print(string.format("The table size is %d", table_Size)) - --table.foreach(cur, print) - --print_code() - - level_tx = -1 - level_ty = -1 - if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end - if is_in_indices(stmt,"ty") then level_ty = find_cur_level(stmt,"ty") end - --print(string.format("level_tx %d level_ty %d", level_tx, level_ty)) - - ty_lookup_idx = "" - org_level_ty = level_ty - - --if(cur[level_tx+1]~=nil and cur[level_tx+1]~="") then ty_lookup = ty_lookup+1 end - if(cur[level_ty+1]~=nil and cur[level_ty+1]~="") then - --print(string.format("IF cur[%d] = %s", level_ty+1, cur[level_ty+1])) - ty_lookup_idx = cur[level_ty+1] - else - --if cur[level_ty] ~= nil then print(string.format("ELSE ty_lookup_idx = cur[%d] = %s", level_ty, cur[level_ty])) -- TODO - --else print "ELSE (dangerous)" end - ty_lookup_idx = cur[level_ty] -- may assign nil !? - end - --if ty_lookup_idx ~= nil then print(string.format("ty_lookup_idx '%s'", ty_lookup_idx)) -- TODO - --else print "ty_lookup_idx is NIL" - --end - - if level_ty > 0 then - --print(string.format("\ntile3(%d,%d,%d)",stmt,level_ty,level_tx+1)) - tile(stmt,level_ty,level_tx+1) - end - --print_code() - - --print("\ntylookup is %d",ty_lookup) - --exit(0) - -- - cur = cur_indices(stmt) - table_Size = table.getn(cur) - --print(string.format("Cur indices %s,",list_to_string(cur))) - --print("The table size is "..table.getn(cur)) - --table.foreach(cur, print) - - if is_in_indices(stmt,"tx") then level_tx = find_cur_level(stmt,"tx") end - if ty_lookup_idx then - if is_in_indices(stmt,ty_lookup_idx) then level_ty = find_cur_level(stmt,ty_lookup_idx) end - end - - ty_lookup = 1 - idx_flag = -1 - -- find the level of the next valid index after ty+1 - --print(string.format("\nlevel_ty %d", level_ty)) - if level_ty > 0 then - --print(string.format("table_Size %d", table_Size)) - for num= level_ty+ty_lookup,table_Size do - --print(string.format("num=%d cur[num] = '%s'",num, cur[num])) - if(cur[num] ~= "") then - idx_flag = find_cur_level(stmt,cur[num]) - --print (string.format("idx_flag = %d", idx_flag)) - break - end - end - end - - --print(string.format("\n(first) I am checking all indexes after ty+1 %s",idx_flag)) - --print_code() - --print "" - - how_many_levels = 1 - startat = idx_flag + 1 - if startat == 0 then startat = 1 end -- avoid attempt to examine an illegal array offset - --print(string.format("idx_flag = %d I will check levels starting with %d", idx_flag, idx_flag+1)) - - for ch_lev = startat,table_Size,1 do -- was for ch_lev = idx_flag+1,table_Size,1 do - --print(string.format("ch_lev %d", ch_lev)) - if(cur[ch_lev] ~= nil and cur[ch_lev] ~= "") then - --print(string.format("cur[%d] = '%s'", ch_lev, cur[ch_lev])) - how_many_levels = how_many_levels+1 - end - end - --print("\nHow Many Levels",how_many_levels) - - -- change this all to reflect the real logic which is to normalize all loops inside the thread loops. - if(how_many_levels <2) then - while( idx_flag >= 0) do - for num = level_ty+ty_lookup,(table_Size) do - --print(string.format("at top of loop, num is %d", num)) - --print(string.format("num %d", num)) - --print(string.format("cur[num] = '%s'", cur[num])) - if(cur[num] ~= "") then - idx=cur[num] - --print(string.format("idx '%s'", idx)) - - curlev = find_cur_level(stmt,idx) - --print(string.format("curlev %d", curlev)) - - --print_code() - --print(string.format("\n[COPYTOREG]tile(%d,%d,%d)",stmt,find_cur_level(stmt,idx),level_tx)) - tile(stmt,find_cur_level(stmt,idx),find_cur_level(stmt,idx)) - curlev = find_cur_level(stmt,idx) - --print(string.format("curlev %d", curlev)) - tile(stmt,find_cur_level(stmt,idx),level_tx) - --print(string.format("hehe '%s'",cur[num])) - - cur = cur_indices(stmt) - --print("Cur indices INSIDE"..list_to_string(cur)) - table_Size = table.getn(cur) - --print(string.format("Table Size is: %d",table_Size)) - level_tx = find_cur_level(stmt,"tx") - --print(string.format("\n level TX is: %d",level_tx)) - level_ty = find_cur_level(stmt,ty_lookup_idx) - --print(string.format("\n level TY is: %d",level_ty)) - idx_flag = -1 - --print "idx_flag = -1" - - -- find the level of the next valid index after ty+1 - - -- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?) - for num= level_ty+ty_lookup,table_Size do - --print(string.format("num mucking num = %d", num)) - if(cur[num] ~= nil and cur[num] ~= "") then - idx_flag = find_cur_level(stmt,cur[num]) - --print("\n(second) I am checking all indexes after ty+1 %s",cur[num]) - break - end - end - --print(string.format("num mucked to %d idx_flag = %d", num, idx_flag)) - - end - --print(string.format("at bottom of loop, num is %d", num)) - end - end - end - --print "done with levels" - - - - - --print "ARE WE SYNCED HERE?" - --print_code() - --print("\ntile(%d,%d,%d)",stmt,level_k,level_k) - --tile(stmt,level_k,level_k) - - -- [Malik] end logic - --print_code() - start_level = find_cur_level(stmt, start_loop) - --We should hold contant any block or tile loop - block_idxs = block_indices() - thread_idxs = thread_indices() - --print("\nblock indices are") - --table.foreach(block_idxs, print) - --print("\nthread indices are") - --table.foreach(thread_idxs, print) - --print(string.format("\nStart Level: %d",start_level)) - - hold_constant = {} - --print("\n Now in Blocks") - for i,idx in ipairs(block_idxs) do - --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx))) - if find_cur_level(stmt,idx) >= start_level then - table.insert(hold_constant, idx) - --print(string.format("\nJust inserted block %s in hold_constant",idx)) - end - end - - - --print("\n Now in Threads") - for i,idx in ipairs(thread_idxs) do - --print(string.format("\n Idx:%s : Level: %d",idx,find_cur_level(stmt,idx))) - if find_cur_level(stmt,idx) >= start_level then - table.insert(hold_constant, idx) - --print(string.format("\nJust inserted thread %s in hold_constant",idx)) - end - end - - --print "\nhold constant table is: " - --table.foreach(hold_constant, print) - - --print("\nbefore datacopy pvt") - old_num_stmts = num_statements() - --print_code() - --print(string.format("\n[DataCopy]datacopy_privatized(%d, %s, %s, vector having privatized levels)",stmt, start_loop, array_name)) - --table.foreach(hold_constant, print) - datacopy_privatized(stmt, start_loop, array_name, hold_constant) - - --print(hold_constant) - new_num_stmts = num_statements() - --print("\nthe num of statements:%d\n",new_num_stmt) - --print_code() - --exit(0) - -- [Malik] normalize the copy loops created. - cur = cur_indices(old_num_stmts) - --print("Cur indices "..list_to_string(cur)) - for cidx,i in ipairs(cur) do - if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then - --tile(old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i)) - --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts,find_cur_level(old_num_stmts,i),find_cur_level(old_num_stmts,i)) - end - end - --print_code() - --print("\nthe num of statements OLD+1 :",(old_num_stmts+1)) - - ---[[ - is this commented out? why yes, yes it is block comment - if( (old_num_stmts+1) <= new_num_stmts) then - cur = cur_indices(old_num_stmts+1) - --print("Cur indices+1 "..list_to_string(cur)) - for cidx,i in ipairs(cur) do - if i ~= "tx" and i~="ty" and i~="bx" and i~="by" then - tile(old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i)) - --print("\nTILE OF REG: tile(%d,%d,%d)",old_num_stmts+1,find_cur_level(old_num_stmts+1,i),find_cur_level(old_num_stmts+1,i)) - end - end - end ---]] - - - --Unroll to the last thread level - --for stmt=old_num_stmts,new_num_stmts-1 do - -- level = find_cur_level(stmt,thread_idxs[#thread_idxs])--get last thread level - --if level < #cur_indices(stmt) then - -- unroll(stmt,level+1,0) - --print(string.format("\n[Unroll]unroll(%d, %d, 0)",stmt, level+1)) - ----print_code() - --end - --end - io.flush() - --print("****** ending copy to registers\n\n") - --io.flush() -end - -function copy_to_shared(start_loop, array_name, alignment) - --print(string.format("\nstarting copy to shared(%s, %s, %d )",start_loop,array_name,alignment)) - stmt = 0 --assume stmt 0 - cur = cur_indices(stmt) - --print("Cur indices "..list_to_string(cur)) - - start_level = find_cur_level(stmt, start_loop) - --print(string.format("start_level %d", start_level)) - - old_num_stmts = num_statements() - --print(string.format("old_num_statements %d", old_num_stmts)) - - --Now, we give it indices for up to two dimentions for copy loop - copy_loop_idxs = {"tmp1","tmp2"} - --print(string.format("\n[DataCopy]datacopy(%d, %d, %s, {\"tmp1\",\"tmp2\"},false,0,1,%d,true)",stmt, start_level, array_name, alignment)) - datacopy(stmt, start_level, array_name, copy_loop_idxs, false, 0, 1, alignment,true) - - add_sync(stmt,start_loop) - new_num_stmts = num_statements() - - --This is fairly CUBLAS2 specific, not sure how well it generalizes, - --but for a 2D copy, what we want to do is "normalize" the first loop - --"tmp1" then get its hard upper bound. We then want to tile it to - --make the control loop of that tile "ty". We then tile "tmp2" with a - --size of 1 and make it "tx". - --print(string.format("fairly CUBLAS2 specific, OLD %d NEW %d", old_num_stmts, new_num_stmts )) - - for stmt=old_num_stmts,new_num_stmts-1 do - --print(string.format("for stmt = %d", stmt)) - was_no_error, level = pcall(find_cur_level, stmt, "tmp2") - - if was_no_error then - --print_code() - --print("\nCopy to shared: [If was no error]\n") - find_cur_level(stmt,"tmp2") - tile(stmt, level, level) - - lower,upper = hard_loop_bounds(stmt, level) - upper = upper + 1 - --print(string.format("lower %d upper %d", lower, upper)) - - tx,ty = thread_dims() - --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx) - - level = find_cur_level(stmt,"tmp1") - --print(string.format("level %d", level)) - - if tx == upper and ty == 1 then - --print(string.format("tx = %d upper = %d ty = %d", tx, upper, ty)) - --print "Don't need" - - --Don't need an extra tile level, just move this loop up - second_level = find_cur_level(stmt,"tmp2") - --print(string.format("\n[Tile0]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx")) - tile(stmt, second_level, 1, level, "tx", "tx", counted) - else - --print "DO need?" - --print_code() - if(ty == 1) then new_ctrl = "tmp3" else new_ctrl = "ty" end - - ---[[ Commenting out a block of Gabe's code in this control flow - -- level = find_cur_level(stmt,"tmp1") - tile(stmt, level, level) - - lower,upper = hard_loop_bounds(stmt, level) - upper = upper + 1 - --print_code() - --print("2-loop cleanup: lower, upper: "..lower..", "..upper..", tx: "..tx..", level: "..level) - if(math.ceil(upper/ty) > 1)then - tile(stmt, level, math.ceil(upper/ty), level, "tmp", new_ctrl, counted) - --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level, math.ceil(upper/ty),upper,ty, level, "tmp", new_ctrl)) - else - tile(stmt, level, math.ceil(upper/ty), level, "ty", new_ctrl, counted) - --print(string.format("\n[Tile1]tile(%d, %d, %f[%d,%d], %d,%s,%s,counted)",stmt, level, math.ceil(upper/ty),upper,ty, level, "tx", new_ctrl)) - end - - --print_code() - -- [Malik] If here we have the loop upper bound > tx, then we should tile once more after the next tile, to carve out the correct tx. - lower1,upper1 = hard_loop_bounds(stmt,level) - level1 = level - stmt1 = stmt - -- [Malik] Do the tile after the second level tile with if condition. Just to keep the original order, the tile is being pushed to the end. - - --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1) - - --print_code() - --level = find_cur_level(stmt,"tmp") - --tile(stmt,level,level) - --print_code() - - --[Malik] if you are moving the loop above the level1, you need to update level1 with new position which would be level1+2 or second_level - if(level <= level1) then level1 = level1+2 end - --print(string.format("\n[Tile2]tile(%d, %d, 1, %d,%s,%s,counted)",stmt, second_level, level, "tx", "tx")) - --print("\n----------------------------------") - --print_code() - --print("\n**********************************") - --print("[Malik]-loop cleanup: lower1, upper1: "..lower1..", "..upper1..", tx: "..tx..", level:"..level1) - -- [Malik] If the upper bound > tx, we do another tile to carve out the correct tx from a bigger loop. Else just normalize the bounds. - if( upper1 > ty) then - third_level = find_cur_level(stmt1,"tmp") - --print("\n\n\n\t\t\t\tthirdlevel:"..third_level) - tile(stmt1, third_level, ty, third_level, "ty", "tmp", counted) - --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt1, third_level, ty,third_level, "ty", "tmp")) - tile(stmt1,third_level+1,third_level+1) - --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level+1)) - tile(stmt1,third_level+1,third_level) - --print(string.format("\n[Tile3]tile(%d, %d, %d)",stmt1, third_level+1, third_level)) - else - tile(stmt1,level1,level1) - --print(string.format("\n[Tile3ELSE]tile(%d, %d, %d)",stmt1,level1,level1)) - end - - --print("\nStarting tmp2\n");--print_code(); - second_level = find_cur_level(stmt,"tmp2") - lower,upper = hard_loop_bounds(stmt,second_level) - level = second_level - --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..", level:"..level) - - if(math.ceil(upper/tx) > 1)then - tile(stmt, second_level,math.ceil(upper/tx), level, "tmp", "tx", counted) - --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tmp", "tx")) - else - tile(stmt, second_level,math.ceil(upper/tx), level, "tx", "tx", counted) - --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,math.ceil(upper/tx),second_level, "tx", "tx")) - end - --print_code() - lower2,upper2 = hard_loop_bounds(stmt,level) - level2 = level - stmt2 = stmt - --print("[Malik]-loop cleanup@tmp2: lower2, upper2: "..lower2..", "..upper2..", tx: "..tx..", level:"..level2) - -- now for the second level. - if( upper2 > tx) then - forth_level = find_cur_level(stmt2,"tmp") - --print("\n\n\n\t\t\t\tforthlevel:"..forth_level) - --print_code() - tile(stmt2, forth_level, 1, forth_level, "tx", "tmp", counted) - --print(string.format("\n[Tile3B]tile(%d, %d, %d,%d,%s,%s,counted)",stmt2, forth_level, tx,forth_level, "ty", "tmp")) - --print_code() - --tile(stmt2,forth_level+1,forth_level+1) - --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level+1)) - --tile(stmt2,forth_level+1,forth_level) - --print(string.format("\n[Tile3B]tile(%d, %d, %d)",stmt2, forth_level+1, forth_level)) - else - new_level = find_cur_level(stmt2,"ty") - tile(stmt2,level2,1,new_level,"tx","tx",counted) - --print(string.format("\n[Tile3BELSE]tile(%d, %d, %d)",stmt2,level2,level2)) - tmp_level = find_cur_level(stmt2,"tmp") - tile(stmt2,tmp_level,tmp_level) - end - - --print_code() - --print("\n----------------------------------") ---]] - - --print_code() - --print("\nStarting tmp2\n");--print_code(); - first_level = find_cur_level(stmt,"tmp1") - second_level = find_cur_level(stmt,"tmp2") - lower,upper = hard_loop_bounds(stmt,second_level) - - --print("[Malik]-loop cleanup@tmp2: lower, upper: "..lower..", "..upper..", tx: "..tx..",first level:"..first_level..",second_level:"..second_level) - - -- Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx. - --print(string.format("\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, second_level,1,first_level, "tx", "tx")) - tile(stmt,second_level,1,first_level,"tx","tx",counted) - --print_code() - - first_level = find_cur_level(stmt,"tmp1") - lower_1,upper_1 = hard_loop_bounds(stmt,first_level) - tx_level = find_cur_level(stmt,"tx") - lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level) - --print(string.format("UL_1 %d %d UL_tx %d %d", lower_1, upper_1, lower_tx, upper_tx)) - - if(math.ceil(upper_tx/tx) > 1)then - --print "ceil I say" - --print(string.format("\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,tx,tx_level, "tx", "tmp1")) - tile(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted) - --print_code() - - peat = find_cur_level(stmt,"tx") - --print(string.format("\n[Tile1]tile(%d, %d, %d)",stmt, peat, peat)) - tile(stmt, peat, peat ) --find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx")) - --print_code() - - if (find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx")) then - --print(string.format("\nagain [Tile1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))) - tile(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")) - --print_code() - end - --else - --tile(stmt, tx_level,1, tx_level, "tx", "tx", counted) - --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, tx_level,1,tx_level, "tx", "tx")) - end - --print_code() - --]] -- this apparently is NOT the end of a block comment - - --print("\nStarting tmp1\n") - -- Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty". - tile(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1")) - --print_code() - - ty_level = find_cur_level(stmt,"tmp1") - lower_ty,upper_ty = hard_loop_bounds(stmt,ty_level) - - tx_level = find_cur_level(stmt,"tx") - lower_tx,upper_tx = hard_loop_bounds(stmt,tx_level) - --print("[Malik]-loop cleanup@tmp1: lowerty, upperty: "..lower_ty..", "..upper_ty..", ty: "..ty..",ty level:"..ty_level..",tx_level:"..tx_level..", stmt: "..stmt) - - --print "before ceil" - if(math.ceil(upper_ty/ty) > 1)then - --print "CEIL IF" - --print("\n Inside upper_ty/ty > 1\n"); - - --print(string.format("\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,ty,ty_level, "ty", "tmp_ty")) - tile(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted) - --print_code() - - --print(string.format("\n[Tile2-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt ,"ty"),find_cur_level(stmt,"ty"))) - tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty")) - --print_code() - - ----------------------------------------------------------------------- - ---------------------------------------------------------------------- - cur_idxs = cur_indices(stmt) - --print("\n cur indexes are "..list_to_string(cur_idxs)) - - -- Putting ty before any tmp_tx - idx_flag = -1 - for num= 0,table.getn(cur_idxs) do - if(cur[num] == "tmp_tx") then - idx_flag = find_cur_level(stmt,cur[num]) - break - end - end - --print(string.format("\n (1) so i have found out the value of idx flag as %d",idx_flag) ) - - if(idx_flag >=0 ) then - if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then - --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) - tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) - --print_code() - end - end - - -- Now Putting ty before any tmp_ty - idx_flag = -1 - for num= 0,table.getn(cur_idxs) do - if(cur[num] == "tmp_ty") then - idx_flag = find_cur_level(stmt,cur[num]) - break - end - end - --print(string.format("\n IF so i have found out the value of idx flag as %d",idx_flag) ) - if(idx_flag >=0 ) then - --print "one more test" - if ((find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"))) then - --print(string.format("\n[Tile2-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) - tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) - --print_code() - end - end - else - --print "CEIL ELSE" - --cur_idxs = cur_indices(stmt) - --print("\n Inside upper_ty/ty <= 1\n"); - - --print(string.format("\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)",stmt, ty_level,1,ty_level, "ty", "ty")) - tile(stmt, ty_level,1, ty_level, "ty", "ty", counted) - --print_code() - - --print(string.format("\n[Tile3-1]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)) - tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1) - --print_code() - - idx_flag = -1 - if(cur_idxs) then - --print "CAN NEVER GET HERE? cur_idxs" - for num= 0,table.getn(cur_idxs) do - if(cur[num] == "tmp_ty") then - idx_flag = find_cur_level(stmt,cur[num]) - break - end - end - end - --print(string.format("\n ELSE so i have found out the value of idx flag as %d",idx_flag) ) - if(idx_flag >=0 ) then - if (find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty")) then - --print(string.format("tile( stmt %d, level ty %d, level ty %d",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) - tile(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) - --print(string.format("\n[Tile3-2]tile(%d, %d, %d)",stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))) - end - end - end - - --print_code() - end - - - --print "\n\n *** at bottom of if in copy to shared, " - --print_code() - --print "end of if" - - else - --copy to shared only created one level, not two, so we use a different approach (MV & TMV) - --print("\nCopy to shared: [If was error]\n") - level = find_cur_level(stmt,"tmp1") - tile(stmt, level, level) - - --print(string.format("\n[Tile]tile(%d, %d, %d)",stmt, level, level)) - tx,ty = thread_dims() - lower,upper = hard_loop_bounds(stmt, level) - upper = upper+1 --upper bound given as <=, compare to dimensions tx which is < - --print("upper "..upper.." tx "..tx) - if upper == tx then - rename_index(stmt, "tmp1", "tx") - else - --print("upper is not tx") - --TODO: Don't know, maybe do some tileing etc - --print_code() - --print("upper "..upper.." tx "..tx.." stmt: "..stmt.." level: "..level) - tile(stmt, level,tx,level, "tx", "tmp_tx", counted) - --print_code() - - --print("stmt:"..stmt.." level+1: "..level+1) - --print("TILE 7") - tile(stmt, level+1,1,level+1,"tx", "tx",counted) - --print("TILE 3") - tile(stmt,level+1,level) - --print_code() - - if(ty > 1) then - --print_code() - --print("GOING IN") - lower,upper = hard_loop_bounds(stmt, level+1) - --print(string.format("ty %d lower %d upper %d", ty, lower, upper)) - --upper=125 - --print("NOW FOR Y: upper "..upper.." ty "..ty.." stmt: "..stmt.." level: "..(level+1).." bound:"..math.ceil(upper/ty)) - tile(stmt, level+1,math.ceil(upper/ty),level+1, "tmp_ty", "ty", counted) - --tile(stmt, level+2,math.ceil(upper/ty),level+2, "tmp_ty", "ty", counted) - end - --print_code() - --rename_index(stmt, "tmp1", "tx") - --print("Warning: Need to implement some logic here to tile the single level shared copy loop to match thread dimensions") - end - end - --Always add sync - add_sync(stmt,start_loop) - - end - --print("ending copy to shared\n") - --print_code() -end - -function unroll_to_depth(max_depth) - --print(string.format("\n\nunroll_to_depth(%d)", max_depth )) - --print "SYNC UP" - - cur = cur_indices(0) - thread_idxs = thread_indices() - guard_idx = thread_idxs[#thread_idxs] - - --print(string.format("cur indices %s",list_to_string(cur))) - --print(string.format("thread indices %s",list_to_string(thread_idxs))) - --print(string.format("#thread_idxs = %d", #thread_idxs)) - --print(string.format("guard_idx = %s", guard_idx)) - - ---- HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS - common_loops = {} - comm_loops_cnt = 0 - num_stmts = num_statements() - --print(string.format("num statements %d", num_stmts)) - - for stmt=0,num_stmts-1 do - cur_idxs = cur_indices(stmt) - - --print(string.format("\nSTMT %d Current Indices: %s",stmt,list_to_string(cur_idxs))) - - if(chk_cur_level(stmt,"tx")>0) then - for ii=1,find_cur_level(stmt,"tx")-1 do -- started at 0 - --print(string.format("ii = %d", ii)) -- index starts at 1, what does index 0 do? - --if cur_idxs[ii] == nil then print "cur_idxs[i]] is NIL" - --else print(string.format("cur_idxs[%d] = '%s'", ii, cur_idxs[ii])) -- index starts at 1, what does index 0 do? - --end - - if(cur_idxs[ii] ~= "bx" and cur_idxs[ii] ~= "by" and cur_idxs[ii] ~= nil and cur_idxs[ii] ~= "tx" and cur_idxs[ii] ~= "ty" and cur_idxs[ii] ~= "") then - - --print(string.format("id %s is not in the list", cur_idxs[ii] )) - - for stmt1=stmt+1,num_stmts-1 do - --print(string.format("\nii %d stmt1 is %d", ii, stmt1)) - cur_idxs1 = cur_indices(stmt1) - --print("\nstmt1 cur_idxs1 is "..list_to_string(cur_idxs1)) - - --print(string.format("cur level(%d, %s) = %d", stmt, "tx", find_cur_level(stmt,"tx"))) - - endrange = find_cur_level(stmt,"tx")-1 - --print(string.format("for iii=1, %d do", endrange)) - - for iii=1,find_cur_level(stmt,"tx")-1 do -- started at 0 - --print(string.format("stmt %d ii %d iii %d ", stmt, ii, iii)) - --if(cur_idxs1[iii] ~= nil) then - -- print(string.format("stmt %d ii %d iii %d cur_idxs1[%d] = '%s'", stmt, ii, iii, iii, cur_idxs1[iii])) - --else - -- print(string.format("stmt %d ii %d iii %d cur_idxs1[%d] = NIL", stmt, ii, iii, iii)) - --end - - if(cur_idxs1[iii] ~= "bx" and cur_idxs1[iii] ~= "by" and cur_idxs1[iii] ~= nil and cur_idxs1[iii] ~= "tx" and cur_idxs1[iii] ~= "ty" and cur_idxs1[iii] ~= "") then - if(cur_idxs[ii] == cur_idxs1[iii]) then - --print("\nfound idx:"..cur_idxs[ii]) - --if(comm_loops_cnt == 0) then print "\n\n*** WARNING *** assigning to array index ZERO in Lua" end - common_loops[comm_loops_cnt] = cur_idxs[ii] - --print(string.format("cl[%d] = '%s'", comm_loops_cnt, common_loops[comm_loops_cnt])) - comm_loops_cnt = comm_loops_cnt + 1 - end - end - end - end - end - end - end - end - ---- - --if(comm_loops_cnt>0) then - -- print("\n COMM LOOPS :TOTAL "..comm_loops_cnt..", and are "..list_to_string(common_loops).." this loop :"..common_loops[0]) - --else - -- print "UNROLL can't unroll any loops?" - --end - - - - - repeat - old_num_stmts = num_statements() - --print(string.format("old_num_statements %d", old_num_stmts)) - - for stmt=0,old_num_stmts-1 do - cur_idxs = cur_indices(stmt) - --print(string.format("stmt %d cur_idxs = %s", stmt, list_to_string(cur_idxs))) - if(#cur_idxs > 0) then - gaurd_level = -1 - if(chk_cur_level(stmt,guard_idx)>0) then - gaurd_level = find_cur_level(stmt,guard_idx) - end - --print(string.format("guard_level(sp) = %d", gaurd_level)) - - if(gaurd_level>-1) then - level = next_clean_level(cur_idxs,gaurd_level) - --print(string.format("next clean level %d", level)) - - --need to handle max_depth - num_unrolled = 0 - level_unroll_comm = level - level_arr = {} - while level >= 0 do - --print(string.format("while: level = %d", level)) - - if num_unrolled == max_depth then break end - --print("Unrolling "..stmt.." at level "..(level).." index ".. cur_idxs[gaurd_level+1]) - - level_arr[num_unrolled] = level - num_unrolled = num_unrolled + 1 - - guard_level = find_cur_level(stmt,guard_idx) - level = next_clean_level(cur_idxs,level+1) - end - --dies print("How many levels for unroll commands"..table.getn(level_arr).." which is "..level_arr[0].." and "..level_arr[#level_arr]) - --if(table.getn(level_arr) ~= nil) then - - --print "OK, NOW WE UNROLL" - - if(level_unroll_comm >= 0)then - for i = table.getn(level_arr),0,-1 do - --print(string.format("\ni=%d", i)) - --print(string.format("[Unroll]unroll(%d, %d, 0)",stmt, level_arr[i])) - - unroll(stmt,level_arr[i],0) - --print("finished unroll]]\n") - --print_code() - end - end ------- - end ---[[ - -THERE WAS A BIG BLOCK OF COMMENTED OUT CODE HERE - - ---]] ------- - end - end - new_num_stmts = num_statements() - - until old_num_stmts == new_num_stmts - -end - - diff --git a/examples/cuda-chill/cudaize.py b/examples/cuda-chill/cudaize.py deleted file mode 100755 index ffef009..0000000 --- a/examples/cuda-chill/cudaize.py +++ /dev/null @@ -1,1047 +0,0 @@ -#! /usr/bin/python - -# THIS IS CUDAIZE.PY - -import chill -import sys -import math - -strided = 0 -counted = 1 - -def print_code(): - chill.print_code() - print "" - sys.stdout.flush() - - -def table_contains_key( table, key ): # use a dict for the 'table'? - return table.has_key(key) # (key in table)? - -def print_array( arr ): # a useful function to mimic lua output - for a in arr[:-1]: - print "%s," % a, - print "%s" % arr[-1] - sys.stdout.flush() - -def valid_indices( statement, indices ): - #print "valid_indices() python calling C cur_indices" - #print statement - cur = chill.cur_indices(statement) # calls C - #print "python valid_indices(), cur = ", - #print cur - #print "indices = ", - #print indices - - for index in indices: - if not index in cur: - return False - return True - -def next_clean_level( indices_at_each_level, level): - #print "next_clean_level( ..., %d )" % level - #print "indices_at_each_level ", - print_array( indices_at_each_level ) - - numlevels = len(indices_at_each_level) - #print "loop to %d" % numlevels - for i in range(level+1, numlevels+1): - pythoni = i-1 # LUA index starts at 1 - #print "Checking level %d = '%s'" % (i, indices_at_each_level[pythoni]) - sys.stdout.flush() - if len(indices_at_each_level[pythoni]) > 0: # LUA INDEX STARTS AT 1 - #print "returning %d" % i - return i # MATCH lua return value, LUA index starts at one - return -1 # no non-dummy indices - - - - -def build_order( final_order, tile_index_names, control_index_names, tile_index_map, current_level): - order = [] - #print "\nbuild_order()" - #print "build_order(): final_order = (", - count = 0 - for f in final_order: - #if count+1 == len(final_order): - # print "%s )" % f - #else: - # print "%s," % f , - count += 1 - - keys = control_index_names.keys() - keys.sort() - #if (2 == len(keys)): - # print "build_order(): ctrl_idx_names = (%s, %s)" % (control_index_names[0], control_index_names[1]) - #else: - # print "build_order(): ctrl_idx_names = (%s" % control_index_names[0], - # for k in keys[1:]: - # print ", %s" % control_index_names[k], - # print ")" - - #print control_index_names - #print "cur_level %d" % current_level - - #print "tile index map: ", - #print tile_index_map - - - for i in range(len(final_order)): - k = final_order[i] # not used? - skip = False - cur = final_order[i] - # control loops below our current level should not be in the current order - - # skip = cur in control_index_names[current_level+2:] - #print "\n%d control_index_names, " % len(control_index_names) - #print control_index_names - - for j in range(current_level+1, len(control_index_names)): - #print "comparing cur %s with cin[%d] %s" % ( cur, j, control_index_names[j]) - if control_index_names[j] == cur: - skip = True - #print "SKIP %s " % cur - - # possibly substitute tile indices if necessary - if tile_index_map.has_key(cur): - approved_sub = False - sub_string = tile_index_map[cur] - #print "sub_string = ", - #print sub_string - - # approved_sub = sub_string in tile_index_names[current_level+2:] - for j in range(current_level+1, len(tile_index_names)): - if tile_index_names[j] == sub_string: - approved_sub = True - if approved_sub: - cur = sub_string - - if not skip: - order.append( cur) - #print "build_order() returning order (", - #print order - #for o in order: - # print "%s," % o, - #print ")" - return order - -def find_cur_level( stmt, idx ): - #print "find_cur_level(stmt %d, idx %s) Cur indices" % ( stmt, idx ), - - cur = chill.cur_indices(stmt) - #for c in cur[:-1]: - # print "%s," % c, - #print "%s" % cur[ -1 ] - - index = 1 # lua starts indices at 1 !! - for c in cur: - if c == idx: - #print "found it at index %d" % index - #sys.stdout.flush() - #print "in find_cur_level, returning ", - #print index - return index - index += 1 - #print "find_cur_level(), Unable to find index %s in" % idx, - #print cur - #print "in find_cur_level, returning -1" - return -1 # special meaning "it's not there" - -def chk_cur_level( stmt, idx ): - # search cur_indices for a ind at stmt - cur = chill.cur_indices(stmt) - if idx in cur: - return 1 + cur.index(idx) # lua index starts at 1 ! - return -1 - -def find_offset( cur_order, tile, control): - #print "Looking for tile '%s' and control '%s' in (" % (tile, control), - #print cur_order - #for o in cur_order: - # print "%s," % o, - #print ")" - - idx1 = -1 - idx2 = -1 - if tile in cur_order: - idx1 = 1 + cur_order.index(tile) # lua indexes from 1! - else: - print "find_offset(), unable to find tile %s in current list of indices" % tile - sys.exit(-1) - - if control in cur_order: - idx2 = 1 + cur_order.index(control) # lua indexes from 1! - else: - print "find_offset(), unable to find control %s in current list of indices" % control - sys.exit(-1) - - #print "found at level %d and %d" % ( idx2, idx1 ) - # this appears horrible - if idx2 < idx1: - return idx2-idx1+1 # bad ordering - else: - return idx2-idx1 - - - -def tile_by_index( tile_indices, sizes, index_names, final_order, tile_method): - #print "STARTING TILE BY INDEX" - #print "tile_by_index() tile_method ", - #print tile_method - #print "index_names: ", - #print index_names - - stmt = 0 # assume statement 0 - if not valid_indices( stmt, tile_indices): - print "python tile_by_index() one or more of ", - print tile_indices, - print " is not valid" - sys.exit(-1) - - if tile_method == None: - #print "CREATING tile_method = 1" - tile_method = 1 # "counted" - - tile_index_names = [] - for ti in tile_indices: - tile_index_names.append( ti ) # make a copy? - #print "tile_index_names:", - #print tile_index_names - - control_index_names = {} # a dictionary? - tile_index_map = {} - - #print "index_names: " - #print index_names - - for pair in index_names: - valid = False - control = pair[0] - name = pair[1] - #print "control %s name %s" % ( control, name ) - - if control[0] == "l" and control[1].isdigit(): - if control.endswith("_control"): - index = int(control[1: -8]) - control_index_names[index-1] = name - valid = True - - elif control.endswith("_tile"): - index = int(control[1: -5]) - #print "index %d" % index - tile_index_names[index-1] = name # ?? - tile_index_map[name] = tile_indices[index-1] - valid = True - if not valid: - print "%s is not a proper key for specifying tile or control loop indices\n" % control - - #print "control_index_names = ", - #print control_index_names - - #print "tile_index_names = ", - #print tile_index_names - - #print "before call to build_order(), tile_index_map = ", - #print tile_index_map - - - # filter out control indices (and do name substitution of unprocessed tile indices) for a given level - cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, -1) - - #print "returned from build_order python\n\n" - - # print("permute("..stmt..", {"..list_to_string(cur_order).."})") - #print "permute(%d, {" % stmt, - #print "cur_order = ", - #print cur_order, - #print "})" - - cur_order.insert(0, stmt) - #print cur_order - chill.permute( tuple( cur_order)) - #print "in cudaize.py, returned from C code chill.permute()\n" - - for i in range(len(tile_indices)): - cur_idx = tile_indices[i] - #print "i %d cur_idx %s calling build order ********" % (i, cur_idx) - cur_order = build_order( final_order, tile_indices, control_index_names, tile_index_map, i) - #print "cur_idx %s return from build order" % cur_idx - - # Find an offset between tile loop and control loop - # 0 = control loop one level above tile loop - # -1 = control loop two levels above tile loop - # > 0 = tile loop above control loop - # In the last case, we do two extra tile commands to get the control - # above the tile and then rely on the final permute to handle the - # rest - level = find_cur_level(stmt,cur_idx) - #print "level %d\n" % level - - offset = find_offset(cur_order, tile_index_names[i], control_index_names[i]) - #print "offset %d" % offset - - if offset <= 0: - #print "[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method ) - chill.tile7( stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method ) - #print "in cudaize.py, returned from C code chill.tile7\n" - - else: - #print "2tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method ) - chill.tile7( stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method ) # regular level - - # flip and tile control loop - #print "3tile(%d, %d, %d)" % ( stmt, level+1, level+1) - chill.tile3( stmt, level+1, level+1) - - #print "4tile(%d, %d, %d)" % ( stmt, level+1, level) - chill.tile3( stmt, level+1, level) - - #print_code() - - # Do permutation based on cur_order - #print("permute based on build order calling build_order()") - cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, i) - - #print("permute based on build order return from build_order()") - - # print("permute("..stmt..", {"..list_to_string(cur_order).."})") - topermute = cur_order - topermute.insert(0, stmt) - chill.permute( tuple(topermute) ) - #print "\nafter permute(), code is:" - #print_code() - -def normalize_index( index ): - #print "in cudaize.py, normalize_index( %s )" % index - stmt = 0 # assume stmt 0 - l = find_cur_level( stmt, index ) - chill.tile3( stmt, l, l ) - -def is_in_indices( stmt, idx): - cur = chill.cur_indices(stmt) - return idx in cur - -def copy_to_registers( start_loop, array_name ): - #print "\n\n****** starting copy to registers" - #sys.stdout.flush() - - stmt = 0 # assume stmt 0 - cur = chill.cur_indices(stmt) # calls C - table_Size = len(cur) - - #print "Cur indices", - #print_array(cur) - #print "\nThe table size is %d" % table_Size - #count=1 - #for c in cur: - # print "%d\t%s" % (count,c) - # count += 1 - - #print_code() - - # would be much cleaner if not translating this code from lua! - level_tx = -1 - level_ty = -1 - if is_in_indices(stmt,"tx"): - level_tx = find_cur_level(stmt,"tx") - if is_in_indices(stmt,"ty"): - level_ty = find_cur_level(stmt,"ty") - #print "level_tx %d level_ty %d" % ( level_tx, level_ty ) - #sys.stdout.flush() - - ty_lookup_idx = "" - org_level_ty = level_ty - - # UGLY logic. Lua index starts at 1, so all tests etc here are off by 1 from the lua code - # level_ty initializes to -1 , which is not a valid index, and so there is added code to - # make it not try to acccess offset -1. -1 IS a valid python array index - # to top it off, the else below can assign a NIL to ty_lookup_idx! - if level_ty != -1 and cur[level_ty] != "": - #print "IF cur[%d] = %s" % ( level_ty, cur[level_ty] ) - ty_lookup_idx = cur[level_ty] - else: - #print "ELSE ty_lookup_idx = cur[%d] = %s" % ( level_ty, cur[level_ty-1]) - ty_lookup_idx = cur[level_ty-1] - #print "ty_lookup_idx '%s'" % ty_lookup_idx - - if level_ty > -1: - #print "\ntile3(%d,%d,%d)" % (stmt,level_ty,level_tx+1) - chill.tile3(stmt,level_ty,level_tx+1) - #print_code() - - cur = chill.cur_indices(stmt) # calls C - table_Size = len(cur) - #print "Cur indices ", - #for c in cur: - # print "%s," % c, - #print "\nThe table size is %d" % len(cur) - #count=1 - #for c in cur: - # print "%d\t%s" % (count,c) - # count += 1 - #sys.stdout.flush() - - if is_in_indices(stmt,"tx"): - level_tx = find_cur_level(stmt,"tx") - if ty_lookup_idx != "": # perhaps incorrect test - if is_in_indices(stmt,ty_lookup_idx): - level_ty = find_cur_level(stmt,ty_lookup_idx) - - ty_lookup = 1 - idx_flag = -1 - # find the level of the next valid index after ty+1 - #print "\nlevel_ty %d" % level_ty - if level_ty > -1: - #print "table_Size %d" % table_Size - for num in range(-1 + level_ty+ty_lookup,table_Size): # ?? off by one? - #print "num=%d cur[num] = '%s'" % (num+1, cur[num]) # num+1 is lua index ???? - sys.stdout.flush() - if cur[num] != "": - idx_flag = find_cur_level(stmt,cur[num]) - #print "idx_flag = %d" % idx_flag - break - - #print "\n(first) I am checking all indexes after ty+1 %s" % idx_flag - #print_code() - #print "" - - how_many_levels = 1 - - #print "idx_flag = %d I will check levels starting with %d" % (idx_flag, idx_flag+1) - # lua arrays start at index 1. the next loop in lua starts at offset 0, since idx_flag can be -1 - # thus the check for "not equal nil" in lua (bad idea) - # python arrays start at 0, so will check for things that lua doesn't (?) - startat = idx_flag + 1 - if idx_flag == -1: - startat = 1 # pretend we're lua for now. TODO: fix the logic - - for ch_lev in range(startat,table_Size+1): # logic may be wrong (off by one) - #print "ch_lev %d" % ch_lev - if ch_lev <= table_Size and cur[ch_lev-1] != "": - #print "cur[%d] = '%s'" % ( ch_lev, cur[ch_lev-1] ) - how_many_levels += 1 - - #print "\nHow Many Levels %d" % how_many_levels - sys.stdout.flush() - sys.stdout.flush() - - if how_many_levels< 2: - while( idx_flag >= 0): - for num in range(level_ty+ty_lookup,table_Size+1): - #print "at top of loop, num is %d" % num - #print "cur[num] = '%s'" % cur[num-1] - if cur[num-1] != "": - idx = cur[num-1] - #print "idx '%s'" % idx - sys.stdout.flush() - curlev = find_cur_level(stmt,idx) - #print "curlev %d" % curlev - - #print "\n[COPYTOREG]tile(%d,%d,%d)"%(stmt,curlev,level_tx) - - chill.tile3(stmt, curlev, curlev) - curlev = find_cur_level(stmt,idx) - #print "curlev %d" % curlev - chill.tile3(stmt,curlev,level_tx) - #print "hehe '%s'" % cur[num-1] - - cur = chill.cur_indices(stmt) - #print "Cur indices INSIDE", - #for c in cur: - # print "%s," % c, - table_Size = len(cur) - #print "\nTable Size is: %d" % len(cur) - - level_tx = find_cur_level(stmt,"tx") - #print "\n level TX is: %d" % level_tx - level_ty = find_cur_level(stmt,ty_lookup_idx) - #print "\n level TY is: %d" %level_ty - idx_flag = -1 - #print "idx_flag = -1" - - - #- find the level of the next valid index after ty+1 - #- the following was num, which conflicts with loop we're already in, and otherwise wasn't used (?) - for num2 in range( -1 + level_ty+ty_lookup ,table_Size): # lua starts index at one - #print "num mucking num = %d" % num2 - if(cur[num2] != ""): - #print "cur[%d] = '%s'" % ( num2, cur[num2] ) - idx_flag = find_cur_level(stmt,cur[num2]) - #print("\n(second) I am checking all indexes after ty+1 %s",cur[num2]) - break - - #print "num mucked to %d idx_flag = %d" % (num, idx_flag) - - #print "at bottom of loop, num is %d" % num - - #print "done with levels" - - # this was a block comment ??? - -# for num in range(level_ty+1, table_Size+1): -# print "num %d" % num -# if cur[num-1] != "": -# idx_flag = find_cur_level(stmt,cur[num-1]) ## ugly -# print "idx_flag = %d" % idx_flag - - # change this all to reflect the real logic which is to normalize all loops inside the thread loops. -# print "change this all ...\n" -# print "level_ty+1 %d table_Size-1 %d idx_flag %d" %( level_ty+1, table_Size-1, idx_flag) -# sys.stdout.flush() -# sys.stdout.flush() - -# while level_ty+1 < (table_Size-1) and idx_flag >= 0: -# print "*** level_ty %d" % level_ty -# for num in range(level_ty+2,table_Size+1): # lua for includes second value -# print "num %d cur[num] %s" % (num, cur[num]) -# if cur[num] != "": -# idx = cur[num] -# print "idx='%s'" % idx -# #print_code() - - - - - #print "ARE WE SYNCED HERE?" - #print_code() - - # [Malik] end logic - start_level = find_cur_level(stmt, start_loop) # start_loop was passed parameter! - - # We should hold constant any block or tile loop - block_idxs = chill.block_indices() - thread_idxs = chill.thread_indices() - #print"\nblock indices are" - #for index, val in enumerate(block_idxs): - # print "%d\t%s" % ( int(index)+1 , val ) - #print"\nthread indices are" - #for index, val in enumerate(thread_idxs): - # print "%d\t%s" % ( int(index)+1 , val ) - #print "\nStart Level: %d" % start_level - - hold_constant = [] - #print("\n Now in Blocks") - for idx in block_idxs: - blocklevel = find_cur_level(stmt,idx) - if blocklevel >= start_level: - hold_constant.append(idx) - #print "\nJust inserted block %s in hold_constant" %idx - - #print("\n Now in Threads") - for idx in thread_idxs: - blocklevel = find_cur_level(stmt,idx) - if blocklevel >= start_level: - hold_constant.append(idx) - #print "\nJust inserted thread %s in hold_constant" %idx - #print "\nhold constant table is: " - #for index, val in enumerate(hold_constant): - # print "%d\t%s" % ( int(index)+1 , val ) - - #print("\nbefore datacopy pvt") - old_num_stmts = chill.num_statements() - #sys.stdout.flush() - - #print "\n[DataCopy]datacopy_privatized(%d, %s, %s, " % (stmt, start_loop, array_name), - #print hold_constant, - #print ")" - passtoC = [stmt, start_loop, array_name ] # a list - passtoC.append( len(hold_constant ) ) - for h in hold_constant: - passtoC.append( h ) - chill.datacopy_privatized( tuple( passtoC )) - sys.stdout.flush() - sys.stdout.flush() - - new_num_statements = chill.num_statements() - #print "new num statements %d" % new_num_statements - - # Unroll to the last thread level -# for stmt in range(old_num_statements, new_num_statements): -# print "unrolling statement %d" % stmt -# level = find_cur_level(stmt,thread_idxs[-1]) #get last thread level -# print "level is %d" % level -# idxs = chill.cur_indices(stmt) -# if level < len(idxs): -# chill.unroll(stmt,level+1,0) - - - -def copy_to_shared( start_loop, array_name, alignment ): - #print "\nstarting copy to shared( %s, %s, %d)" % (start_loop, array_name, alignment ) - #print "copy_to_shared( %s, %s, %d) in cudaize.py" % ( start_loop, array_name, alignment ) - stmt = 0 # assume statement 0 - - cur = chill.cur_indices(stmt) - #print "Cur indices ", - #print_array( cur ) - - start_level = find_cur_level( stmt, start_loop ) - #print "start_level %d" % start_level - - old_num_statements = chill.num_statements() - #print "old_num_statements %d" % old_num_statements - - - # Now, we give it indices for up to two dimensions for copy loop - copy_loop_idxs = ["tmp1","tmp2"] - #chill.datacopy_9arg(stmt, start_level, array_name, copy_loop_idxs, False, 0, 1, alignment,True) - passtoC = [stmt, start_level, array_name] # a list - passtoC.append( len(copy_loop_idxs)) - for i in copy_loop_idxs: - passtoC.append(i) - passtoC.append( 0 ) # False - passtoC.append( 0 ) - passtoC.append( 1 ) - passtoC.append( alignment ) - passtoC.append( 1 ) # True - #print "\n[DataCopy]datacopy( ", - #print passtoC, - #print ")" - - #if array_name == "b": - # chill.cheat(1) - #if array_name == "c": - # chill.cheat(2) - - chill.datacopy_9arg( tuple( passtoC )) - - #print "back from datacopy_9arg\n\n\n" - #sys.stdout.flush() - - - #print "calling add_sync( %d, %s )" % ( stmt, start_loop ) - chill.add_sync( stmt, start_loop ) - #print "back from add_sync()\n\n" - - new_num_statements = chill.num_statements() - - # This is fairly CUBLAS2 specific, not sure how well it generalizes, - # but for a 2D copy, what we want to do is "normalize" the first loop - # "tmp1" then get its hard upper bound. We then want to tile it to - # make the control loop of that tile "ty". We then tile "tmp2" with a - # size of 1 and make it "tx". - - #print "fairly CUBLAS2 specific, OLD %d NEW %d" % ( old_num_statements, new_num_statements) - sys.stdout.flush() - sys.stdout.flush() - - for stmt in range(old_num_statements, new_num_statements): - #print "for stmt = %d" % stmt - level = find_cur_level( stmt, "tmp2") - #print "FOUND CUR LEVEL? level '", - #print level, - #print "'" - - #print "in loop, stmt %d level %d" % ( stmt, level ) - if level != -1: - #print "\nCopy to shared: [If was no error]\n" - find_cur_level(stmt,"tmp2") - chill.tile3( stmt, level, level ) - - #print "hard_loop_bounds( %d, %d )" % (stmt, level) - bounds = chill.hard_loop_bounds(stmt, level) - lower = bounds[0] - upper = 1+ bounds[1] - #print "lower %d upper %d" % ( lower, upper ) - - dims = chill.thread_dims() - #print "in cudaize.py copy_to_shared, dims =", - #print dims - tx = dims[0] - ty = dims[1] - #print "2-loop cleanup: lower, upper: %d, %d, tx: %d" % ( lower, upper, tx) - - level = find_cur_level(stmt,"tmp1") - #print "level %d" % level - if tx == upper and ty == 1: - #print "tx = %d upper = %d ty = %d"% (tx, upper, ty) - #print "Don't need" - - # Don't need an extra tile level, just move this loop up - second_level = find_cur_level(stmt,"tmp2") - chill.tile7(stmt, second_level, 1, level, "tx", "tx", counted) - - else: - #print "DO need?" - if ty == 1: - new_ctrl = "tmp3" - else: - new_ctrl = "ty" - - # LOTS of commented out code here in cudaize.lua - - #print_code() - #print "\nStarting tmp2\n" - first_level = find_cur_level(stmt,"tmp1") - second_level = find_cur_level(stmt,"tmp2") - bounds = chill.hard_loop_bounds(stmt, second_level) - lower = bounds[0] - upper = 1 + bounds[1] # BROKEN? - - #print "[Malik]-loop cleanup@tmp2: lower, upper: %d, %d, tx: %d,first level:%d,second_level:%d" % ( lower, upper-1, tx, first_level, second_level) - - # Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx. - #print "\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, second_level,1,first_level, "tx", "tx") - chill.tile7(stmt, second_level,1,first_level,"tx","tx",counted) - #print_code() - - first_level = find_cur_level(stmt,"tmp1") - bounds = chill.hard_loop_bounds(stmt, first_level) - lower_1 = bounds[0] - upper_1 = 1 + bounds[1] - tx_level = find_cur_level(stmt,"tx") - bounds = chill.hard_loop_bounds(stmt,tx_level) - lower_tx = bounds[0] - upper_tx = 1+bounds[1] - #print "UL_1 %d %d UL_tx %d %d" % ( lower_1, upper_1-1, lower_tx, upper_tx-1) - - if int(math.ceil( float(upper_tx)/float(tx))) > 1: - #print "ceil I say" - #print "\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, tx_level,tx,tx_level, "tx", "tmp1") - chill.tile7(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted) - #print_code() - - repeat = find_cur_level(stmt,"tx") - #print "\n[Tile1]tile(%d, %d, %d)" % (stmt, repeat, repeat) - chill.tile3(stmt, repeat, repeat) #find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx")) - #print_code() - - if find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx"): - #print "\nagain [Tile1]tile(%d, %d, %d)" % (stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")) - chill.tile3(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx")) - #print_code() - - #print_code() - - #print "\nStarting tmp1\n" - # Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty". - chill.tile3(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1")) - #print_code() - - ty_level = find_cur_level(stmt,"tmp1") - bounds = chill.hard_loop_bounds(stmt,ty_level) - lower_ty = bounds[0] - upper_ty = 1 + bounds[1] - - tx_level = find_cur_level(stmt,"tx") - bounds = chill.hard_loop_bounds(stmt,tx_level) - lower_tx = bounds[0] - upper_tx = 1 + bounds[1] - - #print "[Malik]-loop cleanup@tmp1: lowerty, upperty: %d, %d, ty: %d,ty level:%d,tx_level:%d, stmt: %d" % ( lower_ty, upper_ty-1, ty, ty_level, tx_level, stmt) - - #print "before ceil" - #sys.stdout.flush() - - if(math.ceil(float(upper_ty)/float(ty)) > 1): - #print "CEIL IF" - #print "\n Inside upper_ty/ty > 1\n" - - #print "\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, ty_level,ty,ty_level, "ty", "tmp_ty") - chill.tile7(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted) - #print_code() - - #print "\n[Tile2-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt ,"ty"),find_cur_level(stmt,"ty")) - chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty")) - #print_code() - - cur_idxs = chill.cur_indices(stmt) - #print "\n cur indexes are ", - #print_array( cur_idxs) - #sys.stdout.flush() - - # Putting ty before any tmp_tx - idx_flag = -1 - if "tmp_tx" in cur_idxs: - idx_flag = 1 + cur_idxs.index("tmp_tx") # lua index starts at 1 - #print "\n (1) so i have found out the value of idx flag as %d" % idx_flag - #sys.stdout.flush() - - if idx_flag >= 0: - if find_cur_level(stmt,"ty") > find_cur_level(stmt,"tmp_ty"): - #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) - chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) - #print_code() - - - # Now Putting ty before any tmp_ty - sys.stdout.flush() - idx_flag = -1 - if "tmp_ty" in cur_idxs: - idx_flag = 1 + cur_idxs.index("tmp_ty") # lua index starts at 1 - #print "\n IF so i have found out the value of idx flag as %d" % idx_flag - #sys.stdout.flush() - - if idx_flag >= 0: - #print "one more test" - sys.stdout.flush() - if find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"): - #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) - #sys.stdout.flush() - chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) - #print_code() - - - - else: - #print "CEIL ELSE" - #print "\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, ty_level,1,ty_level, "ty", "ty") - #sys.stdout.flush() - chill.tile7( stmt, ty_level, 1, ty_level, "ty", "ty", counted ) - #print_code() - - #print "\n[Tile3-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1) - sys.stdout.flush() - - chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1) - #print_code() - - - idx_flag = -1 - # LUA code checks to see if cur_idxs exists? it is unused except in the other clause of this is - #if(cur_idxs) then - #print "CAN NEVER GET HERE? cur_idxs" - #for num= 0,table.getn(cur_idxs) do - #if(cur[num] == "tmp_ty") then - #idx_flag = find_cur_level(stmt,cur[num]) - #break - #end - #end - print "\n ELSE so i have found out the value of idx flag as %d" % idx_flag - if idx_flag >= 0: # can't happen - print "tile( stmt %d, level ty %d, level ty %d" % ( stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) - #chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty")) - - - - - - #print "\n\n *** at bottom of if in copy to shared, " - #print_code() - #print "end of if" - - else: - # copy to shared only created one level, not two, so we use a different approach (MV & TMV) - #print "\nCopy to shared: [If was error]\n" - level = find_cur_level(stmt,"tmp1") - chill.tile3(stmt, level, level) - - dims = chill.thread_dims() - #print dims - tx = dims[0] - ty = dims[1] - - bounds = chill.hard_loop_bounds(stmt, level) - lower = bounds[0] - upper = bounds[1] - - #print "bounds lower %d upper %d" % (lower, upper) - upper = upper+1 # upper bound given as <=, compare to dimensions tx which is < - if upper == tx: - #print "upper == tx" - chill.rename_index( stmt, "tmp1", "tx") - else: - #print "upper is not tx" - #print "upper %d tx %d stmt: %d level: %d" % ( upper, tx, stmt, level) - chill.tile7( stmt, level, tx, level, "tx", "tmp_tx", counted) - #print_code() - - #print "stmt:%d level+1: %d" % ( stmt, level+1) - #print("TILE 7") - chill.tile7( stmt, level+1,1,level+1,"tx", "tx",counted) - #print("TILE 3") - chill.tile3( stmt, level+1, level) - #print_code() - - - if ty > 1: - #print "GOING IN" - bounds = chill.hard_loop_bounds(stmt, level+1) - lower = bounds[0] - upper = bounds[1] - #print "ty %d lower %d upper %d" % ( ty, lower, upper ) - floatdiv = float(upper)/float(ty) - bound = int(math.ceil(float(upper)/float(ty))) - #print "NOW FOR Y: upper %d ty %d stmt: %d level: %d bound: %d" % ( upper, ty, stmt, level+1, bound) - chill.tile7(stmt, level+1, bound, level+1, "tmp_ty", "ty", counted) - - # Always add sync - chill.add_sync( stmt, start_loop ) - #print "ending copy to shared\n" - #sys.stdout.flush() - #print_code() - - - - - - - - - - - - - - - - - - - -def unroll_to_depth( max_depth ): - print "\n\nunroll_to_depth(%d)" % max_depth - print "SYNC UP" - sys.stdout.flush() - - cur = chill.cur_indices(0) - thread_idxs = chill.thread_indices() - guard_idx = thread_idxs[-1] # last one - - print "cur indices", - print_array(cur) - print "thread indices", - print_array(thread_idxs) - print "guard_idx = %s" % guard_idx - - #print "thread_idxs = ", - #print thread_idxs - guard_idx = thread_idxs[-1] - #print "guard_idx = %s" % guard_idx - - # HERE FIND OUT THE LOOPS WHICH ARE COMMON BETWEEN STATEMENTS - common_loops = [] - comm_loops_cnt = 0 - num_stmts = chill.num_statements() - print "num statements %d" % num_stmts - - for stmt in range(num_stmts): - sys.stdout.flush() - print "\nSTMT %d" % stmt, - cur_idxs = chill.cur_indices(stmt) - print "Current Indices:", - for c in cur_idxs[:-1]: - print "%s," % c, - print "%s" % cur_idxs[-1] # last one - sys.stdout.flush() - #print_code() - - if chk_cur_level(stmt, "tx") > 0: - - for ii in range(find_cur_level(stmt,"tx")-1): - print "ii = %d\ncur_idxs[%d] = '%s'" % (ii+1, ii+1, cur_idxs[ii]) # print to match lua - id = cur_idxs[ii] - if id not in ["bx", "by", "", "tx", "ty"]: - - print "id %s is not in the list" % id - - for stmt1 in range(stmt+1, num_stmts): - print "\nii %d stmt1 is %d" % (ii+1, stmt1) # print to match lua - cur_idxs1 = chill.cur_indices(stmt1) - print "\nstmt1 cur_idxs1 is ", - for ind in cur_idxs1[:-1]: - print "%s," % ind, - print "%s" % cur_idxs1[-1] - - print "cur level(%d, %s) = %d" % (stmt, "tx", find_cur_level(stmt,"tx") ) - sys.stdout.flush() - - endrange = find_cur_level(stmt,"tx")-1 - print "for iii=1, %d do" % endrange - sys.stdout.flush() - for iii in range(endrange): # off by one? TODO - print "stmt %d ii %d iii %d\n" % (stmt, ii+1, iii+1), - sys.stdout.flush() - - if iii >= len(cur_idxs1): - print "stmt %d ii %d iii %d cur_idxs1[%d] = NIL" % (stmt, ii+1, iii+1, iii+1, ) # print to match lua - else: - print "stmt %d ii %d iii %d cur_idxs1[%d] = '%s'" % (stmt, ii+1, iii+1, iii+1, cur_idxs1[iii]) # print to match lua - sys.stdout.flush() - - # this will still probably die - if iii < len(cur_idxs1) and [iii] not in ["bx", "by", "tx", "ty", ""]: - if cur_idxs[ii] == cur_idxs1[iii]: - print "\nfound idx:%s" % cur_idxs[ii] - common_loops.append(cur_idxs[ii]) - print "cl[%d] = '%s'" % ( comm_loops_cnt, cur_idxs[ii] ) - comm_loops_cnt = len(common_loops) - - if len(common_loops) > 0: - print "\n COMM LOOPS :TOTAL %d, and are " % comm_loops_cnt, - print common_loops, - print " this loop : %s" % common_loops[0] - else: - print "UNROLL can't unroll any loops?" - - - while True: # break at bottom of loop (repeat in lua) - old_num_statements = chill.num_statements() - print "old_num_statements %d" % old_num_statements - - for stmt in range(old_num_statements): - cur_idxs = chill.cur_indices(stmt) - print "stmt %d cur_idxs =" % stmt, - index = 0 - for i in cur_idxs: - index +=1 - if index == len(cur_idxs): - print "%s" %i - else: - print "%s," % i, - - if len(cur_idxs) > 0: - guard_level = -1 - if chk_cur_level(stmt, guard_idx) > 0: - guard_level = find_cur_level(stmt,guard_idx) - print "guard_level(sp) = %d" % guard_level - if guard_level > -1: - level = next_clean_level(cur_idxs,guard_level) - print "next clean level %d" % level - - - #print "looking at %d" % stmt - #print "comparing %d and %d in" % (guard_level, level), - #index = 0 - #for i in cur_idxs: - #index +=1 - #if index == len(cur_idxs): - # print "%s" %i - #else: - # print "%s," % i, - - # need to handle max_depth - num_unrolled = 0 - level_unroll_comm = level - level_arr = [] - - #print "before while, level = %d" % level - while level >= 0: - print "while: level = %d" % level - if num_unrolled == max_depth: - break - - print "Unrolling %d at level %d index %s" % ( stmt, level, cur_idxs[guard_level]) # ??? - level_arr.append(level) - - guard_level = find_cur_level(stmt,guard_idx) - level = next_clean_level(cur_idxs,level+1) - - print "OK, NOW WE UNROLL" - if level_unroll_comm >= 0: - level_arr.reverse() - for i,lev in enumerate(level_arr): - print "\ni=%d" % i - print "[Unroll]unroll(%d, %d, 0)" % (stmt, lev) - chill.unroll(stmt, lev, 0) - - - new_num_statements = chill.num_statements() - if old_num_statements == new_num_statements: - break # exit infinite loop - - -# all other calls to C have a routine in this file (?) -def unroll( statement, level, unroll_amount ): - chill.unroll( statement, level, unroll_amount ) - diff --git a/examples/cuda-chill/mm.c b/examples/cuda-chill/mm.c deleted file mode 100644 index 0efbeeb..0000000 --- a/examples/cuda-chill/mm.c +++ /dev/null @@ -1,10 +0,0 @@ -#define N 1024 - -void normalMM(float c[N][N], float a[N][N], float b[N][N]) { - int i, j, k; - - for (i = 0; i < N; i++) - for (j = 0; j < N; j++) - for (k = 0; k < N; k++) - c[j][i] = c[j][i] + a[k][i] * b[j][k]; -} diff --git a/examples/cuda-chill/mm.lua b/examples/cuda-chill/mm.lua deleted file mode 100644 index 5bde1b0..0000000 --- a/examples/cuda-chill/mm.lua +++ /dev/null @@ -1,38 +0,0 @@ -init("mm.c", "normalMM", 0) -dofile("cudaize.lua") -N=1024 -Ti=128 -Tj=64 -Tk=16 -Tii=16 -Tjj=16 - - - - -N=1024 - - - - - - - - - - - - - -tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k"})CU=1 - -tile_by_index({"k"},{Tk},{l1_control="kk"},{"ii","jj","kk","i","j","k"})CU=3 - -tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","kk","i","iii","j","jjj","k"},1)CU=2 - -cudaize("mm_GPU",{a=1048576,b=1048576,c=1048576},{block={"ii","jj"}, thread={"i","j"}})CU=2 -copy_to_shared("tx","a",-16) -copy_to_shared("tx","b",-16) -copy_to_registers("kk","c") ---print_code() -unroll_to_depth(2) diff --git a/examples/cuda-chill/mpeg4.c b/examples/cuda-chill/mpeg4.c deleted file mode 100755 index 7f83bf7..0000000 --- a/examples/cuda-chill/mpeg4.c +++ /dev/null @@ -1,23 +0,0 @@ -#define N1 4096 -#define N2 4096 -#define WINDOW_SIZE 16 - -void mpeg4_cpu(float result[N1][N2], float prev[N2+WINDOW_SIZE][N2+WINDOW_SIZE], float curr[WINDOW_SIZE*WINDOW_SIZE]) -{ - unsigned int i; - unsigned int j; - unsigned int k; - unsigned int l; - - for ( i = 0; i < N1; ++i) - for ( j = 0; j < N2; ++j) - for ( k = 0; k < WINDOW_SIZE; ++k) - for ( l = 0; l < WINDOW_SIZE; ++l) - result[i][j] += prev[i+k][j+l] * curr[k*WINDOW_SIZE+l]; - - - - - -} - diff --git a/examples/cuda-chill/mpeg4.lua b/examples/cuda-chill/mpeg4.lua deleted file mode 100644 index f025dc0..0000000 --- a/examples/cuda-chill/mpeg4.lua +++ /dev/null @@ -1,45 +0,0 @@ ---CUBLAS 2 MM Multiply - ---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you ---call init() and use global variables to specify procedure and loop - ---Second parameter is procedure # and third is loop # -init("mpeg4.c", "mpeg4_cpu", 0) - ---dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,copy_to_shared methods - -N=4096 -M=4096 -W=16 - ---TI 4ust be <= M ---TJ must be <=TI -Ti=32 -Tj=32 -Tii=16 -Tjj=16 -Tk=4 ---permute(0,{"j","i","k","l"}) -tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j","k","l"}) ---tile_by_index({"k","l"},{Tk*2,Tk*2},{l1_control="kk",l2_control="ll"},{"ii","jj","kk","ll","i","j","k","l"}) ---print_code() ---tile_by_index({"k","l"},{Tk,Tk},{l1_control="kk",l2_control="ll"},{"ii","jj","i","j","kk","k","ll","l"}) -tile_by_index({"i","j"},{Tii,Tjj},{l1_control="iii",l2_control="jjj"},{"ii","jj","iii","i","jjj","j","k","l"}) ---print_code() ---normalize_index("j") ---normalize_index("i") ---print_code() -cudaize("kernel_GPU",{curr=W*W,prev=(N+W)*(M+W),result=N*M},{block={"ii","jj"}, thread={"i","j"}}) ---print_code() -copy_to_shared("iii","prev",16) - -copy_to_registers("jjj","result") - ---print_code() ---copy_to_constant_no_tile("curr") -unroll_to_depth(2) -print_code() -print_space() - - diff --git a/examples/cuda-chill/mriq-fh.c b/examples/cuda-chill/mriq-fh.c deleted file mode 100755 index 1e924b7..0000000 --- a/examples/cuda-chill/mriq-fh.c +++ /dev/null @@ -1,38 +0,0 @@ -#define X 32768 -#define K 256 -struct kValues { - float Kx; - float Ky; - float Kz; - float PhiMag; -}; -extern float sin(float); -extern float cos(float); - -void mriFH_cpu(float *rPhi,float *rRho,float *iRho, float *iPhi, float *rD, float *iD, float *kx, float *ky, float *kz, float *dx, float *dy, float *dz, float *rFHref, float *iFHref) -{ - - float rfh; - float ifh; - float exp; - float cArg; - float sArg; - //float rRho[K]; - //float iRho[K]; - unsigned int k; - unsigned int x; - - - for (x = 0; x < X; ++x) { - for (k = 0; k < K; ++k) { - - exp = 2 * 3.14159 * (kx[k]* dx[x] + ky[k]* dy[x] + kz[k]* dz[x]); - cArg = cos(exp); - sArg = sin(exp); - rFHref[x] += rRho[k]* cArg - iRho[k]* sArg; - iFHref[x] += iRho[k]*cArg + rRho[k]*sArg; - } - - } -} - diff --git a/examples/cuda-chill/mriq-fh.lua b/examples/cuda-chill/mriq-fh.lua deleted file mode 100755 index 3277bac..0000000 --- a/examples/cuda-chill/mriq-fh.lua +++ /dev/null @@ -1,73 +0,0 @@ ---CUBLAS 2 MM Multiply - ---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you ---call init() and use global variables to specify procedure and loop - ---Second parameter is procedure # and third is loop # -init("mriq-fh.c", "mriFH_cpu", 0) - -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, - --copy_to_shared methods -N=32768 -M=256 -Tx=256 - - -print_code() ---permute(0,{"j","i"}) ---tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"}) -tile_by_index({"x"},{Tx},{l1_control="xx"},{"xx","x","k"}) ---tile_by_index({"x"},{16},{l1_control="xx1"},{"xx","x","xx1","k"}) ---tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"}) ---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) -print_code() - -normalize_index("x") ---normalize_index("i") -print_code() ---tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"}) ---print_code() ---cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}}) -cudaize("kernel_GPU",{dx=N,dy=N,dz=N,iRho=M,kx=M,ky=M,kz=M,rFHref=N,iFHref=N,rRho=M},{block={"xx"}, thread={"x"}}) ---copy_to_shared("tx","iRho",-16) ---copy_to_shared("tx","dz",1) ---copy_to_shared("tx","rRho",-16) ---copy_to_registers("tx","rFHref") ---copy_to_registers("tx","rRho") ---copy_to_registers("tx","iRho") ---copy_to_registers("tx","kx") ---copy_to_registers("tx","dx") ---copy_to_registers("tx","ky") ---copy_to_registers("tx","dy") ---copy_to_registers("tx","kz") ---copy_to_registers("tx","dz") ---copy_to_registers("tx","iFHref") ---copy_to_texture("rRho") ---copy_to_texture("kx") ---copy_to_texture("dx") ---copy_to_texture("ky") ---copy_to_texture("dy") ---copy_to_texture("kz") ---copy_to_texture("dz") ---copy_to_texture("iRho") ---print_code()--]] ---unroll(0,4,0) ---copy_to_constant_no_tile("kx") ---copy_to_constant_no_tile("ky") ---copy_to_constant_no_tile("kz") ---copy_to_constant_no_tile("rRho") ---copy_to_constant_no_tile("iRho") - ---unroll_to_depth(1) -print_code() ---[[ -copy_to_Texture("rRho") -copy_to_Texture("kx") -copy_to_Texture("dx") -copy_to_Texture("ky") -copy_to_Texture("dy") -copy_to_Texture("kz") -copy_to_Texture("dz") -copy_to_Texture("iRho") ---unroll_to_depth(2) ---]] diff --git a/examples/cuda-chill/mriq.c b/examples/cuda-chill/mriq.c deleted file mode 100644 index ba4b87c..0000000 --- a/examples/cuda-chill/mriq.c +++ /dev/null @@ -1,33 +0,0 @@ -#define N 32768 -#define M 3072 -struct kValues { - float Kx; - float Ky; - float Kz; - float PhiMag; -}; -extern float sinf(float); -extern float cosf(float); - -void -ComputeQCPU(int numK, int numX,struct kValues kVals[M],float x[N], float y[N], float z[N],float Qr[N], float Qi[N]) { - float expArg; - float cosArg; - float sinArg; - float phi; - int i; - int j; - numK = M; - numX = N; - for ( i = 0; i < M; i++) { - for ( j = 0; j < N; j++) { - expArg = 6.2831853071795864769252867665590058f * (kVals[i].Kx * x[j] +kVals[i].Ky * y[j] +kVals[i].Kz * z[j]); - cosArg = cosf(expArg); - sinArg = sinf(expArg); - phi = kVals[i].PhiMag; - Qr[j] += phi * cosArg; - Qi[j] += phi * sinArg; - } - } -} - diff --git a/examples/cuda-chill/mriq.lua b/examples/cuda-chill/mriq.lua deleted file mode 100644 index 1170111..0000000 --- a/examples/cuda-chill/mriq.lua +++ /dev/null @@ -1,55 +0,0 @@ ---CUBLAS 2 MM Multiply - ---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you ---call init() and use global variables to specify procedure and loop - ---Second parameter is procedure # and third is loop # -init("mriq.c", "ComputeQCPU", 0) - -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, - --copy_to_shared methods -N=32768 -M=3072 -TI=128 -TJ=128 - -permute(0,{"j","i"}) ---tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"}) -tile_by_index({"i"}, {TJ}, {l1_control="ii",l1_tile="i"}, {"ii", "j","i"}) -tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"}) ---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) ---print_code() - -normalize_index("j") -normalize_index("i") ---print_code() ---tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"}) ---print_code() -cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}}) - -copy_to_shared("tx","kVals",1) ---copy_to_shared("tx","x",1) ---copy_to_shared("tx","y",1) ---copy_to_shared("tx","z",1) - ---copy_to_texture("kVals") ---datacopy(0, 3, "kVals", {"tt","t"},false,0,1,-16,true) ---print_code() ---datacopy_privatized(0,"tx","kVals",{"tx"}) ---copy_to_registers("tx","kVals") -copy_to_registers("ii","x") -copy_to_registers("ii","y") -copy_to_registers("ii","z") -copy_to_registers("ii","Qi") -copy_to_registers("ii","Qr") ---[[datacopy_privatized(0,"tx","x",{"tx"}) -datacopy_privatized(0,"tx","y",{"tx"}) -datacopy_privatized(0,"tx","z",{"tx"}) -datacopy_privatized(0,"tx","Qi",{"tx"}) -datacopy_privatized(0,"tx","Qr",{"tx"}) - - -]]-- ---unroll(0,5,64) -print_code() ---unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels diff --git a/examples/cuda-chill/mv-shadow.c b/examples/cuda-chill/mv-shadow.c deleted file mode 100644 index 582b187..0000000 --- a/examples/cuda-chill/mv-shadow.c +++ /dev/null @@ -1,9 +0,0 @@ -#define N 1024 - -void normalMV(float c[N][N], float a[N], float b[N]) { - int i, j; - - for (i = 0; i < N; i++) - for (j = 0; j < N; j++) - a[i] = a[i] + c[j][i] * b[j]; -} diff --git a/examples/cuda-chill/mv-shadow.lua b/examples/cuda-chill/mv-shadow.lua deleted file mode 100644 index 43e8491..0000000 --- a/examples/cuda-chill/mv-shadow.lua +++ /dev/null @@ -1,65 +0,0 @@ -init("mv-shadow.c","normalMV",0) -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, - --copy_to_shared methods - -N=129 -TI=32 -TJ=64 - -N=1024 -TI=16 - - - - - - - - - - - - - - - - ---Tile the i and j loop, introducing "ii" as the control loop for the "i" ---tile, "k" for the control loop fo the "j" tile, with the final order ---of {"ii", "k", "i", "j"} -tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) ---tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) ---tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"}) ---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) ---print_code() ---Normalize indx will do a tile size of one over the loop level specified ---by the input index. This is useful to get a zero lower bound and hard ---upper bound on a loop instead of it being relative to previous loop ---levels. ---normalize_index("ii") -normalize_index("i") -print_code() - ---Cudaize now determines the grid dimentions from the loops themselves ---(the upper bounds of the block and thread loops). It also renames the ---given block and thread loops's indexes to the approviate values from ---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the ---size of the arrays to be copied in the CUDA scaffolding. -cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}}) ---print_code() - ---Does a datacopy, tile, and add_sync to get a shared memory copy - ---copy_to_shared("tx", "b", 1) ---copy_to_shared("tx", "c", -16) ---print_code() ---copy_to_texture("b") ---copy_to_texture("c") -copy_to_registers("k", "a") ---print_code() - -unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels ---copy_to_texture("b") ---print_code() ---unroll(0,5,0) ---print_code() diff --git a/examples/cuda-chill/mv.c b/examples/cuda-chill/mv.c deleted file mode 100644 index 582b187..0000000 --- a/examples/cuda-chill/mv.c +++ /dev/null @@ -1,9 +0,0 @@ -#define N 1024 - -void normalMV(float c[N][N], float a[N], float b[N]) { - int i, j; - - for (i = 0; i < N; i++) - for (j = 0; j < N; j++) - a[i] = a[i] + c[j][i] * b[j]; -} diff --git a/examples/cuda-chill/mv.lua b/examples/cuda-chill/mv.lua deleted file mode 100644 index ca54501..0000000 --- a/examples/cuda-chill/mv.lua +++ /dev/null @@ -1,65 +0,0 @@ -init("mv.c","normalMV",0) -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, - --copy_to_shared methods - -N=129 -TI=32 -TJ=64 - -N=1024 - - - - - - - - - - - - - - - - ---Tile the i and j loop, introducing "ii" as the control loop for the "i" ---tile, "k" for the control loop fo the "j" tile, with the final order ---of {"ii", "k", "i", "j"} -tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) ---tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) ---tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"}) ---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) ---print_code() ---Normalize indx will do a tile size of one over the loop level specified ---by the input index. This is useful to get a zero lower bound and hard ---upper bound on a loop instead of it being relative to previous loop ---levels. ---normalize_index("ii") -normalize_index("i") -print_code() - ---Cudaize now determines the grid dimentions from the loops themselves ---(the upper bounds of the block and thread loops). It also renames the ---given block and thread loops's indexes to the approviate values from ---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the ---size of the arrays to be copied in the CUDA scaffolding. -cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}}) - ---print_code() - ---Does a datacopy, tile, and add_sync to get a shared memory copy - ---copy_to_shared("tx", "b", 1) ---copy_to_shared("tx", "c", -16) ---print_code() ---copy_to_texture("b") ---copy_to_texture("c") -copy_to_registers("k", "a") ---print_code() - -unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels ---copy_to_texture("b") ---print_code() ---unroll(0,5,0) ---print_code() diff --git a/examples/cuda-chill/mv_try.c b/examples/cuda-chill/mv_try.c deleted file mode 100644 index 7781f3b..0000000 --- a/examples/cuda-chill/mv_try.c +++ /dev/null @@ -1,9 +0,0 @@ -#define N 4096 - -void normalMV(int n, float c[N][N], float a[N], float b[N]) { - int i, j; - - for (i = 0; i < n; i++) - for (j = 0; j < n; j++) - a[i] = a[i] + c[i][j] * b[j]; -} diff --git a/examples/cuda-chill/mv_try.lua b/examples/cuda-chill/mv_try.lua deleted file mode 100644 index db4d9ad..0000000 --- a/examples/cuda-chill/mv_try.lua +++ /dev/null @@ -1,14 +0,0 @@ -init("mv_try.c","normalMV",0) -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, - --copy_to_shared methods - -TI=96 - -N=4096 - - -tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) -cudaize("mv_GPU", {a=N, b=N, c=N*N}, - {block={"ii"}, thread={"i"}}) - -print_code() diff --git a/examples/cuda-chill/nbody.c b/examples/cuda-chill/nbody.c deleted file mode 100644 index 57899b6..0000000 --- a/examples/cuda-chill/nbody.c +++ /dev/null @@ -1,66 +0,0 @@ -#define NBODIES 16384 -#define SOFTENINGSQUARED 0.01f -#define DELTATIME 0.001f -#define DAMPING 1.0f - -#define NBLOCKSY 1 -#define NBLOCKSX (NBODIES/NTHREADSX) -#define NTHREADSY 1 -#define NTHREADSX 64 - -#define BLOCKSIZE 128 - -#define SHARED 1 -#define TIMER 1 -#define VERIFY 1 - -extern float sqrtf(float); - -void nbody_cpu(float* oldpos,float* oldpos1, float *newpos, float *oldvel, float *newvel, float *force) -{ - float r0,r1,r2; - float invDist, invDistCube, mass, invMass; - unsigned int i,j; - for(i = 0; i < NBODIES; ++i) { - //force[i*4 ] = 0; - //force[i*4+1] = 0; - //force[i*4+2] = 0; - //force[i*4+3] = 0; - for(j = 0; j < NBODIES; ++j) { - r0 = oldpos[j*4]-oldpos1[i*4]; - r1 = oldpos[j*4+1]-oldpos1[i*4+1]; - r2 = oldpos[j*4+2]-oldpos1[i*4+2]; - - invDist = 1.0/sqrtf(r0 * r0 + r1 * r1 + r2 * r2 + SOFTENINGSQUARED); - invDistCube = invDist * invDist * invDist; - mass = oldpos1[i*4+3]; - - force[i*4] = force[i*4] + r0 * mass * invDistCube; - force[i*4+1] = force[i*4+1] + r1 * mass * invDistCube; - force[i*4+2] = force[i*4+2] + r2 * mass * invDistCube; - - } - } - -/* for (i = 0; i < NBODIES; ++i) { - invMass = oldvel[4*i+3]; - - oldvel[4*i] += (force[4*i] * invMass) * DELTATIME * DAMPING; - oldvel[4*i+1] += (force[4*i+1] * invMass) * DELTATIME * DAMPING; - oldvel[4*i+2] += (force[4*i+2] * invMass) * DELTATIME * DAMPING; - - oldpos[4*i] += oldvel[4*i] * DELTATIME; - oldpos[4*i+1] += oldvel[4*i+1] * DELTATIME; - oldpos[4*i+2] += oldvel[4*i+2] * DELTATIME; - - newpos[4*i+0] = oldpos[4*i]; - newpos[4*i+1] = oldpos[4*i+1]; - newpos[4*i+2] = oldpos[4*i+2]; - newpos[4*i+3] = oldpos[4*i+3]; - - newvel[4*i+0] = oldvel[4*i]; - newvel[4*i+1] = oldvel[4*i+1]; - newvel[4*i+2] = oldvel[4*i+2]; - newvel[4*i+3] = oldvel[4*i+3]; - }*/ -} diff --git a/examples/cuda-chill/nbody.lua b/examples/cuda-chill/nbody.lua deleted file mode 100644 index 08f88a9..0000000 --- a/examples/cuda-chill/nbody.lua +++ /dev/null @@ -1,53 +0,0 @@ ---CUBLAS 2 MM Multiply - ---This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you ---call init() and use global variables to specify procedure and loop - ---Second parameter is procedure # and third is loop # -init("nbody.c", "nbody_cpu" , 0) - -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, - --copy_to_shared methods -NBODIES=16384 - - ---Tj=128 CHANGE FOR BEST..... BEST IS 64BLOCKS 128THREADS ---Ti=256 -Tj=64 -Ti=32 -Tjjj=1 -Tiii=1 -Tn=0.1 ---normalize_index("j") --- ---print_code() ---normalize_index("n") --- TILE COMMANDS ZEROOOOOOOOOOO:3 ---tile_by_index({"i","j"},{Ti,Tj},{l1_control="ii",l2_control="jj"},{"ii","jj","i","j"})--CU=-1 -tile_by_index({"i"},{Ti},{l1_control="ii"},{"ii","i","j"})--CU=-1 ---normalize_index("i") ---tile_by_index({"n"},{Tn},{l1_control="nn"},{"jj","ii","nn","j","i","n"})--CU=-1 - ---tile_by_index({"j","i"},{Tjjj,Tiii},{l1_control="jjj",l2_control="iii"},{"jj","ii","nn","jjj","j","iii","i","n"})--CU=3 ---tile_by_index({"j"}, {Tn}, {l1_control="j",l1_tile="jjj"}, {"ii", "jj", "nn","jjj","j","i","n"}) ---tile_by_index({"i"}, {Ti/2}, {l1_control="iii"}, {"ii","iii", "jj","i","j"}) ---print_code() -cudaize("kernel_GPU",{oldpos=4*NBODIES,oldpos1=4*NBODIES,oldvel=4*NBODIES,force=4*NBODIES,newpos=4*NBODIES,newvel=4*NBODIES},{block={"ii"}, thread={"i"}})--CU=3 -print_code() ---tile(0,6,6) ---copy_to_shared("tx","oldpos",-16) ---copy_to_registers("j","oldpos") ---copy_to_registers("j","oldpos1") ---copy_to_registers("j","force") - ---copy_to_texture("oldpos") ---tile(1,3,3) ---tile(2,3,3) - -print_code() ---unroll_to_depth(1) --- ---tile(2,3,3) ---unroll(2,3,0) ---unroll(0,5,0) ---print_code() diff --git a/examples/cuda-chill/tmv-shadow.c b/examples/cuda-chill/tmv-shadow.c deleted file mode 100644 index cb9ea8d..0000000 --- a/examples/cuda-chill/tmv-shadow.c +++ /dev/null @@ -1,9 +0,0 @@ -#define N 1024 - -void normalMV(float c[N][N], float a[N], float b[N]) { - int i, j; - - for (i = 0; i < N; i++) - for (j = 0; j < N; j++) - a[i] = a[i] + c[i][j] * b[j]; -} diff --git a/examples/cuda-chill/tmv-shadow.lua b/examples/cuda-chill/tmv-shadow.lua deleted file mode 100644 index 196b939..0000000 --- a/examples/cuda-chill/tmv-shadow.lua +++ /dev/null @@ -1,50 +0,0 @@ -init("tmv-shadow.c","normalMV",0) -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, - --copy_to_shared methods - -N=1024 ---N= 8209 ---N=129 -TI=64 -N=1024 -TI=32 ---tile, "k" for the control loop for the "j" tile, with the final order ---of {"ii", "k", "i", "j"} -tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) ---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) ---print_code() ---tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) - ---print_code() ---Normalize indx will do a tile size of one over the loop level specified ---by the input index. This is useful to get a zero lower bound and hard ---upper bound on a loop instead of it being relative to previous loop ---levels. ---normalize_index("i") ---print_code() - ---Cudaize now determines the grid dimentions from the loops themselves ---(the upper bounds of the block and thread loops). It also renames the ---given block and thread loops's indexes to the approviate values from ---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the ---size of the arrays to be copied in the CUDA scaffolding. -cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}}) - ---print_code() - ---Does a datacopy, tile, and add_sync to get a shared memory copy -copy_to_shared("tx", "b", 1) ---copy_to_texture("b") ---print_code() - -copy_to_shared("tx", "c", -16) ---copy_to_texture("c") ---print_code() - -copy_to_registers("k", "a") -print_code() ---unroll(0,5,0) ---unroll(0,4,0) ---unroll(2,4,16) -unroll_to_depth(1) ---print_code() diff --git a/examples/cuda-chill/tmv.c b/examples/cuda-chill/tmv.c deleted file mode 100644 index cb9ea8d..0000000 --- a/examples/cuda-chill/tmv.c +++ /dev/null @@ -1,9 +0,0 @@ -#define N 1024 - -void normalMV(float c[N][N], float a[N], float b[N]) { - int i, j; - - for (i = 0; i < N; i++) - for (j = 0; j < N; j++) - a[i] = a[i] + c[i][j] * b[j]; -} diff --git a/examples/cuda-chill/tmv.lua b/examples/cuda-chill/tmv.lua deleted file mode 100644 index 5071108..0000000 --- a/examples/cuda-chill/tmv.lua +++ /dev/null @@ -1,50 +0,0 @@ -init("tmv.c","normalMV",0) -dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, - --copy_to_shared methods - -N=1024 ---N= 8209 ---N=129 -TI=64 -N=1024 -TI=32 ---tile, "k" for the control loop for the "j" tile, with the final order ---of {"ii", "k", "i", "j"} -tile_by_index({"i","j"}, {TI,TI}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) ---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) ---print_code() ---tile_by_index({"i"}, {TI/32}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) - ---print_code() ---Normalize indx will do a tile size of one over the loop level specified ---by the input index. This is useful to get a zero lower bound and hard ---upper bound on a loop instead of it being relative to previous loop ---levels. ---normalize_index("i") ---print_code() - ---Cudaize now determines the grid dimentions from the loops themselves ---(the upper bounds of the block and thread loops). It also renames the ---given block and thread loops's indexes to the approviate values from ---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the ---size of the arrays to be copied in the CUDA scaffolding. -cudaize("tmv_GPU", {a=N, b=N, c=N*N},{block={"ii"}, thread={"i"}}) - ---print_code() - ---Does a datacopy, tile, and add_sync to get a shared memory copy -copy_to_shared("tx", "b", 1) ---copy_to_texture("b") ---print_code() - -copy_to_shared("tx", "c", -16) ---copy_to_texture("c") ---print_code() - -copy_to_registers("k", "a") -print_code() ---unroll(0,5,0) ---unroll(0,4,0) ---unroll(2,4,16) -unroll_to_depth(1) ---print_code() diff --git a/examples/fortran/README b/examples/fortran/README deleted file mode 100644 index 4f23bee..0000000 --- a/examples/fortran/README +++ /dev/null @@ -1,10 +0,0 @@ -// Manu - -1) Fortran support added to permute, tile, unroll and datacopy. Tested these w.r.t gemm.c using gemm.script. - There might be other issues (like fusion due to unroll, ...) that have not been tested. - -2) To incorporate Fortran support I had to modify certain values in omega (include/omega/omega_core/oc.h). - To solve for large number of unknowns, these values have to be reverted back. - -3) Tested the existing chill scripts using Derick's python script. - At least the existing chill scripts are not affected by the fortran related changes. diff --git a/examples/fortran/ccd.f b/examples/fortran/ccd.f deleted file mode 100644 index 12d834d..0000000 --- a/examples/fortran/ccd.f +++ /dev/null @@ -1,32 +0,0 @@ -c -c These have been separated out from ccsd_t_singles_l.F and ccsd_t_doubles_l.F -c - subroutine clean_sd_t_s1_1(h3d,h2d,h1d,p6d,p5d,p4d, - 2 triplesx,t1sub,v2sub) - IMPLICIT NONE - integer h3d,h2d,h1d,p6d,p5d,p4d - integer h3,h2,h1,p6,p5,p4 - integer N - double precision triplesx(16,16,16,16,16,16) - double precision t1sub(16,16) - double precision v2sub(16,16,16,16) - - N = 16 - - do p4=1,10 - do p5=1,10 - do p6=1,10 - do h1=1,10 - do h2=1,10 - do h3=1,10 - triplesx(h3,h2,h1,p6,p5,p4)=triplesx(h3,h2,h1,p6,p5,p4) - 1 + t1sub(p4,h1)*v2sub(h3,h2,p6,p5) - enddo - enddo - enddo - enddo - enddo - enddo - return - end - diff --git a/examples/fortran/ccd.script b/examples/fortran/ccd.script deleted file mode 100644 index c2af500..0000000 --- a/examples/fortran/ccd.script +++ /dev/null @@ -1,18 +0,0 @@ -source: ccd.f -procedure: clean_sd_t_s1_1 -format : rose -loop: 0 - - - -original() - -UN=4 - -unroll(0,5,4) -unroll(0,4,4) -unroll(0,3,4) -unroll(0,2,4) -unroll(0,1,4) - -print diff --git a/examples/fortran/gemm.f90 b/examples/fortran/gemm.f90 deleted file mode 100644 index b65bb58..0000000 --- a/examples/fortran/gemm.f90 +++ /dev/null @@ -1,58 +0,0 @@ -program matmul - - integer N,i,j,k - real*8 a(10,10), b(10,10), c(10,10), ct(10,10),mysum - - do i=1,10,1 - do j=1,10,1 - a(i,j) = i+j - b(i,j) = i-j - c(i,j) = 0.0 - ct(i,j) = 0.0 - end do - b(i,i) = 1.0; - end do - - - DO j=1,10,1 - DO k=1,10,1 - DO i=1,10,1 - c(i,j) = c(i,j)+a(i,k)*b(k,j) - end do - end do - end do - - - - call gemm(10,a,b,ct) - - mysum = 0.0 - do i=1,10,1 - do j=1,10,1 - mysum = c(i,j) - ct(i,j) - end do - end do - - if (abs(mysum) >= 0.00001) then - write (*,*) "Something wrong" - else - write (*,*) "Output matches" - end if - -end program matmul - - SUBROUTINE gemm(N,A,B,C) - INTEGER N - REAL*8 A(N,N), B(N,N), C(N,N) - - INTEGER I,J,K - - DO J=1,N,1 - DO K=1,N,1 - DO I=1,N,1 - C(I,J) = C(I,J)+A(I,K)*B(K,J) - end do - end do - end do - - END subroutine diff --git a/examples/fortran/gemm.script b/examples/fortran/gemm.script deleted file mode 100644 index 01eb859..0000000 --- a/examples/fortran/gemm.script +++ /dev/null @@ -1,30 +0,0 @@ -#matrix multiply large array size for intel machine -source: gemm.f90 -procedure: gemm -format: rose -loop: 0 - -TI = 128 -#TI = 4 -TJ = 8 -#TK = 3 -TK = 512 -UI = 2 -UJ = 2 - -permute([3,1,2]) -tile(0,2,TJ) -#print space -tile(0,2,TI) -#print space -tile(0,5,TK) -#print space - - -datacopy(0,3,A,false,-1) -#print space - -datacopy(0,4,B) -unroll(0,4,UI) -unroll(0,5,UJ) - diff --git a/examples/fortran/rose_gemm.f90 b/examples/fortran/rose_gemm.f90 deleted file mode 100644 index d150922..0000000 --- a/examples/fortran/rose_gemm.f90 +++ /dev/null @@ -1,155 +0,0 @@ -PROGRAM matmul -INTEGER :: N, i, j, k -REAL(kind=8) :: a(10,10), b(10,10), c(10,10), ct(10,10), mysum -DO i = 1, 10, 1 -DO j = 1, 10, 1 -a(i,j) = i + j -b(i,j) = i - j -c(i,j) = 0.0 -ct(i,j) = 0.0 -END DO -b(i,i) = 1.0 -END DO -DO j = 1, 10, 1 -DO k = 1, 10, 1 -DO i = 1, 10, 1 -c(i,j) = c(i,j) + a(i,k) * b(k,j) -END DO -END DO -END DO -CALL gemm(10,a,b,ct) -mysum = 0.0 -DO i = 1, 10, 1 -DO j = 1, 10, 1 -mysum = c(i,j) - ct(i,j) -END DO -END DO -IF (abs(mysum) >= 0.00001) THEN -WRITE (*, FMT=*) "Something wrong" -ELSE -WRITE (*, FMT=*) "Output matches" -END IF -END PROGRAM matmul - -SUBROUTINE gemm(N,A,B,C) -INTEGER :: t12 -INTEGER :: t10 -INTEGER :: t8 -INTEGER :: t6 -INTEGER :: t4 -INTEGER :: t2 -INTEGER :: chill_t64 -INTEGER :: chill_t63 -INTEGER :: chill_t62 -INTEGER :: chill_t61 -INTEGER :: chill_t60 -INTEGER :: chill_t59 -INTEGER :: chill_t58 -INTEGER :: chill_t57 -INTEGER :: chill_t56 -INTEGER :: chill_t55 -INTEGER :: chill_t54 -INTEGER :: chill_t53 -INTEGER :: chill_t52 -INTEGER :: chill_t51 -INTEGER :: chill_t50 -INTEGER :: chill_t49 -INTEGER :: chill_t48 -INTEGER :: chill_t47 -INTEGER :: over2 -INTEGER :: chill_t46 -INTEGER :: chill_t45 -INTEGER :: chill_t44 -INTEGER :: chill_t43 -INTEGER :: chill_t42 -INTEGER :: chill_t41 -INTEGER :: chill_t40 -INTEGER :: chill_t39 -INTEGER :: chill_t38 -INTEGER :: chill_t37 -INTEGER :: chill_t36 -INTEGER :: chill_t35 -INTEGER :: chill_t34 -INTEGER :: chill_t33 -INTEGER :: chill_t32 -INTEGER :: chill_t31 -INTEGER :: chill_t30 -INTEGER :: chill_t29 -INTEGER :: chill_t28 -INTEGER :: chill_t27 -INTEGER :: chill_t26 -INTEGER :: chill_t25 -INTEGER :: chill_t24 -INTEGER :: chill_t23 -INTEGER :: over1 -INTEGER :: chill_t22 -INTEGER :: chill_t21 -INTEGER :: chill_t20 -INTEGER :: chill_t19 -INTEGER :: chill_t18 -INTEGER :: chill_t17 -INTEGER :: chill_t16 -INTEGER :: chill_t15 -REAL(kind=8), DIMENSION(8,512) :: f_P2 -INTEGER :: chill_t14 -INTEGER :: chill_t13 -INTEGER :: chill_t12 -INTEGER :: chill_t11 -INTEGER :: chill_t10 -INTEGER :: chill_t9 -INTEGER :: chill_t8 -INTEGER :: chill_t7 -REAL(kind=8), DIMENSION(512,128) :: f_P1 -INTEGER :: chill_t1 -INTEGER :: chill_t2 -INTEGER :: chill_t4 -INTEGER :: chill_t6 -INTEGER :: chill_t5 -INTEGER :: N -REAL(kind=8) :: A(N,N), B(N,N), C(N,N) -INTEGER :: I, J, K -over1 = 0 -over2 = 0 -DO t2 = 1, N, 512 -DO t4 = 1, N, 128 -DO t6 = t2, merge(N,t2 + 511,N <= t2 + 511), 1 -DO t8 = t4, merge(t4 + 127,N,t4 + 127 <= N), 1 -f_P1(t8 - t4 + 1,t6 - t2 + 1) = A(t8,t6) -END DO -END DO -DO t6 = 1, N, 8 -DO t8 = t6, merge(N,t6 + 7,N <= t6 + 7), 1 -DO t10 = t2, merge(N,t2 + 511,N <= t2 + 511), 1 -f_P2(t10 - t2 + 1,t8 - t6 + 1) = B(t10,t8) -END DO -END DO -over1 = MOD(N,2) -DO t8 = t4, merge(-over1 + N,t4 + 126,-over1 + N <= t4 + 126), 2 -over2 = MOD(N,2) -DO t10 = t6, merge(t6 + 6,N - over2,t6 + 6 <= N - over2), 2 -DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1 -C(t8,t10) = C(t8,t10) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1) -C(t8 + 1,t10) = C(t8 + 1,t10) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1) -C(t8,t10 + 1) = C(t8,t10 + 1) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1) -C(t8 + 1,t10 + 1) = C(t8 + 1,t10 + 1) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 + 1 - t6 + 1) -END DO -END DO -IF (N - 7 <= t6 .AND. 1 <= over2) THEN -DO t12 = t2, merge(N,t2 + 511,N <= t2 + 511), 1 -C(t8,N) = C(t8,N) + f_P1(t8 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1) -C(t8 + 1,N) = C(t8 + 1,N) + f_P1(t8 + 1 - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,N - t6 + 1) -END DO -END IF -END DO -IF (N - 127 <= t4 .AND. 1 <= over1) THEN -DO t10 = t6, merge(t6 + 7,N,t6 + 7 <= N), 1 -DO t12 = t2, merge(t2 + 511,N,t2 + 511 <= N), 1 -C(N,t10) = C(N,t10) + f_P1(N - t4 + 1,t12 - t2 + 1) * f_P2(t12 - t2 + 1,t10 - t6 + 1) -END DO -END DO -END IF -END DO -END DO -END DO -END SUBROUTINE - diff --git a/graph-test.cc b/graph-test.cc deleted file mode 100644 index 3cdcbee..0000000 --- a/graph-test.cc +++ /dev/null @@ -1,148 +0,0 @@ -#include "graph.hh" - -using std::cout; -using std::endl; -template -struct A { -}; - -template struct Graph; - -int main() { - Graph<> g; - - for (int i = 0; i < 8; i++) - g.insert(); - - std::vector t; - t.push_back(Empty()); - t.push_back(Empty()); - - g.connect(0,1); - g.connect(1,4); - g.connect(4,0); - g.connect(4,5); - g.connect(1,5); - g.connect(1,2); - g.connect(2,3); - g.connect(3,2); - g.connect(2,6); - g.connect(5,6); - g.connect(6,5); - g.connect(6,7); - g.connect(3,7); - g.connect(7,7,t); - - g.insert(); - g.insert(); - g.connect(9,8); - g.connect(8,0); - - cout << "Graph #1:" << endl; - cout << g; - - std::vector > r = g.topoSort(); - - cout << "topological order: "; - int num_scc = 0; - for (int i = 0; i < r.size(); i++) { - if (i != 0) - cout << ' '; - if (r[i].size() > 1) { - cout << '('; - num_scc++; - } - for (std::set::iterator j = r[i].begin(); j != r[i].end(); j++) { - if (j != r[i].begin()) - cout << ' '; - cout << (*j+1); - } - if (r[i].size() > 1) - cout << ')'; - } - cout << endl; - cout << "total number of SCC: " << num_scc << endl; - - Graph<> g2; - - for (int i = 0; i < 6; i++) - g2.insert(); - - g2.connect(0,1); - g2.connect(0,2); - g2.connect(3,4); - g2.connect(3,5); - g2.connect(3,2); - g2.connect(5,0); - - cout << endl << "Graph #2:" << endl; - cout << g2; - - std::vector > r2 = g2.packed_topoSort(); - - cout << "packed topological order: "; - for (int i = 0; i < r2.size(); i++) { - if (i != 0) - cout << ' '; - if (r2[i].size() > 1) - cout << '('; - for (std::set::iterator j = r2[i].begin(); j != r2[i].end(); j++) { - if (j != r2[i].begin()) - cout << ' '; - cout << (*j+1); - } - if (r2[i].size() > 1) - cout << ')'; - } - cout << endl; - - Graph<> g3; - - for (int i = 0; i < 6; i++) - g3.insert(); - - g3.connect(5,2); - g3.connect(5,3); - g3.connect(5,4); - g3.connect(3,1); - g3.connect(1,0); - - cout << endl << "Graph #3:" << endl; - cout << g3; - - std::vector > r3 = g3.topoSort(); - - cout << "topological order: "; - for (int i = 0; i < r3.size(); i++) { - if (i != 0) - cout << ' '; - if (r3[i].size() > 1) - cout << '('; - for (std::set::iterator j = r3[i].begin(); j != r3[i].end(); j++) { - if (j != r3[i].begin()) - cout << ' '; - cout << (*j+1); - } - if (r3[i].size() > 1) - cout << ')'; - } - cout << endl; - - r3 = g3.packed_topoSort(); - - cout << "packed topological order: "; - for (int i = 0; i < r3.size(); i++) { - if (i != 0) - cout << ' '; - if (r3[i].size() > 1) - cout << '('; - for (std::set::iterator j = r3[i].begin(); j != r3[i].end(); j++) { - if (j != r3[i].begin()) - cout << ' '; - cout << (*j+1); - } - if (r3[i].size() > 1) - cout << ')'; - } - cout << endl; -} diff --git a/graph.hh b/graph.hh index 5d0ff66..f8471df 100644 --- a/graph.hh +++ b/graph.hh @@ -76,7 +76,8 @@ template std::ostream& operator<<(std::ostream &os, const Graph &g) { for (int i = 0; i < g.vertex.size(); i++) for (typename Graph::EdgeList::const_iterator j = g.vertex[i].second.begin(); j != g.vertex[i].second.end(); j++) { - os << "s" << i << "->" << "s" << j->first << ":"; + // os << i+1 << "->" << j->first+1 << ":"; + os << "s" << i << "->" << "s" << j->first << ":"; for (typename std::vector::const_iterator k = j->second.begin(); k != j->second.end(); k++) os << " " << *k; os << std::endl; diff --git a/include/ir_suif.hh b/include/ir_suif.hh deleted file mode 120000 index 37f4ae8..0000000 --- a/include/ir_suif.hh +++ /dev/null @@ -1 +0,0 @@ -../ir_suif.hh \ No newline at end of file diff --git a/include/ir_suif_utils.hh b/include/ir_suif_utils.hh deleted file mode 120000 index 327320d..0000000 --- a/include/ir_suif_utils.hh +++ /dev/null @@ -1 +0,0 @@ -../ir_suif_utils.hh \ No newline at end of file diff --git a/ir_cuda_rose_utils.cc b/ir_cuda_rose_utils.cc deleted file mode 100644 index e7b4c37..0000000 --- a/ir_cuda_rose_utils.cc +++ /dev/null @@ -1,191 +0,0 @@ -/***************************************************************************** - Copyright (C) 2008 University of Southern California - Copyright (C) 2009 University of Utah - All Rights Reserved. - - Purpose: - SUIF interface utilities. - - Notes: - - Update history: - 01/2006 created by Chun Chen -*****************************************************************************/ - -//#include -#include "ir_rose_utils.hh" - - -/** - * Returns the body of the for loop found by finding the first loop in - * code, and if level > 1 recursively calling on the body of the found - * loop and (level-1) - */ -SgNode* loop_body_at_level(SgNode* tnl, int level) { - SgNode *inner_nl = 0; - //Now strip out the tnl on the inner level of the for loop - //tree_node_list_iter tnli(tnl); - - if (isSgBasicBlock(tnl)) { - - SgStatementPtrList& tnli = isSgBasicBlock(tnl)->get_statements(); - - for (SgStatementPtrList::iterator it = tnli.begin(); it != tnli.end(); - it++) { - if (isSgForStatement(*it)) { - inner_nl = loop_body_at_level(isSgForStatement(*it), level); - break; - } - - } - - } - - return inner_nl; -} - -SgNode* loop_body_at_level(SgForStatement* loop, int level) { - if (level > 1) - return loop_body_at_level(loop->get_loop_body(), level - 1); - return loop->get_loop_body(); -} - -void swap_node_for_node_list(SgNode* tn, SgNode* new_tnl) { - SgStatement *s = isSgStatement(tn); - - SgStatement* p; - if (s != 0) { - p = isSgStatement(tn->get_parent()); - - if (p != 0) { - - if (isSgBasicBlock(new_tnl)) { - - /*SgStatementPtrList & list_ = - isSgBasicBlock(new_tnl)->get_statements(); - - if (isSgForStatement(p)) { - if (!isSgBasicBlock(isSgForStatement(p)->get_loop_body())) - p->replace_statement(s, isSgStatement(new_tnl)); - else { - p->insert_statement(s, list_, true); - p->remove(s); - } - } else { - p->insert_statement(s, list_, true); - p->remove(s); - } - */ - if (isSgForStatement(p)) { - if (!isSgBasicBlock(isSgForStatement(p)->get_loop_body())) - p->replace_statement(s, isSgStatement(new_tnl)); - else { - - SgStatementPtrList& list_ = - isSgBasicBlock(new_tnl)->get_statements(); - - //std::vector list; - - SgStatementPtrList::iterator it = list_.begin(); - SgStatement* begin = *it; - begin->set_parent(p); - - p->replace_statement(s, begin); - it++; - //SgStatement* stmt = first; - SgStatement* temp = begin; - for (; it != list_.end(); it++) { - (*it)->set_parent(p); - p->insert_statement(temp, *it, false); - temp = *it; - } - - } - - } else { - - - SgStatementPtrList& list_ = - isSgBasicBlock(new_tnl)->get_statements(); - - //std::vector list; - - SgStatementPtrList::iterator it = list_.begin(); - SgStatement* begin = *it; - begin->set_parent(p); - - p->replace_statement(s, begin); - it++; - //SgStatement* stmt = first; - SgStatement* temp = begin; - for (; it != list_.end(); it++) { - (*it)->set_parent(p); - p->insert_statement(temp, *it, false); - temp = *it; - } - - } - - /* SgStatement* temp = s; - - SgStatementPtrList::iterator it = list_.begin(); - p->insert_statement(temp, *it, true); - temp = *it; - p->remove_statement(s); - it++; - for (; it != list_.end(); it++) { - p->insert_statement(temp, *it, false); - temp = *it; - } - - // new_tnl->set_parent(p); - //new_tnl->get_statements(); - SgStatementPtrList& list = - isSgBasicBlock(new_tnl)->get_statements(); - - //std::vector list; - - SgStatementPtrList::iterator it = list.begin(); - SgStatement* begin = *it; - begin->set_parent(p); - - p->replace_statement(s, begin); - it++; - //SgStatement* stmt = first; - SgStatement* temp = begin; - for (; it != list.end(); it++) { - (*it)->set_parent(p); - p->insert_statement(temp, *it, false); - temp = *it; - } - */ - /* SgStatementPtrList& stmt_list = isSgBasicBlock(new_tnl)->get_statements(); - SgStatement* target = s; - - for(SgStatementPtrList::iterator it = stmt_list.begin() ; it != stmt_list.end(); it++) - { - isSgNode(*it)->set_parent(p); - p->insert_statement(isSgStateme, *it, false); - target = *it; - } - - p->remove_statement(s); - - */ - }else if(isSgIfStmt(p)) { - - if(isSgIfStmt(p)->get_true_body() == s) - isSgIfStmt(p)->set_true_body(isSgStatement(new_tnl)); - else if(isSgIfStmt(p)->get_false_body() == s) - isSgIfStmt(p)->set_false_body(isSgStatement(new_tnl)); - new_tnl->set_parent(p); - } - else { - p->replace_statement(s, isSgStatement(new_tnl)); - new_tnl->set_parent(p); - } - } - - } - // return isSgNode(p); -} diff --git a/ir_cuda_suif_utils.cc b/ir_cuda_suif_utils.cc deleted file mode 100644 index f15c190..0000000 --- a/ir_cuda_suif_utils.cc +++ /dev/null @@ -1,54 +0,0 @@ -/***************************************************************************** - Copyright (C) 2008 University of Southern California - Copyright (C) 2009 University of Utah - All Rights Reserved. - - Purpose: - SUIF interface utilities. - - Notes: - - Update history: - 01/2006 created by Chun Chen -*****************************************************************************/ - -#include -#include "ir_suif_utils.hh" - - -/** - * Returns the body of the for loop found by finding the first loop in - * code, and if level > 1 recursively calling on the body of the found - * loop and (level-1) - */ -tree_node_list* loop_body_at_level(tree_node_list* tnl, int level) -{ - tree_node_list *inner_nl = 0; - //Now strip out the tnl on the inner level of the for loop - tree_node_list_iter tnli(tnl); - while (!tnli.is_empty()) { - tree_node *node = tnli.step(); - if(node->kind() == TREE_FOR) - { - //Found the first tree_for, call sibling function - inner_nl = loop_body_at_level((tree_for*)node, level); - break; - } - } - return inner_nl; -} - -tree_node_list* loop_body_at_level(tree_for* loop, int level) -{ - if(level > 1) - return loop_body_at_level(loop->body(), level-1); - return loop->body(); -} - -tree_node_list* swap_node_for_node_list(tree_node* tn, tree_node_list* new_tnl) -{ - tree_node_list* tnl = tn->parent(); - tnl->insert_after(new_tnl, tn->list_e()); - delete tnl->remove(tn->list_e()); - return tnl; -} diff --git a/ir_cudarose.cc b/ir_cudarose.cc deleted file mode 100644 index 6b31bdd..0000000 --- a/ir_cudarose.cc +++ /dev/null @@ -1,165 +0,0 @@ -/***************************************************************************** - Copyright (C) 2009 University of Utah - All Rights Reserved. - - Purpose: - CHiLL's SUIF interface. - - Notes: - Array supports mixed pointer and array type in a single declaration. - - History: - 2/2/2011 Created by Protonu Basu. -*****************************************************************************/ - -#include -#include "ir_cudarose.hh" -#include "loop.hh" -#include "loop_cuda_rose.hh" -//#include "ir_suif_utils.hh" - -using namespace SageBuilder; -using namespace SageInterface; - -IR_cudaroseCode::IR_cudaroseCode(const char *filename, const char* proc_name) : - IR_roseCode(filename, proc_name) { - - //std::string file_suffix = StringUtility::fileNameSuffix(filename); - - //if (CommandlineProcessing::isCFileNameSuffix(file_suffix)) - //{ - std::string orig_name = StringUtility::stripPathFromFileName(filename); - std::string naked_name = StringUtility::stripFileSuffixFromFileName( - orig_name); - file->set_unparse_output_filename("rose_" + naked_name + ".cu"); - - //} - - gsym_ = root; - first_scope = firstScope; - parameter = symtab2_; - body = symtab3_; - defn = func->get_definition()->get_body(); - func_defn = func->get_definition(); -} - - - -IR_ArraySymbol *IR_cudaroseCode::CreateArraySymbol(const IR_Symbol *sym, - std::vector &size, int sharedAnnotation) { - SgType *tn; - SgVariableSymbol* vs; - if (typeid(*sym) == typeid(IR_roseScalarSymbol)) { - tn = static_cast(sym)->vs_->get_type(); - } else if (typeid(*sym) == typeid(IR_roseArraySymbol)) { - tn = static_cast(sym)->vs_->get_type(); - while (isSgArrayType(tn) || isSgPointerType(tn)) { - if (isSgArrayType(tn)) - tn = isSgArrayType(tn)->get_base_type(); - else if (isSgPointerType(tn)) - tn = isSgPointerType(tn)->get_base_type(); - else - throw ir_error( - "in CreateScalarSymbol: symbol not an array nor a pointer!"); - } - } else - throw std::bad_typeid(); - - for (int i = size.size() - 1; i >= 0; i--) - tn = buildArrayType(tn, - static_cast(size[i])->GetExpression()); - - static int rose_array_counter = 1; - std::string s = std::string("_P") + omega::to_string(rose_array_counter++); - SgVariableDeclaration* defn2 = buildVariableDeclaration( - const_cast(s.c_str()), tn); - SgInitializedNamePtrList& variables2 = defn2->get_variables(); - - SgInitializedNamePtrList::const_iterator i2 = variables2.begin(); - SgInitializedName* initializedName2 = *i2; - vs = new SgVariableSymbol(initializedName2); - - prependStatement(defn2, - isSgScopeStatement(func->get_definition()->get_body())); - - vs->set_parent(symtab_); - symtab_->insert(SgName(s.c_str()), vs); - - SgStatementPtrList* tnl5 = new SgStatementPtrList; - - (*tnl5).push_back(isSgStatement(defn2)); - - omega::CG_roseRepr* stmt = new omega::CG_roseRepr(tnl5); - - init_code_ = ocg_->StmtListAppend(init_code_, - static_cast(stmt)); - - if (sharedAnnotation == 1) - isSgNode(defn2)->setAttribute("__shared__", - new AstTextAttribute("__shared__")); - - return new IR_roseArraySymbol(this, vs); -} - -bool IR_cudaroseCode::commit_loop(Loop *loop, int loop_num) { - if (loop == NULL) - return true; - - LoopCuda *cu_loop = (LoopCuda *) loop; - SgNode *tnl = cu_loop->codegen(); - if (!tnl) - return false; - - SgStatementPtrList* new_list = NULL; - if (isSgBasicBlock(tnl)) { - new_list = new SgStatementPtrList; - for (SgStatementPtrList::iterator it = - isSgBasicBlock(tnl)->get_statements().begin(); - it != isSgBasicBlock(tnl)->get_statements().end(); it++) - (*new_list).push_back(*it); - } - - //Only thing that should be left will be the inserting of the tnl* into the loop - omega::CG_outputRepr *repr; - if (new_list == NULL) - repr = new omega::CG_roseRepr(tnl); - else - repr = new omega::CG_roseRepr(new_list); - if (cu_loop->init_code != NULL) - repr = ocg_->StmtListAppend(cu_loop->init_code->clone(), repr); - - std::vector loops = find_loops( - func->get_definition()->get_body()); - tnl = isSgNode(loops[loop_num])->get_parent(); - - if (cu_loop->setup_code != NULL) { - SgStatementPtrList* setup_tnl = - static_cast(cu_loop->setup_code)->GetList(); - - SgStatement* target = isSgStatement(loops[loop_num]); - - for (SgStatementPtrList::iterator it = (*setup_tnl).begin(); - it != (*setup_tnl).end(); it++) { - - isSgStatement(tnl)->insert_statement(target, *it, false); - isSgNode(*it)->set_parent(tnl); - target = *it; - } - - //SgStatementPtrList - // for SgStatementPtrList::it - //TODO: I think this is a hack we can undo if we have loop->codegen() - //loo->getCode(), maybe also get rid of setup and teardown... - //fix_unfinished_comment(setup_tnl, indexes_string); - //isSgStatement(tnl)->replace_statement(isSgStatement(loops[loop_num]), *setup_tnl); - isSgStatement(tnl)->remove_statement(isSgStatement(loops[loop_num])); - } - - delete repr; - - return true; -} - -IR_cudaroseCode::~IR_cudaroseCode() { -} - diff --git a/ir_cudarose.hh b/ir_cudarose.hh deleted file mode 100644 index 34e0404..0000000 --- a/ir_cudarose.hh +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef IR_CUDA_ROSE -#define IR_CUDA_ROSE - -#include -#include -#include "ir_rose.hh" -#include "loop.hh" -#include "loop_cuda_rose.hh" -#include "ir_rose_utils.hh" - - - -class IR_cudaroseCode : public IR_roseCode{ - -public: - - - IR_cudaroseCode(const char *filename, const char* proc_name); - - - - SgGlobal *gsym_; - SgScopeStatement* defn; - SgGlobal* first_scope; - SgSymbolTable* parameter; - SgSymbolTable* body; - SgFunctionDefinition* func_defn; - std::vector write_procs;//procs to write - - - IR_ArraySymbol *CreateArraySymbol(const IR_Symbol *sym, std::vector &size,int sharedAnnotation = 1); - omega::CG_outputRepr* init_code(){ return init_code_; } - bool commit_loop(Loop *loop, int loop_num); - std::vector get_loops() - { - std::vector loops = find_loops(func->get_definition()->get_body()); - return loops; - } - - ~IR_cudaroseCode(); - -}; - - -#endif - diff --git a/ir_cudasuif.cc b/ir_cudasuif.cc deleted file mode 100644 index c646e13..0000000 --- a/ir_cudasuif.cc +++ /dev/null @@ -1,144 +0,0 @@ -/***************************************************************************** - Copyright (C) 2009 University of Utah - All Rights Reserved. - - Purpose: - CHiLL's SUIF interface. - - Notes: - Array supports mixed pointer and array type in a single declaration. - - History: - 2/2/2011 Created by Protonu Basu. -*****************************************************************************/ - -#include -#include "ir_cudasuif.hh" -#include "loop.hh" -#include "loop_cuda.hh" -#include "ir_suif_utils.hh" - - -IR_cudasuifCode::IR_cudasuifCode(const char *filename, int proc_num) - :IR_suifCode(filename, proc_num) -{ - //setting up gsym_ here - fileset->reset_iter(); - gsym_ = fileset->globals(); - -} - - - -IR_ArraySymbol *IR_cudasuifCode::CreateArraySymbol(const IR_Symbol *sym, - std::vector &size, - int sharedAnnotation) -{ - type_node *tn; - - if (typeid(*sym) == typeid(IR_suifScalarSymbol)) { - tn = static_cast(sym)->vs_->type(); - } - else if (typeid(*sym) == typeid(IR_suifArraySymbol)) { - tn = static_cast(sym)->vs_->type(); - if (tn->is_modifier()) - tn = static_cast(tn)->base(); - while (tn->is_array() || tn->is_ptr()) { - if (tn->is_array()) - tn = static_cast(tn)->elem_type(); - else if (tn->is_ptr()) - tn = static_cast(tn)->ref_type(); - } - } - else - throw std::bad_typeid(); - - if (is_fortran_) - for (int i = 0; i < size.size(); i++) { - var_sym *temporary = symtab_->new_unique_var(type_s32); - init_code_ = ocg_->StmtListAppend(init_code_, ocg_->StmtListAppend(ocg_->CreateAssignment(0, new omega::CG_suifRepr(operand(temporary)), size[i]),NULL)); - - tn = new array_type(tn, array_bound(1), array_bound(temporary)); - symtab_->add_type(tn); - } - else - for (int i = size.size()-1; i >= 0; i--) { - var_sym *temporary = symtab_->new_unique_var(type_s32); - //init_code_ = ocg_->StmtListAppend(init_code_, ocg_->CreateStmtList(ocg_->CreateAssignment(0, new omega::CG_suifRepr(operand(temporary)), size[i]))); - init_code_ = ocg_->StmtListAppend(init_code_, ocg_->StmtListAppend(ocg_->CreateAssignment(0, new omega::CG_suifRepr(operand(temporary)), size[i]), NULL)); - - tn = new array_type(tn, array_bound(1), array_bound(temporary)); - symtab_->add_type(tn); - if(i == 0 && sharedAnnotation == 1){ - tn = static_cast(ocg_)->ModifyType(tn, "__shared__"); - symtab_->add_type(tn); - } - } - - static int suif_array_counter = 1; - std::string s = std::string("_P") + omega::to_string(suif_array_counter++); - var_sym *vs = new var_sym(tn, const_cast(s.c_str())); - vs->add_to_table(symtab_); - - return new IR_suifArraySymbol(this, vs); -} - - -bool IR_cudasuifCode::commit_loop(Loop *loop, int loop_num) { - if (loop == NULL) - return true; - - //Call code-gen part of any scripting routines that were run. - // internally call GetCode - // Add stuff before and after (setup, teardown - // return a tnl - LoopCuda *cu_loop = (LoopCuda *)loop; - tree_node_list *tnl = cu_loop->codegen(); - if(!tnl) - return false; - - //set up our new procs - for(int i=0; inew_procs.size(); i++) - { - printf("setting proc fse\n"); - cu_loop->new_procs[i]->set_fse(fse_); - write_procs.push_back(cu_loop->new_procs[i]); - } - - //Only thing that should be left will be the inserting of the tnl* into the loop - - omega::CG_outputRepr *repr = new omega::CG_suifRepr(tnl); - if (cu_loop->init_code != NULL) - repr = ocg_->StmtListAppend(cu_loop->init_code->clone(), repr); - - std::vector loops = find_loops(psym_->block()->body()); - tnl = loops[loop_num]->parent(); - - if (cu_loop->setup_code != NULL) { - tree_node_list *setup_tnl = static_cast(cu_loop->setup_code->clone())->GetCode(); - //TODO: I think this is a hack we can undo if we have loop->codegen() - //loo->getCode(), maybe also get rid of setup and teardown... - //fix_unfinished_comment(setup_tnl, indexes_string); - tnl->insert_before(setup_tnl, loops[loop_num]->list_e()); - } - tnl->insert_before(static_cast(repr)->GetCode(), loops[loop_num]->list_e()); - if (cu_loop->teardown_code != NULL) { - tree_node_list *setup_tnl = static_cast(cu_loop->teardown_code->clone())->GetCode(); - tnl->insert_before(setup_tnl, loops[loop_num]->list_e()); - } - - tnl->remove(loops[loop_num]->list_e()); - - delete repr; - return true; -} - -IR_cudasuifCode::~IR_cudasuifCode() -{ - for(int i=0; iis_written()) - write_procs[i]->write_proc(fse_); - write_procs[i]->flush_proc(); - } -} diff --git a/ir_cudasuif.hh b/ir_cudasuif.hh deleted file mode 100644 index 834778e..0000000 --- a/ir_cudasuif.hh +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef IR_CUDA_SUIF -#define IR_CUDA_SUIF - -#include -#include -#include "ir_suif.hh" -#include "loop.hh" -#include "loop_cuda.hh" -#include "ir_suif_utils.hh" - - - -class IR_cudasuifCode : public IR_suifCode{ - -public: - global_symtab *gsym_; - std::vector write_procs;//procs to write - - - IR_cudasuifCode(const char *filename, int proc_num); - IR_ArraySymbol *CreateArraySymbol(const IR_Symbol *sym, - std::vector &size, - int sharedAnnotation = 1); - omega::CG_outputRepr* init_code(){ return init_code_; } - bool commit_loop(Loop *loop, int loop_num); - std::vector get_loops() - { - std::vector loops = find_loops(psym_->block()->body()); - return loops; - } - ~IR_cudasuifCode(); - -}; - - -#endif diff --git a/loop.cc b/loop.cc index ce83006..0a82f7a 100644 --- a/loop.cc +++ b/loop.cc @@ -53,6 +53,7 @@ bool Loop::isInitialized() const { bool Loop::init_loop(std::vector &ir_tree, std::vector &ir_stmt) { + ir_stmt = extract_ir_stmts(ir_tree); stmt_nesting_level_.resize(ir_stmt.size()); std::vector stmt_nesting_level(ir_stmt.size()); diff --git a/loop_backup.cc b/loop_backup.cc deleted file mode 100644 index b361ed4..0000000 --- a/loop_backup.cc +++ /dev/null @@ -1,3311 +0,0 @@ -/***************************************************************************** - Copyright (C) 2008 University of Southern California - Copyright (C) 2009-2010 University of Utah - All Rights Reserved. - - Purpose: - Core loop transformation functionality. - - Notes: - "level" (starting from 1) means loop level and it corresponds to "dim" - (starting from 0) in transformed iteration space [c_1,l_1,c_2,l_2,...., - c_n,l_n,c_(n+1)], e.g., l_2 is loop level 2 in generated code, dim 3 - in transformed iteration space, and variable 4 in Omega relation. - All c's are constant numbers only and they will not show up as actual loops. - Formula: - dim = 2*level - 1 - var = dim + 1 - - History: - 10/2005 Created by Chun Chen. - 09/2009 Expand tile functionality, -chun - 10/2009 Initialize unfusible loop nest without bailing out, -chun -*****************************************************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include "loop.hh" -#include "omegatools.hh" -#include "irtools.hh" -#include "chill_error.hh" - -using namespace omega; - -const std::string Loop::tmp_loop_var_name_prefix = std::string("_t"); -const std::string Loop::overflow_var_name_prefix = std::string("over"); - -//----------------------------------------------------------------------------- -// Class Loop -//----------------------------------------------------------------------------- - -bool Loop::init_loop(std::vector &ir_tree, std::vector &ir_stmt) { - ir_stmt = extract_ir_stmts(ir_tree); - std::vector stmt_nesting_level(ir_stmt.size()); - for (int i = 0; i < ir_stmt.size(); i++) { - ir_stmt[i]->payload = i; - int t = 0; - ir_tree_node *itn = ir_stmt[i]; - while (itn->parent != NULL) { - itn = itn->parent; - if (itn->content->type() == IR_CONTROL_LOOP) - t++; - } - stmt_nesting_level[i] = t; - } - - stmt = std::vector(ir_stmt.size()); - int n_dim = -1; - int max_loc; - std::vector index; - for (int i = 0; i < ir_stmt.size(); i++) { - int max_nesting_level = -1; - int loc; - for (int j = 0; j < ir_stmt.size(); j++) - if (stmt_nesting_level[j] > max_nesting_level) { - max_nesting_level = stmt_nesting_level[j]; - loc = j; - } - - // most deeply nested statement acting as a reference point - if (n_dim == -1) { - n_dim = max_nesting_level; - max_loc = loc; - - index = std::vector(n_dim); - - ir_tree_node *itn = ir_stmt[loc]; - int cur_dim = n_dim-1; - while (itn->parent != NULL) { - itn = itn->parent; - if (itn->content->type() == IR_CONTROL_LOOP) { - index[cur_dim] = static_cast(itn->content)->index()->name(); - itn->payload = cur_dim--; - } - } - } - - // align loops by names, temporary solution - ir_tree_node *itn = ir_stmt[loc]; - while (itn->parent != NULL) { - itn = itn->parent; - if (itn->content->type() == IR_CONTROL_LOOP && itn->payload == -1) { - std::string name = static_cast(itn->content)->index()->name(); - for (int j = 0; j < n_dim; j++) - if (index[j] == name) { - itn->payload = j; - break; - } - if (itn->payload == -1) - throw loop_error("no complex alignment yet"); - } - } - - // set relation variable names - Relation r(n_dim); - F_And *f_root = r.add_and(); - itn = ir_stmt[loc]; - while (itn->parent != NULL) { - itn = itn->parent; - if (itn->content->type() == IR_CONTROL_LOOP) - r.name_set_var(itn->payload+1, static_cast(itn->content)->index()->name()); - } - - // extract information from loop/if structures - std::vector processed(n_dim, false); - Tuple vars_to_be_reversed; - itn = ir_stmt[loc]; - while (itn->parent != NULL) { - itn = itn->parent; - - switch (itn->content->type()) { - case IR_CONTROL_LOOP: { - IR_Loop *lp = static_cast(itn->content); - Variable_ID v = r.set_var(itn->payload+1); - int c; - - try { - c = lp->step_size(); - if (c > 0) { - CG_outputRepr *lb = lp->lower_bound(); - exp2formula(ir, r, f_root, freevar, lb, v, 's', IR_COND_GE, true); - CG_outputRepr *ub = lp->upper_bound(); - IR_CONDITION_TYPE cond = lp->stop_cond(); - if (cond == IR_COND_LT || cond == IR_COND_LE) - exp2formula(ir, r, f_root, freevar, ub, v, 's', cond, true); - else - throw ir_error("loop condition not supported"); - - } - else if (c < 0) { - CG_outputBuilder *ocg = ir->builder(); - CG_outputRepr *lb = lp->lower_bound(); - lb = ocg->CreateMinus(NULL, lb); - exp2formula(ir, r, f_root, freevar, lb, v, 's', IR_COND_GE, true); - CG_outputRepr *ub = lp->upper_bound(); - ub = ocg->CreateMinus(NULL, ub); - IR_CONDITION_TYPE cond = lp->stop_cond(); - if (cond == IR_COND_GE) - exp2formula(ir, r, f_root, freevar, ub, v, 's', IR_COND_LE, true); - else if (cond == IR_COND_GT) - exp2formula(ir, r, f_root, freevar, ub, v, 's', IR_COND_LT, true); - else - throw ir_error("loop condition not supported"); - - vars_to_be_reversed.append(lp->index()->name()); - } - else - throw ir_error("loop step size zero"); - } - catch (const ir_error &e) { - for (int i = 0; i < itn->children.size(); i++) - delete itn->children[i]; - itn->children = std::vector(); - itn->content = itn->content->convert(); - return false; - } - - if (abs(c) != 1) { - F_Exists *f_exists = f_root->add_exists(); - Variable_ID e = f_exists->declare(); - F_And *f_and = f_exists->add_and(); - Stride_Handle h = f_and->add_stride(abs(c)); - if (c > 0) - h.update_coef(e, 1); - else - h.update_coef(e, -1); - h.update_coef(v, -1); - CG_outputRepr *lb = lp->lower_bound(); - exp2formula(ir, r, f_and, freevar, lb, e, 's', IR_COND_EQ, true); - } - - processed[itn->payload] = true; - break; - } - case IR_CONTROL_IF: { - CG_outputRepr *cond = static_cast(itn->content)->condition(); - try { - if (itn->payload % 2 == 1) - exp2constraint(ir, r, f_root, freevar, cond, true); - else { - F_Not *f_not = f_root->add_not(); - F_And *f_and = f_not->add_and(); - exp2constraint(ir, r, f_and, freevar, cond, true); - } - } - catch (const ir_error &e) { - std::vector *t; - if (itn->parent == NULL) - t = &ir_tree; - else - t = &(itn->parent->children); - int id = itn->payload; - int i = t->size() - 1; - while (i >= 0) { - if ((*t)[i] == itn) { - for (int j = 0; j < itn->children.size(); j++) - delete itn->children[j]; - itn->children = std::vector(); - itn->content = itn->content->convert(); - } - else if ((*t)[i]->payload >> 1 == id >> 1) { - delete (*t)[i]; - t->erase(t->begin()+i); - } - i--; - } - return false; - } - - break; - } - default: - for (int i = 0; i < itn->children.size(); i++) - delete itn->children[i]; - itn->children = std::vector(); - itn->content = itn->content->convert(); - return false; - } - } - - // add information for missing loops - for (int j = 0; j < n_dim; j++) - if (!processed[j]) { - ir_tree_node *itn = ir_stmt[max_loc]; - while (itn->parent != NULL) { - itn = itn->parent; - if (itn->content->type() == IR_CONTROL_LOOP && itn->payload == j) - break; - } - - Variable_ID v = r.set_var(j+1); - if (loc < max_loc) { - CG_outputRepr *lb = static_cast(itn->content)->lower_bound(); - exp2formula(ir, r, f_root, freevar, lb, v, 's', IR_COND_EQ, true); - } - else { // loc > max_loc - CG_outputRepr *ub = static_cast(itn->content)->upper_bound(); - exp2formula(ir, r, f_root, freevar, ub, v, 's', IR_COND_EQ, true); - } - } - - r.setup_names(); - r.simplify(); - - // insert the statement - CG_outputBuilder *ocg = ir->builder(); - Tuple reverse_expr; - for (int j = 1; j <= vars_to_be_reversed.size(); j++) { - CG_outputRepr *repl = ocg->CreateIdent(vars_to_be_reversed[j]); - repl = ocg->CreateMinus(NULL, repl); - reverse_expr.append(repl); - } - CG_outputRepr *code = static_cast(ir_stmt[loc]->content)->extract(); - code = ocg->CreatePlaceHolder(0, code, reverse_expr, vars_to_be_reversed); - stmt[loc].code = code; - stmt[loc].IS = r; - stmt[loc].loop_level = std::vector(n_dim); - for (int i = 0; i < n_dim; i++) { - stmt[loc].loop_level[i].type = LoopLevelOriginal; - stmt[loc].loop_level[i].payload = i; - stmt[loc].loop_level[i].parallel_level = 0; - } - - stmt_nesting_level[loc] = -1; - } - - return true; -} - - - -Loop::Loop(const IR_Control *control) { - ir = const_cast(control->ir_); - init_code = NULL; - cleanup_code = NULL; - tmp_loop_var_name_counter = 1; - overflow_var_name_counter = 1; - known = Relation::True(0); - - std::vector ir_tree = build_ir_tree(control->clone(), NULL); - std::vector ir_stmt; - - while (!init_loop(ir_tree, ir_stmt)) {} - - // init the dependence graph - for (int i = 0; i < stmt.size(); i++) - dep.insert(); - - for (int i = 0; i < stmt.size(); i++) - for (int j = i; j < stmt.size(); j++) { - std::pair, std::vector > dv = test_data_dependences(ir, stmt[i].code, stmt[i].IS, stmt[j].code, stmt[j].IS, freevar); - - for (int k = 0; k < dv.first.size(); k++) - if (is_dependence_valid(ir_stmt[i], ir_stmt[j], dv.first[k], true)) - dep.connect(i, j, dv.first[k]); - else - dep.connect(j, i, dv.first[k].reverse()); - - for (int k = 0; k < dv.second.size(); k++) - if (is_dependence_valid(ir_stmt[j], ir_stmt[i], dv.second[k], false)) - dep.connect(j, i, dv.second[k]); - else - dep.connect(i, j, dv.second[k].reverse()); - } - - // cleanup the IR tree - for (int i = 0; i < ir_tree.size(); i++) - delete ir_tree[i]; - - // init dumb transformation relations e.g. [i, j] -> [ 0, i, 0, j, 0] - for (int i = 0; i < stmt.size(); i++) { - int n = stmt[i].IS.n_set(); - stmt[i].xform = Relation(n, 2*n+1); - F_And *f_root = stmt[i].xform.add_and(); - - for (int j = 1; j <= n; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(stmt[i].xform.output_var(2*j), 1); - h.update_coef(stmt[i].xform.input_var(j), -1); - } - - for (int j = 1; j <= 2*n+1; j+=2) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(stmt[i].xform.output_var(j), 1); - } - stmt[i].xform.simplify(); - } - - if (stmt.size() != 0) - num_dep_dim = stmt[0].IS.n_set(); - else - num_dep_dim = 0; -} - - -Loop::~Loop() { - for (int i = 0; i < stmt.size(); i++) - if (stmt[i].code != NULL) { - stmt[i].code->clear(); - delete stmt[i].code; - } - if (init_code != NULL) { - init_code->clear(); - delete init_code; - } - if (cleanup_code != NULL) { - cleanup_code->clear(); - delete cleanup_code; - } -} - - -int Loop::get_dep_dim_of(int stmt_num, int level) const { - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invaid statement " + to_string(stmt_num)); - - if (level < 1 || level > stmt[stmt_num].loop_level.size()) - return -1; - - int trip_count = 0; - while (true) { - switch (stmt[stmt_num].loop_level[level-1].type) { - case LoopLevelOriginal: - return stmt[stmt_num].loop_level[level-1].payload; - case LoopLevelTile: - level = stmt[stmt_num].loop_level[level-1].payload; - if (level < 1) - return -1; - if (level > stmt[stmt_num].loop_level.size()) - throw loop_error("incorrect loop level information for statement " + to_string(stmt_num)); - break; - default: - throw loop_error("unknown loop level information for statement " + to_string(stmt_num)); - } - trip_count++; - if (trip_count >= stmt[stmt_num].loop_level.size()) - throw loop_error("incorrect loop level information for statement " + to_string(stmt_num)); - } -} - - -int Loop::get_last_dep_dim_before(int stmt_num, int level) const { - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invaid statement " + to_string(stmt_num)); - - if (level < 1) - return -1; - if (level > stmt[stmt_num].loop_level.size()) - level = stmt[stmt_num].loop_level.size() + 1; - - for (int i = level-1; i >= 1; i--) - if (stmt[stmt_num].loop_level[i-1].type == LoopLevelOriginal) - return stmt[stmt_num].loop_level[i-1].payload; - - return -1; -} - - -void Loop::print_internal_loop_structure() const { - for (int i = 0; i < stmt.size(); i++) { - std::vector lex = getLexicalOrder(i); - std::cout << "s" << i+1 << ": "; - for (int j = 0; j < stmt[i].loop_level.size(); j++) { - if (2*j < lex.size()) - std::cout << lex[2*j]; - switch (stmt[i].loop_level[j].type) { - case LoopLevelOriginal: - std::cout << "(dim:" << stmt[i].loop_level[j].payload << ")"; - break; - case LoopLevelTile: - std::cout << "(tile:" << stmt[i].loop_level[j].payload << ")"; - break; - default: - std::cout << "(unknown)"; - } - std::cout << ' '; - } - for (int j = 2*stmt[i].loop_level.size(); j < lex.size(); j+=2) { - std::cout << lex[j]; - if (j != lex.size()-1) - std::cout << ' '; - } - std::cout << std::endl; - } -} - - -CG_outputRepr *Loop::getCode(int effort) const { - const int m = stmt.size(); - if (m == 0) - return NULL; - const int n = stmt[0].xform.n_out(); - - Tuple ni(m); - Tuple IS(m); - Tuple xform(m); - for (int i = 0; i < m; i++) { - ni[i+1] = stmt[i].code; - IS[i+1] = stmt[i].IS; - xform[i+1] = stmt[i].xform; - } - - Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); - CG_outputBuilder *ocg = ir->builder(); - CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, effort); - - if (init_code != NULL) - repr = ocg->StmtListAppend(init_code->clone(), repr); - if (cleanup_code != NULL) - repr = ocg->StmtListAppend(repr, cleanup_code->clone()); - - return repr; -} - - -void Loop::printCode(int effort) const { - const int m = stmt.size(); - if (m == 0) - return; - const int n = stmt[0].xform.n_out(); - - Tuple IS(m); - Tuple xform(m); - for (int i = 0; i < m; i++) { - IS[i+1] = stmt[i].IS; - xform[i+1] = stmt[i].xform; - } - - Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); - std::cout << MMGenerateCode(xform, IS, known, effort); -} - - -Relation Loop::getNewIS(int stmt_num) const { - Relation result; - - if (stmt[stmt_num].xform.is_null()) { - Relation known = Extend_Set(copy(this->known), stmt[stmt_num].IS.n_set() - this->known.n_set()); - result = Intersection(copy(stmt[stmt_num].IS), known); - } - else { - Relation known = Extend_Set(copy(this->known), stmt[stmt_num].xform.n_out() - this->known.n_set()); - result = Intersection(Range(Restrict_Domain(copy(stmt[stmt_num].xform), copy(stmt[stmt_num].IS))), known); - } - - result.simplify(2, 4); - - return result; -} - -std::vector Loop::getNewIS() const { - const int m = stmt.size(); - - std::vector new_IS(m); - for (int i = 0; i < m; i++) - new_IS[i] = getNewIS(i); - - return new_IS; -} - - -void Loop::permute(const std::vector &pi) { - std::set active; - for (int i = 0; i < stmt.size(); i++) - active.insert(i); - - permute(active, pi); -} - - -void Loop::original() { - std::set active; - for (int i = 0; i < stmt.size(); i++) - active.insert(i); - setLexicalOrder(0, active); -} - - -void Loop::permute(const std::set &active, const std::vector &pi) { - if (active.size() == 0 || pi.size() == 0) - return; - - // check for sanity of parameters - int level = pi[0]; - for (int i = 1; i < pi.size(); i++) - if (pi[i] < level) - level = pi[i]; - if (level < 1) - throw std::invalid_argument("invalid permuation"); - std::vector reverse_pi(pi.size(), 0); - for (int i = 0; i < pi.size(); i++) - if (pi[i] >= level+pi.size()) - throw std::invalid_argument("invalid permutation"); - else - reverse_pi[pi[i]-level] = i+level; - for (int i = 0; i < reverse_pi.size(); i++) - if (reverse_pi[i] == 0) - throw std::invalid_argument("invalid permuation"); - int ref_stmt_num; - std::vector lex; - for (std::set::iterator i = active.begin(); i != active.end(); i++) { - if (*i < 0 || *i >= stmt.size()) - throw std::invalid_argument("invalid statement " + to_string(*i)); - if (i == active.begin()) { - ref_stmt_num = *i; - lex = getLexicalOrder(*i); - } - else { - if (level+pi.size()-1 > stmt[*i].loop_level.size()) - throw std::invalid_argument("invalid permuation"); - std::vector lex2 = getLexicalOrder(*i); - for (int j = 0; j < 2*level-3; j+=2) - if (lex[j] != lex2[j]) - throw std::invalid_argument("statements to permute must be in the same subloop"); - for (int j = 0; j < pi.size(); j++) - if (!(stmt[*i].loop_level[level+j-1].type == stmt[ref_stmt_num].loop_level[level+j-1].type && - stmt[*i].loop_level[level+j-1].payload == stmt[ref_stmt_num].loop_level[level+j-1].payload)) - throw std::invalid_argument("permuted loops must have the same loop level types"); - } - } - - // Update transformation relations - for (std::set::iterator i = active.begin(); i != active.end(); i++) { - int n = stmt[*i].xform.n_out(); - Relation mapping(n, n); - F_And *f_root = mapping.add_and(); - for (int j = 1; j <= n; j+= 2) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(j), 1); - h.update_coef(mapping.input_var(j), -1); - } - for (int j = 0; j < pi.size(); j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(2*(level+j)), 1); - h.update_coef(mapping.input_var(2*pi[j]), -1); - } - for (int j = 1; j < level; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(2*j), 1); - h.update_coef(mapping.input_var(2*j), -1); - } - for (int j = level+pi.size(); j <= stmt[*i].loop_level.size(); j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(2*j), 1); - h.update_coef(mapping.input_var(2*j), -1); - } - - stmt[*i].xform = Composition(mapping, stmt[*i].xform); - stmt[*i].xform.simplify(); - } - - // get the permuation for dependence vectors - std::vector t; - for (int i = 0; i < pi.size(); i++) - if (stmt[ref_stmt_num].loop_level[pi[i]-1].type == LoopLevelOriginal) - t.push_back(stmt[ref_stmt_num].loop_level[pi[i]-1].payload); - int max_dep_dim = -1; - int min_dep_dim = num_dep_dim; - for (int i = 0; i < t.size(); i++) { - if (t[i] > max_dep_dim) - max_dep_dim = t[i]; - if (t[i] < min_dep_dim) - min_dep_dim = t[i]; - } - if (min_dep_dim > max_dep_dim) - return; - if (max_dep_dim - min_dep_dim + 1 != t.size()) - throw loop_error("cannot update the dependence graph after permuation"); - std::vector dep_pi(num_dep_dim); - for (int i = 0; i < min_dep_dim; i++) - dep_pi[i] = i; - for (int i = min_dep_dim; i <= max_dep_dim; i++) - dep_pi[i] = t[i-min_dep_dim]; - for (int i = max_dep_dim+1; i < num_dep_dim; i++) - dep_pi[i] = i; - - // update the dependence graph - DependenceGraph g; - for (int i = 0; i < dep.vertex.size(); i++) - g.insert(); - for (int i = 0; i < dep.vertex.size(); i++) - for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++) { - if ((active.find(i) != active.end() && active.find(j->first) != active.end())) { - std::vector dv = j->second; - for (int k = 0; k < dv.size(); k++) { - switch (dv[k].type) { - case DEP_W2R: - case DEP_R2W: - case DEP_W2W: - case DEP_R2R: { - std::vector lbounds(num_dep_dim); - std::vector ubounds(num_dep_dim); - for (int d = 0; d < num_dep_dim; d++) { - lbounds[d] = dv[k].lbounds[dep_pi[d]]; - ubounds[d] = dv[k].ubounds[dep_pi[d]]; - } - dv[k].lbounds = lbounds; - dv[k].ubounds = ubounds; - break; - } - case DEP_CONTROL: { - break; - } - default: - throw loop_error("unknown dependence type"); - } - } - g.connect(i, j->first, dv); - } - else if (active.find(i) == active.end() && active.find(j->first) == active.end()) { - std::vector dv = j->second; - g.connect(i, j->first, dv); - } - else { - std::vector dv = j->second; - for (int k = 0; k < dv.size(); k++) - switch (dv[k].type) { - case DEP_W2R: - case DEP_R2W: - case DEP_W2W: - case DEP_R2R: { - for (int d = 0; d < num_dep_dim; d++) - if (dep_pi[d] != d) { - dv[k].lbounds[d] = -posInfinity; - dv[k].ubounds[d] = posInfinity; - } - break; - } - case DEP_CONTROL: - break; - default: - throw loop_error("unknown dependence type"); - } - g.connect(i, j->first, dv); - } - } - dep = g; - - // update loop level information - for (std::set::iterator i = active.begin(); i != active.end(); i++) { - int cur_dep_dim = min_dep_dim; - std::vector new_loop_level(stmt[*i].loop_level.size()); - for (int j = 1; j <= stmt[*i].loop_level.size(); j++) - if (j >= level && j < level+pi.size()) { - switch (stmt[*i].loop_level[reverse_pi[j-level]-1].type) { - case LoopLevelOriginal: - new_loop_level[j-1].type = LoopLevelOriginal; - new_loop_level[j-1].payload = cur_dep_dim++; - new_loop_level[j-1].parallel_level = stmt[*i].loop_level[reverse_pi[j-level]-1].parallel_level; - break; - case LoopLevelTile: { - new_loop_level[j-1].type = LoopLevelTile; - int ref_level = stmt[*i].loop_level[reverse_pi[j-level]-1].payload; - if (ref_level >= level && ref_level < level+pi.size()) - new_loop_level[j-1].payload = reverse_pi[ref_level-level]; - else - new_loop_level[j-1].payload = ref_level; - new_loop_level[j-1].parallel_level = stmt[*i].loop_level[reverse_pi[j-level]-1].parallel_level; - break; - } - default: - throw loop_error("unknown loop level information for statement " + to_string(*i)); - } - } - else { - switch (stmt[*i].loop_level[j-1].type) { - case LoopLevelOriginal: - new_loop_level[j-1].type = LoopLevelOriginal; - new_loop_level[j-1].payload = stmt[*i].loop_level[j-1].payload; - new_loop_level[j-1].parallel_level = stmt[*i].loop_level[j-1].parallel_level; - break; - case LoopLevelTile: { - new_loop_level[j-1].type = LoopLevelTile; - int ref_level = stmt[*i].loop_level[j-1].payload; - if (ref_level >= level && ref_level < level+pi.size()) - new_loop_level[j-1].payload = reverse_pi[ref_level-level]; - else - new_loop_level[j-1].payload = ref_level; - new_loop_level[j-1].parallel_level = stmt[*i].loop_level[j-1].parallel_level; - break; - } - default: - throw loop_error("unknown loop level information for statement " + to_string(*i)); - } - } - stmt[*i].loop_level = new_loop_level; - } - - setLexicalOrder(2*level-2, active); -} - -std::set Loop::split(int stmt_num, int level, const Relation &cond) { - // check for sanity of parameters - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - if (level <= 0 || level > stmt[stmt_num].loop_level.size()) - throw std::invalid_argument("invalid loop level " + to_string(level)); - - std::set result; - int dim = 2*level-1; - std::vector lex = getLexicalOrder(stmt_num); - std::set same_loop = getStatements(lex, dim-1); - - Relation cond2 = copy(cond); - cond2.simplify(); - cond2 = EQs_to_GEQs(cond2); - Conjunct *c = cond2.single_conjunct(); - int cur_lex = lex[dim-1]; - for (GEQ_Iterator gi(c->GEQs()); gi; gi++) { - int max_level = (*gi).max_tuple_pos(); - Relation single_cond(max_level); - single_cond.and_with_GEQ(*gi); - - // TODO: should decide where to place newly created statements with - // complementary split condition from dependence graph. - bool place_after; - if (max_level == 0) - place_after = true; - else if ((*gi).get_coef(cond2.set_var(max_level)) < 0) - place_after = true; - else - place_after = false; - - // make adjacent lexical number available for new statements - if (place_after) { - lex[dim-1] = cur_lex+1; - shiftLexicalOrder(lex, dim-1, 1); - } - else { - lex[dim-1] = cur_lex-1; - shiftLexicalOrder(lex, dim-1, -1); - } - - // original statements with split condition, - // new statements with complement of split condition - int old_num_stmt = stmt.size(); - std::map what_stmt_num; - apply_xform(same_loop); - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - int n = stmt[*i].IS.n_set(); - Relation part1, part2; - if (max_level > n) { - part1 = copy(stmt[*i].IS); - part2 = Relation::False(0); - } - else { - part1 = Intersection(copy(stmt[*i].IS), Extend_Set(copy(single_cond), n-max_level)); - part2 = Intersection(copy(stmt[*i].IS), Extend_Set(Complement(copy(single_cond)), n-max_level)); - } - - stmt[*i].IS = part1; - - if (Intersection(copy(part2), Extend_Set(copy(this->known), n-this->known.n_set())).is_upper_bound_satisfiable()) { - Statement new_stmt; - new_stmt.code = stmt[*i].code->clone(); - new_stmt.IS = part2; - new_stmt.xform = copy(stmt[*i].xform); - if (place_after) - assign_const(new_stmt.xform, dim-1, cur_lex+1); - else - assign_const(new_stmt.xform, dim-1, cur_lex-1); - new_stmt.loop_level = stmt[*i].loop_level; - stmt.push_back(new_stmt); - dep.insert(); - what_stmt_num[*i] = stmt.size() - 1; - if (*i == stmt_num) - result.insert(stmt.size() - 1); - } - } - - // update dependence graph - int dep_dim = get_dep_dim_of(stmt_num, level); - for (int i = 0; i < old_num_stmt; i++) { - std::vector > > D; - - for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++) { - if (same_loop.find(i) != same_loop.end()) { - if (same_loop.find(j->first) != same_loop.end()) { - if (what_stmt_num.find(i) != what_stmt_num.end() && what_stmt_num.find(j->first) != what_stmt_num.end()) - dep.connect(what_stmt_num[i], what_stmt_num[j->first], j->second); - if (place_after && what_stmt_num.find(j->first) != what_stmt_num.end()) { - std::vector dvs; - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if (dv.is_data_dependence() && dep_dim != -1) { - dv.lbounds[dep_dim] = -posInfinity; - dv.ubounds[dep_dim] = posInfinity; - } - dvs.push_back(dv); - } - if (dvs.size() > 0) - D.push_back(std::make_pair(what_stmt_num[j->first], dvs)); - } - else if (!place_after && what_stmt_num.find(i) != what_stmt_num.end()) { - std::vector dvs; - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if (dv.is_data_dependence() && dep_dim != -1) { - dv.lbounds[dep_dim] = -posInfinity; - dv.ubounds[dep_dim] = posInfinity; - } - dvs.push_back(dv); - } - if (dvs.size() > 0) - dep.connect(what_stmt_num[i], j->first, dvs); - - } - } - else { - if (what_stmt_num.find(i) != what_stmt_num.end()) - dep.connect(what_stmt_num[i], j->first, j->second); - } - } - else if (same_loop.find(j->first) != same_loop.end()) { - if (what_stmt_num.find(j->first) != what_stmt_num.end()) - D.push_back(std::make_pair(what_stmt_num[j->first], j->second)); - } - } - - for (int j = 0; j < D.size(); j++) - dep.connect(i, D[j].first, D[j].second); - } - } - - return result; -} - - - -void Loop::tile(int stmt_num, int level, int tile_size, int outer_level, TilingMethodType method, int alignment_offset, int alignment_multiple) { - // check for sanity of parameters - if (tile_size < 0) - throw std::invalid_argument("invalid tile size"); - if (alignment_multiple < 1 || alignment_offset < 0) - throw std::invalid_argument("invalid alignment for tile"); - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - if (level <= 0) - throw std::invalid_argument("invalid loop level " + to_string(level)); - if (level > stmt[stmt_num].loop_level.size()) - throw std::invalid_argument("there is no loop level " + to_string(level) + " for statement " + to_string(stmt_num)); - if (outer_level <= 0 || outer_level > level) - throw std::invalid_argument("invalid tile controlling loop level " + to_string(outer_level)); - - int dim = 2*level-1; - int outer_dim = 2*outer_level-1; - std::vector lex = getLexicalOrder(stmt_num); - std::set same_tiled_loop = getStatements(lex, dim-1); - std::set same_tile_controlling_loop = getStatements(lex, outer_dim-1); - - // special case for no tiling - if (tile_size == 0) { - for (std::set::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) { - Relation r(stmt[*i].xform.n_out(),stmt[*i].xform.n_out()+2); - F_And *f_root = r.add_and(); - for (int j = 1; j <= 2*outer_level-1; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.input_var(j), 1); - h.update_coef(r.output_var(j), -1); - } - EQ_Handle h1 = f_root->add_EQ(); - h1.update_coef(r.output_var(2*outer_level), 1); - EQ_Handle h2 = f_root->add_EQ(); - h2.update_coef(r.output_var(2*outer_level+1), 1); - for (int j = 2*outer_level; j <= stmt[*i].xform.n_out(); j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.input_var(j), 1); - h.update_coef(r.output_var(j+2), -1); - } - - stmt[*i].xform = Composition(copy(r), stmt[*i].xform); - } - } - // normal tiling - else { - std::set private_stmt; - for (std::set::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) { -// if (same_tiled_loop.find(*i) == same_tiled_loop.end() && !is_single_iteration(getNewIS(*i), dim)) -// same_tiled_loop.insert(*i); - - // should test dim's value directly but it is ok for now -// if (same_tiled_loop.find(*i) == same_tiled_loop.end() && get_const(stmt[*i].xform, dim+1, Output_Var) == posInfinity) - if (same_tiled_loop.find(*i) == same_tiled_loop.end() && overflow.find(*i) != overflow.end()) - private_stmt.insert(*i); - } - - - // extract the union of the iteration space to be considered - Relation hull; - { - Tuple r_list; - Tuple r_mask; - - for (std::set::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) - if (private_stmt.find(*i) == private_stmt.end()) { - Relation r = project_onto_levels(getNewIS(*i), dim+1, true); - for (int j = outer_dim; j < dim; j++) - r = Project(r, j+1, Set_Var); - for (int j = 0; j < outer_dim; j += 2) - r = Project(r, j+1, Set_Var); - r_list.append(r); - r_mask.append(1); - } - - hull = Hull(r_list, r_mask, 1, true); - } - - // extract the bound of the dimension to be tiled - Relation bound = get_loop_bound(hull, dim); - if (!bound.has_single_conjunct()) { - // further simplify the bound - hull = Approximate(hull); - bound = get_loop_bound(hull, dim); - - int i = outer_dim - 2; - while (!bound.has_single_conjunct() && i >= 0) { - hull = Project(hull, i+1, Set_Var); - bound = get_loop_bound(hull, dim); - i -= 2; - } - - if (!bound.has_single_conjunct()) - throw loop_error("cannot handle tile bounds"); - } - - // separate lower and upper bounds - std::vector lb_list, ub_list; - { - Conjunct *c = bound.query_DNF()->single_conjunct(); - for (GEQ_Iterator gi(c->GEQs()); gi; gi++) { - int coef = (*gi).get_coef(bound.set_var(dim+1)); - if (coef < 0) - ub_list.push_back(*gi); - else if (coef > 0) - lb_list.push_back(*gi); - } - } - if (lb_list.size() == 0) - throw loop_error("unable to calculate tile controlling loop lower bound"); - if (ub_list.size() == 0) - throw loop_error("unable to calculate tile controlling loop upper bound"); - - // find the simplest lower bound for StridedTile or simplest iteration count for CountedTile - int simplest_lb = 0, simplest_ub = 0; - if (method == StridedTile) { - int best_cost = INT_MAX; - for (int i = 0; i < lb_list.size(); i++) { - int cost = 0; - for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - cost += 5; - break; - } - case Global_Var: { - cost += 2; - break; - } - default: - cost += 15; - break; - } - } - - if (cost < best_cost) { - best_cost = cost; - simplest_lb = i; - } - } - } - else if (method == CountedTile) { - std::map s1, s2, s3; - int best_cost = INT_MAX; - for (int i = 0; i < lb_list.size(); i++) - for (int j = 0; j < ub_list.size(); j++) { - int cost = 0; - - for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - s1[(*ci).var] += (*ci).coef; - break; - } - case Global_Var: { - s2[(*ci).var] += (*ci).coef; - break; - } - case Exists_Var: - case Wildcard_Var: { - s3[(*ci).var] += (*ci).coef; - break; - } - default: - cost = INT_MAX-2; - break; - } - } - - for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - s1[(*ci).var] += (*ci).coef; - break; - } - case Global_Var: { - s2[(*ci).var] += (*ci).coef; - break; - } - case Exists_Var: - case Wildcard_Var: { - s3[(*ci).var] += (*ci).coef; - break; - } - default: - if (cost == INT_MAX-2) - cost = INT_MAX-1; - else - cost = INT_MAX-3; - break; - } - } - - if (cost == 0) { - for (std::map::iterator k = s1.begin(); k != s1.end(); k++) - if ((*k).second != 0) - cost += 5; - for (std::map::iterator k = s2.begin(); k != s2.end(); k++) - if ((*k).second != 0) - cost += 2; - for (std::map::iterator k = s3.begin(); k != s3.end(); k++) - if ((*k).second != 0) - cost += 15; - } - - if (cost < best_cost) { - best_cost = cost; - simplest_lb = i; - simplest_ub = j; - } - } - } - - // prepare the new transformation relations - for (std::set::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) { - Relation r(stmt[*i].xform.n_out(), stmt[*i].xform.n_out()+2); - F_And *f_root = r.add_and(); - for (int j = 0; j < outer_dim-1; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.output_var(j+1), 1); - h.update_coef(r.input_var(j+1), -1); - } - - for (int j = outer_dim-1; j < stmt[*i].xform.n_out(); j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.output_var(j+3), 1); - h.update_coef(r.input_var(j+1), -1); - } - - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.output_var(outer_dim), 1); - h.update_const(-lex[outer_dim-1]); - - stmt[*i].xform = Composition(r, stmt[*i].xform); - } - - // add tiling constraints. - for (std::set::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) { - F_And *f_super_root = stmt[*i].xform.and_with_and(); - F_Exists *f_exists = f_super_root->add_exists(); - F_And *f_root = f_exists->add_and(); - - // create a lower bound variable for easy formula creation later - Variable_ID aligned_lb; - { - Variable_ID lb = f_exists->declare(); - coef_t coef = lb_list[simplest_lb].get_coef(bound.set_var(dim+1)); - if (coef == 1) { // e.g. if i >= m+5, then LB = m+5 - EQ_Handle h = f_root->add_EQ(); - h.update_coef(lb, 1); - for (Constr_Vars_Iter ci(lb_list[simplest_lb]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - int pos = (*ci).var->get_position(); - if (pos != dim + 1) - h.update_coef(stmt[*i].xform.output_var(pos), (*ci).coef); - break; - } - case Global_Var: { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = stmt[*i].xform.get_local(g); - else - v = stmt[*i].xform.get_local(g, (*ci).var->function_of()); - h.update_coef(v, (*ci).coef); - break; - } - default: - throw loop_error("cannot handle tile bounds"); - } - } - h.update_const(lb_list[simplest_lb].get_const()); - } - else { // e.g. if 2i >= m+5, then m+5 <= 2*LB < m+5+2 - GEQ_Handle h1 = f_root->add_GEQ(); - GEQ_Handle h2 = f_root->add_GEQ(); - for (Constr_Vars_Iter ci(lb_list[simplest_lb]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - int pos = (*ci).var->get_position(); - if (pos == dim + 1) { - h1.update_coef(lb, (*ci).coef); - h2.update_coef(lb, -(*ci).coef); - } - else { - h1.update_coef(stmt[*i].xform.output_var(pos), (*ci).coef); - h2.update_coef(stmt[*i].xform.output_var(pos), -(*ci).coef); - } - break; - } - case Global_Var: { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = stmt[*i].xform.get_local(g); - else - v = stmt[*i].xform.get_local(g, (*ci).var->function_of()); - h1.update_coef(v, (*ci).coef); - h2.update_coef(v, -(*ci).coef); - break; - } - default: - throw loop_error("cannot handle tile bounds"); - } - } - h1.update_const(lb_list[simplest_lb].get_const()); - h2.update_const(-lb_list[simplest_lb].get_const()); - h2.update_const(coef-1); - } - - Variable_ID offset_lb; - if (alignment_offset == 0) - offset_lb = lb; - else { - EQ_Handle h = f_root->add_EQ(); - offset_lb = f_exists->declare(); - h.update_coef(offset_lb, 1); - h.update_coef(lb, -1); - h.update_const(alignment_offset); - } - - if (alignment_multiple == 1) { // trivial - aligned_lb = offset_lb; - } - else { // e.g. to align at 4, aligned_lb = 4*alpha && LB-4 < 4*alpha <= LB - aligned_lb = f_exists->declare(); - Variable_ID e = f_exists->declare(); - - EQ_Handle h = f_root->add_EQ(); - h.update_coef(aligned_lb, 1); - h.update_coef(e, -alignment_multiple); - - GEQ_Handle h1 = f_root->add_GEQ(); - GEQ_Handle h2 = f_root->add_GEQ(); - h1.update_coef(e, alignment_multiple); - h2.update_coef(e, -alignment_multiple); - h1.update_coef(offset_lb, -1); - h2.update_coef(offset_lb, 1); - h1.update_const(alignment_multiple-1); - } - } - - // create an upper bound variable for easy formula creation later - Variable_ID ub = f_exists->declare(); - { - coef_t coef = -ub_list[simplest_ub].get_coef(bound.set_var(dim+1)); - if (coef == 1) { // e.g. if i <= m+5, then UB = m+5 - EQ_Handle h = f_root->add_EQ(); - h.update_coef(ub, -1); - for (Constr_Vars_Iter ci(ub_list[simplest_ub]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - int pos = (*ci).var->get_position(); - if (pos != dim + 1) - h.update_coef(stmt[*i].xform.output_var(pos), (*ci).coef); - break; - } - case Global_Var: { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = stmt[*i].xform.get_local(g); - else - v = stmt[*i].xform.get_local(g, (*ci).var->function_of()); - h.update_coef(v, (*ci).coef); - break; - } - default: - throw loop_error("cannot handle tile bounds"); - } - } - h.update_const(ub_list[simplest_ub].get_const()); - } - else { // e.g. if 2i <= m+5, then m+5-2 < 2*UB <= m+5 - GEQ_Handle h1 = f_root->add_GEQ(); - GEQ_Handle h2 = f_root->add_GEQ(); - for (Constr_Vars_Iter ci(ub_list[simplest_ub]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - int pos = (*ci).var->get_position(); - if (pos == dim + 1) { - h1.update_coef(ub, -(*ci).coef); - h2.update_coef(ub, (*ci).coef); - } - else { - h1.update_coef(stmt[*i].xform.output_var(pos), -(*ci).coef); - h2.update_coef(stmt[*i].xform.output_var(pos), (*ci).coef); - } - break; - } - case Global_Var: { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = stmt[*i].xform.get_local(g); - else - v = stmt[*i].xform.get_local(g, (*ci).var->function_of()); - h1.update_coef(v, -(*ci).coef); - h2.update_coef(v, (*ci).coef); - break; - } - default: - throw loop_error("cannot handle tile bounds"); - } - } - h1.update_const(-ub_list[simplest_ub].get_const()); - h2.update_const(ub_list[simplest_ub].get_const()); - h1.update_const(coef-1); - } - } - - // insert tile controlling loop constraints - if (method == StridedTile) { // e.g. ii = LB + 32 * alpha && alpha >= 0 - Variable_ID e = f_exists->declare(); - GEQ_Handle h1 = f_root->add_GEQ(); - h1.update_coef(e, 1); - - EQ_Handle h2 = f_root->add_EQ(); - h2.update_coef(stmt[*i].xform.output_var(outer_dim+1), 1); - h2.update_coef(e, -tile_size); - h2.update_coef(aligned_lb, -1); - } - else if (method == CountedTile) { // e.g. 0 <= ii < ceiling((UB-LB+1)/32) - GEQ_Handle h1 = f_root->add_GEQ(); - h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), 1); - - GEQ_Handle h2 = f_root->add_GEQ(); - h2.update_coef(stmt[*i].xform.output_var(outer_dim+1), -tile_size); - h2.update_coef(aligned_lb, -1); - h2.update_coef(ub, 1); - } - - // special care for private statements like overflow assignment - if (private_stmt.find(*i) != private_stmt.end()) { // e.g. ii <= UB - GEQ_Handle h = f_root->add_GEQ(); - h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1); - h.update_coef(ub, 1); - } - // if (private_stmt.find(*i) != private_stmt.end()) { - // if (stmt[*i].xform.n_out() > dim+3) { // e.g. ii <= UB && i = ii - // GEQ_Handle h = f_root->add_GEQ(); - // h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1); - // h.update_coef(ub, 1); - - // stmt[*i].xform = Project(stmt[*i].xform, dim+3, Output_Var); - // f_root = stmt[*i].xform.and_with_and(); - // EQ_Handle h1 = f_root->add_EQ(); - // h1.update_coef(stmt[*i].xform.output_var(dim+3), 1); - // h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1); - // } - // else if (method == StridedTile) { // e.g. ii <= UB since i does not exist - // GEQ_Handle h = f_root->add_GEQ(); - // h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1); - // h.update_coef(ub, 1); - // } - // } - - // restrict original loop index inside the tile - else { - if (method == StridedTile) { // e.g. ii <= i < ii + tile_size - GEQ_Handle h1 = f_root->add_GEQ(); - h1.update_coef(stmt[*i].xform.output_var(dim+3), 1); - h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1); - - GEQ_Handle h2 = f_root->add_GEQ(); - h2.update_coef(stmt[*i].xform.output_var(dim+3), -1); - h2.update_coef(stmt[*i].xform.output_var(outer_dim+1), 1); - h2.update_const(tile_size-1); - } - else if (method == CountedTile) { // e.g. LB+32*ii <= i < LB+32*ii+tile_size - GEQ_Handle h1 = f_root->add_GEQ(); - h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), -tile_size); - h1.update_coef(stmt[*i].xform.output_var(dim+3), 1); - h1.update_coef(aligned_lb, -1); - - GEQ_Handle h2 = f_root->add_GEQ(); - h2.update_coef(stmt[*i].xform.output_var(outer_dim+1), tile_size); - h2.update_coef(stmt[*i].xform.output_var(dim+3), -1); - h2.update_const(tile_size-1); - h2.update_coef(aligned_lb, 1); - } - } - } - } - - // update loop level information - for (std::set::iterator i = same_tile_controlling_loop.begin(); i != same_tile_controlling_loop.end(); i++) { - for (int j = 1; j <= stmt[*i].loop_level.size(); j++) - switch (stmt[*i].loop_level[j-1].type) { - case LoopLevelOriginal: - break; - case LoopLevelTile: - if (stmt[*i].loop_level[j-1].payload >= outer_level) - stmt[*i].loop_level[j-1].payload++; - break; - default: - throw loop_error("unknown loop level type for statement " + to_string(*i)); - } - - LoopLevel ll; - ll.type = LoopLevelTile; - ll.payload = level+1; - ll.parallel_level = 0; - stmt[*i].loop_level.insert(stmt[*i].loop_level.begin()+(outer_level-1), ll); - } -} - - - -std::set Loop::unroll(int stmt_num, int level, int unroll_amount) { - // check for sanity of parameters - if (unroll_amount < 0) - throw std::invalid_argument("invalid unroll amount " + to_string(unroll_amount)); - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - if (level <= 0 || level > stmt[stmt_num].loop_level.size()) - throw std::invalid_argument("invalid loop level " + to_string(level)); - - int dim = 2*level - 1; - std::vector lex = getLexicalOrder(stmt_num); - std::set same_loop = getStatements(lex, dim-1); - - // nothing to do - if (unroll_amount == 1) - return std::set(); - - // extract the intersection of the iteration space to be considered - Relation hull = Relation::True(level); - apply_xform(same_loop); - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - if (stmt[*i].IS.is_upper_bound_satisfiable()) { - Relation mapping(stmt[*i].IS.n_set(), level); - F_And *f_root = mapping.add_and(); - for (int j = 1; j <= level; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.input_var(j), 1); - h.update_coef(mapping.output_var(j), -1); - } - hull = Intersection(hull, Range(Restrict_Domain(mapping, copy(stmt[*i].IS)))); - hull.simplify(2, 4); - } - } - for (int i = 1; i <= level; i++) { - std::string name = tmp_loop_var_name_prefix + to_string(i); - hull.name_set_var(i, name); - } - hull.setup_names(); - - // extract the exact loop bound of the dimension to be unrolled - if (is_single_loop_iteration(hull, level, this->known)) - return std::set(); - Relation bound = get_loop_bound(hull, level, this->known); - if (!bound.has_single_conjunct() || !bound.is_satisfiable() || bound.is_tautology()) - throw loop_error("unable to extract loop bound for unrolling"); - - // extract the loop stride - EQ_Handle stride_eq; - int stride = 1; - { - bool simple_stride = true; - int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(level), stride_eq, simple_stride); - if (strides > 1) - throw loop_error("too many strides"); - else if (strides == 1) { - int sign = stride_eq.get_coef(bound.set_var(level)); - Constr_Vars_Iter it(stride_eq, true); - stride = abs((*it).coef/sign); - } - } - - // separate lower and upper bounds - std::vector lb_list, ub_list; - { - Conjunct *c = bound.query_DNF()->single_conjunct(); - for (GEQ_Iterator gi(c->GEQs()); gi; gi++) { - int coef = (*gi).get_coef(bound.set_var(level)); - if (coef < 0) - ub_list.push_back(*gi); - else if (coef > 0) - lb_list.push_back(*gi); - } - } - - // simplify overflow expression for each pair of upper and lower bounds - std::vector > > overflow_table(lb_list.size(), std::vector >(ub_list.size(), std::map())); - bool is_overflow_simplifiable = true; - for (int i = 0; i < lb_list.size(); i++) { - if (!is_overflow_simplifiable) - break; - - for (int j = 0; j < ub_list.size(); j++) { - // lower bound or upper bound has non-unit coefficient, can't simplify - if (ub_list[j].get_coef(bound.set_var(level)) != -1 || lb_list[i].get_coef(bound.set_var(level)) != 1) { - is_overflow_simplifiable = false; - break; - } - - for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) { - switch((*ci).var->kind()) { - case Input_Var: - { - if ((*ci).var != bound.set_var(level)) - overflow_table[i][j][(*ci).var] += (*ci).coef; - - break; - } - case Global_Var: - { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = bound.get_local(g); - else - v = bound.get_local(g, (*ci).var->function_of()); - overflow_table[i][j][(*ci).var] += (*ci).coef; - break; - } - default: - throw loop_error("failed to calculate overflow amount"); - } - } - overflow_table[i][j][NULL] += ub_list[j].get_const(); - - for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) { - switch((*ci).var->kind()) { - case Input_Var: - { - if ((*ci).var != bound.set_var(level)) { - overflow_table[i][j][(*ci).var] += (*ci).coef; - if (overflow_table[i][j][(*ci).var] == 0) - overflow_table[i][j].erase(overflow_table[i][j].find((*ci).var)); - } - break; - } - case Global_Var: - { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = bound.get_local(g); - else - v = bound.get_local(g, (*ci).var->function_of()); - overflow_table[i][j][(*ci).var] += (*ci).coef; - if (overflow_table[i][j][(*ci).var] == 0) - overflow_table[i][j].erase(overflow_table[i][j].find((*ci).var)); - break; - } - default: - throw loop_error("failed to calculate overflow amount"); - } - } - overflow_table[i][j][NULL] += lb_list[i].get_const(); - - overflow_table[i][j][NULL] += stride; - if (unroll_amount == 0 || (overflow_table[i][j].size() == 1 && overflow_table[i][j][NULL]/stride < unroll_amount)) - unroll_amount = overflow_table[i][j][NULL]/stride; - } - } - - // loop iteration count can't be determined, bail out gracefully - if (unroll_amount == 0) - return std::set(); - - // further simply overflow calculation using coefficients' modular - if (is_overflow_simplifiable) { - for (int i = 0; i < lb_list.size(); i++) - for (int j = 0; j < ub_list.size(); j++) - if (stride == 1) { - for (std::map::iterator k = overflow_table[i][j].begin(); k != overflow_table[i][j].end(); ) - if ((*k).first != NULL) { - int t = int_mod_hat((*k).second, unroll_amount); - if (t == 0) { - overflow_table[i][j].erase(k++); - } - else { - int t2 = hull.query_variable_mod((*k).first, unroll_amount); - if (t2 != INT_MAX) { - overflow_table[i][j][NULL] += t * t2; - overflow_table[i][j].erase(k++); - } - else { - (*k).second = t; - k++; - } - } - } - else - k++; - - overflow_table[i][j][NULL] = int_mod_hat(overflow_table[i][j][NULL], unroll_amount); - - // Since we don't have MODULO instruction in SUIF yet (only MOD), make all coef positive in the final formula - for (std::map::iterator k = overflow_table[i][j].begin(); k != overflow_table[i][j].end(); k++) - if ((*k).second < 0) - (*k).second += unroll_amount; - } - } - - - // build overflow statement - CG_outputBuilder *ocg = ir->builder(); - CG_outputRepr *overflow_code = NULL; - Relation cond_upper(level), cond_lower(level); - Relation overflow_constraint(0); - F_And *overflow_constraint_root = overflow_constraint.add_and(); - std::vector over_var_list; - if (is_overflow_simplifiable && lb_list.size() == 1) { - for (int i = 0; i < ub_list.size(); i++) { - if (overflow_table[0][i].size() == 1) { - // upper splitting condition - GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]); - h.update_const(((overflow_table[0][i][NULL]/stride)%unroll_amount) * -stride); - } - else { - // upper splitting condition - std::string over_name = overflow_var_name_prefix + to_string(overflow_var_name_counter++); - Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name); - over_var_list.push_back(over_free_var); - GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]); - h.update_coef(cond_upper.get_local(over_free_var), -stride); - - // insert constraint 0 <= overflow < unroll_amount - Variable_ID v = overflow_constraint.get_local(over_free_var); - GEQ_Handle h1 = overflow_constraint_root->add_GEQ(); - h1.update_coef(v, 1); - GEQ_Handle h2 = overflow_constraint_root->add_GEQ(); - h2.update_coef(v, -1); - h2.update_const(unroll_amount-1); - - // create overflow assignment - bound.setup_names(); - CG_outputRepr *rhs = NULL; - for (std::map::iterator j = overflow_table[0][i].begin(); j != overflow_table[0][i].end(); j++) - if ((*j).first != NULL) { - CG_outputRepr *t = ocg->CreateIdent((*j).first->name()); - if ((*j).second != 1) - t = ocg->CreateTimes(ocg->CreateInt((*j).second), t); - rhs = ocg->CreatePlus(rhs, t); - } - else - if ((*j).second != 0) - rhs = ocg->CreatePlus(rhs, ocg->CreateInt((*j).second)); - - if (stride != 1) - rhs = ocg->CreateIntegerCeil(rhs, ocg->CreateInt(stride)); - rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount)); - - CG_outputRepr *lhs = ocg->CreateIdent(over_name); - init_code = ocg->StmtListAppend(init_code, ocg->CreateAssignment(0, lhs, ocg->CreateInt(0))); - lhs = ocg->CreateIdent(over_name); - overflow_code = ocg->StmtListAppend(overflow_code, ocg->CreateAssignment(0, lhs, rhs)); - } - } - - // lower splitting condition - GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[0]); - } - else if (is_overflow_simplifiable && ub_list.size() == 1) { - for (int i = 0; i < lb_list.size(); i++) { - - if (overflow_table[i][0].size() == 1) { - // lower splitting condition - GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]); - h.update_const(overflow_table[i][0][NULL] * -stride); - } - else { - // lower splitting condition - std::string over_name = overflow_var_name_prefix + to_string(overflow_var_name_counter++); - Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name); - over_var_list.push_back(over_free_var); - GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]); - h.update_coef(cond_lower.get_local(over_free_var), -stride); - - // insert constraint 0 <= overflow < unroll_amount - Variable_ID v = overflow_constraint.get_local(over_free_var); - GEQ_Handle h1 = overflow_constraint_root->add_GEQ(); - h1.update_coef(v, 1); - GEQ_Handle h2 = overflow_constraint_root->add_GEQ(); - h2.update_coef(v, -1); - h2.update_const(unroll_amount-1); - - // create overflow assignment - bound.setup_names(); - CG_outputRepr *rhs = NULL; - for (std::map::iterator j = overflow_table[0][i].begin(); j != overflow_table[0][i].end(); j++) - if ((*j).first != NULL) { - CG_outputRepr *t = ocg->CreateIdent((*j).first->name()); - if ((*j).second != 1) - t = ocg->CreateTimes(ocg->CreateInt((*j).second), t); - rhs = ocg->CreatePlus(rhs, t); - } - else - if ((*j).second != 0) - rhs = ocg->CreatePlus(rhs, ocg->CreateInt((*j).second)); - - if (stride != 1) - rhs = ocg->CreateIntegerCeil(rhs, ocg->CreateInt(stride)); - rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount)); - - CG_outputRepr *lhs = ocg->CreateIdent(over_name); - init_code = ocg->StmtListAppend(init_code, ocg->CreateAssignment(0, lhs, ocg->CreateInt(0))); - lhs = ocg->CreateIdent(over_name); - overflow_code = ocg->StmtListAppend(overflow_code, ocg->CreateAssignment(0, lhs, rhs)); - } - } - - // upper splitting condition - GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[0]); - } - else { - std::string over_name = overflow_var_name_prefix + to_string(overflow_var_name_counter++); - Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name); - over_var_list.push_back(over_free_var); - - Tuple lb_repr_list, ub_repr_list; - for (int i = 0; i < lb_list.size(); i++) { - //lb_repr_list.append(outputLBasRepr(ocg, lb_list[i], bound, bound.set_var(dim+1), stride, stride_eq, Relation::True(bound.n_set()), std::vector(bound.n_set(), NULL))); - lb_repr_list.append(outputLBasRepr(ocg, lb_list[i], bound, bound.set_var(dim+1), stride, stride_eq, Relation::True(bound.n_set()), std::vector(bound.n_set()))); - GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]); - } - for (int i = 0; i < ub_list.size(); i++) { - //ub_repr_list.append(outputUBasRepr(ocg, ub_list[i], bound, bound.set_var(dim+1), stride, stride_eq, std::vector(bound.n_set(), NULL))); - ub_repr_list.append(outputUBasRepr(ocg, ub_list[i], bound, bound.set_var(dim+1), stride, stride_eq, std::vector(bound.n_set()))); - GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]); - h.update_coef(cond_upper.get_local(over_free_var), -stride); - } - - CG_outputRepr *lbRepr, *ubRepr; - if (lb_repr_list.size() > 1) - lbRepr = ocg->CreateInvoke("max", lb_repr_list); - else if (lb_repr_list.size() == 1) - lbRepr = lb_repr_list[1]; - - if (ub_repr_list.size() > 1) - ubRepr = ocg->CreateInvoke("min", ub_repr_list); - else if (ub_repr_list.size() == 1) - ubRepr = ub_repr_list[1]; - - // create overflow assignment - bound.setup_names(); - CG_outputRepr *rhs = ocg->CreatePlus(ocg->CreateMinus(ubRepr, lbRepr), ocg->CreateInt(1)); - if (stride != 1) - rhs = ocg->CreateIntegerDivide(rhs, ocg->CreateInt(stride)); - rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount)); - CG_outputRepr *lhs = ocg->CreateIdent(over_name); - init_code = ocg->StmtListAppend(init_code, ocg->CreateAssignment(0, lhs, ocg->CreateInt(0))); - lhs = ocg->CreateIdent(over_name); - overflow_code = ocg->CreateAssignment(0, lhs, rhs); - - // insert constraint 0 <= overflow < unroll_amount - Variable_ID v = overflow_constraint.get_local(over_free_var); - GEQ_Handle h1 = overflow_constraint_root->add_GEQ(); - h1.update_coef(v, 1); - GEQ_Handle h2 = overflow_constraint_root->add_GEQ(); - h2.update_coef(v, -1); - h2.update_const(unroll_amount-1); - } - - // insert overflow statement - int overflow_stmt_num = -1; - if (overflow_code != NULL) { - // build iteration space for overflow statement - Relation mapping(level, level-1); - F_And *f_root = mapping.add_and(); - for (int i = 1; i < level; i++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(i), 1); - h.update_coef(mapping.input_var(i), -1); - } - Relation overflow_IS = Range(Restrict_Domain(mapping, copy(hull))); - for (int i = 1; i < level; i++) - overflow_IS.name_set_var(i, hull.set_var(i)->name()); - overflow_IS.setup_names(); - - // build dumb transformation relation for overflow statement - Relation overflow_xform(level-1, 2*(level-1)+1); - f_root = overflow_xform.add_and(); - for (int i = 1; i <= level-1; i++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(overflow_xform.output_var(2*i), 1); - h.update_coef(overflow_xform.input_var(i), -1); - - h = f_root->add_EQ(); - h.update_coef(overflow_xform.output_var(2*i-1), 1); - h.update_const(-lex[2*i-2]); - } - EQ_Handle h = f_root->add_EQ(); - h.update_coef(overflow_xform.output_var(2*(level-1)+1), 1); - h.update_const(-lex[2*(level-1)]); - - shiftLexicalOrder(lex, dim-1, 1); - Statement overflow_stmt; - overflow_stmt.code = overflow_code; - overflow_stmt.IS = overflow_IS; - overflow_stmt.xform = overflow_xform; - overflow_stmt.loop_level = std::vector(level-1); - for (int i = 0; i < level-1; i++) { - overflow_stmt.loop_level[i].type = stmt[stmt_num].loop_level[i].type; - if (stmt[stmt_num].loop_level[i].type == LoopLevelTile && - stmt[stmt_num].loop_level[i].payload >= level) - overflow_stmt.loop_level[i].payload = -1; - else - overflow_stmt.loop_level[i].payload = stmt[stmt_num].loop_level[i].payload; - overflow_stmt.loop_level[i].parallel_level = stmt[stmt_num].loop_level[i].parallel_level; - } - stmt.push_back(overflow_stmt); - dep.insert(); - overflow_stmt_num = stmt.size() - 1; - overflow[overflow_stmt_num] = over_var_list; - - // update the global known information on overflow variable - this->known = Intersection(this->known, Extend_Set(copy(overflow_constraint), this->known.n_set()-overflow_constraint.n_set())); - - // update dependence graph - DependenceVector dv; - dv.type = DEP_CONTROL; - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) - dep.connect(overflow_stmt_num, *i, dv); - dv.type = DEP_W2W; - { - IR_ScalarSymbol *overflow_sym = NULL; - std::vector scalars = ir->FindScalarRef(overflow_code); - for (int i = scalars.size()-1; i >=0; i--) - if (scalars[i]->is_write()) { - overflow_sym = scalars[i]->symbol(); - break; - } - for (int i = scalars.size()-1; i >=0; i--) - delete scalars[i]; - dv.sym = overflow_sym; - } - dv.lbounds = std::vector(num_dep_dim, 0); - dv.ubounds = std::vector(num_dep_dim, 0); - int dep_dim = get_last_dep_dim_before(stmt_num, level); - for (int i = dep_dim + 1; i < num_dep_dim; i++) { - dv.lbounds[i] = -posInfinity; - dv.ubounds[i] = posInfinity; - } - for (int i = 0; i <= dep_dim; i++) { - if (i != 0) { - dv.lbounds[i-1] = 0; - dv.ubounds[i-1] = 0; - } - dv.lbounds[i] = 1; - dv.ubounds[i] = posInfinity; - dep.connect(overflow_stmt_num, overflow_stmt_num, dv); - } - } - - // split the loop so it can be fully unrolled - std::set result = split(stmt_num, level, cond_upper); - std::set result2 = split(stmt_num, level, cond_lower); - for (std::set::iterator i = result2.begin(); i != result2.end(); i++) - result.insert(*i); - - // check if unrolled statements can be trivially lumped together as one statement - bool can_be_lumped = true; - if (can_be_lumped) { - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) - if (*i != stmt_num) { - if (stmt[*i].loop_level.size() != stmt[stmt_num].loop_level.size()) { - can_be_lumped = false; - break; - } - for (int j = 0; j < stmt[stmt_num].loop_level.size(); j++) - if (!(stmt[*i].loop_level[j].type == stmt[stmt_num].loop_level[j].type && - stmt[*i].loop_level[j].payload == stmt[stmt_num].loop_level[j].payload)) { - can_be_lumped = false; - break; - } - if (!can_be_lumped) - break; - std::vector lex2 = getLexicalOrder(*i); - for (int j = 2*level; j < lex.size()-1; j+=2) - if (lex[j] != lex2[j]) { - can_be_lumped = false; - break; - } - if (!can_be_lumped) - break; - } - } - if (can_be_lumped) { - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) - if (is_inner_loop_depend_on_level(stmt[*i].IS, level, known)) { - can_be_lumped = false; - break; - } - } - if (can_be_lumped) { - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) - if (*i != stmt_num) { - if (!(Must_Be_Subset(copy(stmt[*i].IS), copy(stmt[stmt_num].IS)) && Must_Be_Subset(copy(stmt[stmt_num].IS), copy(stmt[*i].IS)))) { - can_be_lumped = false; - break; - } - } - } - if (can_be_lumped) { - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - for (DependenceGraph::EdgeList::iterator j = dep.vertex[*i].second.begin(); j != dep.vertex[*i].second.end(); j++) - if (same_loop.find(j->first) != same_loop.end()) { - for (int k = 0; k < j->second.size(); k++) - if (j->second[k].type == DEP_CONTROL || j->second[k].type == DEP_UNKNOWN) { - can_be_lumped = false; - break; - } - if (!can_be_lumped) - break; - } - if (!can_be_lumped) - break; - } - } - - - // add strides to original statements - // for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) - // add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride); - - - // std::vector depending_overflow_var; - // for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - // add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride); - // if (overflow.find(*i) != overflow.end()) { - // // TO DO: It should check whether overflow vaiable depends on - // // this loop index and by how much. This step is important if - // // you want to unroll loops in arbitrary order. - // depending_overflow_var.insert(depending_overflow_var.end(), overflow[*i].begin(), overflow[*i].end()); - - // continue; - // } - // } - - - -// std::map > pending; -// for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { -// add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride); - -// if (overflow.find(*i) != overflow.end()) { -// // TO DO: It should check whether overflow vaiable depends on -// // this loop index and by how much. This step is important if -// // you want to unroll loops in arbitrary order. -// depending_overflow_var.insert(depending_overflow_var.end(), overflow[*i].begin(), overflow[*i].end()); - -// continue; -// } - -// // create copy for each unroll amount -// for (int j = 1; j < unroll_amount; j++) { -// Tuple funcList; -// Tuple loop_vars; -// loop_vars.append(stmt[*i].IS.set_var((dim+1)/2)->name()); -// funcList.append(ocg->CreatePlus(ocg->CreateIdent(stmt[*i].IS.set_var(level)->name()), ocg->CreateInt(j*stride))); -// CG_outputRepr *code = ocg->CreatePlaceHolder(0, stmt[*i].code->clone(), funcList, loop_vars); - -// // prepare the new statment to insert -// Statement unrolled_stmt; -// unrolled_stmt.IS = copy(stmt[*i].IS); -// // adjust_loop_bound(unrolled_stmt.IS, (dim-1)/2, j); -// unrolled_stmt.xform = copy(stmt[*i].xform); -// unrolled_stmt.code = code; -// unrolled_stmt.loop_level = stmt[*i].loop_level; -// pending[*i].push_back(unrolled_stmt); -// } -// } - -// // adjust iteration space due to loop bounds depending on this loop -// // index and affected overflow variables -// for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { -// for (int j = 0; j < pending[*i].size(); j++) { -// adjust_loop_bound(pending[*i][j].IS, (dim-1)/2, j+1, depending_overflow_var); -// //pending[*i][j].IS = Intersection(pending[*i][j].IS, Extend_Set(copy(this->known), pending[*i][j].IS.n_set() - this->known.n_set())); -// } -// } - - // insert unrolled statements - int old_num_stmt = stmt.size(); - if (!can_be_lumped) { - std::map > what_stmt_num; - - for (int j = 1; j < unroll_amount; j++) { - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - Statement new_stmt; - - Tuple funcList; - Tuple loop_vars; - loop_vars.append(stmt[*i].IS.set_var(level)->name()); - funcList.append(ocg->CreatePlus(ocg->CreateIdent(stmt[*i].IS.set_var(level)->name()), ocg->CreateInt(j*stride))); - new_stmt.code = ocg->CreatePlaceHolder(0, stmt[*i].code->clone(), funcList, loop_vars); - - new_stmt.IS = adjust_loop_bound(stmt[*i].IS, level, j * stride); - add_loop_stride(new_stmt.IS, bound, level-1, unroll_amount * stride); - - new_stmt.xform = copy(stmt[*i].xform); - new_stmt.loop_level = stmt[*i].loop_level; - stmt.push_back(new_stmt); - dep.insert(); - what_stmt_num[*i].push_back(stmt.size() - 1); - } - } - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) - add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride); - - - // update dependence graph - if (stmt[stmt_num].loop_level[level-1].type == LoopLevelOriginal) { - int dep_dim = stmt[stmt_num].loop_level[level-1].payload; - int new_stride = unroll_amount * stride; - for (int i = 0; i < old_num_stmt; i++) { - std::vector > D; - - for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); ) { - if (same_loop.find(i) != same_loop.end()) { - if (same_loop.find(j->first) != same_loop.end()) { - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if (dv.type == DEP_CONTROL || dv.type == DEP_UNKNOWN) { - D.push_back(std::make_pair(j->first, dv)); - for (int kk = 0; kk < unroll_amount - 1; kk++) - if (what_stmt_num[i][kk] != -1 && what_stmt_num[j->first][kk] != -1) - dep.connect(what_stmt_num[i][kk], what_stmt_num[j->first][kk], dv); - } - else { - coef_t lb = dv.lbounds[dep_dim]; - coef_t ub = dv.ubounds[dep_dim]; - if (ub == lb && int_mod(lb, static_cast(new_stride)) == 0) { - D.push_back(std::make_pair(j->first, dv)); - for (int kk = 0; kk < unroll_amount - 1; kk++) - if (what_stmt_num[i][kk] != -1 && what_stmt_num[j->first][kk] != -1) - dep.connect(what_stmt_num[i][kk], what_stmt_num[j->first][kk], dv); - } - else if (lb == -posInfinity && ub == posInfinity) { - D.push_back(std::make_pair(j->first, dv)); - for (int kk = 0; kk < unroll_amount; kk++) - if (kk == 0) - D.push_back(std::make_pair(j->first, dv)); - else if (what_stmt_num[j->first][kk-1] != -1) - D.push_back(std::make_pair(what_stmt_num[j->first][kk-1], dv)); - for (int t = 0; t < unroll_amount - 1; t++) - if (what_stmt_num[i][t] != -1) - for (int kk = 0; kk < unroll_amount; kk++) - if (kk == 0) - dep.connect(what_stmt_num[i][t], j->first, dv); - else if (what_stmt_num[j->first][kk-1] != -1) - dep.connect(what_stmt_num[i][t], what_stmt_num[j->first][kk-1], dv); - } - else { - for (int kk = 0; kk < unroll_amount; kk++) { - if (lb != -posInfinity) { - if (kk * stride < int_mod(lb, static_cast(new_stride))) - dv.lbounds[dep_dim] = floor(static_cast(lb)/new_stride) * new_stride + new_stride; - else - dv.lbounds[dep_dim] = floor(static_cast(lb)/new_stride) * new_stride; - } - if (ub != posInfinity) { - if (kk * stride > int_mod(ub, static_cast(new_stride))) - dv.ubounds[dep_dim] = floor(static_cast(ub)/new_stride) * new_stride - new_stride; - else - dv.ubounds[dep_dim] = floor(static_cast(ub)/new_stride) * new_stride; - } - if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim]) { - if (kk == 0) - D.push_back(std::make_pair(j->first, dv)); - else if (what_stmt_num[j->first][kk-1] != -1) - D.push_back(std::make_pair(what_stmt_num[j->first][kk-1], dv)); - } - } - for (int t = 0; t < unroll_amount-1; t++) - if (what_stmt_num[i][t] != -1) - for (int kk = 0; kk < unroll_amount; kk++) { - if (lb != -posInfinity) { - if (kk * stride < int_mod(lb+t+1, static_cast(new_stride))) - dv.lbounds[dep_dim] = floor(static_cast(lb+(t+1)*stride)/new_stride) * new_stride + new_stride; - else - dv.lbounds[dep_dim] = floor(static_cast(lb+(t+1)*stride)/new_stride) * new_stride; - } - if (ub != posInfinity) { - if (kk * stride > int_mod(ub+t+1, static_cast(new_stride))) - dv.ubounds[dep_dim] = floor(static_cast(ub+(t+1)*stride)/new_stride) * new_stride - new_stride; - else - dv.ubounds[dep_dim] = floor(static_cast(ub+(t+1)*stride)/new_stride) * new_stride; - } - if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim]) { - if (kk == 0) - dep.connect(what_stmt_num[i][t], j->first, dv); - else if (what_stmt_num[j->first][kk-1] != -1) - dep.connect(what_stmt_num[i][t], what_stmt_num[j->first][kk-1], dv); - } - } - } - } - } - - dep.vertex[i].second.erase(j++); - } - else { - for (int kk = 0; kk < unroll_amount - 1; kk++) - if (what_stmt_num[i][kk] != -1) - dep.connect(what_stmt_num[i][kk], j->first, j->second); - - j++; - } - } - else { - if (same_loop.find(j->first) != same_loop.end()) - for (int k = 0; k < j->second.size(); k++) - for (int kk = 0; kk < unroll_amount - 1; kk++) - if (what_stmt_num[j->first][kk] != -1) - D.push_back(std::make_pair(what_stmt_num[j->first][kk], j->second[k])); - j++; - } - } - - for (int j = 0; j < D.size(); j++) - dep.connect(i, D[j].first, D[j].second); - } - } - - // reset lexical order for the unrolled loop body - std::set new_same_loop; - for (std::map >::iterator i = what_stmt_num.begin(); i != what_stmt_num.end(); i++) { - new_same_loop.insert(i->first); - for (int j = 0; j < i->second.size(); j++) - new_same_loop.insert(i->second[j]); - } - setLexicalOrder(dim+1, new_same_loop); - } - else { - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) - add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride); - - int max_level = stmt[stmt_num].loop_level.size(); - std::vector > stmt_order; - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) - stmt_order.push_back(std::make_pair(get_const(stmt[*i].xform, 2*max_level, Output_Var), *i)); - sort(stmt_order.begin(), stmt_order.end()); - - Statement new_stmt; - new_stmt.code = NULL; - for (int j = 1; j < unroll_amount; j++) - for (int i = 0; i < stmt_order.size(); i++) { - Tuple funcList; - Tuple loop_vars; - loop_vars.append(stmt[stmt_order[i].second].IS.set_var(level)->name()); - funcList.append(ocg->CreatePlus(ocg->CreateIdent(stmt[stmt_order[i].second].IS.set_var(level)->name()), ocg->CreateInt(j*stride))); - CG_outputRepr *code = ocg->CreatePlaceHolder(0, stmt[stmt_order[i].second].code->clone(), funcList, loop_vars); - new_stmt.code = ocg->StmtListAppend(new_stmt.code, code); - } - - new_stmt.IS = copy(stmt[stmt_num].IS); - new_stmt.xform = copy(stmt[stmt_num].xform); - assign_const(new_stmt.xform, 2*max_level, stmt_order[stmt_order.size()-1].first+1); - new_stmt.loop_level = stmt[stmt_num].loop_level; - stmt.push_back(new_stmt); - dep.insert(); - - // update dependence graph - if (stmt[stmt_num].loop_level[level-1].type == LoopLevelOriginal) { - int dep_dim = stmt[stmt_num].loop_level[level-1].payload; - int new_stride = unroll_amount * stride; - for (int i = 0; i < old_num_stmt; i++) { - std::vector > > D; - - for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); ) { - if (same_loop.find(i) != same_loop.end()) { - if (same_loop.find(j->first) != same_loop.end()) { - std::vector dvs11, dvs12, dvs22, dvs21; - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if (dv.type == DEP_CONTROL || dv.type == DEP_UNKNOWN) { - if (i == j->first) { - dvs11.push_back(dv); - dvs22.push_back(dv); - } - else - throw loop_error("unrolled statements lumped together illegally"); - } - else { - coef_t lb = dv.lbounds[dep_dim]; - coef_t ub = dv.ubounds[dep_dim]; - if (ub == lb && int_mod(lb, static_cast(new_stride)) == 0) { - dvs11.push_back(dv); - dvs22.push_back(dv); - } - else { - if (lb != -posInfinity) - dv.lbounds[dep_dim] = ceil(static_cast(lb)/new_stride) * new_stride; - if (ub != posInfinity) - dv.ubounds[dep_dim] = floor(static_cast(ub)/new_stride) * new_stride; - if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim]) - dvs11.push_back(dv); - - if (lb != -posInfinity) - dv.lbounds[dep_dim] = ceil(static_cast(lb)/new_stride) * new_stride; - if (ub != posInfinity) - dv.ubounds[dep_dim] = ceil(static_cast(ub)/new_stride) * new_stride; - if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim]) - dvs21.push_back(dv); - - if (lb != -posInfinity) - dv.lbounds[dep_dim] = floor(static_cast(lb)/new_stride) * new_stride; - if (ub != posInfinity) - dv.ubounds[dep_dim] = floor(static_cast(ub-stride)/new_stride) * new_stride; - if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim]) - dvs12.push_back(dv); - - if (lb != -posInfinity) - dv.lbounds[dep_dim] = floor(static_cast(lb)/new_stride) * new_stride; - if (ub != posInfinity) - dv.ubounds[dep_dim] = ceil(static_cast(ub-stride)/new_stride) * new_stride; - if (dv.ubounds[dep_dim] >= dv.lbounds[dep_dim]) - dvs22.push_back(dv); - } - } - } - if (dvs11.size() > 0) - D.push_back(std::make_pair(i, dvs11)); - if (dvs22.size() > 0) - dep.connect(old_num_stmt, old_num_stmt, dvs22); - if (dvs12.size() > 0) - D.push_back(std::make_pair(old_num_stmt, dvs12)); - if (dvs21.size() > 0) - dep.connect(old_num_stmt, i, dvs21); - - dep.vertex[i].second.erase(j++); - } - else { - dep.connect(old_num_stmt, j->first, j->second); - j++; - } - } - else { - if (same_loop.find(j->first) != same_loop.end()) - D.push_back(std::make_pair(old_num_stmt, j->second)); - j++; - } - } - - for (int j = 0; j < D.size(); j++) - dep.connect(i, D[j].first, D[j].second); - } - } - } - - return result; -} - - -std::vector Loop::getLexicalOrder(int stmt_num) const { - assert(stmt_num < stmt.size()); - - const int n = stmt[stmt_num].xform.n_out(); - std::vector lex(n,0); - - for (int i = 0; i < n; i += 2) - lex[i] = get_const(stmt[stmt_num].xform, i, Output_Var); - - return lex; -} - -std::set Loop::getStatements(const std::vector &lex, int dim) const { - const int m = stmt.size(); - - std::set same_loops; - for (int i = 0; i < m; i++) { - if (dim < 0) - same_loops.insert(i); - else { - std::vector a_lex = getLexicalOrder(i); - int j; - for (j = 0; j <= dim; j+=2) - if (lex[j] != a_lex[j]) - break; - if (j > dim) - same_loops.insert(i); - } - } - - return same_loops; -} - - -void Loop::shiftLexicalOrder(const std::vector &lex, int dim, int amount) { - const int m = stmt.size(); - - if (amount == 0) - return; - - for (int i = 0; i < m; i++) { - std::vector lex2 = getLexicalOrder(i); - - bool need_shift = true; - - for (int j = 0; j < dim; j++) - if (lex2[j] != lex[j]) { - need_shift = false; - break; - } - - if (!need_shift) - continue; - - if (amount > 0) { - if (lex2[dim] < lex[dim]) - continue; - } - else if (amount < 0) { - if (lex2[dim] > lex[dim]) - continue; - } - - assign_const(stmt[i].xform, dim, lex2[dim] + amount); - } -} - - -void Loop::setLexicalOrder(int dim, const std::set &active, int starting_order) { - if (active.size() == 0) - return; - - // check for sanity of parameters - if (dim < 0 || dim % 2 != 0) - throw std::invalid_argument("invalid constant loop level to set lexicographical order"); - std::vector lex; - int ref_stmt_num; - for (std::set::iterator i = active.begin(); i != active.end(); i++) { - if ((*i) < 0 || (*i) >= stmt.size()) - throw std::invalid_argument("invalid statement number " + to_string(*i)); - if (dim >= stmt[*i].xform.n_out()) - throw std::invalid_argument("invalid constant loop level to set lexicographical order"); - if (i == active.begin()) { - lex = getLexicalOrder(*i); - ref_stmt_num = *i; - } - else { - std::vector lex2 = getLexicalOrder(*i); - for (int j = 0; j < dim; j+=2) - if (lex[j] != lex2[j]) - throw std::invalid_argument("statements are not in the same sub loop nest"); - } - } - - // sepearate statements by current loop level types - int level = (dim+2)/2; - std::map, std::set > active_by_level_type; - std::set active_by_no_level; - for (std::set::iterator i = active.begin(); i != active.end(); i++) { - if (level > stmt[*i].loop_level.size()) - active_by_no_level.insert(*i); - else - active_by_level_type[std::make_pair(stmt[*i].loop_level[level-1].type, stmt[*i].loop_level[level-1].payload)].insert(*i); - } - - // further separate statements due to control dependences - std::vector > active_by_level_type_splitted; - for (std::map, std::set >::iterator i = active_by_level_type.begin(); i != active_by_level_type.end(); i++) - active_by_level_type_splitted.push_back(i->second); - for (std::set::iterator i = active_by_no_level.begin(); i != active_by_no_level.end(); i++) - for (int j = active_by_level_type_splitted.size() - 1; j >= 0; j--) { - std::set controlled, not_controlled; - for (std::set::iterator k = active_by_level_type_splitted[j].begin(); k != active_by_level_type_splitted[j].end(); k++) { - std::vector dvs = dep.getEdge(*i, *k); - bool is_controlled = false; - for (int kk = 0; kk < dvs.size(); kk++) - if (dvs[kk].type = DEP_CONTROL) { - is_controlled = true; - break; - } - if (is_controlled) - controlled.insert(*k); - else - not_controlled.insert(*k); - } - if (controlled.size() != 0 && not_controlled.size() != 0) { - active_by_level_type_splitted.erase(active_by_level_type_splitted.begin() + j); - active_by_level_type_splitted.push_back(controlled); - active_by_level_type_splitted.push_back(not_controlled); - } - } - - // set lexical order separating loops with different loop types first - if (active_by_level_type_splitted.size() + active_by_no_level.size() > 1) { - int dep_dim = get_last_dep_dim_before(ref_stmt_num, level) + 1; - - Graph, Empty> g; - for (std::vector >::iterator i = active_by_level_type_splitted.begin(); i != active_by_level_type_splitted.end(); i++) - g.insert(*i); - for (std::set::iterator i = active_by_no_level.begin(); i != active_by_no_level.end(); i++) { - std::set t; - t.insert(*i); - g.insert(t); - } - for (int i = 0; i < g.vertex.size(); i++) - for (int j = i+1; j < g.vertex.size(); j++) { - bool connected = false; - for (std::set::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) { - for (std::set::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) { - std::vector dvs = dep.getEdge(*ii, *jj); - for (int k = 0; k < dvs.size(); k++) - if (dvs[k].is_control_dependence() || - (dvs[k].is_data_dependence() && !dvs[k].has_been_carried_before(dep_dim))) { - g.connect(i, j); - connected = true; - break; - } - if (connected) - break; - } - if (connected) - break; - } - connected = false; - for (std::set::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) { - for (std::set::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) { - std::vector dvs = dep.getEdge(*jj, *ii); - for (int k = 0; k < dvs.size(); k++) - if (dvs[k].is_control_dependence() || - (dvs[k].is_data_dependence() && !dvs[k].has_been_carried_before(dep_dim))) { - g.connect(j, i); - connected = true; - break; - } - if (connected) - break; - } - if (connected) - break; - } - } - - std::vector > s = g.topoSort(); - if (s.size() != g.vertex.size()) - throw loop_error("cannot separate statements with different loop types at loop level " + to_string(level)); - - // assign lexical order - int order = starting_order; - for (int i = 0; i < s.size(); i++) { - std::set &cur_scc = g.vertex[*(s[i].begin())].first; - int sz = cur_scc.size(); - if (sz == 1) { - int cur_stmt = *(cur_scc.begin()); - assign_const(stmt[cur_stmt].xform, dim, order); - for (int j = dim+2; j < stmt[cur_stmt].xform.n_out(); j+=2) - assign_const(stmt[cur_stmt].xform, j, 0); - order++; - } - else { - setLexicalOrder(dim, cur_scc, order); - order += sz; - } - } - } - // set lexical order seperating single iteration statements and loops - else { - std::set true_singles; - std::set nonsingles; - std::map > fake_singles; - - // sort out statements that do not require loops - for(std::set::iterator i = active.begin(); i != active.end(); i++) { - Relation cur_IS = getNewIS(*i); - if (is_single_iteration(cur_IS, dim+1)) { - bool is_all_single = true; - for (int j = dim+3; j < stmt[*i].xform.n_out(); j+=2) - if (!is_single_iteration(cur_IS, j)) { - is_all_single = false; - break; - } - if (is_all_single) - true_singles.insert(*i); - else { - try { - fake_singles[get_const(cur_IS, dim+1, Set_Var)].insert(*i); - } - catch (const std::exception &e) { - fake_singles[posInfinity].insert(*i); - } - } - } - else - nonsingles.insert(*i); - } - - // split nonsingles forcibly according to negative dependences present (loop unfusible) - int dep_dim = get_dep_dim_of(ref_stmt_num, level); - Graph g2; - for (std::set::iterator i = nonsingles.begin(); i != nonsingles.end(); i++) - g2.insert(*i); - for (int i = 0; i < g2.vertex.size(); i++) - for (int j = i+1; j < g2.vertex.size(); j++) { - std::vector dvs = dep.getEdge(g2.vertex[i].first, g2.vertex[j].first); - for (int k = 0; k < dvs.size(); k++) - if (dvs[k].is_control_dependence() || - (dvs[k].is_data_dependence() && dvs[k].has_negative_been_carried_at(dep_dim))) { - g2.connect(i, j); - break; - } - dvs = dep.getEdge(g2.vertex[j].first, g2.vertex[i].first); - for (int k = 0; k < dvs.size(); k++) - if (dvs[k].is_control_dependence() || - (dvs[k].is_data_dependence() && dvs[k].has_negative_been_carried_at(dep_dim))) { - g2.connect(j, i); - break; - } - } - - std::vector > s2 = g2.packed_topoSort(); - - std::vector > splitted_nonsingles; - for (int i = 0; i < s2.size(); i++) { - std::set cur_scc; - for (std::set::iterator j = s2[i].begin(); j != s2[i].end(); j++) - cur_scc.insert(g2.vertex[*j].first); - splitted_nonsingles.push_back(cur_scc); - } - - // convert to dependence graph for grouped statements - dep_dim = get_last_dep_dim_before(ref_stmt_num, level) + 1; - Graph, Empty> g; - for (std::set::iterator i = true_singles.begin(); i != true_singles.end(); i++) { - std::set t; - t.insert(*i); - g.insert(t); - } - for (int i = 0; i < splitted_nonsingles.size(); i++) { - g.insert(splitted_nonsingles[i]); - } - for (std::map >::iterator i = fake_singles.begin(); i != fake_singles.end(); i++) - g.insert((*i).second); - - for (int i = 0; i < g.vertex.size(); i++) - for (int j = i + 1; j < g.vertex.size(); j++) { - bool connected = false; - for (std::set::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) { - for (std::set::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) { - std::vector dvs = dep.getEdge(*ii, *jj); - for (int k = 0; k < dvs.size(); k++) - if (dvs[k].is_control_dependence() || - (dvs[k].is_data_dependence() && !dvs[k].has_been_carried_before(dep_dim))) { - g.connect(i, j); - connected = true; - break; - } - if (connected) - break; - } - if (connected) - break; - } - connected = false; - for (std::set::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) { - for (std::set::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) { - std::vector dvs = dep.getEdge(*jj, *ii); - for (int k = 0; k < dvs.size(); k++) - if (dvs[k].is_control_dependence() || - (dvs[k].is_data_dependence() && !dvs[k].has_been_carried_before(dep_dim))) { - g.connect(j, i); - connected = true; - break; - } - if (connected) - break; - } - if (connected) - break; - } - } - - // topological sort according to chun's permute algorithm - std::vector > s = g.topoSort(); - - // assign lexical order - int order = starting_order; - for (int i = 0; i < s.size(); i++) { - // translate each SCC into original statements - std::set cur_scc; - for (std::set::iterator j = s[i].begin(); j != s[i].end(); j++) - copy(g.vertex[*j].first.begin(), g.vertex[*j].first.end(), inserter(cur_scc, cur_scc.begin())); - - // now assign the constant - for(std::set::iterator j = cur_scc.begin(); j != cur_scc.end(); j++) - assign_const(stmt[*j].xform, dim, order); - - if (cur_scc.size() > 1) - setLexicalOrder(dim+2, cur_scc); - else if (cur_scc.size() == 1) { - int cur_stmt =*(cur_scc.begin()); - for (int j = dim+2; j < stmt[cur_stmt].xform.n_out(); j+=2) - assign_const(stmt[cur_stmt].xform, j, 0); - } - - if (cur_scc.size() > 0) - order++; - } - } -} - - -void Loop::apply_xform() { - std::set active; - for (int i = 0; i < stmt.size(); i++) - active.insert(i); - apply_xform(active); -} - - -void Loop::apply_xform(int stmt_num) { - std::set active; - active.insert(stmt_num); - apply_xform(active); -} - - -void Loop::apply_xform(std::set &active) { - int max_n = 0; - - CG_outputBuilder *ocg = ir->builder(); - for (std::set::iterator i = active.begin(); i != active.end(); i++) { - int n = stmt[*i].loop_level.size(); - if (n > max_n) - max_n = n; - - std::vector lex = getLexicalOrder(*i); - - Relation mapping(2*n+1, n); - F_And *f_root = mapping.add_and(); - for (int j = 1; j <= n; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(j), 1); - h.update_coef(mapping.input_var(2*j), -1); - } - mapping = Composition(mapping, stmt[*i].xform); - mapping.simplify(); - - // match omega input/output variables to variable names in the code - for (int j = 1; j <= stmt[*i].IS.n_set(); j++) - mapping.name_input_var(j, stmt[*i].IS.set_var(j)->name()); - for (int j = 1; j <= n; j++) - mapping.name_output_var(j, tmp_loop_var_name_prefix + to_string(tmp_loop_var_name_counter+j-1)); - mapping.setup_names(); - - Relation known = Extend_Set(copy(this->known), mapping.n_out() - this->known.n_set()); - //stmt[*i].code = outputStatement(ocg, stmt[*i].code, 0, mapping, known, std::vector(mapping.n_out(), NULL)); - stmt[*i].code = outputStatement(ocg, stmt[*i].code, 0, mapping, known, std::vector(mapping.n_out())); - stmt[*i].IS = Range(Restrict_Domain(mapping, stmt[*i].IS)); - stmt[*i].IS.simplify(); - - // replace original transformation relation with straight 1-1 mapping - mapping = Relation(n, 2*n+1); - f_root = mapping.add_and(); - for (int j = 1; j <= n; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(2*j), 1); - h.update_coef(mapping.input_var(j), -1); - } - for (int j = 1; j <= 2*n+1; j+=2) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(j), 1); - h.update_const(-lex[j-1]); - } - stmt[*i].xform = mapping; - } - - tmp_loop_var_name_counter += max_n; -} - - -void Loop::addKnown(const Relation &cond) { - int n1 = this->known.n_set(); - - Relation r = copy(cond); - int n2 = r.n_set(); - - if (n1 < n2) - this->known = Extend_Set(this->known, n2-n1); - else if (n1 > n2) - r = Extend_Set(r, n1-n2); - - this->known = Intersection(this->known, r); -} - - -bool Loop::nonsingular(const std::vector > &T) { - if (stmt.size() == 0) - return true; - - // check for sanity of parameters - for (int i = 0; i < stmt.size(); i++) { - if (stmt[i].loop_level.size() != num_dep_dim) - throw std::invalid_argument("nonsingular loop transformations must be applied to original perfect loop nest"); - for (int j = 0; j < stmt[i].loop_level.size(); j++) - if (stmt[i].loop_level[j].type != LoopLevelOriginal) - throw std::invalid_argument("nonsingular loop transformations must be applied to original perfect loop nest"); - } - if (T.size() != num_dep_dim) - throw std::invalid_argument("invalid transformation matrix"); - for (int i = 0; i < stmt.size(); i++) - if (T[i].size() != num_dep_dim + 1 && T[i].size() != num_dep_dim) - throw std::invalid_argument("invalid transformation matrix"); - - // build relation from matrix - Relation mapping(2*num_dep_dim+1, 2*num_dep_dim+1); - F_And *f_root = mapping.add_and(); - for (int i = 0; i < num_dep_dim; i++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(2*(i+1)), -1); - for (int j = 0; j < num_dep_dim; j++) - if (T[i][j] != 0) - h.update_coef(mapping.input_var(2*(j+1)), T[i][j]); - if (T[i].size() == num_dep_dim+1) - h.update_const(T[i][num_dep_dim]); - } - for (int i = 1; i <= 2*num_dep_dim+1; i+=2) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(i), -1); - h.update_coef(mapping.input_var(i), 1); - } - - // update transformation relations - for (int i = 0; i < stmt.size(); i++) - stmt[i].xform = Composition(copy(mapping), stmt[i].xform); - - // update dependence graph - for (int i = 0; i < dep.vertex.size(); i++) - for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++) { - std::vector dvs = j->second; - for (int k = 0; k < dvs.size(); k++) { - DependenceVector &dv = dvs[k]; - switch (dv.type) { - case DEP_W2R: - case DEP_R2W: - case DEP_W2W: - case DEP_R2R: { - std::vector lbounds(num_dep_dim), ubounds(num_dep_dim); - for (int p = 0; p < num_dep_dim; p++) { - coef_t lb = 0; - coef_t ub = 0; - for (int q = 0; q < num_dep_dim; q++) { - if (T[p][q] > 0) { - if (lb == -posInfinity || dv.lbounds[q] == -posInfinity) - lb = -posInfinity; - else - lb += T[p][q] * dv.lbounds[q]; - if (ub == posInfinity || dv.ubounds[q] == posInfinity) - ub = posInfinity; - else - ub += T[p][q] * dv.ubounds[q]; - } - else if (T[p][q] < 0) { - if (lb == -posInfinity || dv.ubounds[q] == posInfinity) - lb = -posInfinity; - else - lb += T[p][q] * dv.ubounds[q]; - if (ub == posInfinity || dv.lbounds[q] == -posInfinity) - ub = posInfinity; - else - ub += T[p][q] * dv.lbounds[q]; - } - } - if (T[p].size() == num_dep_dim+1) { - if (lb != -posInfinity) - lb += T[p][num_dep_dim]; - if (ub != posInfinity) - ub += T[p][num_dep_dim]; - } - lbounds[p] = lb; - ubounds[p] = ub; - } - dv.lbounds = lbounds; - dv.ubounds = ubounds; - - break; - } - default: - ; - } - } - j->second = dvs; - } - - // set constant loop values - std::set active; - for (int i = 0; i < stmt.size(); i++) - active.insert(i); - setLexicalOrder(0, active); - - return true; -} - - -void Loop::skew(const std::set &stmt_nums, int level, const std::vector &skew_amount) { - if (stmt_nums.size() == 0) - return; - - // check for sanity of parameters - int ref_stmt_num = *(stmt_nums.begin()); - for (std::set::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) { - if (*i < 0 || *i >= stmt.size()) - throw std::invalid_argument("invalid statement number " + to_string(*i)); - if (level < 1 || level > stmt[*i].loop_level.size()) - throw std::invalid_argument("invalid loop level " + to_string(level)); - for (int j = stmt[*i].loop_level.size(); j < skew_amount.size(); j++) - if (skew_amount[j] != 0) - throw std::invalid_argument("invalid skewing formula"); - } - - // set trasformation relations - for (std::set::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) { - int n = stmt[*i].xform.n_out(); - Relation r(n,n); - F_And *f_root = r.add_and(); - for (int j = 1; j <= n; j++) - if (j != 2*level) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.input_var(j), 1); - h.update_coef(r.output_var(j), -1); - } - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.output_var(2*level), -1); - for (int j = 0; j < skew_amount.size(); j++) - if (skew_amount[j] != 0) - h.update_coef(r.input_var(2*(j+1)), skew_amount[j]); - - stmt[*i].xform = Composition(r, stmt[*i].xform); - stmt[*i].xform.simplify(); - } - - // update dependence graph - if (stmt[ref_stmt_num].loop_level[level-1].type == LoopLevelOriginal) { - int dep_dim = stmt[ref_stmt_num].loop_level[level-1].payload; - for (std::set::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) - for (DependenceGraph::EdgeList::iterator j = dep.vertex[*i].second.begin(); j != dep.vertex[*i].second.end(); j++) - if (stmt_nums.find(j->first) != stmt_nums.end()) { - // dependence between skewed statements - std::vector dvs = j->second; - for (int k = 0; k < dvs.size(); k++) { - DependenceVector &dv = dvs[k]; - if (dv.is_data_dependence()) { - coef_t lb = 0; - coef_t ub = 0; - for (int kk = 0; kk < skew_amount.size(); kk++) { - int cur_dep_dim = get_dep_dim_of(*i, kk+1); - if (skew_amount[kk] > 0) { - if (lb != -posInfinity && - stmt[*i].loop_level[kk].type == LoopLevelOriginal && - dv.lbounds[cur_dep_dim] != -posInfinity) - lb += skew_amount[kk] * dv.lbounds[cur_dep_dim]; - else { - if (cur_dep_dim != -1 && !(dv.lbounds[cur_dep_dim] == 0 && dv.ubounds[cur_dep_dim] == 0)) - lb = -posInfinity; - } - if (ub != posInfinity && - stmt[*i].loop_level[kk].type == LoopLevelOriginal && - dv.ubounds[cur_dep_dim] != posInfinity) - ub += skew_amount[kk] * dv.ubounds[cur_dep_dim]; - else { - if (cur_dep_dim != -1 && !(dv.lbounds[cur_dep_dim] == 0 && dv.ubounds[cur_dep_dim] == 0)) - ub = posInfinity; - } - } - else if (skew_amount[kk] < 0) { - if (lb != -posInfinity && - stmt[*i].loop_level[kk].type == LoopLevelOriginal && - dv.ubounds[cur_dep_dim] != posInfinity) - lb += skew_amount[kk] * dv.ubounds[cur_dep_dim]; - else { - if (cur_dep_dim != -1 && !(dv.lbounds[cur_dep_dim] == 0 && dv.ubounds[cur_dep_dim] == 0)) - lb = -posInfinity; - } - if (ub != posInfinity && - stmt[*i].loop_level[kk].type == LoopLevelOriginal && - dv.lbounds[cur_dep_dim] != -posInfinity) - ub += skew_amount[kk] * dv.lbounds[cur_dep_dim]; - else { - if (cur_dep_dim != -1 && !(dv.lbounds[cur_dep_dim] == 0 && dv.ubounds[cur_dep_dim] == 0)) - ub = posInfinity; - } - } - } - dv.lbounds[dep_dim] = lb; - dv.ubounds[dep_dim] = ub; - } - } - j->second = dvs; - } - else { - // dependence from skewed statement to unskewed statement becomes jumbled, - // put distance value at skewed dimension to unknown - std::vector dvs = j->second; - for (int k = 0; k < dvs.size(); k++) { - DependenceVector &dv = dvs[k]; - if (dv.is_data_dependence()) { - dv.lbounds[dep_dim] = -posInfinity; - dv.ubounds[dep_dim] = posInfinity; - } - } - j->second = dvs; - } - for (int i = 0; i < dep.vertex.size(); i++) - if (stmt_nums.find(i) == stmt_nums.end()) - for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++) - if (stmt_nums.find(j->first) != stmt_nums.end()) { - // dependence from unskewed statement to skewed statement becomes jumbled, - // put distance value at skewed dimension to unknown - std::vector dvs = j->second; - for (int k = 0; k < dvs.size(); k++) { - DependenceVector &dv = dvs[k]; - if (dv.is_data_dependence()) { - dv.lbounds[dep_dim] = -posInfinity; - dv.ubounds[dep_dim] = posInfinity; - } - } - j->second = dvs; - } - } -} - - -void Loop::shift(const std::set &stmt_nums, int level, int shift_amount) { - if (stmt_nums.size() == 0) - return; - - // check for sanity of parameters - int ref_stmt_num = *(stmt_nums.begin()); - for (std::set::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) { - if (*i < 0 || *i >= stmt.size()) - throw std::invalid_argument("invalid statement number " + to_string(*i)); - if (level < 1 || level > stmt[*i].loop_level.size()) - throw std::invalid_argument("invalid loop level " + to_string(level)); - } - - // do nothing - if (shift_amount == 0) - return; - - // set trasformation relations - for (std::set::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) { - int n = stmt[*i].xform.n_out(); - - Relation r(n, n); - F_And *f_root = r.add_and(); - for (int j = 1; j <= n; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.input_var(j), 1); - h.update_coef(r.output_var(j), -1); - if (j == 2*level) - h.update_const(shift_amount); - } - - stmt[*i].xform = Composition(r, stmt[*i].xform); - stmt[*i].xform.simplify(); - } - - // update dependence graph - if (stmt[ref_stmt_num].loop_level[level-1].type == LoopLevelOriginal) { - int dep_dim = stmt[ref_stmt_num].loop_level[level-1].payload; - for (std::set::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) - for (DependenceGraph::EdgeList::iterator j = dep.vertex[*i].second.begin(); j != dep.vertex[*i].second.end(); j++) - if (stmt_nums.find(j->first) == stmt_nums.end()) { - // dependence from shifted statement to unshifted statement - std::vector dvs = j->second; - for (int k = 0; k < dvs.size(); k++) { - DependenceVector &dv = dvs[k]; - if (dv.is_data_dependence()) { - if (dv.lbounds[dep_dim] != -posInfinity) - dv.lbounds[dep_dim] -= shift_amount; - if (dv.ubounds[dep_dim] != posInfinity) - dv.ubounds[dep_dim] -= shift_amount; - } - } - j->second = dvs; - } - for (int i = 0; i < dep.vertex.size(); i++) - if (stmt_nums.find(i) == stmt_nums.end()) - for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++) - if (stmt_nums.find(j->first) != stmt_nums.end()) { - // dependence from unshifted statement to shifted statement - std::vector dvs = j->second; - for (int k = 0; k < dvs.size(); k++) { - DependenceVector &dv = dvs[k]; - if (dv.is_data_dependence()) { - if (dv.lbounds[dep_dim] != -posInfinity) - dv.lbounds[dep_dim] += shift_amount; - if (dv.ubounds[dep_dim] != posInfinity) - dv.ubounds[dep_dim] += shift_amount; - } - } - j->second = dvs; - } - } -} - - - -// bool Loop::fuse(const std::set &stmt_nums, int level) { -// if (stmt_nums.size() == 0 || stmt_nums.size() == 1) -// return true; -// int dim = 2*level-1; - -// // check for sanity of parameters -// std::vector ref_lex; -// for (std::set::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) { -// if (*i < 0 || *i >= stmt.size()) -// throw std::invalid_argument("invalid statement number " + to_string(*i)); -// if (level < 1 || level > (stmt[*i].xform.n_out()-1)/2) -// throw std::invalid_argument("invalid loop level " + to_string(level)); -// if (ref_lex.size() == 0) -// ref_lex = getLexicalOrder(*i); -// else { -// std::vector lex = getLexicalOrder(*i); -// for (int j = 0; j < dim-1; j+=2) -// if (lex[j] != ref_lex[j]) -// throw std::invalid_argument("statements for fusion must be in the same level-" + to_string(level-1) + " subloop"); -// } -// } - -// // collect lexicographical order values from to-be-fused statements -// std::set lex_values; -// for (std::set::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) { -// std::vector lex = getLexicalOrder(*i); -// lex_values.insert(lex[dim-1]); -// } -// if (lex_values.size() == 1) -// return true; - -// // negative dependence would prevent fusion -// int dep_dim = xform_index[dim].first; -// for (std::set::iterator i = lex_values.begin(); i != lex_values.end(); i++) { -// ref_lex[dim-1] = *i; -// std::set a = getStatements(ref_lex, dim-1); -// std::set::iterator j = i; -// j++; -// for (; j != lex_values.end(); j++) { -// ref_lex[dim-1] = *j; -// std::set b = getStatements(ref_lex, dim-1); -// for (std::set::iterator ii = a.begin(); ii != a.end(); ii++) -// for (std::set::iterator jj = b.begin(); jj != b.end(); jj++) { -// std::vector dvs; -// dvs = dep.getEdge(*ii, *jj); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim) && dvs[k].hasNegative(dep_dim)) -// throw loop_error("loop error: statements " + to_string(*ii) + " and " + to_string(*jj) + " cannot be fused together due to negative dependence"); -// dvs = dep.getEdge(*jj, *ii); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim) && dvs[k].hasNegative(dep_dim)) -// throw loop_error("loop error: statements " + to_string(*jj) + " and " + to_string(*ii) + " cannot be fused together due to negative dependence"); -// } -// } -// } - -// // collect all other lexicographical order values from the subloop -// // enclosing these to-be-fused loops -// std::set same_loop = getStatements(ref_lex, dim-3); -// std::set other_lex_values; -// for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { -// std::vector lex = getLexicalOrder(*i); -// if (lex_values.find(lex[dim-1]) == lex_values.end()) -// other_lex_values.insert(lex[dim-1]); -// } - -// // update to-be-fused loops due to dependence cycle -// Graph, Empty> g; -// { -// std::set t; -// for (std::set::iterator i = lex_values.begin(); i != lex_values.end(); i++) { -// ref_lex[dim-1] = *i; -// std::set t2 = getStatements(ref_lex, dim-1); -// std::set_union(t.begin(), t.end(), t2.begin(), t2.end(), inserter(t, t.begin())); -// } -// g.insert(t); -// } -// for (std::set::iterator i = other_lex_values.begin(); i != other_lex_values.end(); i++) { -// ref_lex[dim-1] = *i; -// std::set t = getStatements(ref_lex, dim-1); -// g.insert(t); -// } -// for (int i = 0; i < g.vertex.size(); i++) -// for (int j = i+1; j < g.vertex.size(); j++) -// for (std::set::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) -// for (std::set::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) { -// std::vector dvs; -// dvs = dep.getEdge(*ii, *jj); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim)) { -// g.connect(i, j); -// break; -// } -// dvs = dep.getEdge(*jj, *ii); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim)) { -// g.connect(j, i); -// break; -// } -// } -// std::vector > s = g.topoSort(); -// int fused_lex_value = 0; -// for (int i = 0; i < s.size(); i++) -// if (s[i].find(0) != s[i].end()) { -// // now add additional lexicographical order values -// for (std::set::iterator j = s[i].begin(); j != s[i].end(); j++) -// if (*j != 0) { -// int stmt = *(g.vertex[*j].first.begin()); -// std::vector lex = getLexicalOrder(stmt); -// lex_values.insert(lex[dim-1]); -// } - -// if (s.size() > 1) { -// if (i == 0) { -// int min_lex_value; -// for (std::set::iterator j = s[i+1].begin(); j != s[i+1].end(); j++) { -// int stmt = *(g.vertex[*j].first.begin()); -// std::vector lex = getLexicalOrder(stmt); -// if (j == s[i+1].begin()) -// min_lex_value = lex[dim-1]; -// else if (lex[dim-1] < min_lex_value) -// min_lex_value = lex[dim-1]; -// } -// fused_lex_value = min_lex_value - 1; -// } -// else { -// int max_lex_value; -// for (std::set::iterator j = s[i-1].begin(); j != s[i-1].end(); j++) { -// int stmt = *(g.vertex[*j].first.begin()); -// std::vector lex = getLexicalOrder(stmt); -// if (j == s[i-1].begin()) -// max_lex_value = lex[dim-1]; -// else if (lex[dim-1] > max_lex_value) -// max_lex_value = lex[dim-1]; -// } -// fused_lex_value = max_lex_value + 1; -// } -// } - -// break; -// } - -// // sort the newly updated to-be-fused lexicographical order values -// std::vector ordered_lex_values; -// for (std::set::iterator i = lex_values.begin(); i != lex_values.end(); i++) -// ordered_lex_values.push_back(*i); -// std::sort(ordered_lex_values.begin(), ordered_lex_values.end()); - -// // make sure internal loops inside to-be-fused loops have the same -// // lexicographical order before and after fusion -// std::vector > inside_lex_range(ordered_lex_values.size()); -// for (int i = 0; i < ordered_lex_values.size(); i++) { -// ref_lex[dim-1] = ordered_lex_values[i]; -// std::set the_stmts = getStatements(ref_lex, dim-1); -// std::set::iterator j = the_stmts.begin(); -// std::vector lex = getLexicalOrder(*j); -// int min_inside_lex_value = lex[dim+1]; -// int max_inside_lex_value = lex[dim+1]; -// j++; -// for (; j != the_stmts.end(); j++) { -// std::vector lex = getLexicalOrder(*j); -// if (lex[dim+1] < min_inside_lex_value) -// min_inside_lex_value = lex[dim+1]; -// if (lex[dim+1] > max_inside_lex_value) -// max_inside_lex_value = lex[dim+1]; -// } -// inside_lex_range[i].first = min_inside_lex_value; -// inside_lex_range[i].second = max_inside_lex_value; -// } -// for (int i = 1; i < ordered_lex_values.size(); i++) -// if (inside_lex_range[i].first <= inside_lex_range[i-1].second) { -// int shift_lex_value = inside_lex_range[i-1].second - inside_lex_range[i].first + 1; -// ref_lex[dim-1] = ordered_lex_values[i]; -// ref_lex[dim+1] = inside_lex_range[i].first; -// shiftLexicalOrder(ref_lex, dim+1, shift_lex_value); -// inside_lex_range[i].first += shift_lex_value; -// inside_lex_range[i].second += shift_lex_value; -// } - -// // set lexicographical order for fused loops -// for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { -// std::vector lex = getLexicalOrder(*i); -// if (lex_values.find(lex[dim-1]) != lex_values.end()) -// assign_const(stmt[*i].xform, dim-1, fused_lex_value); -// } - -// // no need to update dependence graph -// ; - -// return true; -// } - - -// bool Loop::distribute(const std::set &stmt_nums, int level) { -// if (stmt_nums.size() == 0 || stmt_nums.size() == 1) -// return true; -// int dim = 2*level-1; - -// // check for sanity of parameters -// std::vector ref_lex; -// for (std::set::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) { -// if (*i < 0 || *i >= stmt.size()) -// throw std::invalid_argument("invalid statement number " + to_string(*i)); -// if (level < 1 || level > (stmt[*i].xform.n_out()-1)/2) -// throw std::invalid_argument("invalid loop level " + to_string(level)); -// if (ref_lex.size() == 0) -// ref_lex = getLexicalOrder(*i); -// else { -// std::vector lex = getLexicalOrder(*i); -// for (int j = 0; j <= dim-1; j+=2) -// if (lex[j] != ref_lex[j]) -// throw std::invalid_argument("statements for distribution must be in the same level-" + to_string(level) + " subloop"); -// } -// } - -// // find SCC in the to-be-distributed loop -// int dep_dim = xform_index[dim].first; -// std::set same_loop = getStatements(ref_lex, dim-1); -// Graph g; -// for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) -// g.insert(*i); -// for (int i = 0; i < g.vertex.size(); i++) -// for (int j = i+1; j < g.vertex.size(); j++) { -// std::vector dvs; -// dvs = dep.getEdge(g.vertex[i].first, g.vertex[j].first); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim)) { -// g.connect(i, j); -// break; -// } -// dvs = dep.getEdge(g.vertex[j].first, g.vertex[i].first); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim)) { -// g.connect(j, i); -// break; -// } -// } -// std::vector > s = g.topoSort(); - -// // find statements that cannot be distributed due to dependence cycle -// Graph, Empty> g2; -// for (int i = 0; i < s.size(); i++) { -// std::set t; -// for (std::set::iterator j = s[i].begin(); j != s[i].end(); j++) -// if (stmt_nums.find(g.vertex[*j].first) != stmt_nums.end()) -// t.insert(g.vertex[*j].first); -// if (!t.empty()) -// g2.insert(t); -// } -// for (int i = 0; i < g2.vertex.size(); i++) -// for (int j = i+1; j < g2.vertex.size(); j++) -// for (std::set::iterator ii = g2.vertex[i].first.begin(); ii != g2.vertex[i].first.end(); ii++) -// for (std::set::iterator jj = g2.vertex[j].first.begin(); jj != g2.vertex[j].first.end(); jj++) { -// std::vector dvs; -// dvs = dep.getEdge(*ii, *jj); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim)) { -// g2.connect(i, j); -// break; -// } -// dvs = dep.getEdge(*jj, *ii); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim)) { -// g2.connect(j, i); -// break; -// } -// } -// std::vector > s2 = g2.topoSort(); - -// // nothing to distribute -// if (s2.size() == 1) -// throw loop_error("loop error: no statement can be distributed due to dependence cycle"); - -// std::vector > s3; -// for (int i = 0; i < s2.size(); i++) { -// std::set t; -// for (std::set::iterator j = s2[i].begin(); j != s2[i].end(); j++) -// std::set_union(t.begin(), t.end(), g2.vertex[*j].first.begin(), g2.vertex[*j].first.end(), inserter(t, t.begin())); -// s3.push_back(t); -// } - -// // associate other affected statements with the right distributed statements -// for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) -// if (stmt_nums.find(*i) == stmt_nums.end()) { -// bool is_inserted = false; -// int potential_insertion_point = 0; -// for (int j = 0; j < s3.size(); j++) { -// for (std::set::iterator k = s3[j].begin(); k != s3[j].end(); k++) { -// std::vector dvs; -// dvs = dep.getEdge(*i, *k); -// for (int kk = 0; kk < dvs.size(); kk++) -// if (dvs[kk].isCarried(dep_dim)) { -// s3[j].insert(*i); -// is_inserted = true; -// break; -// } -// dvs = dep.getEdge(*k, *i); -// for (int kk = 0; kk < dvs.size(); kk++) -// if (dvs[kk].isCarried(dep_dim)) -// potential_insertion_point = j; -// } -// if (is_inserted) -// break; -// } - -// if (!is_inserted) -// s3[potential_insertion_point].insert(*i); -// } - -// // set lexicographical order after distribution -// int order = ref_lex[dim-1]; -// shiftLexicalOrder(ref_lex, dim-1, s3.size()-1); -// for (std::vector >::iterator i = s3.begin(); i != s3.end(); i++) { -// for (std::set::iterator j = (*i).begin(); j != (*i).end(); j++) -// assign_const(stmt[*j].xform, dim-1, order); -// order++; -// } - -// // no need to update dependence graph -// ; - -// return true; -// } - - - - - - - - diff --git a/loop_cuda.cc b/loop_cuda.cc deleted file mode 100644 index a23990d..0000000 --- a/loop_cuda.cc +++ /dev/null @@ -1,2123 +0,0 @@ -/***************************************************************************** - Copyright (C) 2009 University of Utah - All Rights Reserved. - - Purpose: - Cudaize methods - - Notes: - - History: - 1/7/10 Created by Gabe Rudy by migrating code from loop.cc - 31/1/11 Modified by Protonu Basu -*****************************************************************************/ - -#include -#include -#include -#include -#include "loop_cuda.hh" -#include "loop.hh" -#include -#include -#include "omegatools.hh" -#include "ir_cudasuif.hh" -#include "ir_suif.hh" -#include "ir_suif_utils.hh" -#include "chill_error.hh" -#include - -using namespace omega; -char *k_cuda_texture_memory; //protonu--added to track texture memory type -char *k_cuda_constant_memory; //protonu--added to track constant memory type -//extern char *omega::k_cuda_texture_memory; //protonu--added to track texture memory type -extern char *omega::k_ocg_comment; - - -static int cudaDebug; -class CudaStaticInit{ public: CudaStaticInit(){ cudaDebug=0; //Change this to 1 for debug -}}; -static CudaStaticInit junkInitInstance__; - - - -std::string& upcase(std::string& s) -{ - for(int i=0; i& curOrder){ - if(!cudaDebug) return; - for(int i=0; i0) - printf(","); - printf("%s", curOrder[i].c_str()); - } - printf("\n"); -} - -void printVS(const std::vector& curOrder){ - //if(!cudaDebug) return; - for(int i=0; i0) - printf(","); - printf("%s", curOrder[i].c_str()); - } - printf("\n"); -} - -LoopCuda::~LoopCuda() { - const int m = stmt.size(); - for (int i = 0; i < m; i++) - stmt[i].code->clear(); -} - -bool LoopCuda::symbolExists(std::string s){ - if(symtab->lookup_sym(s.c_str(), SYM_VAR, false)) - return true; - if(globals->lookup_sym(s.c_str(), SYM_VAR, false)) - return true; - for(int i=0; iub_list(); - tree_node_list_iter upli(ub); - while(!upli.is_empty()){ - tree_node *node = upli.step(); - if(node->kind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr) - { - in_rrr* ins = (in_rrr*)((tree_instr*)node)->instr(); - //expect the structure: cpy( _ = min(grab_me, _)) - if(ins->opcode() == io_cpy && ins->src1_op().is_instr()){ - ins = (in_rrr*)ins->src1_op().instr(); - if(ins->opcode() == io_min){ - tree_node_list* tnl = new tree_node_list; - tnl->append(if_node(symtab, fold_sle(operand(bound_sym), ins->src1_op().instr()->clone()), then_part)); - return tnl; - } - } - } - } - return then_part; //Failed to go to proper loop level -} - -/** - * This would be better if it was done by a CHiLL xformation instead of at codegen - * - * state: - * for(...) - * for(...) - * cur_body - * stmt1 - * - * stm1 is in-between two loops that are going to be reduced. The - * solution is to put stmt1 at the end of cur_body but conditionally run - * in on the last step of the for loop. - * - * A CHiLL command that would work better: - * - * for(...) - * stmt0 - * for(for i=0; i - * for(...) - * for(for i=0; i findCommentedFors(const char* index, tree_node_list* tnl){ - std::vector result; - - tree_node_list_iter iter(tnl); - bool next_loop_ok = false; - while (!iter.is_empty()) { - tree_node *tn = iter.step(); - if (tn->kind() == TREE_INSTR && ((tree_instr*)tn)->instr()->opcode() == io_mrk) - { - instruction* inst = ((tree_instr*)tn)->instr(); - std::string comment; - if ((inst->peek_annote(k_ocg_comment) != NULL)) - { - immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment)); - immed_list_iter data_iter(data); - if(!data_iter.is_empty()){ - immed first_immed = data_iter.step(); - if(first_immed.kind() == im_string) - comment = first_immed.string(); - } - } - if(comment.find("~cuda~") != std::string::npos - && comment.find("preferredIdx: ") != std::string::npos){ - std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos); - if(idx.find(" ") != std::string::npos) - idx = idx.substr(0,idx.find(" ")); - if(strcmp(idx.c_str(),index) == 0) - next_loop_ok = true; - } - } - if (tn->kind() == TREE_FOR){ - if(next_loop_ok){ - //printf("found loop %s\n", static_cast(tn)->index()->name()); - result.push_back(static_cast(tn)); - } - else{ - //printf("looking down for loop %s\n", static_cast(tn)->index()->name()); - std::vector t = findCommentedFors(index, static_cast(tn)->body()); - std::copy(t.begin(), t.end(), back_inserter(result)); - } - next_loop_ok = false; - } - if (tn->kind() == TREE_IF) { - //printf("looking down if\n"); - tree_if *tni = static_cast(tn); - std::vector t = findCommentedFors(index, tni->then_part()); - std::copy(t.begin(), t.end(), back_inserter(result)); - } - } - - return result; -} - -tree_node_list* forReduce(tree_for* loop, var_sym* reduceIndex, proc_symtab* proc_syms) -{ - //We did the replacements all at once with recursiveFindPreferedIdxs - //replacements r; - //r.oldsyms.append(loop->index()); - //r.newsyms.append(reduceIndex); - //tree_for* new_loop = (tree_for*)loop->clone_helper(&r, true); - tree_for* new_loop = loop; - - //return body one loops in - tree_node_list* tnl = loop_body_at_level(new_loop, 1); - //wrap in conditional if necessary - tnl = wrapInIfFromMinBound(tnl, new_loop, proc_syms, reduceIndex); - return tnl; -} - -void recursiveFindRefs(tree_node_list* code, proc_symtab* proc_syms, replacements* r) -{ - if(code->parent() && code->scope()->is_block()) - ((block_symtab*)code->scope())->find_exposed_refs(proc_syms, r); - tree_node_list_iter tnli(code); - while (!tnli.is_empty()) { - tree_node *node = tnli.step(); - //printf("node kind: %d\n", node->kind()); - if(node->is_instr()) - { - tree_instr* t_instr = (tree_instr*)node; - t_instr->find_exposed_refs(proc_syms, r); - } - if(node->is_block()){ - recursiveFindRefs(static_cast(node)->body(), proc_syms, r); - } - else if(node->is_for()){ - tree_for* tn_for = static_cast(node); - //Find refs in statemetns and body - tn_for->find_exposed_refs(proc_syms, r); - //recursiveFindRefs(tn_for->body(), proc_syms, r); - } - } -} - -tree_node_list* recursiveFindReplacePreferedIdxs(tree_node_list* code, proc_symtab* proc_syms, - proc_sym* cudaSync, func_type* unkown_func, - std::map& loop_idxs) -{ - tree_node_list* tnl = new tree_node_list; - tree_node_list_iter tnli(code); - var_sym* idxSym=0; - bool sync = false; - std::vector r1; - std::vector r2; - while (!tnli.is_empty()) { - tree_node *node = tnli.step(); - //printf("node kind: %d\n", node->kind()); - if(node->is_instr()) - { - if(((tree_instr*)node)->instr()->format() == inf_rrr){ - in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr(); - if(inst->opcode() == io_mrk){ - std::string comment; - if ((inst->peek_annote(k_ocg_comment) != NULL)) - { - immed_list *data = (immed_list *)(inst->peek_annote(k_ocg_comment)); - immed_list_iter data_iter(data); - if(!data_iter.is_empty()){ - immed first_immed = data_iter.step(); - if(first_immed.kind() == im_string) - comment = first_immed.string(); - } - } - if(comment.find("~cuda~") != std::string::npos - && comment.find("preferredIdx: ") != std::string::npos){ - std::string idx = comment.substr(comment.find("preferredIdx: ")+14,std::string::npos); - if(idx.find(" ") != std::string::npos) - idx = idx.substr(0,idx.find(" ")); - //printf("sym_tab preferred index: %s\n", idx.c_str()); - if(loop_idxs.find(idx) != loop_idxs.end()) - idxSym = loop_idxs.find(idx)->second; - //Get the proc variable sybol for this preferred index - if(idxSym == 0){ - idxSym = (var_sym*)proc_syms->lookup_sym(idx.c_str(), SYM_VAR, false); - //printf("idx not found: lookup %p\n", idxSym); - if(!idxSym){ - idxSym = new var_sym(type_s32, (char*)idx.c_str()); - proc_syms->add_sym(idxSym); - //printf("idx created and inserted\n"); - } - //Now insert into our map for future - loop_idxs.insert(make_pair(idx, idxSym)); - } - //See if we have a sync as well - if(comment.find("sync") != std::string::npos){ - //printf("Inserting sync after current block\n"); - sync = true; - } - } - } - } - tnl->append(node); - } - else if(node->is_block()){ - tree_block* b = static_cast(node); - b->set_body(recursiveFindReplacePreferedIdxs(b->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); - tnl->append(b); - } - else if(node->is_for()){ - tree_for* tn_for = static_cast(node); - if(idxSym){ - //Replace the current tn_for's index variable with idxSym - //printf("replacing sym %s -> %s\n", tn_for->index()->name(), idxSym->name()); - replacements r; - r.oldsyms.append(tn_for->index()); - r.newsyms.append(idxSym); - tree_for* new_loop = (tree_for*)tn_for->clone_helper(&r, true); - idxSym = 0; //Reset for more loops in this tnl - new_loop->set_body(recursiveFindReplacePreferedIdxs(new_loop->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); - tnl->append(new_loop); - - if(sync){ - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaSync))), 0); - tnl->append(new tree_instr(the_call)); - //tnl->print(); - sync = true; - } - }else{ - tn_for->set_body(recursiveFindReplacePreferedIdxs(tn_for->body(), proc_syms, cudaSync, unkown_func, loop_idxs)); - tnl->append(tn_for); - } - }else if (node->kind() == TREE_IF) { - tree_if *tni = static_cast(node); - tni->set_then_part(recursiveFindReplacePreferedIdxs(tni->then_part(), proc_syms, cudaSync, unkown_func, loop_idxs)); - tnl->append(tni); - } - } - //Do this after the loop to not screw up the pointer interator - /* - for(int i=0; i array references -// loop_idxs -> map for when we encounter a loop with a different preferredIndex -// dim_vars -> out param, fills with var_sym pair for 2D array dimentions (messy stuff) -tree_node_list* swapVarReferences(tree_node_list* code, replacements* r, CG_suifBuilder *ocg, - std::map& loop_vars, - proc_symtab *proc_syms, - std::vector< std::pair >& dim_vars) -{ - //Iterate over every expression, looking up each variable and type - //reference used and possibly replacing it or adding it to our symbol - //table - // - //We use the built-in cloning helper methods to seriously help us with this! - - //Need to do a recursive mark - recursiveFindRefs(code, proc_syms, r); - - - //We can't rely on type_node->clone() to do the heavy lifting when the - //old type is a two dimentional array with variable upper bounds as - //that requires creating and saveing variable references to the upper - //bounds. So we do one pass over the oldtypes doing this type of - //conversion, putting results in the fixed_types map for a second pass - //to pick up. - std::map fixed_types; //array_types needing their upper bound installed - type_node_list_iter tlip(&r->oldtypes); - while(!tlip.is_empty()) - { - type_node* old_tn = tlip.step(); - type_node* new_tn = 0; - type_node* base_type = old_tn; - std::vector< std::pair > variable_upper_bouneds; - if(old_tn->is_ptr()){ - while (base_type->is_array() || base_type->is_ptr()) { - if (base_type->is_array()){ - array_bound ub = ((array_type*)base_type)->upper_bound(); - if(ub.is_variable()){ - var_sym* old_ub = (var_sym*)ub.variable(); - var_sym *new_ub = proc_syms->new_unique_var(type_s32); - dim_vars.push_back(std::pair(old_ub, new_ub)); - variable_upper_bouneds.push_back( std::pair(new_ub, base_type) ); - } - base_type = static_cast(base_type)->elem_type(); - } - else if (base_type->is_ptr()) - base_type = static_cast(base_type)->ref_type(); - } - } - for (int i = variable_upper_bouneds.size()-1; i >= 0; i--) { - var_sym *var_ub = variable_upper_bouneds[i].first; - type_node* old_tn = variable_upper_bouneds[i].second; - if(new_tn == 0) - new_tn = new array_type(base_type, array_bound(1), array_bound(var_ub)); - else - new_tn = new array_type(new_tn, array_bound(1), array_bound(var_ub)); - proc_syms->add_type(new_tn); - fixed_types.insert(std::pair(old_tn, new_tn)); - } - if(new_tn){ - if(old_tn->is_ptr()){ - new_tn = new ptr_type(new_tn); - proc_syms->add_type(new_tn); - } - fixed_types.insert(std::pair(old_tn, new_tn)); - } - } - - //Quickly look for modifiers on our our array types (__shared__ float [][]) - type_node_list_iter tliq(&r->oldtypes); - while(!tliq.is_empty()) - { - type_node* old_tn = tliq.step(); - if(old_tn->is_modifier()){ - type_node* base_type = static_cast(old_tn)->base(); - if(fixed_types.find(base_type) != fixed_types.end()){ - type_node* fixed_base = (*fixed_types.find(base_type)).second; - //printf("Fix modifier with fixed base\n"); - //This should work to copy over the annotations, but apparently doesn't work so well - type_node* new_tn = new modifier_type(static_cast(old_tn)->op(), fixed_base); - old_tn->copy_annotes(new_tn); - fixed_types.insert(std::pair(old_tn, new_tn)); - } - } - } - - //Run through the types and create entries in r->newtypes but don't install - type_node_list_iter tli(&r->oldtypes); - while(!tli.is_empty()) - { - type_node* old_tn = tli.step(); - type_node* new_tn = 0; - - //If we recorded this as fixed by our special case, use that type - //instead of cloning. - if(fixed_types.find(old_tn) != fixed_types.end()){ - new_tn = (*fixed_types.find(old_tn)).second; - //printf("Reusing fixed typ %u: ", new_tn->type_id()); - }else{ - new_tn = old_tn->clone(); - //printf("Cloning type %u: ", old_tn->type_id()); - } - new_tn = proc_syms->install_type(new_tn); - - //Ok, there is a weird case where an array type that has var_sym as - //their upper bounds can't be covered fully in this loop or the - //var_sym loop, so we need special code. - /* - if(old_tn->op() == TYPE_PTR && ((ptr_type*)old_tn)->ref_type()->op() == TYPE_ARRAY){ - array_type* outer_array = (array_type*)((ptr_type*)old_tn)->ref_type(); - array_bound ub = outer_array->upper_bound(); - if(ub.is_variable()){ - var_sym* old_ub = (var_sym*)ub.variable(); - var_sym* new_ub = (var_sym*)((array_type*)((ptr_type*)new_tn)->ref_type())->upper_bound().variable(); - //r->oldsyms.append(old_ub); - fix_ub.insert(std::pair(old_ub, (array_type*)((ptr_type*)new_tn)->ref_type())); - dim_vars.push_back(std::pair(old_ub, new_ub)); - printf("array var_sym: %p\n", new_ub); - } - if(outer_array->elem_type()->op() == TYPE_ARRAY) - { - array_type* inner_array = (array_type*)outer_array->elem_type(); - array_bound ub = inner_array->upper_bound(); - if(ub.is_variable()){ - var_sym* old_ub = (var_sym*)ub.variable(); - var_sym* new_ub = (var_sym*)((array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type())->upper_bound().variable(); - dim_vars.push_back(std::pair(old_ub, new_ub)); - printf("array var_sym: %p\n", new_ub); - //r->oldsyms.append(old_ub); - fix_ub.insert(std::pair(old_ub, (array_type*)((array_type*)((ptr_type*)new_tn)->ref_type())->elem_type())); - } - } - } - */ - r->newtypes.append(new_tn); - } - - //printf("proc_syms symbol run through\n"); - //proc_syms->print(); - - //Run through the syms creating new copies - sym_node_list_iter snli(&r->oldsyms); - while(!snli.is_empty()) - { - sym_node *old_sn = snli.step(); - - if(loop_vars.count(std::string(old_sn->name())) > 0) - { - r->newsyms.append(loop_vars[std::string(old_sn->name())]); - //printf("def exists: %s\n", old_sn->name()); - }else{ - sym_node *new_sn = old_sn->copy(); - if(new_sn->is_var()){ - var_sym* var = (var_sym*)new_sn; - type_node* new_type = var->type()->clone_helper(r); - - //TODO: Have a tagged list of variables to make shared - //Make local 2D arrays __shared__ - if(new_type->op() == TYPE_ARRAY && ((array_type*)new_type)->elem_type()->op() == TYPE_ARRAY){ - //protonu--changes suggested by Malik - //printf("Adding __shared__ annotation to : %s\n", new_sn->name()); - //new_type = ocg->ModifyType(new_type, "__shared__"); - //proc_syms->add_type(new_type); - } - var->set_type(new_type); - } - proc_syms->add_sym(new_sn); - r->newsyms.append(new_sn); - //printf("def new: %s\n", new_sn->name()); - } - } - - //printf("proc_syms var runthrough\n"); - //proc_syms->print(); - return code->clone_helper(r); -} - -bool LoopCuda::validIndexes(int stmt, const std::vector& idxs){ - for(int i=0; i array_dims, - std::vector blockIdxs, std::vector threadIdxs) -{ - int stmt_num = 0; - if(cudaDebug){ - printf("cudaize_v2(%s, {", kernel_name.c_str()); - //for( - printf("}, blocks={"); printVs(blockIdxs); printf("}, thread={"); printVs(threadIdxs); printf("})\n"); - } - - this->array_dims = array_dims; - if(!validIndexes(stmt_num, blockIdxs)){ - throw std::runtime_error("One of the indexes in the block list was not " - "found in the current set of indexes."); - } - if(!validIndexes(stmt_num, threadIdxs)){ - throw std::runtime_error("One of the indexes in the thread list was not " - "found in the current set of indexes."); - } - if(blockIdxs.size() ==0) - throw std::runtime_error("Cudaize: Need at least one block dimention"); - int block_level=0; - //Now, we will determine the actual size (if possible, otherwise - //complain) for the block dimentions and thread dimentions based on our - //indexes and the relations for our stmt; - for(int i=0; iarray_dims = array_dims; - cu_kernel_name = kernel_name.c_str(); - -} - -tree_node_list* LoopCuda::cudaize_codegen_v2() -{ - //printf("cudaize codegen V2\n"); - CG_suifBuilder *ocg = dynamic_cast(ir->builder()); - if(!ocg) return false; - - //protonu--adding an annote to track texture memory type - ANNOTE(k_cuda_texture_memory, "cuda texture memory", TRUE); - ANNOTE(k_cuda_constant_memory, "cuda constant memory", TRUE); - int tex_mem_on = 0; - int cons_mem_on = 0; - - - - CG_outputRepr* repr; - std::vector arrayVars; - std::vector localScopedVars; - - std::vector ro_refs; - std::vector wo_refs; - std::set uniqueRefs; - std::set uniqueWoRefs; - //protonu--let's try a much simpler approach of a map instead - //we also keep a map for constant memories - std::maptex_ref_map; - std::mapcons_ref_map; - - for(int j=0; j refs = ir->FindArrayRef(stmt[j].code); - for (int i = 0; i < refs.size(); i++) - { - //printf("ref %s wo %d\n", static_cast(refs[i]->name()), refs[i]->is_write()); - var_sym* var = symtab->lookup_var((char*)refs[i]->name().c_str(),false); - //If the array is not a parameter, then it's a local array and we - //want to recreate it as a stack variable in the kernel as opposed to - //passing it in. - if(!var->is_param()) - continue; - if (uniqueRefs.find(refs[i]->name()) == uniqueRefs.end()) - { - uniqueRefs.insert(refs[i]->name()); - if(refs[i]->is_write()){ - uniqueWoRefs.insert(refs[i]->name()); - wo_refs.push_back(refs[i]); - } - else - ro_refs.push_back(refs[i]); - } - if (refs[i]->is_write() && uniqueWoRefs.find(refs[i]->name()) == uniqueWoRefs.end()){ - uniqueWoRefs.insert(refs[i]->name()); - wo_refs.push_back(refs[i]); - //printf("adding %s to wo\n", static_cast(refs[i]->name())); - } - } - } - - // printf("reading from array "); - // for(int i=0; iname().c_str()); - // printf("and writting to array "); - // for(int i=0; iname().c_str()); - // printf("\n"); - - const char* gridName = "dimGrid"; - const char* blockName = "dimBlock"; - - //TODO: Could allow for array_dims_vars to be a mapping from array - //references to to variable names that define their length. - var_sym* dim1 = 0; - var_sym* dim2 = 0; - - for(int i=0; iname(); - outArray = symtab->lookup_var((char*)name.c_str(),false); - - VarDefs v; - v.size_2d = -1; - char buf[32]; - snprintf(buf, 32, "devO%dPtr", i+1); - v.name = buf; - if(outArray->type()->is_ptr()) - if(((ptr_type *)(outArray->type()))->ref_type()->is_array()) - v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type(); - else - v.type = ((ptr_type *)(outArray->type()))->ref_type(); - else - v.type = type_f32; - v.tex_mapped = false; - v.cons_mapped = false; - v.original_name = wo_refs[i]->name(); - //Size of the array = dim1 * dim2 * num bytes of our array type - - //If our input array is 2D (non-linearized), we want the actual - //dimentions of the array - CG_outputRepr* size; - //Lookup in array_dims - std::map::iterator it = array_dims.find(name.c_str()); - if(outArray->type()->is_ptr() && outArray->type()->ref_type(0)->is_array()) - { - array_type* t = (array_type*)outArray->type()->ref_type(0); - v.size_2d = t->upper_bound().constant()+1; - printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)wo_refs[i]->name().c_str()); - size = ocg->CreateInt(v.size_2d * v.size_2d); - }else if(it != array_dims.end()){ - int ref_size = it->second; - v.var_ref_size = ref_size; - size = ocg->CreateInt(ref_size); - } - else{ - if(dim1){ - size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)), - new CG_suifRepr(operand(dim2))); - }else{ - char buf[1024]; - sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a " - "detectable size or specififed dimentions", name.c_str()); - throw std::runtime_error(buf); - } - } - v.size_expr = operand(static_cast(ocg->CreateTimes( - size, - ocg->CreateInt(v.type->size()/8)))->GetExpression()); - v.in_data = 0; - v.out_data = outArray; - //Check for in ro_refs and remove it at this point - std::vector::iterator it_; - for(it_ = ro_refs.begin(); it_ != ro_refs.end(); it_++) - { - if((*it_)->name() == wo_refs[i]->name()){ - break; - } - } - if(it_ != ro_refs.end()) - { - v.in_data = outArray; - ro_refs.erase(it_); - } - - arrayVars.push_back(v); - - } - - //protonu-- assuming that all texture mapped memories were originally read only mems - //there should be safety checks for that, will implement those later - - int cs_ref_size = 0; - - for(int i=0; iname(); - inArray = symtab->lookup_var((char*)name.c_str(),false); - VarDefs v; - v.size_2d = -1; - char buf[32]; - snprintf(buf, 32, "devI%dPtr", i+1); - v.name = buf; - if(inArray->type()->is_ptr()) - if(((ptr_type *)(inArray->type()))->ref_type()->is_array()) - v.type = ((array_type *)(((ptr_type *)(inArray->type()))->ref_type()))->elem_type(); - else - v.type = ((ptr_type *)(inArray->type()))->ref_type(); - else - v.type = type_f32; - v.tex_mapped = false; - v.cons_mapped = false; - v.original_name = ro_refs[i]->name(); - if ( texture != NULL) - v.tex_mapped = (texture->is_array_tex_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars - if (v.tex_mapped){ - printf("this variable %s is mapped to texture memory", name.c_str()); - } - if ( constant_mem != NULL) - v.cons_mapped = (constant_mem->is_array_cons_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars - if (v.cons_mapped){ - printf("this variable %s is mapped to constant memory", name.c_str()); - } - - //Size of the array = dim1 * dim2 * num bytes of our array type - - //If our input array is 2D (non-linearized), we want the actual - //dimentions of the array (as it might be less than cu_n - CG_outputRepr* size; - //Lookup in array_dims - std::map::iterator it = array_dims.find(name.c_str()); - int ref_size = 0; - if(inArray->type()->is_ptr() && inArray->type()->ref_type(0)->is_array()) - { - array_type* t = (array_type*)inArray->type()->ref_type(0); - v.size_2d = t->upper_bound().constant()+1; - printf("Detected 2D array sized of %d for %s\n", v.size_2d, (char*)ro_refs[i]->name().c_str()); - size = ocg->CreateInt(v.size_2d * v.size_2d); - }else if(it != array_dims.end()){ - ref_size = it->second; - v.var_ref_size = ref_size; - size = ocg->CreateInt(ref_size); - }else{ - if(dim1){ - size = ocg->CreateTimes(new CG_suifRepr(operand(dim1)), - new CG_suifRepr(operand(dim2))); - }else{ - char buf[1024]; - sprintf(buf, "CudaizeCodeGen: Array reference %s does not have a " - "detectable size or specififed dimentions", name.c_str()); - throw std::runtime_error(buf); - } - } - - - - v.size_expr = operand(static_cast(ocg->CreateTimes( - size, - ocg->CreateInt(v.type->size()/8)))->GetExpression()); - - v.in_data = inArray; - v.out_data = 0; - arrayVars.push_back(v); - } - - - if(arrayVars.size() < 2) - { - fprintf(stderr, "cudaize error: Did not find two arrays being accessed\n"); - return false; - } - - //protonu--debugging tool--the printf statement - //tex_mem_on signals use of tex mem - for(int i=0; iinstall_type(unkown_func); - func_type* void_func = new func_type(type_void); //function on unkown args that returns a void - void_func = (func_type*)globals->install_type(void_func); - func_type* float_func = new func_type(type_f32); //function on unkown args that returns a float - float_func = (func_type*)globals->install_type(float_func); - - type_node* result = ocg->ModifyType(type_void, "__global__"); - result = globals->install_type(result); - func_type* kernel_type = new func_type(result); //function returns a '__global__ void' - - int numArgs = arrayVars.size() + (dim1 ? 2 : 0) + localScopedVars.size(); - //protonu--need to account for texture memory here, reduce the #args - if( tex_mem_on ) numArgs -= tex_mem_on; - if( cons_mem_on ) numArgs -= cons_mem_on; - kernel_type->set_num_args(numArgs); - int argCount = 0; - for(int i=0; itype()->clone(); - else - fptr = arrayVars[i].out_data->type()->clone(); - //protonu--skip this for texture mems - if( arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true ) - kernel_type->set_arg_type(argCount++, fptr); - } - if(dim1){ - kernel_type->set_arg_type(argCount++, type_s32); //width x height dimentions - kernel_type->set_arg_type(argCount++, type_s32); - } - kernel_type = (func_type*)globals->install_type(kernel_type); - - proc_sym* cudaMalloc = globals->new_proc(unkown_func, src_c, "cudaMalloc"); - proc_sym* cudaMemcpy = globals->new_proc(unkown_func, src_c, "cudaMemcpy"); - proc_sym* cudaFree = globals->new_proc(unkown_func, src_c, "cudaFree"); - proc_sym* cudaSync = globals->new_proc(void_func, src_c, "__syncthreads"); - proc_sym* cudaBind = globals->new_proc(unkown_func, src_c, "cudaBindTexture"); - proc_sym* cudaMemcpySym = globals->new_proc(unkown_func, src_c, "cudaMemcpyToSymbol"); - - - //protonu-removing Gabe's function, introducing mine, this is pretty cosmetic - //proc_sym* cudaFetch = globals->new_proc(float_func, src_c, "tex1Dfetch"); - proc_sym* tex1D = globals->new_proc(float_func, src_c, "tex1Dfetch"); - - var_sym *cudaMemcpyHostToDevice = new var_sym(type_s32, "cudaMemcpyHostToDevice"); - var_sym *cudaMemcpyDeviceToHost = new var_sym(type_s32, "cudaMemcpyDeviceToHost"); - cudaMemcpyDeviceToHost->set_param(); - cudaMemcpyHostToDevice->set_param(); - globals->add_sym(cudaMemcpyHostToDevice); - globals->add_sym(cudaMemcpyDeviceToHost); - - //protonu--adding the bool tex_mem to the structure struct_type - //to bypass the re-naming of struct texture, this is a hack fix - struct_type* texType = new struct_type(TYPE_GROUP, 0, "texture", 0, true); - immed_list *iml_tex = new immed_list; - iml_tex->append(immed("texture memory")); - texType->append_annote(k_cuda_texture_memory, iml_tex); - //protonu--end my changes - texType = (struct_type*)globals->install_type(texType); - //protonu--should register the locals later on - //when we do the bind operation - //var_sym* texRef = new var_sym(texType, "texRef"); - //globals->add_sym(texRef); - - //Add our mallocs (and input array memcpys) - for(int i=0; iclone()); - //protonu--temporary change - type_node* fptr = new ptr_type(arrayVars[i].type); - fptr = symtab->install_type(fptr); - var_sym *dvs = new var_sym(fptr, const_cast( - arrayVars[i].name.c_str())); - dvs->set_addr_taken(); - symtab->add_sym(dvs); - - //cudaMalloc args - //protonu--no cudaMalloc required for constant memory - tree_node_list* tnl = new tree_node_list; - if(arrayVars[i].cons_mapped != true ) - { - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMalloc))), 2); - the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to()->ptr_to(), operand(), immed(dvs)))); - the_call->set_argument(1, arrayVars[i].size_expr); - - tnl->append(new tree_instr(the_call)); - setup_code = ocg->StmtListAppend(setup_code, - new CG_suifRepr(tnl)); - } - if(arrayVars[i].in_data) - { - //cudaMemcpy args - //protonu-- no cudaMemcpy required for constant memory - if ( arrayVars[i].cons_mapped != true ) - { - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpy))), 4); - the_call->set_argument(0, operand(dvs)); - the_call->set_argument(1, operand(arrayVars[i].in_data)); - the_call->set_argument(2, arrayVars[i].size_expr.clone()); - the_call->set_argument(3, operand(cudaMemcpyHostToDevice)); - - tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - setup_code = ocg->StmtListAppend(setup_code, - new CG_suifRepr(tnl)); - } - - //protonu--check if the arrayvar is tex mapped - if(arrayVars[i].tex_mapped == true) - { - //Need a texture reference variable - char buf[32]; - snprintf(buf, 32, "tex%dRef", i+1); - arrayVars[i].secondName = buf; - - var_sym* texRef = new var_sym(texType, buf); - //printf("\n putting in %s\n", arrayVars[i].original_name.c_str()); - tex_ref_map[arrayVars[i].original_name] = texRef; - globals->add_sym(texRef); - //protonu--added the above two lines - - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaBind))), 4); - in_ldc *ins = new in_ldc(type_s32, operand(), immed(0)); - the_call->set_argument(0, operand(ins)); - the_call->set_argument(1, operand(texRef));//protonu--change to add the new sym - the_call->set_argument(2, operand(dvs)); - the_call->set_argument(3, arrayVars[i].size_expr.clone()); - - tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - setup_code = ocg->StmtListAppend(setup_code, - new CG_suifRepr(tnl)); - } - - //protonu--if arrayvar is mapped to constant memory - if(arrayVars[i].cons_mapped == true) - { - char buf[32]; - snprintf(buf, 32, "cs%dRef", i+1); - //arrayVars[i].secondName = buf; - array_bound low (0); - array_bound high (arrayVars[i].var_ref_size -1); - array_type *arr = new array_type(arrayVars[i].type,low, high); - type_node* cons_arr = ocg->ModifyType(arr, "__device__ __constant__"); - cons_arr = globals->install_type(cons_arr); - var_sym* consRef = new var_sym(cons_arr, buf); - cons_ref_map[arrayVars[i].original_name] = consRef; - globals->add_sym(consRef); - - - - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaMemcpySym))), 3); - the_call->set_argument(0, operand(new in_ldc(type_void->ptr_to(), operand(), immed(consRef)))); - the_call->set_argument(1, operand(arrayVars[i].in_data)); - the_call->set_argument(2, arrayVars[i].size_expr.clone()); - - tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - setup_code = ocg->StmtListAppend(setup_code, - new CG_suifRepr(tnl)); - - } - } - } - - //Build dimGrid dim3 variables based on loop dimentions and ti/tj - char blockD1[120]; - char blockD2[120]; - if(dim1){ - snprintf(blockD1, 120, "%s/%d", dim1->name(), cu_tx); - snprintf(blockD2, 120, "%s/%d", dim2->name(), cu_ty); - }else{ - snprintf(blockD1, 120, "%d", cu_bx); - snprintf(blockD2, 120, "%d", cu_by); - //snprintf(blockD1, 120, "%d/%d", cu_nx, cu_tx); - //snprintf(blockD2, 120, "%d/%d", cu_ny, cu_ty); - } - repr = ocg->CreateDim3(immed((char*)gridName), - immed(blockD1), - immed(blockD2)); - setup_code = ocg->StmtListAppend(setup_code, repr); - - repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx),immed(cu_ty)); - - if(cu_tz > 1) - repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty), immed(cu_tz)); - else - repr = ocg->CreateDim3(immed((char*)blockName), immed(cu_tx), immed(cu_ty)); - setup_code = ocg->StmtListAppend(setup_code, repr); - - //call kernel function with name loop_name - //like: transpose_k<<>>(devOPtr, devIPtr , width, height); - char dims[120]; - snprintf(dims,120,"<<<%s,%s>>>",gridName, blockName); - immed_list *iml = new immed_list; - iml->append(immed((char*)cu_kernel_name.c_str())); - iml->append(immed(dims)); - //printf("%s %s\n", static_cast(cu_kernel_name), dims); - for(int i=0; i= 0) - { - snprintf(dims,120,"(float(*) [%d])%s", arrayVars[i].size_2d, - const_cast(arrayVars[i].name.c_str())); - //printf("%d %s\n", i, dims); - iml->append(immed(dims)); - }else{ - //printf("%d %s\n", i, static_cast(arrayVars[i].name)); - iml->append(immed(const_cast( - arrayVars[i].name.c_str()))); - } - } - if(dim1){ - iml->append(immed(dim1)); - iml->append(immed(dim2)); - } - repr = ocg->CreateKernel(iml);//kernel call - setup_code = ocg->StmtListAppend(setup_code, repr); - - //cuda free variables - for(int i=0; iptr_to(), operand(), immed(cudaMemcpy))), 4); - the_call->set_argument(0, operand(arrayVars[i].out_data)); - the_call->set_argument(1, operand(symtab->lookup_var(const_cast( - arrayVars[i].name.c_str())))); - the_call->set_argument(2, arrayVars[i].size_expr.clone()); - the_call->set_argument(3, operand(cudaMemcpyDeviceToHost)); - - tree_node_list* tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - teardown_code = ocg->StmtListAppend(teardown_code, - new CG_suifRepr(tnl)); - } - - in_cal *the_call = - new in_cal(type_s32, operand(), operand(new in_ldc(unkown_func->ptr_to(), operand(), immed(cudaFree))), 1); - the_call->set_argument(0, operand(symtab->lookup_var(const_cast( - arrayVars[i].name.c_str())))); - - tree_node_list* tnl = new tree_node_list; - tnl->append(new tree_instr(the_call)); - teardown_code = ocg->StmtListAppend(teardown_code, - new CG_suifRepr(tnl)); - } - - // --------------- - // BUILD THE KERNEL - // --------------- - - //Extract out kernel body - tree_node_list* code = getCode(); - //Get rid of wrapper if that original() added - if(code->head()->contents->kind() == TREE_IF) - { - tree_if* ifn = (tree_if*)code->head()->contents; - code = ifn->then_part(); - } - - //Create kernel function body - proc_sym *new_psym = globals->new_proc(kernel_type, src_c, (char*)cu_kernel_name.c_str()); - proc_symtab *new_proc_syms = new proc_symtab(new_psym->name()); - globals->add_child(new_proc_syms); - - //Add Params - std::map loop_vars; - //In-Out arrays - type_node* fptr; - for(int i=0; itype()->clone(); - fptr = arrayVars[i].in_data->type(); - else - //fptr = arrayVars[i].out_data->type()->clone(); - fptr = arrayVars[i].out_data->type(); - fptr = new_proc_syms->install_type(fptr); - std::string name = arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name(); - var_sym* sym = new var_sym(fptr, arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name()); - //protonu--adding a check to ensure that texture memories are not passed in as arguments - if(arrayVars[i].tex_mapped != true && arrayVars[i].cons_mapped !=true ) - { - sym->set_param(); - new_proc_syms->params()->append(sym); - new_proc_syms->add_sym(sym);//protonu--added to suppress the addition of the redundant var in the kernel - } - if (arrayVars[i].cons_mapped == true) - { - sym->set_param(); - new_proc_syms->add_sym(sym); - } - //printf("inserting name: %s\n", static_cast(name)); - loop_vars.insert(std::pair(std::string(name), sym)); - } - - if(dim1) - { - //Array dimentions - var_sym* kdim1 = new var_sym(dim1->type(), dim1->name()); - kdim1->set_param(); - new_proc_syms->add_sym(kdim1); - loop_vars.insert(std::pair(std::string(dim1->name()), kdim1)); - var_sym* kdim2 = new var_sym(dim2->type(), dim2->name()); - kdim2->set_param(); - new_proc_syms->add_sym(kdim2); - loop_vars.insert(std::pair(std::string(dim2->name()), kdim2)); - new_proc_syms->params()->append(kdim1); - new_proc_syms->params()->append(kdim2); - } - //Put block and thread implicit variables into scope - std::vector index_syms; - /* Currently we don't use the block dimentions - var_sym* blockDim_x = new var_sym(type_s32, "blockDim.x"); - blockDim_x->set_param(); - new_proc_syms->add_sym(blockDim_x); - var_sym* blockDim_y = new var_sym(type_s32, "blockDim.y"); - blockDim_y->set_param(); - new_proc_syms->add_sym(blockDim_y); - */ - if(cu_bx > 1){ - var_sym* blockIdx_x = new var_sym(type_s32, "blockIdx.x"); - blockIdx_x->set_param(); - new_proc_syms->add_sym(blockIdx_x); - index_syms.push_back(blockIdx_x); - } - if(cu_by > 1){ - var_sym* blockIdx_y = new var_sym(type_s32, "blockIdx.y"); - blockIdx_y->set_param(); - new_proc_syms->add_sym(blockIdx_y); - index_syms.push_back(blockIdx_y); - } - if(cu_tx > 1){ - var_sym* threadIdx_x = new var_sym(type_s32, "threadIdx.x"); - threadIdx_x->set_param(); - new_proc_syms->add_sym(threadIdx_x); - index_syms.push_back(threadIdx_x); - } - if(cu_ty > 1){ - var_sym* threadIdx_y = new var_sym(type_s32, "threadIdx.y"); - threadIdx_y->set_param(); - new_proc_syms->add_sym(threadIdx_y); - index_syms.push_back(threadIdx_y); - } - - if(cu_tz > 1){ - var_sym* threadIdx_z = new var_sym(type_s32, "threadIdx.z"); - threadIdx_z->set_param(); - new_proc_syms->add_sym(threadIdx_z); - index_syms.push_back(threadIdx_z); - } - - //Figure out which loop variables will be our thread and block dimention variables - std::vector loop_syms; - //Get our indexes - std::vector indexes;// = get_loop_indexes(code,cu_num_reduce); - int threadsPos=0; - if(cu_bx > 1) - indexes.push_back("bx"); - if(cu_by > 1) - indexes.push_back("by"); - if(cu_tx > 1){ - threadsPos = indexes.size(); - indexes.push_back("tx"); - } - if(cu_ty > 1) - indexes.push_back("ty"); - if(cu_tz > 1) - indexes.push_back("tz"); - for(int i=0; iadd_sym(loop_syms[i]); - //loop_vars.insert(std::pair(std::string(indexes[i]), loop_syms[i])); - } - - //Generate this code - //int bx = blockIdx.x - //int by = blockIdx.y - //int tx = threadIdx.x - //int ty = threadIdx.y - CG_outputRepr *body=NULL; - for(int i=0; iStmtListAppend(body, ocg->CreateStmtList( - // ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i]))))); - body = ocg->StmtListAppend(body, ocg->StmtListAppend( - ocg->CreateAssignment(0, lhs, new CG_suifRepr(operand(index_syms[i]))), NULL)); - } - - //Get our inital code prepped for loop reduction. First we need to swap - //out internal SUIF variable references to point to the new local - //function symbol table. - std::map loop_idxs; //map from idx names to their new syms - std::vector< std::pair > dim_vars; //pair is of var_sym (for 2D array size initializations) - replacements r; - tree_node_list* swapped = swapVarReferences(code, &r, ocg, loop_vars, new_proc_syms, dim_vars); - //printf("\n code before recursiveFindReplacePreferedIdxs :\n"); - //swapped->print(); - swapped = recursiveFindReplacePreferedIdxs(swapped, new_proc_syms, cudaSync, void_func, loop_idxs);//in-place swapping - //printf("\n code after recursiveFindReplacePreferedIdxs :\n"); - //swapped->print(); - - for(int i=0; i tfs = findCommentedFors(indexes[i], swapped); - for(int k=0; kprint(); - swap_node_for_node_list(tfs[k], newBlock); - //printf("AFTER SWAP\n"); newBlock->print(); - } - } - //printf("AFTER REDUCE\n"); swapped->print(); - - if(static_cast(ir)->init_code()){ - tree_node_list* orig_init_code = static_cast(static_cast(ir)->init_code())->GetCode(); - for(int i=0; ikind() == TREE_INSTR && ((tree_instr*)node)->instr()->format() == inf_rrr) - { - in_rrr* inst = (in_rrr*)((tree_instr*)node)->instr(); - //expect the structure: cpy( _ = min(grab_me, _)) - if(inst->opcode() == io_cpy && inst->dst_op().is_symbol()){ - //printf("looking at instruction: "); - //inst->print(); - var_sym* dest = inst->dst_op().symbol(); - if(dest == dim_vars[i].first) - { - if(inst->src1_op().is_instr() && inst->src1_op().instr()->format() == inf_ldc){ - value = ((in_ldc*)inst->src1_op().instr())->value().integer(); - } - } - } - } - } - if(value < 0){ - fprintf(stderr, "ERROR: Could not find initializing statement for variable used in upper_bound of array type"); - } - CG_outputRepr *lhs = new CG_suifRepr(operand(dim_vars[i].second)); - //body = ocg->StmtListAppend(body, ocg->CreateStmtList(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value)))); - body = ocg->StmtListAppend(body, ocg->StmtListAppend(ocg->CreateAssignment(0, lhs, ocg->CreateInt(value)), NULL)); - } - } - - - body = ocg->StmtListAppend(body, new CG_suifRepr(swapped)); - - //protonu--lets try creating our function definiton here - var_sym *tsym = NULL; - - - std::vector refs = ir->FindArrayRef(body); - for(int i=0; iis_array_tex_mapped(refs[i]->name().c_str())) - { - //protonu--our new tex lookup function - in_cal *tex_lookup = - new in_cal(type_f32, operand(), operand(new in_ldc(float_func->ptr_to(), operand(), immed(tex1D))), 2); - - //printf("name of the array to be mapped is %s\n", refs[i]->name().c_str()); - tsym = tex_ref_map[refs[i]->name()]; - tex_lookup->set_argument(0, operand(tsym)); - - - int array_dims = ((IR_suifArrayRef *)refs[i])->ia_->dims(); - - if (array_dims == 1){ - tex_lookup->set_argument(1, ((IR_suifArrayRef *)refs[i])->ia_->index(0).clone()); - }else if (array_dims > 2) { - printf(" \n we don't handle more than 2D arrays mapped to textures yet\n"); - }else if (array_dims == 2) { - - IR_ArraySymbol *sym = refs[i]->symbol(); - CG_outputRepr *sz = sym->size(1); - delete sym; // free the wrapper object only - // find the builder ocg - CG_outputRepr *expr = ocg->CreateTimes(sz->clone(),refs[i]->index(0)); - delete sz; // free the wrapper object only - expr = ocg->CreatePlus(expr, refs[i]->index(1)); - // expr holds the 1D access expression and take it out - tex_lookup->set_argument(1, ((CG_suifRepr *)expr)->GetExpression()); - } - - //using chun's function to replace the array look up with the function call - ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(tex_lookup))); - } - - } - - - tsym = NULL; - //protonu--now let's try what we did above for constant memory - for(int i=0; iis_array_cons_mapped(refs[i]->name().c_str())) - { - - //printf("name of the array to be cons mapped is %s\n", refs[i]->name().c_str()); - tsym = cons_ref_map[refs[i]->name()]; - //we should create a IR_SuifArray here - IR_ArraySymbol *ar_sym = new IR_suifArraySymbol(ir,tsym); - std::vector ar_index; - ar_index.push_back(((IR_suifArrayRef *)refs[i])->index(0)); - IR_ArrayRef *ar_ref = ((IR_suifCode *)ir)->CreateArrayRef(ar_sym, ar_index); - //using chun's function to replace the array look up with the function call - ((IR_suifCode *)ir)->ReplaceExpression(refs[i] , new CG_suifRepr(operand(((IR_suifArrayRef *)ar_ref)->ia_))); - - } - } - - - tree_proc *new_body = new tree_proc(static_cast(body)->GetCode(), new_proc_syms); - //globals->add_child(new_proc_syms); - new_psym->set_block(new_body); - new_procs.push_back(new_psym); - - return swapped; -} - -//Order taking out dummy variables -std::vector cleanOrder(std::vector idxNames){ - std::vector results; - for(int j=0; j& curOrder) -{ - //printf("curOrder: "); - //printVs(curOrder); - //printf("idxNames: "); - //printVS(idxNames[stmt]); - std::vector cIdxNames = cleanOrder(idxNames[stmt]); - bool same=true; - std::vector pi; - for(int i=0; i &pi) -{ -// check for sanity of parameters - if (stmt_num >= stmt.size() || stmt_num < 0) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - const int n = stmt[stmt_num].xform.n_out(); - if (pi.size() > (n-1)/2) - throw std::invalid_argument("iteration space dimensionality does not match permute dimensionality"); - int first_level = 0; - int last_level = 0; - for (int i = 0; i < pi.size(); i++) { - if (pi[i] > (n-1)/2 || pi[i] <= 0) - throw std::invalid_argument("invalid loop level " + to_string(pi[i]) + " in permuation"); - - if (pi[i] != i+1) { - if (first_level == 0) - first_level = i+1; - last_level = i+1; - } - } - if (first_level == 0) - return true; - - std::vector lex = getLexicalOrder(stmt_num); - std::set active = getStatements(lex, 2*first_level-2); - Loop::permute(active, pi); -} - - -void LoopCuda::tile_cuda(int stmt, int level, int outer_level) -{ - tile_cuda(stmt,level,1,outer_level,"","",CountedTile); -} -void LoopCuda::tile_cuda(int level, int tile_size, int outer_level, std::string idxName, - std::string ctrlName, TilingMethodType method){ - tile_cuda(0, level, tile_size, outer_level, idxName, ctrlName, method); -} - -void LoopCuda::tile_cuda(int stmt, int level, int tile_size, int outer_level, std::string idxName, - std::string ctrlName, TilingMethodType method){ - //Do regular tile but then update the index and control loop variable - //names as well as the idxName to reflect the current state of things. - //printf("tile(%d,%d,%d,%d)\n", stmt, level, tile_size, outer_level); - //printf("idxNames before: "); - //printVS(idxNames[stmt]); - - tile(stmt, level, tile_size, outer_level, method); - - if(idxName.size()) - idxNames[stmt][level-1] = idxName.c_str(); - if(tile_size == 1){ - //potentially rearrange loops - if(outer_level < level){ - std::string tmp = idxNames[stmt][level-1]; - for(int i=level-1; i>outer_level-1; i--){ - if(i-1 >= 0) - idxNames[stmt][i] = idxNames[stmt][i-1]; - } - idxNames[stmt][outer_level-1] = tmp; - } - //TODO: even with a tile size of one, you need a insert (of a dummy loop) - idxNames[stmt].insert(idxNames[stmt].begin()+(level),""); - }else{ - if(!ctrlName.size()) - throw std::runtime_error("No ctrl loop name for tile"); - //insert - idxNames[stmt].insert(idxNames[stmt].begin()+(outer_level-1),ctrlName.c_str()); - } - - //printf("idxNames after: "); - //printVS(idxNames[stmt]); -} - - -bool LoopCuda::datacopy_privatized_cuda(int stmt_num, int level, const std::string &array_name, const std::vector &privatized_levels, bool allow_extra_read , int fastest_changing_dimension , int padding_stride , int padding_alignment , bool cuda_shared) -{ - int old_stmts =stmt.size(); - //datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, cuda_shared); - if(cuda_shared) - datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 1); - else - datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, 0); - - - //Adjust idxNames to reflect updated state - std::vector cIdxNames = cleanOrder(idxNames[stmt_num]); - int new_stmts = stmt.size(); - for(int i=old_stmts; i idxs; - - - //protonu-making sure the vector of nonSplitLevels grows along with - //the statement structure - stmt_nonSplitLevels.push_back(omega::Tuple()); - - //Indexes up to level will be the same - for(int j=0; j new_idxs, bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, bool cuda_shared) -{ - - int old_stmts =stmt.size(); - //datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment,cuda_shared); - if(cuda_shared) - datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 1); - else - datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment, 0); - //Adjust idxNames to reflect updated state - std::vector cIdxNames = cleanOrder(idxNames[stmt_num]); - int new_stmts = stmt.size(); - for(int i=old_stmts; i idxs; - - //protonu-making sure the vector of nonSplitLevels grows along with - //the statement structure - stmt_nonSplitLevels.push_back(omega::Tuple()); - - //protonu--lets dump out the code from each statement here - //printf("\n dumping statement :%d", i); - //stmt[i].code->Dump(); - - //Indexes up to level will be the same - for(int j=0; j lex = getLexicalOrder(stmt_num); - std::set same_loop = getStatements(lex, dim-1); - - level = nonDummyLevel(stmt_num,level); - //printf("unrolling %d at level %d\n", stmt_num,level); - - //protonu--using the new version of unroll, which returns - //a set of ints instead of a bool. To keep Gabe's logic - //I'll check the size of the set, if it's 0 return true - //bool b= unroll(stmt_num, level, unroll_amount); - std::set b_set= unroll(stmt_num, level, unroll_amount); - bool b = false; - if (b_set.size() == 0) b = true; - //end--protonu - - //Adjust idxNames to reflect updated state - std::vector cIdxNames = cleanOrder(idxNames[stmt_num]); - std::vector origSource = idxNames[stmt_num];; - //Drop index names at level - if(unroll_amount == 0){ - //For all statements that were in this unroll together, drop index name for unrolled level - idxNames[stmt_num][level-1] = ""; - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - //printf("in same loop as %d is %d\n", stmt_num, (*i)); - //idxNames[(*i)][level-1] = ""; - idxNames[(*i)] = idxNames[stmt_num]; - } - } - - lex = getLexicalOrder(stmt_num); - same_loop = getStatements(lex, dim-1); - - bool same_as_source = false; - int new_stmts = stmt.size(); - for(int i=old_stmts; i()); - - - //We expect that new statements have a constant for the variable in - //stmt[i].IS at level (as seen with print_with_subs), otherwise there - //will be a for loop at level and idxNames should match stmt's - //idxNames pre-unrolled - Relation IS = stmt[i].IS; - //Ok, if you know how the hell to get anything out of a Relation, you - //should probably be able to do this more elegantly. But for now, I'm - //hacking it. - std::string s = IS.print_with_subs_to_string(); - //s looks looks like - //{[_t49,8,_t51,_t52,128]: 0 <= _t52 <= 3 && 0 <= _t51 <= 15 && 0 <= _t49 && 64_t49+16_t52+_t51 <= 128} - //where level == 5, you see a integer in the input set - - //If that's not an integer and this is the first new statement, then - //we think codegen will have a loop at that level. It's not perfect, - //not sure if it can be determined without round-tripping to codegen. - int sIdx = 0; - int eIdx = 0; - for(int j=0; j 0){ - eIdx = s.find("]"); - int tmp = s.find(",",sIdx+1); - if(tmp > 0 && tmp < eIdx) - eIdx = tmp; //", before ]" - if(eIdx > 0){ - sIdx++; - std::string var = s.substr(sIdx,eIdx-sIdx); - //printf("%s\n", s.c_str()); - //printf("set var for stmt %d at level %d is %s\n", i, level, var.c_str()); - if(atoi(var.c_str()) == 0 && i ==old_stmts){ - //TODO:Maybe do see if this new statement would be in the same - //group as the original and if it would, don't say - //same_as_source - if(same_loop.find(i) == same_loop.end()){ - printf("stmt %d level %d, newly created unroll statement should have same level indexes as source\n", i, level); - same_as_source = true; - } - } - } - } - - - //printf("fixing up statement %d n_set %d with %d levels\n", i, stmt[i].IS.n_set(), level-1); - if(same_as_source) - idxNames.push_back(origSource); - else - idxNames.push_back(idxNames[stmt_num]); - } - - return b; -} - -void LoopCuda::copy_to_texture(const char *array_name) -{ - //protonu--placeholder for now - //set the bool for using cuda memory as true - //in a vector of strings, put the names of arrays to tex mapped - if ( !texture ) - texture = new texture_memory_mapping(true, array_name); - else - texture->add(array_name); - - -} - - -void LoopCuda::copy_to_constant(const char *array_name) -{ - //protonu--placeholder for now - //set the bool for using cuda memory as true - //in a vector of strings, put the names of arrays to tex mapped - if ( !constant_mem ) - constant_mem = new constant_memory_mapping(true, array_name); - else - constant_mem->add(array_name); -} - -//protonu--moving this from Loop -tree_node_list* LoopCuda::codegen() -{ - if(code_gen_flags & GenCudaizeV2) - return cudaize_codegen_v2(); - //Do other flagged codegen methods, return plain vanilla generated code - return getCode(); -} - -//These three are in Omega code_gen.cc and are used as a massive hack to -//get out some info from MMGenerateCode. Yea for nasty side-effects. -namespace omega{ - extern int checkLoopLevel; - extern int stmtForLoopCheck; - extern int upperBoundForLevel; - extern int lowerBoundForLevel; -} - - -void LoopCuda::extractCudaUB(int stmt_num, int level, int &outUpperBound, int &outLowerBound){ - // check for sanity of parameters - const int m = stmt.size(); - if (stmt_num >= m || stmt_num < 0) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - const int n = stmt[stmt_num].xform.n_out(); - if (level > (n-1)/2 || level <= 0) - throw std::invalid_argument("invalid loop level " + to_string(level)); - - int dim = 2*level-1; - - std::vector lex = getLexicalOrder(stmt_num); - std::set same_loop = getStatements(lex, dim-1); - - // extract the intersection of the iteration space to be considered - Relation hull; - { - hull = Relation::True(n); - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - hull = Intersection(hull, project_onto_levels(getNewIS(*i), dim+1, true)); - hull.simplify(2, 4); - } - - for (int i = 2; i <= dim+1; i+=2) { - //std::string name = std::string("_t") + to_string(t_counter++); - std::string name = std::string("_t") + to_string(tmp_loop_var_name_counter++); - hull.name_set_var(i, name); - } - hull.setup_names(); - } - - // extract the exact loop bound of the dimension to be unrolled - if (is_single_iteration(hull, dim)){ - throw std::runtime_error("No loop availabe at level to extract upper bound."); - } - Relation bound = get_loop_bound(hull, dim); - if (!bound.has_single_conjunct() || !bound.is_satisfiable() || bound.is_tautology()) - throw loop_error("loop error: unable to extract loop bound for cudaize"); - - // extract the loop stride - EQ_Handle stride_eq; - int stride = 1; - { - bool simple_stride = true; - int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(dim+1), stride_eq, simple_stride); - if (strides > 1) - throw loop_error("loop error: too many strides"); - else if (strides == 1) { - int sign = stride_eq.get_coef(bound.set_var(dim+1)); -// assert(sign == 1 || sign == -1); - Constr_Vars_Iter it(stride_eq, true); - stride = abs((*it).coef/sign); - } - } - if(stride != 1){ - char buf[1024]; - sprintf(buf, "Cudaize: Loop at level %d has non-one stride of %d", level, stride); - throw std::runtime_error(buf); - } - - //Use code generation system to build tell us our bound information. We - //need a hard upper bound a 0 lower bound. - - checkLoopLevel = level*2; - stmtForLoopCheck = stmt_num; - upperBoundForLevel = -1; - lowerBoundForLevel = -1; - printCode(1,false); - checkLoopLevel = 0; - - outUpperBound = upperBoundForLevel; - outLowerBound = lowerBoundForLevel; - return; -} - - -void LoopCuda::printCode(int effort, bool actuallyPrint) const { - const int m = stmt.size(); - if (m == 0) - return; - const int n = stmt[0].xform.n_out(); - - - - Tuple IS(m); - Tuple xform(m); - Tuple nonSplitLevels(m); - for (int i = 0; i < m; i++) { - IS[i+1] = stmt[i].IS; - xform[i+1] = stmt[i].xform; - nonSplitLevels[i+1] = stmt_nonSplitLevels[i]; - //nonSplitLevels[i+1] = stmt[i].nonSplitLevels; - } - - Tuple< Tuple > idxTupleNames; - if(useIdxNames){ - for(int i=0; i idxs; - for(int j=0; jknown), n - this->known.n_set()); - CG_stringBuilder *ocg = new CG_stringBuilder(); - Tuple nameInfo; - for (int i = 1; i <= m; i++) - nameInfo.append(new CG_stringRepr("s" + to_string(i))); - CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort); - if(actuallyPrint) - std::cout << GetString(repr); -/* - for (int i = 1; i <= m; i++) - delete nameInfo[i]; -*/ - - delete ocg; -} - - - -void LoopCuda::printRuntimeInfo() const { - for(int i=0; i(stmt[i].code)->GetCode()->print_expr(); - } -} - -void LoopCuda::printIndexes() const { - for(int i=0; i0) - printf(","); - printf("%s", idxNames[i][j].c_str()); - } - printf("\n"); - } -} - -tree_node_list* LoopCuda::getCode(int effort) const { - const int m = stmt.size(); - if (m == 0) - return new tree_node_list; - const int n = stmt[0].xform.n_out(); - - - - Tuple ni(m); - Tuple IS(m); - Tuple xform(m); - Tuple< IntTuple > nonSplitLevels(m); - for (int i = 0; i < m; i++) { - ni[i+1] = stmt[i].code; - IS[i+1] = stmt[i].IS; - xform[i+1] = stmt[i].xform; - nonSplitLevels[i+1] = stmt_nonSplitLevels[i]; - //nonSplitLevels[i+1] = stmt[i].nonSplitLevels; - } - - - Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); -#ifdef DEBUG -// std::cout << GetString(MMGenerateCode(new CG_stringBuilder(), xform, IS, known, effort)); -#endif - Tuple< Tuple > idxTupleNames; - if(useIdxNames){ - for(int i=0; i idxs; - for(int j=0; jbuilder(); - CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, nonSplitLevels, syncs, idxTupleNames, effort); - - //CG_outputRepr *overflow_initialization = ocg->CreateStmtList(); - //protonu--using the new function CG_suifBuilder::StmtListAppend - CG_outputRepr *overflow_initialization = ocg->StmtListAppend(NULL, NULL); - for (std::map >::const_iterator i = overflow.begin(); i != overflow.end(); i++) - for (std::vector::const_iterator j = i->second.begin(); j != i->second.end(); j++) - //overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->CreateStmtList(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)))); - overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->StmtListAppend(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)), NULL)); - - repr = ocg->StmtListAppend(overflow_initialization, repr); - tree_node_list *tnl = static_cast(repr)->GetCode(); - - delete repr; - /* - for (int i = 1; i <= m; i++) - delete ni[i]; - */ - - return tnl; -} - - -//protonu--adding constructors for the new derived class -LoopCuda::LoopCuda():Loop(), code_gen_flags(GenInit){} - -LoopCuda::LoopCuda(IR_Control *irc, int loop_num) - :Loop(irc) -{ - setup_code = NULL; - teardown_code = NULL; - code_gen_flags = 0; - cu_bx = cu_by = cu_tx = cu_ty = cu_tz = 1; - cu_num_reduce = 0; - cu_mode = GlobalMem; - texture = NULL; - constant_mem = NULL; - - int m=stmt.size(); - //printf("\n the size of stmt(initially) is: %d\n", stmt.size()); - for(int i=0; i()); - - - //protonu--setting up - //proc_symtab *symtab - //global_symtab *globals - - globals = ((IR_cudasuifCode *)ir)->gsym_ ; - std::vector tf = ((IR_cudasuifCode *)ir)->get_loops(); - - symtab = tf[loop_num]->proc()->block()->proc_syms(); - - std::vector deepest = find_deepest_loops(tf[loop_num]); - - for (int i = 0; i < deepest.size(); i++){ - index.push_back(deepest[i]->index()->name()); //reflects original code index names - } - - for(int i=0; i< stmt.size(); i++) - idxNames.push_back(index); //refects prefered index names (used as handles in cudaize v2) - useIdxNames=false; - -} - diff --git a/loop_cuda_rose.cc b/loop_cuda_rose.cc deleted file mode 100644 index c5633ee..0000000 --- a/loop_cuda_rose.cc +++ /dev/null @@ -1,3734 +0,0 @@ -/***************************************************************************** - Copyright (C) 2009 University of Utah - All Rights Reserved. - - Purpose: - Cudaize methods - - Notes: - - History: - 1/7/10 Created by Gabe Rudy by migrating code from loop.cc - 31/1/11 Modified by Protonu Basu -*****************************************************************************/ -#define TRANSFORMATION_FILE_INFO Sg_File_Info::generateDefaultFileInfoForTransformationNode() -#include -#include -#include -#include -#include "loop_cuda_rose.hh" -#include "loop.hh" -#include -//#include -#include "omegatools.hh" -#include "ir_cudarose.hh" -#include "ir_rose.hh" -#include "ir_rose_utils.hh" -#include "chill_error.hh" -#include -#include "Outliner.hh" -//#define DEBUG -using namespace omega; -using namespace SageBuilder; -using namespace SageInterface; -//using namespace Outliner; -//using namespace ASTtools; -char *k_cuda_texture_memory; //protonu--added to track texture memory type -//extern char *omega::k_cuda_texture_memory; //protonu--added to track texture memory type -extern char *omega::k_ocg_comment; - -static int cudaDebug; -class CudaStaticInit { -public: - CudaStaticInit() { - cudaDebug = 0; //Change this to 1 for debug - } -}; -static CudaStaticInit junkInitInstance__; - -std::string& upcase(std::string& s) { - for (int i = 0; i < s.size(); i++) - s[i] = toupper(s[i]); - return s; -} - -void printVs(const std::vector& curOrder) { - if (!cudaDebug) return; - for (int i = 0; i < curOrder.size(); i++) { - if (i > 0) - printf(","); - printf("%s", curOrder[i].c_str()); - } - printf("\n"); -} - -void printVS(const std::vector& curOrder) { - if(!cudaDebug) return; - for (int i = 0; i < curOrder.size(); i++) { - if (i > 0) - printf(","); - printf("%s", curOrder[i].c_str()); - } - printf("\n"); -} - -LoopCuda::~LoopCuda() { - const int m = stmt.size(); - for (int i = 0; i < m; i++) - stmt[i].code->clear(); -} - -bool LoopCuda::symbolExists(std::string s) { - - if (body_symtab->find_variable(SgName(s.c_str())) - || parameter_symtab->find_variable(SgName(s.c_str()))) - return true; - if (globals->lookup_variable_symbol(SgName(s.c_str()))) - return true; - for (int i = 0; i < idxNames.size(); i++) - for (int j = 0; j < idxNames[i].size(); j++) - if (strcmp(idxNames[i][j].c_str(), s.c_str()) == 0) - return true; - return false; -} - -void LoopCuda::addSync(int stmt_num, std::string idxName) { - //we store these and code-gen inserts sync to omega comments where stmt - //in loop that has idxName being generated - syncs.push_back(make_pair(stmt_num, idxName)); -} - -void LoopCuda::renameIndex(int stmt_num, std::string idx, std::string newName) { - int level = findCurLevel(stmt_num, idx); - if (idxNames.size() <= stmt_num || idxNames[stmt_num].size() < level) - throw std::runtime_error("Invalid statment number of index"); - idxNames[stmt_num][level - 1] = newName.c_str(); -} - -enum Type { - Int -}; - -SgNode* wrapInIfFromMinBound(SgNode* then_part, SgForStatement* loop, - SgScopeStatement* symtab, SgVariableSymbol* bound_sym) { - // CG_roseBuilder *ocg = new CG_roseBuilder( - - SgBinaryOp* test_expr = isSgBinaryOp(loop->get_test_expr()); - SgExpression* upperBound; - SgExpression* conditional; - upperBound = test_expr->get_rhs_operand(); - CG_outputRepr *ifstmt; - - SgCallExpression *call; - if (call = isSgCallExpression(upperBound)) - if (isSgVarRefExp(call->get_function())->get_symbol()->get_name().getString() - == "__rose_lt") { - SgExprListExp* arg_list = call->get_args(); - SgExpression *if_bound = *(arg_list->get_expressions().begin()); - /*This relies on the minimum expression being the rhs operand of - * the min instruction. - */ - SgIfStmt *ifstmt = buildIfStmt( - buildLessOrEqualOp(buildVarRefExp(bound_sym), if_bound), - isSgStatement(then_part), NULL); - return isSgNode(ifstmt); - - } - -/* if (isSgConditionalExp(upperBound)) { - conditional = isSgConditionalExp(upperBound)->get_conditional_exp(); - - if (isSgBinaryOp(conditional)) { - SgBinaryOp* binop = isSgBinaryOp(conditional); - - if (isSgLessThanOp(binop) || isSgLessOrEqualOp(binop)) { - SgIfStmt *ifstmt = buildIfStmt( - buildLessOrEqualOp(buildVarRefExp(bound_sym), - test_expr), isSgStatement(then_part), NULL); - return isSgNode(ifstmt); - } - - } - - } -*/ - return then_part; -} - -/** - * This would be better if it was done by a CHiLL xformation instead of at codegen - * - * state: - * for(...) - * for(...) - * cur_body - * stmt1 - * - * stm1 is in-between two loops that are going to be reduced. The - * solution is to put stmt1 at the end of cur_body but conditionally run - * in on the last step of the for loop. - * - * A CHiLL command that would work better: - * - * for(...) - * stmt0 - * for(for i=0; i - * for(...) - * for(for i=0; i findCommentedFors(const char* index, SgNode* tnl) { - std::vector result; - bool next_loop_ok = false; - - if (isSgBasicBlock(tnl)) { - - SgStatementPtrList& list = isSgBasicBlock(tnl)->get_statements(); - - for (SgStatementPtrList::iterator it = list.begin(); it != list.end(); - it++) { - std::vector t = findCommentedFors(index, - isSgNode(*it)); - std::copy(t.begin(), t.end(), back_inserter(result)); - } - } else if (isSgForStatement(tnl)) { - - AstTextAttribute* att = - (AstTextAttribute*) (isSgNode(tnl)->getAttribute( - "omega_comment")); - std::string comment = att->toString(); - - if (comment.find("~cuda~") != std::string::npos - && comment.find("preferredIdx: ") != std::string::npos) { - std::string idx = comment.substr( - comment.find("preferredIdx: ") + 14, std::string::npos); - if (idx.find(" ") != std::string::npos) - idx = idx.substr(0, idx.find(" ")); - if (strcmp(idx.c_str(), index) == 0) - next_loop_ok = true; - } - - if (next_loop_ok) { - //printf("found loop %s\n", static_cast(tn)->index()->name()); - result.push_back(isSgForStatement(tnl)); - } else { - //printf("looking down for loop %s\n", static_cast(tn)->index()->name()); - std::vector t = findCommentedFors(index, - isSgForStatement(tnl)->get_loop_body()); - std::copy(t.begin(), t.end(), back_inserter(result)); - } - next_loop_ok = false; - } else if (isSgIfStmt(tnl)) { - //printf("looking down if\n"); - SgIfStmt *tni = isSgIfStmt(tnl); - std::vector t = findCommentedFors(index, - tni->get_true_body()); - std::copy(t.begin(), t.end(), back_inserter(result)); - } - - return result; -} - -SgNode* forReduce(SgForStatement* loop, SgVariableSymbol* reduceIndex, - SgScopeStatement* body_syms) { - //We did the replacements all at once with recursiveFindPreferedIdxs - //replacements r; - //r.oldsyms.append(loop->index()); - //r.newsyms.append(reduceIndex); - //tree_for* new_loop = (tree_for*)loop->clone_helper(&r, true); - SgForStatement* new_loop = loop; - - //return body one loops in - SgNode* tnl = loop_body_at_level(new_loop, 1); - //wrap in conditional if necessary - tnl = wrapInIfFromMinBound(tnl, new_loop, body_syms, reduceIndex); - return tnl; -} - -void recursiveFindRefs(SgNode* code, std::set& syms, - SgFunctionDefinition* def) { - - SgStatement* s = isSgStatement(code); - // L = {symbols defined within 's'}, local variables declared within 's' - ASTtools::VarSymSet_t L; - ASTtools::collectDefdVarSyms(s, L); - //dump (L, "L = "); - - // U = {symbols used within 's'} - ASTtools::VarSymSet_t U; - ASTtools::collectRefdVarSyms(s, U); - //dump (U, "U = "); - - // U - L = {symbols used within 's' but not defined in 's'} - // variable references to non-local-declared variables - ASTtools::VarSymSet_t diff_U_L; - set_difference(U.begin(), U.end(), L.begin(), L.end(), - inserter(diff_U_L, diff_U_L.begin())); - //dump (diff_U_L, "U - L = "); - - // Q = {symbols defined within the function surrounding 's' that are - // visible at 's'}, including function parameters - ASTtools::VarSymSet_t Q; - ASTtools::collectLocalVisibleVarSyms(def->get_declaration(), s, Q); -// dump (Q, "Q = "); - - // (U - L) \cap Q = {variables that need to be passed as parameters - // to the outlined function} - // a sub set of variables that are not globally visible (no need to pass at all) - // It excludes the variables with a scope between global and the enclosing function - set_intersection(diff_U_L.begin(), diff_U_L.end(), Q.begin(), Q.end(), - inserter(syms, syms.begin())); - - /* std::vector scalars; - //SgNode *tnl = static_cast(repr)->GetCode(); - SgStatement* stmt; - SgExpression* exp; - if (tnl != NULL) { - if(stmt = isSgStatement(tnl)){ - if(isSgBasicBlock(stmt)){ - SgStatementPtrList& stmts = isSgBasicBlock(stmt)->get_statements(); - for(int i =0; i < stmts.size(); i++){ - //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgNode(stmts[i])); - std::vector a = recursiveFindRefs(isSgNode(stmts[i])); - //delete r; - std::copy(a.begin(), a.end(), back_inserter(scalars)); - } - - } - else if(isSgForStatement(stmt)){ - - SgForStatement *tnf = isSgForStatement(stmt); - //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgStatement(tnf->get_loop_body())); - std::vector a = recursiveFindRefs(isSgNode(tnf->get_loop_body())); - //delete r; - std::copy(a.begin(), a.end(), back_inserter(scalars)); - } - else if(isSgFortranDo(stmt)){ - SgFortranDo *tfortran = isSgFortranDo(stmt); - omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgStatement(tfortran->get_body())); - std::vector a = recursiveFindRefs(r); - delete r; - std::copy(a.begin(), a.end(), back_inserter(scalars)); - } - - else if(isSgIfStmt(stmt) ){ - SgIfStmt* tni = isSgIfStmt(stmt); - //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgNode(tni->get_conditional())); - std::vector a = recursiveFindRefs(isSgNode(tni->get_conditional())); - //delete r; - std::copy(a.begin(), a.end(), back_inserter(scalars)); - //r = new omega::CG_roseRepr(isSgNode(tni->get_true_body())); - a = recursiveFindRefs(isSgNode(tni->get_true_body())); - //delete r; - std::copy(a.begin(), a.end(), back_inserter(scalars)); - //r = new omega::CG_roseRepr(isSgNode(tni->get_false_body())); - a = recursiveFindRefs(isSgNode(tni->get_false_body())); - //delete r; - std::copy(a.begin(), a.end(), back_inserter(scalars)); - } - else if(isSgExprStatement(stmt)) { - //omega::CG_roseRepr *r = new omega::CG_roseRepr(isSgExpression(isSgExprStatement(stmt)->get_expression())); - std::vector a = recursiveFindRefs(isSgNode(isSgExprStatement(stmt)->get_expression())); - //delete r; - std::copy(a.begin(), a.end(), back_inserter(scalars)); - - } - } - } - else{ - SgExpression* op = isSgExpression(tnl); - if(isSgVarRefExp(op)){ - - scalars.push_back(isSgVarRefExp(op)->get_symbol()); - - } - else if( isSgAssignOp(op)){ - //omega::CG_roseRepr *r1 = new omega::CG_roseRepr(isSgAssignOp(op)->get_lhs_operand()); - std::vector a1 = recursiveFindRefs(isSgNode(isSgAssignOp(op)->get_lhs_operand())); - //delete r1; - std::copy(a1.begin(), a1.end(), back_inserter(scalars)); - //omega::CG_roseRepr *r2 = new omega::CG_roseRepr(isSgAssignOp(op)->get_rhs_operand()); - std::vector a2 = recursiveFindRefs(isSgNode(isSgAssignOp(op)->get_rhs_operand())); - //delete r2; - std::copy(a2.begin(), a2.end(), back_inserter(scalars)); - - } - else if(isSgBinaryOp(op)){ - // omega::CG_roseRepr *r1 = new omega::CG_roseRepr(isSgBinaryOp(op)->get_lhs_operand()); - std::vector a1 = recursiveFindRefs(isSgNode(isSgBinaryOp(op)->get_lhs_operand())); - //delete r1; - std::copy(a1.begin(), a1.end(), back_inserter(scalars)); - //omega::CG_roseRepr *r2 = new omega::CG_roseRepr(isSgBinaryOp(op)->get_rhs_operand()); - std::vector a2 = recursiveFindRefs((isSgBinaryOp(op)->get_rhs_operand())); - //delete r2; - std::copy(a2.begin(), a2.end(), back_inserter(scalars)); - } - else if(isSgUnaryOp(op)){ - //omega::CG_roseRepr *r1 = new omega::CG_roseRepr(isSgUnaryOp(op)->get_operand()); - std::vector a1 = recursiveFindRefs(isSgNode(isSgUnaryOp(op)->get_operand())); - //delete r1; - std::copy(a1.begin(), a1.end(), back_inserter(scalars)); - } - - } - return scalars; - - - */ - -} - -SgNode* recursiveFindReplacePreferedIdxs(SgNode* code, SgSymbolTable* body_syms, - SgSymbolTable* param_syms, SgScopeStatement* body, - std::map& loop_idxs, - SgGlobal* globalscope, bool sync = false) { - //tree_node_list* tnl = new tree_node_list; - //tree_node_list_iter tnli(code); - SgVariableSymbol* idxSym = 0; - std::vector r1; - std::vector r2; - SgNode* tnli; - SgNode* tnli1; - SgNode* tnli2; - SgBasicBlock * clone; - - if (isSgForStatement(code)) { - AstTextAttribute* att = - (AstTextAttribute*) (isSgNode(code)->getAttribute( - "omega_comment")); - - std::string comment; - if (att != NULL) - comment = att->toString(); - - if (comment.find("~cuda~") != std::string::npos - && comment.find("preferredIdx: ") != std::string::npos) { - std::string idx = comment.substr( - comment.find("preferredIdx: ") + 14, std::string::npos); - if (idx.find(" ") != std::string::npos) - idx = idx.substr(0, idx.find(" ")); - if (loop_idxs.find(idx) != loop_idxs.end()) - idxSym = loop_idxs.find(idx)->second; - //Get the proc variable sybol for this preferred index - if (idxSym == 0) { - idxSym = body_syms->find_variable(idx.c_str()); - if (!idxSym) - idxSym = param_syms->find_variable(idx.c_str()); - //printf("idx not found: lookup %p\n", idxSym); - if (!idxSym) { - SgVariableDeclaration* defn = buildVariableDeclaration( - SgName((char*) idx.c_str()), buildIntType()); - //idxSym = new var_sym(type_s32, (char*)idx.c_str()); - SgInitializedNamePtrList& variables = defn->get_variables(); - SgInitializedNamePtrList::const_iterator i = - variables.begin(); - SgInitializedName* initializedName = *i; - SgVariableSymbol* vs = new SgVariableSymbol( - initializedName); - prependStatement(defn, body); - vs->set_parent(body_syms); - body_syms->insert(SgName((char*) idx.c_str()), vs); - idxSym = vs; - //printf("idx created and inserted\n"); - } - //Now insert into our map for future - if (cudaDebug) - std::cout << idx << "\n\n"; - loop_idxs.insert(make_pair(idx, idxSym)); - } - //See if we have a sync as well - if (comment.find("sync") != std::string::npos) { - //printf("Inserting sync after current block\n"); - sync = true; - } - - } - if (idxSym) { - SgForInitStatement* list = - isSgForStatement(code)->get_for_init_stmt(); - SgStatementPtrList& initStatements = list->get_init_stmt(); - SgStatementPtrList::const_iterator j = initStatements.begin(); - const SgVariableSymbol* index; - - if (SgExprStatement *expr = isSgExprStatement(*j)) - if (SgAssignOp* op = isSgAssignOp(expr->get_expression())) - if (SgVarRefExp* var_ref = isSgVarRefExp( - op->get_lhs_operand())) - index = var_ref->get_symbol(); - - std::vector array = substitute(code, index, NULL, - isSgNode(body_syms)); - - for (int j = 0; j < array.size(); j++) - array[j]->set_symbol(idxSym); - } - - SgStatement* body_ = isSgStatement( - recursiveFindReplacePreferedIdxs( - isSgNode((isSgForStatement(code)->get_loop_body())), - body_syms, param_syms, body, loop_idxs, globalscope)); - - omega::CG_roseRepr * tnl = new omega::CG_roseRepr(code); - omega::CG_outputRepr* block = tnl->clone(); - tnli = static_cast(block)->GetCode(); - - isSgForStatement(tnli)->set_loop_body(body_); - body_->set_parent(tnli); - - if (idxSym) { - SgForInitStatement* list = - isSgForStatement(tnli)->get_for_init_stmt(); - SgStatementPtrList& initStatements = list->get_init_stmt(); - SgStatementPtrList::const_iterator j = initStatements.begin(); - const SgVariableSymbol* index; - - if (SgExprStatement *expr = isSgExprStatement(*j)) - if (SgAssignOp* op = isSgAssignOp(expr->get_expression())) - if (SgVarRefExp* var_ref = isSgVarRefExp( - op->get_lhs_operand())) - index = var_ref->get_symbol(); - - std::vector array = substitute(tnli, index, NULL, - isSgNode(body_syms)); - - for (int j = 0; j < array.size(); j++) - array[j]->set_symbol(idxSym); - } - // std::cout << isSgNode(body_)->unparseToString() << "\n\n"; - if (att != NULL) - tnli->setAttribute("omega_comment", att); - - if (sync) { - SgName name_syncthreads("__syncthreads"); - SgFunctionSymbol * syncthreads_symbol = - globalscope->lookup_function_symbol(name_syncthreads); - - // Create a call to __syncthreads(): - SgFunctionCallExp * syncthreads_call = buildFunctionCallExp( - syncthreads_symbol, buildExprListExp()); - - SgExprStatement* stmt = buildExprStatement(syncthreads_call); - - /* if (SgBasicBlock* bb = isSgBasicBlock( - isSgForStatement(code)->get_loop_body())) - appendStatement(isSgStatement(stmt), bb); - - else if (SgStatement* ss = isSgStatement( - isSgForStatement(code)->get_loop_body())) { - SgBasicBlock* bb2 = buildBasicBlock(); - - isSgNode(ss)->set_parent(bb2); - appendStatement(ss, bb2); - - appendStatement(isSgStatement(stmt), bb2); - isSgNode(stmt)->set_parent(bb2); - isSgForStatement(code)->set_loop_body(bb2); - isSgNode(bb2)->set_parent(code); - } - */ - - SgBasicBlock* bb2 = buildBasicBlock(); - - bb2->append_statement(isSgStatement(tnli)); - bb2->append_statement(stmt); - /* SgNode* parent = code->get_parent(); - if(!isSgStatement(parent)) - throw loop_error("Parent not a statement"); - - if(isSgForStatement(parent)){ - if(SgStatement *ss = isSgForStatement(isSgForStatement(parent)->get_loop_body())){ - omega::CG_roseRepr * tnl = new omega::CG_roseRepr(ss); - omega::CG_outputRepr* block= tnl->clone(); - - SgNode *new_ss = static_cast(block)->GetCode(); - SgBasicBlock* bb2 = buildBasicBlock(); - - isSgNode(new_ss)->set_parent(bb2); - appendStatement(isSgStatement(new_ss), bb2); - appendStatement(isSgStatement(stmt), bb2); - isSgNode(stmt)->set_parent(bb2); - - isSgStatement(parent)->replace_statement_from_basicBlock(ss, isSgStatement(bb2)); - - }else if(isSgBasicBlock(isSgForStatement(parent)->get_loop_body())) - isSgStatement(isSgForStatement(parent)->get_loop_body())->insert_statement(isSgStatement(code), stmt, false); - else - throw loop_error("parent statement type undefined!!"); - - } - else if(isSgBasicBlock(parent)) - isSgStatement(parent)->insert_statement(isSgStatement(code), stmt, false); - else - throw loop_error("parent statement type undefined!!"); - - //tnl->print(); - * - * - */ - sync = true; - return isSgNode(bb2); - - } else - return tnli; - } else if (isSgIfStmt(code)) { - SgStatement* body_ = isSgStatement( - recursiveFindReplacePreferedIdxs( - isSgNode((isSgIfStmt(code)->get_true_body())), - body_syms, param_syms, body, loop_idxs, globalscope)); - - omega::CG_roseRepr * tnl = new omega::CG_roseRepr(code); - omega::CG_outputRepr* block = tnl->clone(); - tnli = static_cast(block)->GetCode(); - - isSgIfStmt(tnli)->set_true_body(body_); - - if ((isSgIfStmt(code)->get_false_body())) - isSgIfStmt(tnli)->set_false_body( - isSgStatement( - recursiveFindReplacePreferedIdxs( - isSgNode( - (isSgIfStmt(code)->get_false_body())), - body_syms, param_syms, body, loop_idxs, - globalscope))); - - return tnli; - } else if (isSgStatement(code) && !isSgBasicBlock(code)) { - omega::CG_roseRepr * tnl = new omega::CG_roseRepr(code); - omega::CG_outputRepr* block = tnl->clone(); - tnli = static_cast(block)->GetCode(); - - return tnli; - - } else if (isSgBasicBlock(code)) { - SgStatementPtrList& tnl = isSgBasicBlock(code)->get_statements(); - - SgStatementPtrList::iterator temp; - clone = buildBasicBlock(); - bool sync_found = false; - for (SgStatementPtrList::const_iterator it = tnl.begin(); - it != tnl.end(); it++) { - - if (isSgForStatement(*it)) { - AstTextAttribute* att = - (AstTextAttribute*) (isSgNode(*it)->getAttribute( - "omega_comment")); - - std::string comment; - if (att != NULL) - comment = att->toString(); - - if (comment.find("~cuda~") != std::string::npos - && comment.find("preferredIdx: ") - != std::string::npos) { - std::string idx = comment.substr( - comment.find("preferredIdx: ") + 14, - std::string::npos); - if (idx.find(" ") != std::string::npos) - idx = idx.substr(0, idx.find(" ")); - //printf("sym_tab preferred index: %s\n", idx.c_str()); - if (loop_idxs.find(idx) != loop_idxs.end()) - idxSym = loop_idxs.find(idx)->second; - //Get the proc variable sybol for this preferred index - if (idxSym == 0) { - idxSym = body_syms->find_variable(idx.c_str()); - if (!idxSym) - idxSym = param_syms->find_variable(idx.c_str()); - //printf("idx not found: lookup %p\n", idxSym); - if (!idxSym) { - SgVariableDeclaration* defn = - buildVariableDeclaration( - SgName((char*) idx.c_str()), - buildIntType()); - //idxSym = new var_sym(type_s32, (char*)idx.c_str()); - SgInitializedNamePtrList& variables = - defn->get_variables(); - SgInitializedNamePtrList::const_iterator i = - variables.begin(); - SgInitializedName* initializedName = *i; - SgVariableSymbol* vs = new SgVariableSymbol( - initializedName); - prependStatement(defn, body); - vs->set_parent(body_syms); - body_syms->insert(SgName((char*) idx.c_str()), vs); - //printf("idx created and inserted\n"); - idxSym = vs; - } - //Now insert into our map for future - if (cudaDebug) - std::cout << idx << "\n\n"; - loop_idxs.insert(make_pair(idx, idxSym)); - - } - //See if we have a sync as well - if (comment.find("sync") != std::string::npos) { - //printf("Inserting sync after current block\n"); - sync = true; - } - - } - if (idxSym) { - SgForInitStatement* list = - isSgForStatement(*it)->get_for_init_stmt(); - SgStatementPtrList& initStatements = list->get_init_stmt(); - SgStatementPtrList::const_iterator j = - initStatements.begin(); - const SgVariableSymbol* index; - - if (SgExprStatement *expr = isSgExprStatement(*j)) - if (SgAssignOp* op = isSgAssignOp( - expr->get_expression())) - if (SgVarRefExp* var_ref = isSgVarRefExp( - op->get_lhs_operand())) - index = var_ref->get_symbol(); - - std::vector array = substitute(*it, index, - NULL, isSgNode(body_syms)); - - for (int j = 0; j < array.size(); j++) - array[j]->set_symbol(idxSym); - - } - - SgStatement* body_ = - isSgStatement( - recursiveFindReplacePreferedIdxs( - isSgNode( - (isSgForStatement(*it)->get_loop_body())), - body_syms, param_syms, body, loop_idxs, - globalscope)); - - omega::CG_roseRepr * tnl = new omega::CG_roseRepr(*it); - omega::CG_outputRepr* block = tnl->clone(); - tnli = - static_cast(block)->GetCode(); - - isSgForStatement(tnli)->set_loop_body(body_); - body_->set_parent(tnli); - if (idxSym) { - SgForInitStatement* list = - isSgForStatement(tnli)->get_for_init_stmt(); - SgStatementPtrList& initStatements = list->get_init_stmt(); - SgStatementPtrList::const_iterator j = - initStatements.begin(); - const SgVariableSymbol* index; - - if (SgExprStatement *expr = isSgExprStatement(*j)) - if (SgAssignOp* op = isSgAssignOp( - expr->get_expression())) - if (SgVarRefExp* var_ref = isSgVarRefExp( - op->get_lhs_operand())) - index = var_ref->get_symbol(); - - std::vector array = substitute(tnli, index, - NULL, isSgNode(body_syms)); - - for (int j = 0; j < array.size(); j++) - array[j]->set_symbol(idxSym); - } - idxSym = 0; - // std::cout << isSgNode(body_)->unparseToString() << "\n\n"; - if (att != NULL) - tnli->setAttribute("omega_comment", att); - clone->append_statement(isSgStatement(tnli)); - if (sync) { - SgName name_syncthreads("__syncthreads"); - SgFunctionSymbol * syncthreads_symbol = - globalscope->lookup_function_symbol( - name_syncthreads); - - // Create a call to __syncthreads(): - SgFunctionCallExp * syncthreads_call = buildFunctionCallExp( - syncthreads_symbol, buildExprListExp()); - - SgExprStatement* stmt = buildExprStatement( - syncthreads_call); - - /* if (SgBasicBlock* bb = isSgBasicBlock( - isSgForStatement(code)->get_loop_body())) - appendStatement(isSgStatement(stmt), bb); - - else if (SgStatement* ss = isSgStatement( - isSgForStatement(code)->get_loop_body())) { - SgBasicBlock* bb2 = buildBasicBlock(); - - isSgNode(ss)->set_parent(bb2); - appendStatement(ss, bb2); - - appendStatement(isSgStatement(stmt), bb2); - isSgNode(stmt)->set_parent(bb2); - isSgForStatement(code)->set_loop_body(bb2); - isSgNode(bb2)->set_parent(code); - } - */ - - //SgBasicBlock* bb2 = buildBasicBlock(); - clone->append_statement(stmt); - /* SgNode* parent = code->get_parent(); - if(!isSgStatement(parent)) - throw loop_error("Parent not a statement"); - - if(isSgForStatement(parent)){ - if(SgStatement *ss = isSgForStatement(isSgForStatement(parent)->get_loop_body())){ - omega::CG_roseRepr * tnl = new omega::CG_roseRepr(ss); - omega::CG_outputRepr* block= tnl->clone(); - - SgNode *new_ss = static_cast(block)->GetCode(); - SgBasicBlock* bb2 = buildBasicBlock(); - - isSgNode(new_ss)->set_parent(bb2); - appendStatement(isSgStatement(new_ss), bb2); - appendStatement(isSgStatement(stmt), bb2); - isSgNode(stmt)->set_parent(bb2); - - isSgStatement(parent)->replace_statement_from_basicBlock(ss, isSgStatement(bb2)); - - }else if(isSgBasicBlock(isSgForStatement(parent)->get_loop_body())) - isSgStatement(isSgForStatement(parent)->get_loop_body())->insert_statement(isSgStatement(code), stmt, false); - else - throw loop_error("parent statement type undefined!!"); - - } - else if(isSgBasicBlock(parent)) - isSgStatement(parent)->insert_statement(isSgStatement(code), stmt, false); - else - throw loop_error("parent statement type undefined!!"); - - //tnl->print(); - * - * - */ - sync = true; - // return isSgNode(bb2); - - } - - // return tnli; - } else if (isSgIfStmt(*it)) { - SgStatement* body_ = isSgStatement( - recursiveFindReplacePreferedIdxs( - isSgNode((isSgIfStmt(*it)->get_true_body())), - body_syms, param_syms, body, loop_idxs, - globalscope)); - - omega::CG_roseRepr * tnl = new omega::CG_roseRepr(*it); - omega::CG_outputRepr* block = tnl->clone(); - tnli1 = - static_cast(block)->GetCode(); - - isSgIfStmt(tnli1)->set_true_body(body_); - - if ((isSgIfStmt(*it)->get_false_body())) - isSgIfStmt(tnli1)->set_false_body( - isSgStatement( - recursiveFindReplacePreferedIdxs( - isSgNode( - (isSgIfStmt(*it)->get_false_body())), - body_syms, param_syms, body, - loop_idxs, globalscope))); - - clone->append_statement(isSgStatement(tnli1)); - //return tnli; - } else if (isSgStatement(*it)) { - omega::CG_roseRepr * tnl = new omega::CG_roseRepr(*it); - omega::CG_outputRepr* block = tnl->clone(); - tnli2 = - static_cast(block)->GetCode(); - - clone->append_statement(isSgStatement(tnli2)); - //return tnli; - - } - } - - return isSgNode(clone); - - } - - /* if (!isSgBasicBlock( - recursiveFindReplacePreferedIdxs(isSgNode(*it), body_syms, - param_syms, body, loop_idxs, globalscope))) { - SgStatement *to_push = isSgStatement( - recursiveFindReplacePreferedIdxs(isSgNode(*it), - body_syms, param_syms, body, loop_idxs, - globalscope, sync)); - clone->append_statement(to_push); - - if ((sync_found) && isSgForStatement(to_push)) { - SgName name_syncthreads("__syncthreads"); - SgFunctionSymbol * syncthreads_symbol = - globalscope->lookup_function_symbol( - name_syncthreads); - - // Create a call to __syncthreads(): - SgFunctionCallExp * syncthreads_call = buildFunctionCallExp( - syncthreads_symbol, buildExprListExp()); - - SgExprStatement* stmt = buildExprStatement( - syncthreads_call); - - clone->append_statement(isSgStatement(stmt)); - } - // std::cout<unparseToString()<<"\n\n"; - } else { - - SgStatementPtrList& tnl2 = isSgBasicBlock( - recursiveFindReplacePreferedIdxs(isSgNode(*it), - body_syms, param_syms, body, loop_idxs, - globalscope))->get_statements(); - for (SgStatementPtrList::const_iterator it2 = tnl2.begin(); - it2 != tnl2.end(); it2++) { - clone->append_statement(*it2); - - sync_found = true; - // std::cout<unparseToString()<<"\n\n"; - } - } - - } - return isSgNode(clone); - } - */ -// return tnl; -} - -// loop_vars -> array references -// loop_idxs -> map for when we encounter a loop with a different preferredIndex -// dim_vars -> out param, fills with var_sym pair for 2D array dimentions (messy stuff) -SgNode* swapVarReferences(SgNode* code, - std::set& syms, SgSymbolTable* param, - SgSymbolTable* body, SgScopeStatement* body_stmt) { - //Iterate over every expression, looking up each variable and type - //reference used and possibly replacing it or adding it to our symbol - //table - // - //We use the built-in cloning helper methods to seriously help us with this! - - //Need to do a recursive mark - - std::set::iterator myIterator; - for (myIterator = syms.begin(); myIterator != syms.end(); myIterator++) { - SgName var_name = (*myIterator)->get_name(); - std::string x = var_name.getString(); - - if ((param->find_variable(var_name) == NULL) - && (body->find_variable(var_name) == NULL)) { - SgInitializedName* decl = (*myIterator)->get_declaration(); - - SgVariableSymbol* dvs = new SgVariableSymbol(decl); - SgVariableDeclaration* var_decl = buildVariableDeclaration( - dvs->get_name(), dvs->get_type()); - - AstTextAttribute* att = (AstTextAttribute*) (isSgNode( - decl->get_declaration())->getAttribute("__shared__")); - if (isSgNode(decl->get_declaration())->attributeExists( - "__shared__")) - var_decl->get_declarationModifier().get_storageModifier().setCudaShared(); - - appendStatement(var_decl, body_stmt); - - dvs->set_parent(body); - body->insert(var_name, dvs); - } - - std::vector array = substitute(code, *myIterator, NULL, - isSgNode(body)); - - SgVariableSymbol* var = (SgVariableSymbol*) (*myIterator); - for (int j = 0; j < array.size(); j++) - array[j]->set_symbol(var); - } - - return code; -} - -bool LoopCuda::validIndexes(int stmt, const std::vector& idxs) { - for (int i = 0; i < idxs.size(); i++) { - bool found = false; - for (int j = 0; j < idxNames[stmt].size(); j++) { - if (strcmp(idxNames[stmt][j].c_str(), idxs[i].c_str()) == 0) { - found = true; - } - } - if (!found) { - return false; - } - } - return true; -} - -bool LoopCuda::cudaize_v2(std::string kernel_name, - std::map array_dims, - std::vector blockIdxs, - std::vector threadIdxs) { - CG_outputBuilder *ocg = ir->builder(); - int stmt_num = 0; - if (cudaDebug) { - printf("cudaize_v2(%s, {", kernel_name.c_str()); - //for( - printf("}, blocks={"); - printVs(blockIdxs); - printf("}, thread={"); - printVs(threadIdxs); - printf("})\n"); - } - - this->array_dims = array_dims; - if (!validIndexes(stmt_num, blockIdxs)) { - throw std::runtime_error("One of the indexes in the block list was not " - "found in the current set of indexes."); - } - if (!validIndexes(stmt_num, threadIdxs)) { - throw std::runtime_error( - "One of the indexes in the thread list was not " - "found in the current set of indexes."); - } - if (blockIdxs.size() == 0) - throw std::runtime_error("Cudaize: Need at least one block dimention"); - int block_level = 0; - //Now, we will determine the actual size (if possible, otherwise - //complain) for the block dimentions and thread dimentions based on our - //indexes and the relations for our stmt; - for (int i = 0; i < blockIdxs.size(); i++) { - int level = findCurLevel(stmt_num, blockIdxs[i]); - int ub, lb; - CG_outputRepr* ubrepr = extractCudaUB(stmt_num, level, ub, lb); - if (lb != 0) { - //attempt to "normalize" the loop with an in-place tile and then re-check our bounds - if (cudaDebug) - printf( - "Cudaize: doing tile at level %d to try and normalize lower bounds\n", - level); - tile(stmt_num, level, 1, level, CountedTile); - idxNames[stmt_num].insert(idxNames[stmt_num].begin() + (level), ""); //TODO: possibly handle this for all sibling stmts - ubrepr = extractCudaUB(stmt_num, level, ub, lb); - } - if (lb != 0) { - char buf[1024]; - sprintf(buf, - "Cudaize: Loop at level %d does not have 0 as it's lower bound", - level); - throw std::runtime_error(buf); - } - if (ub < 0) { - char buf[1024]; - sprintf(buf, - "Cudaize: Loop at level %d does not have a hard upper bound", - level); - //Anand: Commenting out error indication for lack of constant upper bound - //throw std::runtime_error(buf); - } - if (cudaDebug) - printf("block idx %s level %d lb: %d ub %d\n", blockIdxs[i].c_str(), - level, lb, ub); - if (i == 0) { - block_level = level; - if (ubrepr == NULL) { - cu_bx = ub + 1; - cu_bx_repr = NULL; - } else { - cu_bx = 0; - cu_bx_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1)); - } - idxNames[stmt_num][level - 1] = "bx"; - } else if (i == 1) { - if (ubrepr == NULL) { - cu_by = ub + 1; - cu_by_repr = NULL; - } else { - cu_by = 0; - cu_by_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1)); - } - idxNames[stmt_num][level - 1] = "by"; - } - } - if (!cu_by && !cu_by_repr) - block_level = 0; - int thread_level1 = 0; - int thread_level2 = 0; - for (int i = 0; i < threadIdxs.size(); i++) { - int level = findCurLevel(stmt_num, threadIdxs[i]); - int ub, lb; - CG_outputRepr* ubrepr = extractCudaUB(stmt_num, level, ub, lb); - if (lb != 0) { - //attempt to "normalize" the loop with an in-place tile and then re-check our bounds - if (cudaDebug) - printf( - "Cudaize: doing tile at level %d to try and normalize lower bounds\n", - level); - tile(stmt_num, level, 1, level, CountedTile); - idxNames[stmt_num].insert(idxNames[stmt_num].begin() + (level), ""); - ubrepr = extractCudaUB(stmt_num, level, ub, lb); - } - if (lb != 0) { - char buf[1024]; - sprintf(buf, - "Cudaize: Loop at level %d does not have 0 as it's lower bound", - level); - throw std::runtime_error(buf); - } - if (ub < 0) { - char buf[1024]; - sprintf(buf, - "Cudaize: Loop at level %d does not have a hard upper bound", - level); - //Anand: Commenting out error indication for lack of constant upper bound - //throw std::runtime_error(buf); - } - - if (cudaDebug) - printf("thread idx %s level %d lb: %d ub %d\n", - threadIdxs[i].c_str(), level, lb, ub); - if (i == 0) { - thread_level1 = level; - if (ubrepr == NULL) { - cu_tx = ub + 1; - cu_tx_repr = NULL; - } else { - cu_tx = 0; - cu_tx_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1)); - } - idxNames[stmt_num][level - 1] = "tx"; - } else if (i == 1) { - thread_level2 = level; - if (ubrepr == NULL) { - cu_ty = ub + 1; - cu_ty_repr = NULL; - } else { - cu_ty = 0; - cu_ty_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1)); - } - idxNames[stmt_num][level - 1] = "ty"; - } else if (i == 2) { - if (ubrepr == NULL) { - cu_tz = ub + 1; - cu_tz_repr = NULL; - } else { - cu_tz = 0; - cu_tz_repr = ocg->CreatePlus(ubrepr, ocg->CreateInt(1)); - } - idxNames[stmt_num][level - 1] = "tz"; - } - } - if (!cu_ty && !cu_ty_repr) - thread_level1 = 0; - if (!cu_tz && !cu_tz_repr) - thread_level2 = 0; - - //Make changes to nonsplitlevels - const int m = stmt.size(); - for (int i = 0; i < m; i++) { - if (block_level) { - //stmt[i].nonSplitLevels.append((block_level)*2); - stmt_nonSplitLevels[i].push_back((block_level) * 2); - } - if (thread_level1) { - //stmt[i].nonSplitLevels.append((thread_level1)*2); - stmt_nonSplitLevels[i].push_back((thread_level1) * 2); - } - if (thread_level2) { - //stmt[i].nonSplitLevels.append((thread_level1)*2); - stmt_nonSplitLevels[i].push_back((thread_level1) * 2); - } - } - - if (cudaDebug) { - printf("Codegen: current names: "); - printVS(idxNames[stmt_num]); - } - //Set codegen flag - code_gen_flags |= GenCudaizeV2; - - //Save array dimention sizes - this->array_dims = array_dims; - cu_kernel_name = kernel_name.c_str(); - -} - -/* - * setupConstantVar - * handles constant variable declaration - * and adds a global constant variable - * parameters: - * constant - the constant_memory_mapping object for this loop - * arr_def - the VarDefs object for the mapped variable - * globals - Rose Global variables - * i - an index to keep new variable names unique - * symtab - global symbol table - */ -static void setupConstantVar(constant_memory_mapping* constant, VarDefs* arr_def, SgGlobal* globals, int i, SgSymbolTable* symtab) { - char* buf1 = new char[32]; - snprintf(buf1, 32, "cs%dRef", i+1); - arr_def->secondName = buf1; - - char buf2[64]; - snprintf(buf2, 64, "__device__ __constant__ float"); - - SgVariableDeclaration* consvar_decl = buildVariableDeclaration( - SgName(std::string(buf1)), buildArrayType( - buildOpaqueType(SgName(buf2),globals), - arr_def->size_expr)); - SgInitializedNamePtrList& variables = consvar_decl->get_variables(); - SgInitializedNamePtrList::const_iterator j = variables.begin(); - SgInitializedName* initializedName = *j; - SgVariableSymbol* consvar_sym = new SgVariableSymbol(initializedName); - prependStatement(consvar_decl, globals); - - consvar_sym->set_parent(symtab); - symtab->insert(SgName(std::string(buf1)), consvar_sym); - - constant->set_mapped_symbol(arr_def->original_name.c_str(), consvar_sym); - constant->set_vardef(arr_def->original_name.c_str(), arr_def); -} - -/* - * cudaBindConstantVar - * allocs a variable to constant memory - * constant - the constant mapping object - * arr_def - the VarDefs abject - * globals - global symbol table - * stmt_list - the GPU functions' statement list - */ -static void cudaBindConstantVar(constant_memory_mapping* constant, VarDefs* arr_def, SgGlobal* globals, SgStatementPtrList* stmt_list) { - SgName cudaMemcpyToSymbol_name("cudaMemcpyToSymbol"); - SgFunctionDeclaration* cudaMemcpyToSymbol_decl = buildNondefiningFunctionDeclaration( - cudaMemcpyToSymbol_name, buildVoidType(), buildFunctionParameterList(), globals); - SgExprListExp* args = buildExprListExp(); - args->append_expression(buildCastExp(constant->get_mapped_symbol_exp(arr_def->original_name.c_str()), - buildPointerType(buildVoidType()))); - args->append_expression(buildVarRefExp(arr_def->in_data)); - args->append_expression(arr_def->size_expr); - stmt_list->push_back(buildExprStatement( - buildFunctionCallExp(buildFunctionRefExp(cudaMemcpyToSymbol_decl), args))); -} - -static void consmapArrayRefs(constant_memory_mapping* constant, std::vector* refs, SgGlobal* globals, IR_Code* ir, CG_roseBuilder* ocg) { - // if constant mapping is not being used, ignore this function - if(constant == NULL) return; - for(int i = 0; i < refs->size(); i++) { - IR_ArrayRef* aref = (*refs)[i]; - if(constant->is_array_mapped(aref->name().c_str())) { - // get array reference dimensions - int dims = aref->symbol()->n_dim(); - if(dims > 2) { - printf(" \n CHiLL does not handle constant memory mapping for more than 2D arrays.\n"); - return; - } - - SgExpression* varexp = constant->get_mapped_symbol_exp(aref->name().c_str()); - SgExpression* index_exp; - // build index expression - if(dims == 1) { - index_exp = static_cast(aref->index(0)->clone())->GetExpression(); - } - if(dims == 2) { - VarDefs* arr_def = constant->get_vardef(aref->name().c_str()); - CG_outputRepr* i0 = aref->index(0)->clone(); - CG_outputRepr* i1 = aref->index(1)->clone(); - CG_outputRepr* sz = new CG_roseRepr(buildIntVal(arr_def->size_multi_dim[0])); - CG_outputRepr* exp = ocg->CreatePlus(ocg->CreateTimes(sz->clone(), i0), i1); - index_exp = static_cast(exp->clone())->GetExpression(); - } - ir->ReplaceExpression(aref, new CG_roseRepr(buildPntrArrRefExp(varexp, index_exp))); - } - } -} - -/* - * setupTexmappingVar - * handles texture variable declaration - * and adds a global texture object - * parameters: - * texture - the texture_memory_mapping object - * arr_def - the VarDefs object for the mapped variable - * globals - Rose Global variables - * i - an index to keep the new variable names unique - * devptr_sym - the devptr that the original variable is associated with - * symtab - GPU function symbol table - */ -static void setupTexmappingVar(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals, int i, SgVariableSymbol* devptr_sym, SgSymbolTable* symtab) { - char* buf1 = new char[32]; - snprintf(buf1, 32, "tex%dRef", i+1); - arr_def->secondName = buf1; - - char buf2[64]; - // single-dimensional - snprintf(buf2, 64, "texture", 1); - // multi-dimensional - // snprintf(buf2, 64, "texture", (int)(arr_def->size_multi_dim.size())); //*/ - - SgVariableDeclaration* texvar_decl = buildVariableDeclaration(SgName(std::string(buf1)), buildOpaqueType(buf2, globals)); - - SgInitializedNamePtrList& variables = texvar_decl->get_variables(); - SgInitializedNamePtrList::const_iterator j = variables.begin(); - SgInitializedName* initializedName = *j; - SgVariableSymbol* texvar_sym = new SgVariableSymbol(initializedName); - prependStatement(texvar_decl, globals); - - texvar_sym->set_parent(symtab); - symtab->insert(SgName(buf1), texvar_sym); - - texture->set_mapped_symbol(arr_def->original_name.c_str(), texvar_sym); - texture->set_devptr_symbol(arr_def->original_name.c_str(), devptr_sym); - texture->set_vardef(arr_def->original_name.c_str(), arr_def); -} - - -/* - * One dimensional version of cudaBindTexture - * see cudaBindTexture for details - */ -static SgFunctionCallExp* cudaBindTexture1D(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals) { - SgName cudaBindTexture_name("cudaBindTexture"); - SgFunctionDeclaration* cudaBindTexture_decl = buildNondefiningFunctionDeclaration( - cudaBindTexture_name, buildVoidType(), buildFunctionParameterList(), globals); - - SgExprListExp* args = buildExprListExp(); - args->append_expression(buildIntVal(0)); - args->append_expression(texture->get_mapped_symbol_exp(arr_def->original_name.c_str())); - args->append_expression(texture->get_devptr_symbol_exp(arr_def->original_name.c_str())); - args->append_expression(arr_def->size_expr); - return buildFunctionCallExp(buildFunctionRefExp(cudaBindTexture_decl), args); -} - -/* - * Two dimensional version of cudaBindTexture - * see cudaBindTexture for details - */ -//static SgFunctionCallExp* cudaBindTexture2D(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals) { -// SgName cudaBindTexture_name("cudaBindTexture2D"); -// SgFunctionDeclaration* cudaBindTexture_decl = buildNondefiningFunctionDeclaration( -// cudaBindTexture_name, buildVoidType(), buildFunctionParameterList(), globals); -// -// SgExprListExp* args = buildExprListExp(); -// args->append_expression(buildIntVal(0)); -// args->append_expression(texture->get_tex_mapped_symbol_exp(arr_def->original_name.c_str())); -// args->append_expression(texture->get_devptr_symbol_exp(arr_def->original_name.c_str())); -// args->append_expression(buildIntVal(texture->get_dim_length(arr_def->original_name.c_str(), 0))); -// args->append_expression(buildIntVal(texture->get_dim_length(arr_def->original_name.c_str(), 1))); -// args->append_expression(arr_def->size_expr); -// return buildFunctionCallExp(buildFunctionRefExp(cudaBindTexture_decl), args); -//} - -/* - * cudaBindTexture - * binds a variable to a texture - * parameters: - * texture - the texture mapping object - * arr_def - the VarDefs object - * globals - global symbol table - * stmt_list - the GPU functions' statement list - * notes: - * only supports binding 1D textures, may need to consider cudaBindTexture2D for 2D textures - */ -static void cudaBindTexture(texture_memory_mapping* texture, VarDefs* arr_def, SgGlobal* globals, SgStatementPtrList* stmt_list) { - //int dims = (int)(arr_def->size_multi_dim.size()); - //int dims = texture->get_dims(arr_def->original_name.c_str()); - //if(dims == 1) - stmt_list->push_back( - buildExprStatement(cudaBindTexture1D(texture, arr_def, globals))); - //if(dims == 2) - // stmt_list->push_back( - // buildExprStatement(cudaBindTexture2D(texture, arr_def, globals))); -} - -/* - * texmapArrayRefs - * maps array reference expresions of texture mapped variables to the tex1D function - * parameters: - * texture - the texture mapping object - * refs - a list of all array read operations - * globals - global symbol table - * ir - handles IR_Code operations - * ocg - handles CG_roseBuilder operations -**/ -static void texmapArrayRefs(texture_memory_mapping* texture, std::vector* refs, SgGlobal* globals, IR_Code* ir, CG_roseBuilder *ocg) { - // if texture mapping is not being used, ignore this function - if(texture == NULL) return; - for(int i = 0; i < refs->size(); i++) { - IR_ArrayRef* aref = (*refs)[i]; - if(texture->is_array_mapped(aref->name().c_str())) { - - // get array dimensions - VarDefs* arr_def = texture->get_vardef(aref->name().c_str()); - int dims = aref->symbol()->n_dim(); - if(dims > 2) { - printf(" \n CHiLL does not handle texture mapping for more than 2D arrays.\n"); - // TODO throw some sort of error. or handle in texture_copy function - return; - } - - // build texture lookup function declaration - char texNDfetch_strName[16]; - sprintf(texNDfetch_strName, "tex%dDfetch", 1); // for now, only support tex1Dfetch - //sprintf(texNDfetch_strName, "tex%dDfetch", dims); - SgFunctionDeclaration* fetch_decl = buildNondefiningFunctionDeclaration( - SgName(texNDfetch_strName), buildFloatType(), buildFunctionParameterList(), globals); - - // build args - SgExprListExp* args = buildExprListExp(); - args->append_expression(texture->get_mapped_symbol_exp(aref->name().c_str())); - - // set indexing args - //for(int i = 0; i < dims; i++) { - // args->append_expression((static_cast(aref->index(i)->clone()))->GetExpression()); - //} - if(dims == 1) { - args->append_expression(static_cast(aref->index(0)->clone())->GetExpression()); - } - else if(dims == 2) { - CG_outputRepr* i0 = aref->index(0)->clone(); - CG_outputRepr* i1 = aref->index(1)->clone(); - CG_outputRepr* sz = new CG_roseRepr(buildIntVal(arr_def->size_multi_dim[0])); - CG_outputRepr* expr = ocg->CreatePlus(ocg->CreateTimes(sz->clone(), i0), i1); - args->append_expression(static_cast(expr->clone())->GetExpression()); - } - - // build function call and replace original array ref - SgFunctionCallExp* fetch_call = buildFunctionCallExp(buildFunctionRefExp(fetch_decl), args); - ir->ReplaceExpression(aref, new CG_roseRepr(fetch_call)); - } - } -} - -SgNode* LoopCuda::cudaize_codegen_v2() { - if(cudaDebug) - printf("cudaize codegen V2\n"); - CG_roseBuilder *ocg = dynamic_cast(ir->builder()); - if (!ocg) - return false; - - //protonu--adding an annote to track texture memory type - //ANNOTE(k_cuda_texture_memory, "cuda texture memory", TRUE); - //ANNOTE(k_cuda_constant_memory, "cuda constant memory", TRUE); - int tex_mem_on = 0; - int cons_mem_on = 0; - - - - CG_outputRepr* repr; - std::vector arrayVars; - std::vector localScopedVars; - - std::vector ro_refs; - std::vector wo_refs; - std::set uniqueRefs; - std::set uniqueWoRefs; - std::set syms; - std::set psyms; - std::set pdSyms; - SgStatementPtrList* replacement_list = new SgStatementPtrList; - - for (int j = 0; j < stmt.size(); j++) { - std::vector refs = ir->FindArrayRef(stmt[j].code); - for (int i = 0; i < refs.size(); i++) { - //printf("ref %s wo %d\n", static_cast(refs[i]->name()), refs[i]->is_write()); - SgVariableSymbol* var = body_symtab->find_variable( - SgName((char*) refs[i]->name().c_str())); - SgVariableSymbol* var2 = parameter_symtab->find_variable( - SgName((char*) refs[i]->name().c_str())); - - //If the array is not a parameter, then it's a local array and we - //want to recreate it as a stack variable in the kernel as opposed to - //passing it in. - if (var != NULL) { - //anand-- needs modification, if variable is parameter it wont be part of the - // block's symbol table but the functiond definition's symbol table - - continue; - } - if (uniqueRefs.find(refs[i]->name()) == uniqueRefs.end()) { - - uniqueRefs.insert(refs[i]->name()); - if (refs[i]->is_write()) { - uniqueWoRefs.insert(refs[i]->name()); - wo_refs.push_back(refs[i]); - } else - ro_refs.push_back(refs[i]); - } - if (refs[i]->is_write() - && uniqueWoRefs.find(refs[i]->name()) - == uniqueWoRefs.end()) { - uniqueWoRefs.insert(refs[i]->name()); - wo_refs.push_back(refs[i]); - //printf("adding %s to wo\n", static_cast(refs[i]->name())); - } - pdSyms.insert((const SgVariableSymbol*) var2); - } - } - - if (cudaDebug) { - printf("reading from array "); - for (int i = 0; i < ro_refs.size(); i++) - printf("'%s' ", ro_refs[i]->name().c_str()); - printf("and writing to array "); - for (int i = 0; i < wo_refs.size(); i++) - printf("'%s' ", wo_refs[i]->name().c_str()); - printf("\n"); - } - const char* gridName = "dimGrid"; - const char* blockName = "dimBlock"; - - //TODO: Could allow for array_dims_vars to be a mapping from array - //references to to variable names that define their length. - SgVariableSymbol* dim1 = 0; - SgVariableSymbol* dim2 = 0; - - for (int i = 0; i < wo_refs.size(); i++) { - //TODO: Currently assume all arrays are floats of one or two dimentions - SgVariableSymbol* outArray = 0; - std::string name = wo_refs[i]->name(); - outArray = body_symtab->find_variable(SgName((char*) name.c_str())); - int size_n_d; - if (outArray == NULL) - outArray = parameter_symtab->find_variable( - SgName((char*) name.c_str())); - - VarDefs v; - v.size_multi_dim = std::vector(); - char buf[32]; - snprintf(buf, 32, "devO%dPtr", i + 1); - v.name = buf; - if (isSgPointerType(outArray->get_type())) { - if (isSgArrayType( - isSgNode( - isSgPointerType(outArray->get_type())->get_base_type()))) { - // v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type(); - SgType* t = - isSgPointerType(outArray->get_type())->get_base_type(); - /* SgExprListExp* dimList = t->get_dim_info(); - SgExpressionPtrList::iterator j= dimList->get_expressions().begin(); - SgExpression* expr=NULL; - for (; j != dimList->get_expressions().end(); j++) - expr = *j; - */ - while (isSgArrayType(t)) - t = isSgArrayType(t)->get_base_type(); - - if (!isSgType(t)) { - char buf[1024]; - sprintf(buf, "CudaizeCodeGen: Array type undetected!"); - throw std::runtime_error(buf); - - } - - v.type = t; - } else - v.type = isSgPointerType(outArray->get_type())->get_base_type(); - } else if (isSgArrayType(outArray->get_type())) { - if (isSgArrayType( - isSgNode( - isSgArrayType(outArray->get_type())->get_base_type()))) { - // v.type = ((array_type *)(((ptr_type *)(outArray->type()))->ref_type()))->elem_type(); - SgType* t = - isSgArrayType(outArray->get_type())->get_base_type(); - /* SgExprListExp* dimList = t->get_dim_info(); - SgExpressionPtrList::iterator j= dimList->get_expressions().begin(); - SgExpression* expr=NULL; - for (; j != dimList->get_expressions().end(); j++) - expr = *j; - */ - while (isSgArrayType(t)) - t = isSgArrayType(t)->get_base_type(); - - if (!isSgType(t)) { - char buf[1024]; - sprintf(buf, "CudaizeCodeGen: Array type undetected!"); - throw std::runtime_error(buf); - - } - - v.type = t; - } else - v.type = isSgArrayType(outArray->get_type())->get_base_type(); - } else - v.type = buildFloatType(); - v.tex_mapped = false; - v.cons_mapped = false; - v.original_name = wo_refs[i]->name(); - //Size of the array = dim1 * dim2 * num bytes of our array type - - //If our input array is 2D (non-linearized), we want the actual - //dimentions of the array - CG_outputRepr* size; - //Lookup in array_dims - std::map::iterator it = array_dims.find(name.c_str()); - if (isSgPointerType(outArray->get_type()) - && isSgArrayType( - isSgNode( - isSgPointerType(outArray->get_type())->get_base_type()))) { - SgType* t = isSgPointerType(outArray->get_type())->get_base_type(); - /* SgExprListExp* dimList = t->get_dim_info(); - SgExpressionPtrList::iterator j= dimList->get_expressions().begin(); - SgExpression* expr=NULL; - for (; j != dimList->get_expressions().end(); j++) - expr = *j; - */ - if (isSgIntVal(isSgArrayType(t)->get_index())) - size_n_d = - (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgUnsignedIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgUnsignedLongVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = - (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = - (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgUnsignedLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgAddOp(isSgArrayType(t)->get_index())) { - SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index()); - - SgExpression *lhs = op_add->get_lhs_operand(); - SgExpression *rhs = op_add->get_rhs_operand(); - - if (isSgIntVal(lhs)) - size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value()); - else if (isSgUnsignedIntVal(lhs)) - size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value() - + (int) isSgUnsignedIntVal(rhs)->get_value(); - else if (isSgUnsignedLongVal(lhs)) - size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongLongIntVal(lhs)) - size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - size_n_d = (int) (isSgLongIntVal(lhs)->get_value() - + isSgLongIntVal(rhs)->get_value()); - else if (isSgUnsignedLongLongIntVal(lhs)) - size_n_d = - (int) (isSgUnsignedLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongLongIntVal(rhs)->get_value()); - - } - t = isSgArrayType(t)->get_base_type(); - while (isSgArrayType(t)) { - int dim; - if (isSgIntVal(isSgArrayType(t)->get_index())) - dim = - (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgUnsignedIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgUnsignedLongVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongLongIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongLongIntVal( - isSgArrayType(t)->get_index())) - dim = (int) (isSgUnsignedLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgAddOp(isSgArrayType(t)->get_index())) { - SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index()); - - SgExpression *lhs = op_add->get_lhs_operand(); - SgExpression *rhs = op_add->get_rhs_operand(); - - if (isSgIntVal(lhs)) - dim = (int) isSgIntVal(lhs)->get_value() - + (int) (isSgIntVal(rhs)->get_value()); - else if (isSgUnsignedIntVal(lhs)) - dim = (int) isSgUnsignedIntVal(lhs)->get_value() - + (int) isSgUnsignedIntVal(rhs)->get_value(); - else if (isSgUnsignedLongVal(lhs)) - dim = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - dim = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongLongIntVal(lhs)) - dim = (int) (isSgLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - dim = (int) (isSgLongIntVal(lhs)->get_value() - + isSgLongIntVal(rhs)->get_value()); - else if (isSgUnsignedLongLongIntVal(lhs)) - dim = - (int) (isSgUnsignedLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongLongIntVal(rhs)->get_value()); - - } - size_n_d *= dim; - v.size_multi_dim.push_back(dim); - t = isSgArrayType(t)->get_base_type(); - } - //v.size_2d = (int) (isSgIntVal(t->get_index())->get_value()); - - if (cudaDebug) - printf("Detected Multi-dimensional array sized of %d for %s\n", - size_n_d, (char*) wo_refs[i]->name().c_str()); - size = ocg->CreateInt(size_n_d); - } else if (isSgArrayType(outArray->get_type()) - && isSgArrayType( - isSgNode( - isSgArrayType(outArray->get_type())->get_base_type()))) { - SgType* t = outArray->get_type(); - /* SgExprListExp* dimList = t->get_dim_info(); - SgExpressionPtrList::iterator j= dimList->get_expressions().begin(); - SgExpression* expr=NULL; - for (; j != dimList->get_expressions().end(); j++) - expr = *j; - */ - - if (isSgIntVal(isSgArrayType(t)->get_index())) - size_n_d = - (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgUnsignedIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgUnsignedLongVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = - (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = - (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgUnsignedLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgAddOp(isSgArrayType(t)->get_index())) { - SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index()); - - SgExpression *lhs = op_add->get_lhs_operand(); - SgExpression *rhs = op_add->get_rhs_operand(); - - if (isSgIntVal(lhs)) - size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value()); - else if (isSgUnsignedIntVal(lhs)) - size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value() - + (int) isSgUnsignedIntVal(rhs)->get_value(); - else if (isSgUnsignedLongVal(lhs)) - size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongLongIntVal(lhs)) - size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - size_n_d = (int) (isSgLongIntVal(lhs)->get_value() - + isSgLongIntVal(rhs)->get_value()); - else if (isSgUnsignedLongLongIntVal(lhs)) - size_n_d = - (int) (isSgUnsignedLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongLongIntVal(rhs)->get_value()); - - } - t = isSgArrayType(t)->get_base_type(); - while (isSgArrayType(t)) { - int dim; - if (isSgIntVal(isSgArrayType(t)->get_index())) - dim = - (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgUnsignedIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgUnsignedLongVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongLongIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongLongIntVal( - isSgArrayType(t)->get_index())) - dim = (int) (isSgUnsignedLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgAddOp(isSgArrayType(t)->get_index())) { - SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index()); - - SgExpression *lhs = op_add->get_lhs_operand(); - SgExpression *rhs = op_add->get_rhs_operand(); - - if (isSgIntVal(lhs)) - dim = (int) isSgIntVal(lhs)->get_value() - + (int) (isSgIntVal(rhs)->get_value()); - else if (isSgUnsignedIntVal(lhs)) - dim = (int) isSgUnsignedIntVal(lhs)->get_value() - + (int) isSgUnsignedIntVal(rhs)->get_value(); - else if (isSgUnsignedLongVal(lhs)) - dim = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - dim = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongLongIntVal(lhs)) - dim = (int) (isSgLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - dim = (int) (isSgLongIntVal(lhs)->get_value() - + isSgLongIntVal(rhs)->get_value()); - else if (isSgUnsignedLongLongIntVal(lhs)) - dim = - (int) (isSgUnsignedLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongLongIntVal(rhs)->get_value()); - - } - size_n_d *= dim; - v.size_multi_dim.push_back(dim); - t = isSgArrayType(t)->get_base_type(); - } - - //v.size_2d = (int) (isSgIntVal(t->get_index())->get_value()); - - if (cudaDebug) - printf("Detected Multi-Dimensional array sized of %d for %s\n", - size_n_d, (char*) wo_refs[i]->name().c_str()); - size = ocg->CreateInt(size_n_d); - } else if (it != array_dims.end()) { - int ref_size = it->second; - //size = - // ocg->CreateInt( - // isSgIntVal( - // isSgArrayType(outArray->get_type())->get_index())->get_value()); - //v.size_2d = isSgArrayType(outArray->get_type())->get_rank(); - //v.var_ref_size = ref_size; - size = ocg->CreateInt(ref_size); - - } else { - if (dim1) { - size = ocg->CreateTimes( - new CG_roseRepr(isSgExpression(buildVarRefExp(dim1))), - new CG_roseRepr(isSgExpression(buildVarRefExp(dim2)))); - } else { - char buf[1024]; - sprintf(buf, - "CudaizeCodeGen: Array reference %s does not have a " - "detectable size or specififed dimentions", - name.c_str()); - throw std::runtime_error(buf); - } - } - - v.size_expr = - static_cast(ocg->CreateTimes(size, - new omega::CG_roseRepr( - isSgExpression(buildSizeOfOp(v.type)))))->GetExpression(); - - v.in_data = 0; - v.out_data = outArray; - //Check for in ro_refs and remove it at this point - std::vector::iterator it_; - for (it_ = ro_refs.begin(); it_ != ro_refs.end(); it_++) { - if ((*it_)->name() == wo_refs[i]->name()) { - break; - } - } - if (it_ != ro_refs.end()) { - v.in_data = outArray; - ro_refs.erase(it_); - } - - arrayVars.push_back(v); - - } - - //protonu-- assuming that all texture mapped memories were originally read only mems - //there should be safety checks for that, will implement those later - - for (int i = 0; i < ro_refs.size(); i++) { - SgVariableSymbol* inArray = 0; - std::string name = ro_refs[i]->name(); - inArray = body_symtab->find_variable(SgName((char*) name.c_str())); - if (inArray == NULL) - inArray = parameter_symtab->find_variable( - SgName((char*) name.c_str())); - - VarDefs v; - v.size_multi_dim = std::vector(); - char buf[32]; - snprintf(buf, 32, "devI%dPtr", i + 1); - v.name = buf; - int size_n_d; - if (isSgPointerType(inArray->get_type())) { - if (isSgArrayType( - isSgNode( - isSgPointerType(inArray->get_type())->get_base_type()))) { - - SgType* t = - isSgPointerType(inArray->get_type())->get_base_type(); - - while (isSgArrayType(t)) - t = isSgArrayType(t)->get_base_type(); - - if (!isSgType(t)) { - char buf[1024]; - sprintf(buf, "CudaizeCodeGen: Array type undetected!"); - throw std::runtime_error(buf); - - } - v.type = t; - } else - v.type = isSgPointerType(inArray->get_type())->get_base_type(); - } else if (isSgArrayType(inArray->get_type())) { - if (isSgArrayType( - isSgNode( - isSgArrayType(inArray->get_type())->get_base_type()))) { - - SgType* t = inArray->get_type(); - while (isSgArrayType(t)) - t = isSgArrayType(t)->get_base_type(); - - if (!isSgType(t)) { - char buf[1024]; - sprintf(buf, "CudaizeCodeGen: Array type undetected!"); - throw std::runtime_error(buf); - - } - v.type = t; - } else - v.type = isSgArrayType(inArray->get_type())->get_base_type(); - } - - else - v.type = buildFloatType(); - - v.tex_mapped = false; - v.cons_mapped = false; - v.original_name = ro_refs[i]->name(); - - //derick -- adding texture and constant mapping - if ( texture != NULL) - v.tex_mapped = (texture->is_array_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars - if (v.tex_mapped){ - printf("this variable %s is mapped to texture memory", name.c_str()); - } - //derick -- this is commented out until constant memory is implemeted - if ( constant_mem != NULL) - v.cons_mapped = (constant_mem->is_array_mapped(name.c_str()))? true:false; //protonu-track tex mapped vars - if (v.cons_mapped){ - printf("this variable %s is mapped to constant memory", name.c_str()); - } - - //Size of the array = dim1 * dim2 * num bytes of our array type - //If our input array is 2D (non-linearized), we want the actual - //dimentions of the array (as it might be less than cu_n - CG_outputRepr* size; - //Lookup in array_dims - std::map::iterator it = array_dims.find(name.c_str()); - if (isSgPointerType(inArray->get_type()) - && isSgArrayType( - isSgPointerType(inArray->get_type())->get_base_type())) { - //SgArrayType* t = isSgArrayType(isSgArrayType(inArray->get_type())->get_base_type()); - //v.size_2d = t->get_rank(); - SgType* t = isSgPointerType(inArray->get_type())->get_base_type(); - /* SgExprListExp* dimList = t->get_dim_info(); - SgExpressionPtrList::iterator j= dimList->get_expressions().begin(); - SgExpression* expr=NULL; - for (; j != dimList->get_expressions().end(); j++) - expr = *j; - */ - //v.size_2d = 1; - if (isSgIntVal(isSgArrayType(t)->get_index())) - size_n_d = - (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgUnsignedIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgUnsignedLongVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = - (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = - (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgUnsignedLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgAddOp(isSgArrayType(t)->get_index())) { - SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index()); - - SgExpression *lhs = op_add->get_lhs_operand(); - SgExpression *rhs = op_add->get_rhs_operand(); - - if (isSgIntVal(lhs)) - size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value()); - else if (isSgUnsignedIntVal(lhs)) - size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value() - + (int) isSgUnsignedIntVal(rhs)->get_value(); - else if (isSgUnsignedLongVal(lhs)) - size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongLongIntVal(lhs)) - size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - size_n_d = (int) (isSgLongIntVal(lhs)->get_value() - + isSgLongIntVal(rhs)->get_value()); - else if (isSgUnsignedLongLongIntVal(lhs)) - size_n_d = - (int) (isSgUnsignedLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongLongIntVal(rhs)->get_value()); - - } - t = isSgArrayType(t)->get_base_type(); - while (isSgArrayType(t)) { - int dim; - if (isSgIntVal(isSgArrayType(t)->get_index())) - dim = - (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgUnsignedIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgUnsignedLongVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongLongIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongLongIntVal( - isSgArrayType(t)->get_index())) - dim = (int) (isSgUnsignedLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgAddOp(isSgArrayType(t)->get_index())) { - SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index()); - - SgExpression *lhs = op_add->get_lhs_operand(); - SgExpression *rhs = op_add->get_rhs_operand(); - - if (isSgIntVal(lhs)) - dim = (int) isSgIntVal(lhs)->get_value() - + (int) (isSgIntVal(rhs)->get_value()); - else if (isSgUnsignedIntVal(lhs)) - dim = (int) isSgUnsignedIntVal(lhs)->get_value() - + (int) isSgUnsignedIntVal(rhs)->get_value(); - else if (isSgUnsignedLongVal(lhs)) - dim = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - dim = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongLongIntVal(lhs)) - dim = (int) (isSgLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - dim = (int) (isSgLongIntVal(lhs)->get_value() - + isSgLongIntVal(rhs)->get_value()); - else if (isSgUnsignedLongLongIntVal(lhs)) - dim = - (int) (isSgUnsignedLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongLongIntVal(rhs)->get_value()); - - } - size_n_d *= dim; - v.size_multi_dim.push_back(dim); - t = isSgArrayType(t)->get_base_type(); - } - if (cudaDebug) - printf("Detected Multi-dimensional array sized of %d for %s\n", - size_n_d, (char*) ro_refs[i]->name().c_str()); - size = ocg->CreateInt(size_n_d); - } else if (isSgArrayType(inArray->get_type()) - && isSgArrayType( - isSgArrayType(inArray->get_type())->get_base_type())) { - //SgArrayType* t = isSgArrayType(isSgArrayType(inArray->get_type())->get_base_type()); - //v.size_2d = t->get_rank(); - SgType* t = inArray->get_type(); - /* SgExprListExp* dimList = t->get_dim_info(); - SgExpressionPtrList::iterator j= dimList->get_expressions().begin(); - SgExpression* expr=NULL; - for (; j != dimList->get_expressions().end(); j++) - expr = *j; - */ - - if (isSgIntVal(isSgArrayType(t)->get_index())) - size_n_d = - (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgUnsignedIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgUnsignedLongVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = - (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = - (int) (isSgLongIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongLongIntVal(isSgArrayType(t)->get_index())) - size_n_d = (int) (isSgUnsignedLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgAddOp(isSgArrayType(t)->get_index())) { - SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index()); - - SgExpression *lhs = op_add->get_lhs_operand(); - SgExpression *rhs = op_add->get_rhs_operand(); - - if (isSgIntVal(lhs)) - size_n_d = (int) isSgIntVal(lhs)->get_value() + (int) (isSgIntVal(rhs)->get_value()); - else if (isSgUnsignedIntVal(lhs)) - size_n_d = (int) isSgUnsignedIntVal(lhs)->get_value() - + (int) isSgUnsignedIntVal(rhs)->get_value(); - else if (isSgUnsignedLongVal(lhs)) - size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - size_n_d = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongLongIntVal(lhs)) - size_n_d = (int) (isSgLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - size_n_d = (int) (isSgLongIntVal(lhs)->get_value() - + isSgLongIntVal(rhs)->get_value()); - else if (isSgUnsignedLongLongIntVal(lhs)) - size_n_d = - (int) (isSgUnsignedLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongLongIntVal(rhs)->get_value()); - - } - t = isSgArrayType(t)->get_base_type(); - while (isSgArrayType(t)) { - int dim; - if (isSgIntVal(isSgArrayType(t)->get_index())) - dim = - (int) (isSgIntVal(isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgUnsignedIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgUnsignedLongVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongLongIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgLongIntVal(isSgArrayType(t)->get_index())) - dim = (int) (isSgLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgUnsignedLongLongIntVal( - isSgArrayType(t)->get_index())) - dim = (int) (isSgUnsignedLongLongIntVal( - isSgArrayType(t)->get_index())->get_value()); - else if (isSgAddOp(isSgArrayType(t)->get_index())) { - SgAddOp *op_add = isSgAddOp(isSgArrayType(t)->get_index()); - - SgExpression *lhs = op_add->get_lhs_operand(); - SgExpression *rhs = op_add->get_rhs_operand(); - - if (isSgIntVal(lhs)) - dim = (int) isSgIntVal(lhs)->get_value() - + (int) (isSgIntVal(rhs)->get_value()); - else if (isSgUnsignedIntVal(lhs)) - dim = (int) isSgUnsignedIntVal(lhs)->get_value() - + (int) isSgUnsignedIntVal(rhs)->get_value(); - else if (isSgUnsignedLongVal(lhs)) - dim = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - dim = (int) (isSgUnsignedLongVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongLongIntVal(lhs)) - dim = (int) (isSgLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongVal(rhs)->get_value()); - else if (isSgLongIntVal(lhs)) - dim = (int) (isSgLongIntVal(lhs)->get_value() - + isSgLongIntVal(rhs)->get_value()); - else if (isSgUnsignedLongLongIntVal(lhs)) - dim = - (int) (isSgUnsignedLongLongIntVal(lhs)->get_value() - + isSgUnsignedLongLongIntVal(rhs)->get_value()); - - } - size_n_d *= dim; - v.size_multi_dim.push_back(dim); - t = isSgArrayType(t)->get_base_type(); - } - if (cudaDebug) - printf("Detected Multi-Dimensional array sized of %d for %s\n", - size_n_d, (char*) ro_refs[i]->name().c_str()); - size = ocg->CreateInt(size_n_d); - } - - else if (it != array_dims.end()) { - int ref_size = it->second; - // v.var_ref_size = ref_size; - size = ocg->CreateInt(ref_size); - } else { - if (dim1) { - size = ocg->CreateTimes( - new CG_roseRepr(isSgExpression(buildVarRefExp(dim1))), - new CG_roseRepr(isSgExpression(buildVarRefExp(dim2)))); - } else { - char buf[1024]; - sprintf(buf, - "CudaizeCodeGen: Array reference %s does not have a " - "detectable size or specififed dimentions", - name.c_str()); - throw std::runtime_error(buf); - } - } - v.size_expr = - static_cast(ocg->CreateTimes(size, - new omega::CG_roseRepr( - isSgExpression(buildSizeOfOp(v.type)))))->GetExpression(); - - v.in_data = inArray; - v.out_data = 0; - arrayVars.push_back(v); - } - - if (arrayVars.size() < 2) { - fprintf(stderr, - "cudaize error: Did not find two arrays being accessed\n"); - return false; - } - - //protonu--debugging tool--the printf statement - //tex_mem_on signals use of tex mem - /* derick -- texmapping near malloc mcopy - for(int i=0; iStmtListAppend(setup_code, new CG_roseRepr(tnl)); - } - else { - SgVariableDeclaration* defn = buildVariableDeclaration( - SgName(arrayVars[i].name.c_str()), - buildPointerType(arrayVars[i].type)); - SgInitializedNamePtrList& variables = defn->get_variables(); - SgInitializedNamePtrList::const_iterator j = variables.begin(); - SgInitializedName* initializedName = *j; - SgVariableSymbol* dvs = new SgVariableSymbol(initializedName); - prependStatement(defn, func_body); - - dvs->set_parent(body_symtab); - body_symtab->insert(SgName(arrayVars[i].name.c_str()), dvs); - -// SgVariableSymbol* dvs = body_symtab->find_variable(SgName(arrayVars[i].name.c_str())); - - // if(dvs == NULL) - // dvs = parameter_symtab->find_variable(SgName(arrayVars[i].name.c_str())); - - //cudaMalloc args - // SgBasicBlock* block = buildBasicBlock(); - SgName name_cuda_malloc("cudaMalloc"); - SgFunctionDeclaration * decl_cuda_malloc = - buildNondefiningFunctionDeclaration(name_cuda_malloc, - buildVoidType(), buildFunctionParameterList(), globals); - - SgName name_cuda_copy("cudaMemcpy"); - SgFunctionDeclaration * decl_cuda_copy = - buildNondefiningFunctionDeclaration(name_cuda_copy, - buildVoidType(), buildFunctionParameterList(), globals); - - SgExprListExp* args = buildExprListExp(); - args->append_expression( - buildCastExp(buildAddressOfOp(buildVarRefExp(dvs)), - buildPointerType(buildPointerType(buildVoidType())))); - args->append_expression(arrayVars[i].size_expr); - -// decl_cuda_malloc->get_parameterList()->append_arg - SgFunctionCallExp *the_call = buildFunctionCallExp( - buildFunctionRefExp(decl_cuda_malloc), args); - - SgExprStatement* stmt = buildExprStatement(the_call); - - // (*replacement_list).push_back (stmt); - - SgStatementPtrList* tnl = new SgStatementPtrList; - (*tnl).push_back(stmt); - setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl)); - if (arrayVars[i].in_data) { - - SgExprListExp * cuda_copy_in_args = buildExprListExp(); - cuda_copy_in_args->append_expression( - isSgExpression(buildVarRefExp(dvs))); - cuda_copy_in_args->append_expression( - isSgExpression(buildVarRefExp(arrayVars[i].in_data))); - CG_roseRepr* size_exp = new CG_roseRepr(arrayVars[i].size_expr); - cuda_copy_in_args->append_expression( - static_cast(size_exp->clone())->GetExpression()); - cuda_copy_in_args->append_expression( - buildOpaqueVarRefExp("cudaMemcpyHostToDevice", globals)); - -// cuda_copy_in_args->append_expression( -// new SgVarRefExp(sourceLocation, ) -// ); - SgFunctionCallExp * cuda_copy_in_func_call = buildFunctionCallExp( - buildFunctionRefExp(decl_cuda_copy), cuda_copy_in_args); - - SgExprStatement* stmt = buildExprStatement(cuda_copy_in_func_call); - - SgStatementPtrList *tnl = new SgStatementPtrList; - (*tnl).push_back(stmt); - setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl)); - - if(arrayVars[i].tex_mapped) { - setupTexmappingVar(texture, &arrayVars[i], globals, i, dvs, symtab); - SgStatementPtrList *tnl = new SgStatementPtrList; - cudaBindTexture(texture, &arrayVars[i], globals, tnl); - setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl)); - } - } - } - } - - //Build dimGrid dim3 variables based on loop dimentions and ti/tj - char blockD1[120]; - char blockD2[120]; - if (dim1) { - snprintf(blockD1, 120, "%s/%d", - dim1->get_declaration()->get_name().getString().c_str(), cu_tx); - snprintf(blockD2, 120, "%s/%d", - dim2->get_declaration()->get_name().getString().c_str(), cu_ty); - } else { - snprintf(blockD1, 120, "%d", cu_bx); - snprintf(blockD2, 120, "%d", cu_by); - //snprintf(blockD1, 120, "%d/%d", cu_nx, cu_tx); - //snprintf(blockD2, 120, "%d/%d", cu_ny, cu_ty); - } - - SgInitializedName* arg1 = buildInitializedName("i", buildIntType()); - SgInitializedName* arg2 = buildInitializedName("j", buildIntType()); - SgInitializedName* arg3 = buildInitializedName("k", buildIntType()); - SgName type_name("dim3"); - //SgClassSymbol * type_symbol = globalScope->lookup_class_symbol(type_name); - - //ROSE_ASSERT(type_symbol != NULL); - - //SgClassDeclaration * dim3classdecl = isSgClassDeclaration( - // type_symbol->get_declaration()); - - SgFunctionDeclaration * funcdecl = buildNondefiningFunctionDeclaration( - SgName("dim3"), buildOpaqueType("dim3", globalScope), - //isSgType(dim3classdecl->get_type()), - buildFunctionParameterList(arg1, arg2, arg3), globalScope); - - if (cu_bx && cu_by) - repr = ocg->CreateDim3((const char*) gridName, ocg->CreateInt(cu_bx), - ocg->CreateInt(cu_by)); - else if (cu_bx_repr && cu_by_repr) - repr = ocg->CreateDim3((const char*) gridName, cu_bx_repr, cu_by_repr); - else if (cu_bx_repr) - repr = ocg->CreateDim3((const char*) gridName, cu_bx_repr, - ocg->CreateInt(1)); - setup_code = ocg->StmtListAppend(setup_code, repr); - //SgStatementPtrList* dimList = static_cast(repr)->GetList(); - - //for(SgStatementPtrList::iterator it = (*dimList).begin(); it != (*dimList).end(); it++) - // (*replacement_list).push_back (*it); - - // repr = ocg->CreateDim3((const char*)blockName, cu_tx,cu_ty); - - if (cu_tz > 1 || cu_tz_repr) { - - if (cu_tx && cu_ty && cu_tz) - repr = ocg->CreateDim3((char*) blockName, ocg->CreateInt(cu_tx), - ocg->CreateInt(cu_ty), ocg->CreateInt(cu_tz)); - else if (cu_tx_repr && cu_ty_repr && cu_tz_repr) - repr = ocg->CreateDim3((char*) blockName, cu_tx_repr, cu_ty_repr, - cu_tz_repr); - // SgStatementPtrList* dimList = static_cast(repr)->GetList(); - - // for(SgStatementPtrList::iterator it = (*dimList).begin(); it != (*dimList).end(); it++) - // (*replacement_list).push_back (*it); - - } else { - if (cu_tx && cu_ty) - repr = ocg->CreateDim3((char*) blockName, ocg->CreateInt(cu_tx), - ocg->CreateInt(cu_ty)); - else if (cu_tx_repr && cu_ty_repr) - repr = ocg->CreateDim3((char*) blockName, cu_tx_repr, cu_ty_repr); - //SgStatementPtrList* dimList = static_cast(repr)->GetList(); - - //for(SgStatementPtrList::iterator it = (*dimList).begin(); it != (*dimList).end(); it++) - // (*replacement_list).push_back (*it); - - } - - setup_code = ocg->StmtListAppend(setup_code, repr); - - SgCudaKernelExecConfig* config = new SgCudaKernelExecConfig( - buildVarRefExp(gridName), buildVarRefExp(blockName), NULL, NULL); - //SgCudaKernelExecConfig* config = new SgCudaKernelExecConfig(buildIntVal(cu_bx), , NULL, NULL); - SgExprListExp* iml = new SgExprListExp(); - SgCastExp* dim_s; - - //Creating Kernel function - SgBasicBlock* bb = new SgBasicBlock(TRANSFORMATION_FILE_INFO); - SgFunctionDefinition* kernel_defn = new SgFunctionDefinition( - TRANSFORMATION_FILE_INFO, bb); - SgFunctionDeclaration* kernel_decl_ = new SgFunctionDeclaration( - TRANSFORMATION_FILE_INFO, SgName((char*)cu_kernel_name.c_str()),buildFunctionType(buildVoidType(), buildFunctionParameterList()), kernel_defn); - SgFunctionDeclaration* kernel_decl = new SgFunctionDeclaration( - TRANSFORMATION_FILE_INFO, SgName((char*)cu_kernel_name.c_str()),buildFunctionType(buildVoidType(), buildFunctionParameterList()), kernel_defn); - - //((kernel_decl->get_declarationModifier()).get_storageModifier()).setStatic(); - - kernel_decl->set_definingDeclaration(kernel_decl); - kernel_defn->set_parent(kernel_decl); - bb->set_parent(kernel_defn); - bb->set_endOfConstruct(TRANSFORMATION_FILE_INFO); - bb->get_endOfConstruct()->set_parent(bb); - - //SgFunctionSymbol* functionSymbol = new SgFunctionSymbol(kernel_decl_); - //globals->insert_symbol(SgName((char*) cu_kernel_name.c_str()), - // functionSymbol); - SgFunctionSymbol* functionSymbol2 = new SgFunctionSymbol(kernel_decl); - - globals->insert_symbol(SgName((char*) cu_kernel_name.c_str()), - functionSymbol2); - - kernel_decl_->set_parent(globals); - - kernel_decl_->set_scope(globals); - - kernel_decl_->setForward(); - - globals->prepend_declaration(kernel_decl_); - - kernel_decl->set_endOfConstruct(TRANSFORMATION_FILE_INFO); - kernel_decl->get_endOfConstruct()->set_parent(kernel_decl); - - kernel_decl->set_parent(globals); - kernel_decl->set_scope(globals); - - kernel_decl->get_definition()->set_endOfConstruct(TRANSFORMATION_FILE_INFO); - kernel_decl->get_definition()->get_endOfConstruct()->set_parent( - kernel_decl->get_definition()); - - globals->append_statement(kernel_decl); - - //printf("%s %s\n", static_cast(cu_kernel_name), dims); - //--derick - kernel function parameters - for (int i = 0; i < arrayVars.size(); i++) - //Throw in a type cast if our kernel takes 2D array notation - //like (float(*) [1024]) - { - //protonu--throwing in another hack to stop the caller from passing tex mapped - //vars to the kernel. - if (arrayVars[i].tex_mapped == true || arrayVars[i].cons_mapped) - continue; - if (!(arrayVars[i].size_multi_dim.empty())) { - //snprintf(dims,120,"(float(*) [%d])%s", arrayVars[i].size_2d, - // const_cast(arrayVars[i].name.c_str())); - - SgType* t = arrayVars[i].type; - for (int k = arrayVars[i].size_multi_dim.size() - 1; k >= 0; k--) { - t = buildArrayType(t, - buildIntVal(arrayVars[i].size_multi_dim[k])); - } - SgVariableSymbol* temp = body_symtab->find_variable( - SgName((char*) arrayVars[i].name.c_str())); - if (temp == NULL) - temp = parameter_symtab->find_variable( - SgName((char*) arrayVars[i].name.c_str())); - - dim_s = buildCastExp(buildVarRefExp(temp), buildPointerType(t), - SgCastExp::e_C_style_cast); - - //printf("%d %s\n", i, dims); - iml->append_expression(dim_s); - - SgInitializedName* id = buildInitializedName( - (char*) arrayVars[i].original_name.c_str(), - buildPointerType(t)); - kernel_decl->get_parameterList()->append_arg(id); - kernel_decl_->get_parameterList()->append_arg(id); - id->set_file_info(TRANSFORMATION_FILE_INFO); - - // DQ (9/8/2007): We now test this, so it has to be set explicitly. - id->set_scope(kernel_decl->get_definition()); - - // DQ (9/8/2007): Need to add variable symbol to global scope! - //printf ("Fixing up the symbol table in scope = %p = %s for SgInitializedName = %p = %s \n",globalScope,globalScope->class_name().c_str(),var1_init_name,var1_init_name->get_name().str()); - SgVariableSymbol *var_symbol = new SgVariableSymbol(id); - kernel_decl->get_definition()->insert_symbol(id->get_name(), - var_symbol); - - // if(kernel_decl->get_definition()->get_symbol_table()->find((const) id) == NULL) - - } else { - //printf("%d %s\n", i, static_cast(arrayVars[i].name)); - SgVariableSymbol* temp = body_symtab->find_variable( - SgName((char*) arrayVars[i].name.c_str())); - if (temp == NULL) - temp = parameter_symtab->find_variable( - SgName((char*) arrayVars[i].name.c_str())); - iml->append_expression(buildVarRefExp(temp)); - SgInitializedName* id = buildInitializedName( - (char*) arrayVars[i].original_name.c_str(), - buildPointerType(arrayVars[i].type)); - kernel_decl->get_parameterList()->append_arg(id); - kernel_decl_->get_parameterList()->append_arg(id); - id->set_file_info(TRANSFORMATION_FILE_INFO); - - // DQ (9/8/2007): We now test this, so it has to be set explicitly. - id->set_scope(kernel_decl->get_definition()); - - // DQ (9/8/2007): Need to add variable symbol to global scope! - //printf ("Fixing up the symbol table in scope = %p = %s for SgInitializedName = %p = %s \n"$ - SgVariableSymbol *var_symbol = new SgVariableSymbol(id); - kernel_decl->get_definition()->insert_symbol(id->get_name(), - var_symbol); - - } - - } - if (dim1) { - iml->append_expression(buildVarRefExp(dim1)); - SgInitializedName* id = buildInitializedName( - dim1->get_name().getString().c_str(), dim1->get_type()); - kernel_decl->get_parameterList()->append_arg(id); - - iml->append_expression(buildVarRefExp(dim2)); - SgInitializedName* id2 = buildInitializedName( - dim2->get_name().getString().c_str(), dim2->get_type()); - - kernel_decl->get_parameterList()->append_arg(id); - kernel_decl_->get_parameterList()->append_arg(id); - } - - kernel_decl->get_functionModifier().setCudaKernel(); - kernel_decl_->get_functionModifier().setCudaKernel(); - SgCudaKernelCallExp * cuda_call_site = new SgCudaKernelCallExp( - TRANSFORMATION_FILE_INFO, buildFunctionRefExp(kernel_decl), iml,config); - - // SgStatementPtrList *tnl2 = new SgStatementPtrList; - - (*replacement_list).push_back(buildExprStatement(cuda_call_site)); - - setup_code = ocg->StmtListAppend(setup_code, - new CG_roseRepr(replacement_list)); - - //cuda free variables - for (int i = 0; i < arrayVars.size(); i++) { - if (arrayVars[i].out_data) { - - SgName name_cuda_copy("cudaMemcpy"); - SgFunctionDeclaration * decl_cuda_copyout = - buildNondefiningFunctionDeclaration(name_cuda_copy, - buildVoidType(), buildFunctionParameterList(), - globals); - - SgExprListExp* args = buildExprListExp(); - SgExprListExp * cuda_copy_out_args = buildExprListExp(); - cuda_copy_out_args->append_expression( - isSgExpression(buildVarRefExp(arrayVars[i].out_data))); - cuda_copy_out_args->append_expression( - isSgExpression(buildVarRefExp(arrayVars[i].name))); - CG_roseRepr* size_exp = new CG_roseRepr(arrayVars[i].size_expr); - cuda_copy_out_args->append_expression( - static_cast(size_exp->clone())->GetExpression()); - cuda_copy_out_args->append_expression( - buildOpaqueVarRefExp("cudaMemcpyDeviceToHost", globals)); - -// cuda_copy_in_args->append_expression( -// new SgVarRefExp(sourceLocation, ) -// ); - SgFunctionCallExp * cuda_copy_out_func_call = buildFunctionCallExp( - buildFunctionRefExp(decl_cuda_copyout), cuda_copy_out_args); - - SgFunctionCallExp *the_call = buildFunctionCallExp( - buildFunctionRefExp(decl_cuda_copyout), cuda_copy_out_args); - - SgExprStatement* stmt = buildExprStatement(the_call); - - SgStatementPtrList* tnl3 = new SgStatementPtrList; - - (*tnl3).push_back(stmt); - - // tree_node_list* tnl = new tree_node_list; - // tnl->append(new tree_instr(the_call)); - setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl3)); - - } - if(!arrayVars[i].cons_mapped) { - SgName name_cuda_free("cudaFree"); - SgFunctionDeclaration * decl_cuda_free = - buildNondefiningFunctionDeclaration(name_cuda_free, - buildVoidType(), buildFunctionParameterList(), globals); - - SgExprListExp* args3 = buildExprListExp(); - - SgVariableSymbol* tmp = body_symtab->find_variable( - SgName(arrayVars[i].name.c_str())); - if (tmp == NULL) - tmp = parameter_symtab->find_variable( - SgName(arrayVars[i].name.c_str())); - - args3->append_expression(buildVarRefExp(tmp)); - - SgFunctionCallExp *the_call2 = buildFunctionCallExp( - buildFunctionRefExp(decl_cuda_free), args3); - - SgExprStatement* stmt2 = buildExprStatement(the_call2); - - SgStatementPtrList* tnl4 = new SgStatementPtrList; - - (*tnl4).push_back(stmt2); - //(*replacement_list).push_back (stmt2); - - setup_code = ocg->StmtListAppend(setup_code, new CG_roseRepr(tnl4)); - } - } - - // --------------- - // BUILD THE KERNEL - // --------------- - - //Extract out kernel body - SgNode* code = getCode(); - //Create kernel function body - //Add Params - std::map loop_vars; - //In-Out arrays - for (int i = 0; i < arrayVars.size(); i++) { - /* if(arrayVars[i].in_data) - fptr = arrayVars[i].in_data->type()->clone(); - else - fptr = arrayVars[i].out_data->type()->clone(); - */ - - // fptr = new_proc_syms->install_type(fptr); - std::string name = - arrayVars[i].in_data ? - arrayVars[i].in_data->get_declaration()->get_name().getString() : - arrayVars[i].out_data->get_declaration()->get_name().getString(); - //SgVariableSymbol* sym = new var_sym(fptr, arrayVars[i].in_data ? arrayVars[i].in_data->name() : arrayVars[i].out_data->name()); - - SgVariableSymbol* sym = - kernel_decl->get_definition()->get_symbol_table()->find_variable( - (const char*) name.c_str()); - /* SgVariableDeclaration* defn = buildVariableDeclaration(SgName(name.c_str()), buildFloatType()); - SgInitializedNamePtrList& variables = defn->get_variables(); - SgInitializedNamePtrList::const_iterator i = variables.begin(); - SgInitializedName* initializedName = *i; - SgVariableSymbol* sym = new SgVariableSymbol(initializedName); - prependStatement(defn, isSgScopeStatement(root_)); - - vs->set_parent(symtab2_); - symtab2_->insert(SgName(_s.c_str()), vs); - */ - - if (sym != NULL) - loop_vars.insert( - std::pair(std::string(name), - sym)); - } - - //Figure out which loop variables will be our thread and block dimention variables - std::vector loop_syms; - //Get our indexes - std::vector indexes; // = get_loop_indexes(code,cu_num_reduce); - int threadsPos = 0; - - CG_outputRepr *body = NULL; - SgFunctionDefinition* func_d = func_definition; - //std::vector symbols = recursiveFindRefs(code); - - SgName name_sync("__syncthreads"); - SgFunctionDeclaration * decl_sync = buildNondefiningFunctionDeclaration( - name_sync, buildVoidType(), buildFunctionParameterList(), - globalScope); - - recursiveFindRefs(code, syms, func_d); - - //SgFunctionDeclaration* func = Outliner::generateFunction (code, (char*)cu_kernel_name.c_str(), syms, pdSyms, psyms, NULL, globalScope); - - if (cu_bx > 1 || cu_bx_repr) { - indexes.push_back("bx"); - SgName type_name("blockIdx.x"); - SgClassSymbol * type_symbol = globalScope->lookup_class_symbol( - type_name); - SgVariableDeclaration * var_decl = buildVariableDeclaration("bx", - buildIntType(), NULL, - isSgScopeStatement(kernel_decl->get_definition()->get_body())); - SgStatementPtrList *tnl = new SgStatementPtrList; - // (*tnl).push_back(isSgStatement(var_decl)); - appendStatement(var_decl, kernel_decl->get_definition()->get_body()); - - SgVariableSymbol* bx = - kernel_decl->get_definition()->get_body()->lookup_variable_symbol( - SgName("bx")); - SgStatement* assign = isSgStatement( - buildAssignStatement(buildVarRefExp(bx), - buildOpaqueVarRefExp("blockIdx.x", - kernel_decl->get_definition()->get_body()))); - (*tnl).push_back(assign); - // body = ocg->StmtListAppend(body, - // new CG_roseRepr(tnl)); - appendStatement(assign, kernel_decl->get_definition()->get_body()); - - } - if (cu_by > 1 || cu_by_repr) { - indexes.push_back("by"); - SgName type_name("blockIdx.y"); - SgClassSymbol * type_symbol = globalScope->lookup_class_symbol( - type_name); - SgVariableDeclaration * var_decl = buildVariableDeclaration("by", - buildIntType(), NULL, - isSgScopeStatement(kernel_decl->get_definition()->get_body())); - // SgStatementPtrList *tnl = new SgStatementPtrList; - // (*tnl).push_back(isSgStatement(var_decl)); - appendStatement(var_decl, kernel_decl->get_definition()->get_body()); - - SgVariableSymbol* by = - kernel_decl->get_definition()->get_body()->lookup_variable_symbol( - SgName("by")); - SgStatement* assign = isSgStatement( - buildAssignStatement(buildVarRefExp(by), - buildOpaqueVarRefExp("blockIdx.y", - kernel_decl->get_definition()->get_body()))); - //(*tnl).push_back(assign); - // body = ocg->StmtListAppend(body, - // new CG_roseRepr(tnl)); - appendStatement(assign, kernel_decl->get_definition()->get_body()); - - } - if (cu_tx_repr || cu_tx > 1) { - threadsPos = indexes.size(); - indexes.push_back("tx"); - SgName type_name("threadIdx.x"); - SgClassSymbol * type_symbol = globalScope->lookup_class_symbol( - type_name); - SgVariableDeclaration * var_decl = buildVariableDeclaration("tx", - buildIntType(), NULL, - isSgScopeStatement(kernel_decl->get_definition()->get_body())); - // SgStatementPtrList *tnl = new SgStatementPtrList; - // (*tnl).push_back(isSgStatement(var_decl)); - appendStatement(var_decl, kernel_decl->get_definition()->get_body()); - - SgVariableSymbol* tx = - kernel_decl->get_definition()->get_body()->lookup_variable_symbol( - SgName("tx")); - SgStatement* assign = isSgStatement( - buildAssignStatement(buildVarRefExp(tx), - buildOpaqueVarRefExp("threadIdx.x", - kernel_decl->get_definition()->get_body()))); - //(*tnl).push_back(assign); - // body = ocg->StmtListAppend(body, - // new CG_roseRepr(tnl)); - appendStatement(assign, kernel_decl->get_definition()->get_body()); - - } - if (cu_ty_repr || cu_ty > 1) { - indexes.push_back("ty"); - SgName type_name("threadIdx.y"); - SgClassSymbol * type_symbol = globalScope->lookup_class_symbol( - type_name); - SgVariableDeclaration * var_decl = buildVariableDeclaration("ty", - buildIntType(), NULL, - isSgScopeStatement(kernel_decl->get_definition()->get_body())); - appendStatement(var_decl, kernel_decl->get_definition()->get_body()); - - // SgStatementPtrList *tnl = new SgStatementPtrList; - // (*tnl).push_back(isSgStatement(var_decl)); - SgVariableSymbol* ty = - kernel_decl->get_definition()->get_body()->lookup_variable_symbol( - SgName("ty")); - SgStatement* assign = isSgStatement( - buildAssignStatement(buildVarRefExp(ty), - buildOpaqueVarRefExp("threadIdx.y", - kernel_decl->get_definition()->get_body()))); - // (*tnl).push_back(assign); - // body = ocg->StmtListAppend(body, - // new CG_roseRepr(tnl)); - appendStatement(assign, kernel_decl->get_definition()->get_body()); - - } - if (cu_tz_repr || cu_tz > 1) { - indexes.push_back("tz"); - SgName type_name("threadIdx.z"); - SgClassSymbol * type_symbol = globalScope->lookup_class_symbol( - type_name); - SgVariableDeclaration * var_decl = buildVariableDeclaration("tz", - buildIntType(), NULL, - isSgScopeStatement(kernel_decl->get_definition()->get_body())); - // SgStatementPtrList *tnl = new SgStatementPtrList; - // (*tnl).push_back(isSgStatement(var_decl)); - appendStatement(var_decl, kernel_decl->get_definition()->get_body()); - - SgVariableSymbol* tz = - kernel_decl->get_definition()->get_body()->lookup_variable_symbol( - "tz"); - SgStatement* assign = isSgStatement( - buildAssignStatement(buildVarRefExp(tz), - buildOpaqueVarRefExp("threadIdx.z", - kernel_decl->get_definition()->get_body()))); - // (*tnl).push_back(assign); - // body = ocg->StmtListAppend(body, - // new CG_roseRepr(tnl)); - appendStatement(assign, kernel_decl->get_definition()->get_body()); - - } - - std::map loop_idxs; //map from idx names to their new syms - - SgNode* swapped_ = swapVarReferences(code, syms, - kernel_decl->get_definition()->get_symbol_table(), - kernel_decl->get_definition()->get_body()->get_symbol_table(), - kernel_decl->get_definition()->get_body()); - - //std::cout << swapped_->unparseToString() << std::endl << std::endl; - - SgNode *swapped = recursiveFindReplacePreferedIdxs(swapped_, - kernel_decl->get_definition()->get_body()->get_symbol_table(), - kernel_decl->get_definition()->get_symbol_table(), - kernel_decl->get_definition()->get_body(), loop_idxs, globalScope); //in-place swapping - //swapped->print(); - - if (!isSgBasicBlock(swapped)) { - appendStatement(isSgStatement(swapped), - kernel_decl->get_definition()->get_body()); - swapped->set_parent( - isSgNode(kernel_decl->get_definition()->get_body())); - } else { - - for (SgStatementPtrList::iterator it = - isSgBasicBlock(swapped)->get_statements().begin(); - it != isSgBasicBlock(swapped)->get_statements().end(); it++) { - appendStatement(*it, kernel_decl->get_definition()->get_body()); - (*it)->set_parent( - isSgNode(kernel_decl->get_definition()->get_body())); - - } - - } - - for (int i = 0; i < indexes.size(); i++) { - std::vector tfs = findCommentedFors(indexes[i], - swapped); - for (int k = 0; k < tfs.size(); k++) { - //printf("replacing %p tfs for index %s\n", tfs[k], indexes[i]); - SgNode* newBlock = forReduce(tfs[k], loop_idxs[indexes[i]], - kernel_decl->get_definition()); - //newBlock->print(); - swap_node_for_node_list(tfs[k], newBlock); - //printf("AFTER SWAP\n"); newBlock->print(); - } - } - - //--derick replace array refs of texture mapped vars here - body = new CG_roseRepr(kernel_decl->get_definition()->get_body()); - std::vector refs = ir->FindArrayRef(body); - texmapArrayRefs(texture, &refs, globals, ir, ocg); - // do the same for constant mapped vars - consmapArrayRefs(constant_mem, &refs, globals, ir, ocg); - - return swapped; -} - -//Order taking out dummy variables -std::vector cleanOrder(std::vector idxNames) { - std::vector results; - for (int j = 0; j < idxNames.size(); j++) { - if (idxNames[j].length() != 0) - results.push_back(idxNames[j]); - } - return results; -} - -//First non-dummy level in ascending order -int LoopCuda::nonDummyLevel(int stmt, int level) { - //level comes in 1-basd and should leave 1-based - for (int j = level - 1; j < idxNames[stmt].size(); j++) { - if (idxNames[stmt][j].length() != 0) { - //printf("found non dummy level of %d with idx: %s when searching for %d\n", j+1, (const char*) idxNames[stmt][j], level); - return j + 1; - } - } - char buf[128]; - sprintf(buf, "%d", level); - throw std::runtime_error( - std::string("Unable to find a non-dummy level starting from ") - + std::string(buf)); -} - -int LoopCuda::findCurLevel(int stmt, std::string idx) { - for (int j = 0; j < idxNames[stmt].size(); j++) { - if (strcmp(idxNames[stmt][j].c_str(), idx.c_str()) == 0) - return j + 1; - } - throw std::runtime_error( - std::string("Unable to find index ") + idx - + std::string(" in current list of indexes")); -} - -void LoopCuda::permute_cuda(int stmt, - const std::vector& curOrder) { - //printf("curOrder: "); - //printVs(curOrder); - //printf("idxNames: "); - //printVS(idxNames[stmt]); - std::vector cIdxNames = cleanOrder(idxNames[stmt]); - bool same = true; - std::vector pi; - for (int i = 0; i < curOrder.size(); i++) { - bool found = false; - for (int j = 0; j < cIdxNames.size(); j++) { - if (strcmp(cIdxNames[j].c_str(), curOrder[i].c_str()) == 0) { - pi.push_back(j + 1); - found = true; - if (j != i) - same = false; - } - } - if (!found) { - throw std::runtime_error( - "One of the indexes in the permute order were not " - "found in the current set of indexes."); - } - } - for (int i = curOrder.size(); i < cIdxNames.size(); i++) { - pi.push_back(i); - } - if (same) - return; - permute(stmt, pi); - //Set old indexe names as new - for (int i = 0; i < curOrder.size(); i++) { - idxNames[stmt][i] = curOrder[i].c_str(); //what about sibling stmts? - } -} - -bool LoopCuda::permute(int stmt_num, const std::vector &pi) { -// check for sanity of parameters - if (stmt_num >= stmt.size() || stmt_num < 0) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - const int n = stmt[stmt_num].xform.n_out(); - if (pi.size() > (n - 1) / 2) - throw std::invalid_argument( - "iteration space dimensionality does not match permute dimensionality"); - int first_level = 0; - int last_level = 0; - for (int i = 0; i < pi.size(); i++) { - if (pi[i] > (n - 1) / 2 || pi[i] <= 0) - throw std::invalid_argument( - "invalid loop level " + to_string(pi[i]) - + " in permuation"); - - if (pi[i] != i + 1) { - if (first_level == 0) - first_level = i + 1; - last_level = i + 1; - } - } - if (first_level == 0) - return true; - - std::vector lex = getLexicalOrder(stmt_num); - std::set active = getStatements(lex, 2 * first_level - 2); - Loop::permute(active, pi); -} - -void LoopCuda::tile_cuda(int stmt, int level, int outer_level) { - tile_cuda(stmt, level, 1, outer_level, "", "", CountedTile); -} -void LoopCuda::tile_cuda(int level, int tile_size, int outer_level, - std::string idxName, std::string ctrlName, TilingMethodType method) { - tile_cuda(0, level, tile_size, outer_level, idxName, ctrlName, method); -} - -void LoopCuda::tile_cuda(int stmt, int level, int tile_size, int outer_level, - std::string idxName, std::string ctrlName, TilingMethodType method) { - //Do regular tile but then update the index and control loop variable - //names as well as the idxName to reflect the current state of things. - //printf("tile(%d,%d,%d,%d)\n", stmt, level, tile_size, outer_level); - //printf("idxNames before: "); - //printVS(idxNames[stmt]); - - tile(stmt, level, tile_size, outer_level, method); - - if (idxName.size()) - idxNames[stmt][level - 1] = idxName.c_str(); - if (tile_size == 1) { - //potentially rearrange loops - if (outer_level < level) { - std::string tmp = idxNames[stmt][level - 1]; - for (int i = level - 1; i > outer_level - 1; i--) { - if (i - 1 >= 0) - idxNames[stmt][i] = idxNames[stmt][i - 1]; - } - idxNames[stmt][outer_level - 1] = tmp; - } - //TODO: even with a tile size of one, you need a insert (of a dummy loop) - idxNames[stmt].insert(idxNames[stmt].begin() + (level), ""); - } else { - if (!ctrlName.size()) - throw std::runtime_error("No ctrl loop name for tile"); - //insert - idxNames[stmt].insert(idxNames[stmt].begin() + (outer_level - 1), - ctrlName.c_str()); - } - - //printf("idxNames after: "); - //printVS(idxNames[stmt]); -} - -bool LoopCuda::datacopy_privatized_cuda(int stmt_num, int level, - const std::string &array_name, - const std::vector &privatized_levels, bool allow_extra_read, - int fastest_changing_dimension, int padding_stride, - int padding_alignment, bool cuda_shared) { - int old_stmts = stmt.size(); - // printf("before datacopy_privatized:\n"); - printIS(); - //datacopy_privatized(stmt_num, level, array_name, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, cuda_shared); - if (cuda_shared) - datacopy_privatized(stmt_num, level, array_name, privatized_levels, - allow_extra_read, fastest_changing_dimension, padding_stride, - padding_alignment, 1); - else - datacopy_privatized(stmt_num, level, array_name, privatized_levels, - allow_extra_read, fastest_changing_dimension, padding_stride, - padding_alignment, 0); - // printf("after datacopy_privatized:\n"); - printIS(); - - //Adjust idxNames to reflect updated state - std::vector cIdxNames = cleanOrder(idxNames[stmt_num]); - int new_stmts = stmt.size(); - for (int i = old_stmts; i < new_stmts; i++) { - //printf("fixing up statement %d\n", i); - std::vector idxs; - - //protonu-making sure the vector of nonSplitLevels grows along with - //the statement structure - stmt_nonSplitLevels.push_back(std::vector()); - - //Indexes up to level will be the same - for (int j = 0; j < level - 1; j++) - idxs.push_back(cIdxNames[j]); - - //Expect privatized_levels to match - for (int j = 0; j < privatized_levels.size(); j++) - idxs.push_back(cIdxNames[privatized_levels[j] - 1]);//level is one-based - - //all further levels should match order they are in originally - if (privatized_levels.size()) { - int last_privatized = privatized_levels.back(); - int top_level = last_privatized - + (stmt[i].IS.n_set() - idxs.size()); - //printf("last privatized_levels: %d top_level: %d\n", last_privatized, top_level); - for (int j = last_privatized; j < top_level; j++) { - idxs.push_back(cIdxNames[j]); - //printf("pushing back: %s\n", (const char*)cIdxNames[j]); - } - } - idxNames.push_back(idxs); - } -} - -bool LoopCuda::datacopy_cuda(int stmt_num, int level, - const std::string &array_name, - const std::vector new_idxs, - bool allow_extra_read, int fastest_changing_dimension, - int padding_stride, int padding_alignment, bool cuda_shared) { - - int old_stmts = stmt.size(); - //datacopy(stmt_num,level,array_name,allow_extra_read,fastest_changing_dimension,padding_stride,padding_alignment,cuda_shared); - // printf("before datacopy:\n"); - // printIS(); - if (cuda_shared) - datacopy(stmt_num, level, array_name, allow_extra_read, - fastest_changing_dimension, padding_stride, padding_alignment, - 1); - else - datacopy(stmt_num, level, array_name, allow_extra_read, - fastest_changing_dimension, padding_stride, padding_alignment, - 0); - // printf("after datacopy:\n"); - printIS(); - - //Adjust idxNames to reflect updated state - std::vector cIdxNames = cleanOrder(idxNames[stmt_num]); - int new_stmts = stmt.size(); - for (int i = old_stmts; i < new_stmts; i++) { - //printf("fixing up statement %d\n", i); - std::vector idxs; - - //protonu-making sure the vector of nonSplitLevels grows along with - //the statement structure - stmt_nonSplitLevels.push_back(std::vector()); - - //Indexes up to level will be the same - for (int j = 0; j < level - 1; j++) - idxs.push_back(cIdxNames[j]); - - //all further levels should get names from new_idxs - int top_level = stmt[i].IS.n_set(); - //printf("top_level: %d level: %d\n", top_level, level); - if (new_idxs.size() < top_level - level + 1) - throw std::runtime_error( - "Need more new index names for new datacopy loop levels"); - - for (int j = level - 1; j < top_level; j++) { - idxs.push_back(new_idxs[j - level + 1].c_str()); - //printf("pushing back: %s\n", new_idxs[j-level+1].c_str()); - } - idxNames.push_back(idxs); - } -} - -bool LoopCuda::unroll_cuda(int stmt_num, int level, int unroll_amount) { - int old_stmts = stmt.size(); - //bool b= unroll(stmt_num, , unroll_amount); - - int dim = 2 * level - 1; - std::vector lex = getLexicalOrder(stmt_num); - std::set same_loop = getStatements(lex, dim - 1); - - level = nonDummyLevel(stmt_num, level); - //printf("unrolling %d at level %d\n", stmt_num,level); - - //protonu--using the new version of unroll, which returns - //a set of ints instead of a bool. To keep Gabe's logic - //I'll check the size of the set, if it's 0 return true - //bool b= unroll(stmt_num, level, unroll_amount); - std::set b_set = unroll(stmt_num, level, unroll_amount, idxNames); - bool b = false; - if (b_set.size() == 0) - b = true; - //end--protonu - - //Adjust idxNames to reflect updated state - std::vector cIdxNames = cleanOrder(idxNames[stmt_num]); - std::vector origSource = idxNames[stmt_num]; - ; - //Drop index names at level - if (unroll_amount == 0) { - //For all statements that were in this unroll together, drop index name for unrolled level - idxNames[stmt_num][level - 1] = ""; - for (std::set::iterator i = same_loop.begin(); - i != same_loop.end(); i++) { - //printf("in same loop as %d is %d\n", stmt_num, (*i)); - //idxNames[(*i)][level-1] = ""; - idxNames[(*i)] = idxNames[stmt_num]; - } - } - - lex = getLexicalOrder(stmt_num); - same_loop = getStatements(lex, dim - 1); - - bool same_as_source = false; - int new_stmts = stmt.size(); - for (int i = old_stmts; i < new_stmts; i++) { - //Check whether we had a sync for the statement we are unrolling, if - //so, propogate that to newly created statements so that if they are - //in a different loop structure, they will also get a syncthreads - int size = syncs.size(); - for (int j = 0; j < size; j++) { - if (syncs[j].first == stmt_num) - syncs.push_back(make_pair(i, syncs[j].second)); - } - - //protonu-making sure the vector of nonSplitLevels grows along with - //the statement structure - stmt_nonSplitLevels.push_back(std::vector()); - - //We expect that new statements have a constant for the variable in - //stmt[i].IS at level (as seen with print_with_subs), otherwise there - //will be a for loop at level and idxNames should match stmt's - //idxNames pre-unrolled - Relation IS = stmt[i].IS; - //Ok, if you know how the hell to get anything out of a Relation, you - //should probably be able to do this more elegantly. But for now, I'm - //hacking it. - std::string s = IS.print_with_subs_to_string(); - //s looks looks like - //{[_t49,8,_t51,_t52,128]: 0 <= _t52 <= 3 && 0 <= _t51 <= 15 && 0 <= _t49 && 64_t49+16_t52+_t51 <= 128} - //where level == 5, you see a integer in the input set - - //If that's not an integer and this is the first new statement, then - //we think codegen will have a loop at that level. It's not perfect, - //not sure if it can be determined without round-tripping to codegen. - int sIdx = 0; - int eIdx = 0; - for (int j = 0; j < level - 1; j++) { - sIdx = s.find(",", sIdx + 1); - if (sIdx < 0) - break; - } - if (sIdx > 0) { - eIdx = s.find("]"); - int tmp = s.find(",", sIdx + 1); - if (tmp > 0 && tmp < eIdx) - eIdx = tmp; //", before ]" - if (eIdx > 0) { - sIdx++; - std::string var = s.substr(sIdx, eIdx - sIdx); - //printf("%s\n", s.c_str()); - //printf("set var for stmt %d at level %d is %s\n", i, level, var.c_str()); - if (atoi(var.c_str()) == 0 && i == old_stmts) { - //TODO:Maybe do see if this new statement would be in the same - //group as the original and if it would, don't say - //same_as_source - if (same_loop.find(i) == same_loop.end()) { - printf( - "stmt %d level %d, newly created unroll statement should have same level indexes as source\n", - i, level); - same_as_source = true; - } - } - } - } - - //printf("fixing up statement %d n_set %d with %d levels\n", i, stmt[i].IS.n_set(), level-1); - if (same_as_source) - idxNames.push_back(origSource); - else - idxNames.push_back(idxNames[stmt_num]); - } - - return b; -} - -void LoopCuda::copy_to_texture(const char *array_name) { - //protonu--placeholder for now - //set the bool for using cuda memory as true - //in a vector of strings, put the names of arrays to tex mapped - if (!texture) - texture = new texture_memory_mapping(true, array_name); - else - texture->add(array_name); - -} - -//void LoopCuda::copy_to_texture_2d(const char *array_name, int width, int height) { -// if (!texture) -// texture = new texture_memory_mapping(true, array_name, width, height); -// else -// texture->add(array_name, width, height); -//} - -void LoopCuda::copy_to_constant(const char *array_name) { - if(!constant_mem) - constant_mem = new constant_memory_mapping(true, array_name); - else - constant_mem->add(array_name); -} - -//protonu--moving this from Loop -SgNode* LoopCuda::codegen() { - if (code_gen_flags & GenCudaizeV2) - return cudaize_codegen_v2(); - //Do other flagged codegen methods, return plain vanilla generated code - return getCode(); -} - -//These three are in Omega code_gen.cc and are used as a massive hack to -//get out some info from MMGenerateCode. Yea for nasty side-effects. -namespace omega { - extern int checkLoopLevel; - extern int stmtForLoopCheck; - extern int upperBoundForLevel; - extern int lowerBoundForLevel; -} - -CG_outputRepr* LoopCuda::extractCudaUB(int stmt_num, int level, - int &outUpperBound, int &outLowerBound) { - // check for sanity of parameters - const int m = stmt.size(); - if (stmt_num >= m || stmt_num < 0) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - const int n = stmt[stmt_num].xform.n_out(); - if (level > (n - 1) / 2 || level <= 0) - throw std::invalid_argument("invalid loop level " + to_string(level)); - - int dim = 2 * level - 1; - - std::vector lex = getLexicalOrder(stmt_num); - std::set same_loop = getStatements(lex, dim - 1); - - // extract the intersection of the iteration space to be considered - Relation hull; - { - hull = Relation::True(n); - for (std::set::iterator i = same_loop.begin(); - i != same_loop.end(); i++) { - Relation r = getNewIS(*i); - for (int j = dim + 2; j <= r.n_set(); j++) - r = Project(r, r.set_var(j)); - hull = Intersection(hull, r); - hull.simplify(2, 4); - } - - for (int i = 2; i <= dim + 1; i += 2) { - //std::string name = std::string("_t") + to_string(t_counter++); - std::string name = std::string("_t") - + to_string(tmp_loop_var_name_counter++); - hull.name_set_var(i, name); - } - hull.setup_names(); - } - - // extract the exact loop bound of the dimension to be unrolled - if (is_single_iteration(hull, dim)) { - throw std::runtime_error( - "No loop availabe at level to extract upper bound."); - } - Relation bound = get_loop_bound(hull, dim); - if (!bound.has_single_conjunct() || !bound.is_satisfiable() - || bound.is_tautology()) - throw loop_error( - "loop error: unable to extract loop bound for cudaize"); - - // extract the loop stride - EQ_Handle stride_eq; - /*int stride = 1; - { - bool simple_stride = true; - int strides = countStrides(bound.query_DNF()->single_conjunct(), - bound.set_var(dim + 1), stride_eq, simple_stride); - if (strides > 1) - throw loop_error("loop error: too many strides"); - else if (strides == 1) { - int sign = stride_eq.get_coef(bound.set_var(dim + 1)); - // assert(sign == 1 || sign == -1); - Constr_Vars_Iter it(stride_eq, true); - stride = abs((*it).coef / sign); - } - } - */ - int stride = 1; - { - - coef_t stride; - std::pair result = find_simplest_stride(bound, - bound.set_var(dim + 1)); - if (result.second == NULL) - stride = 1; - else - stride = abs(result.first.get_coef(result.second)) - / gcd(abs(result.first.get_coef(result.second)), - abs(result.first.get_coef(bound.set_var(dim + 1)))); - - if (stride > 1) - throw loop_error("loop error: too many strides"); - /*else if (stride == 1) { - int sign = result.first.get_coef(bound.set_var(dim+1)); - assert(sign == 1 || sign == -1); - } */ - } - - if (stride != 1) { - char buf[1024]; - sprintf(buf, "Cudaize: Loop at level %d has non-one stride of %d", - level, stride); - throw std::runtime_error(buf); - } - - //Use code generation system to build tell us our bound information. We - //need a hard upper bound a 0 lower bound. - - checkLoopLevel = level * 2; - stmtForLoopCheck = stmt_num; - upperBoundForLevel = -1; - lowerBoundForLevel = -1; - printCode(1, false); - checkLoopLevel = 0; - - outUpperBound = upperBoundForLevel; - outLowerBound = lowerBoundForLevel; - - if (outUpperBound == -1) { - - CG_result* temp = last_compute_cgr_; - - while (temp) { - CG_loop * loop; - if (loop = dynamic_cast(temp)) { - if (loop->level_ == 2 * level) { - Relation bound = copy(loop->bounds_); - Variable_ID v = bound.set_var(2 * level); - for (GEQ_Iterator e( - const_cast(bound).single_conjunct()->GEQs()); - e; e++) { - if ((*e).get_coef(v) < 0 - && (*e).is_const_except_for_global(v)) - return output_upper_bound_repr(ir->builder(), *e, v, - bound, - std::vector >( - bound.n_set(), - std::make_pair( - static_cast(NULL), - 0))); - } - } - if (loop->level_ > 2 * level) - break; - else - temp = loop->body_; - } else - break; - } - } - - return NULL; -} - -void LoopCuda::printCode(int effort, bool actuallyPrint) const { - const int m = stmt.size(); - if (m == 0) - return; - const int n = stmt[0].xform.n_out(); - - /*or (int i = 0; i < m; i++) { - IS[i + 1] = stmt[i].IS; - xform[i + 1] = stmt[i].xform; - - //nonSplitLevels[i+1] = stmt[i].nonSplitLevels; - } - */ - - // invalidate saved codegen computation - if (last_compute_cgr_ != NULL) { - delete last_compute_cgr_; - last_compute_cgr_ = NULL; - } - - if (last_compute_cg_ != NULL) { - delete last_compute_cg_; - last_compute_cg_ = NULL; - } - - //Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); - /*CG_stringBuilder *ocg = new CG_stringBuilder(); - Tuple nameInfo; - for (int i = 1; i <= m; i++) - nameInfo.append(new CG_stringRepr("s" + to_string(i))); - */ - - // -- replacing MMGenerateCode - // -- formally CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort); - // -- in the future, these if statements need to be cleaned up. - // -- something like check_lastComputeCG might be a decent protected member function - // -- and/or something that returns a std::vector that also checks last_compute_cg_ - //if (last_compute_cg_ == NULL) { - std::vector IS(m); - std::vector xforms(m); - std::vector > nonSplitLevels(m); - - /* std::vector < std::vector > idxTupleNames; - if (useIdxNames) { - for (int i = 0; i < idxNames.size(); i++) { - Tuple idxs; - for (int j = 0; j < idxNames[i].size(); j++) - idxs.append(idxNames[i][j]); - idxTupleNames.append(idxs); - } - } - */ - for (int i = 0; i < m; i++) { - IS[i] = stmt[i].IS; - xforms[i] = stmt[i].xform; - nonSplitLevels[i] = stmt_nonSplitLevels[i]; - } - Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); - - last_compute_cg_ = new CodeGen(xforms, IS, known, nonSplitLevels, idxNames, - syncs); - - delete last_compute_cgr_; // this was just done above? - last_compute_cgr_ = NULL; - //} - - if (last_compute_cgr_ == NULL || last_compute_effort_ != effort) { - delete last_compute_cgr_; - last_compute_cgr_ = last_compute_cg_->buildAST(effort); - last_compute_effort_ = effort; - } - - //std::vector stmts(m); - //for (int i = 0; i < m; i++) - // stmts[i] = stmt[i].code; - //CG_outputRepr* repr = last_compute_cgr_->printRepr(ocg, stmts); - // -- end replacing MMGenerateCode - std::string repr = last_compute_cgr_->printString(); - - if (actuallyPrint) - std::cout << repr << std::endl; - //std::cout << static_cast(repr)->GetString(); - /* - for (int i = 1; i <= m; i++) - delete nameInfo[i]; - */ - - //delete ocg; -} - -void LoopCuda::printRuntimeInfo() const { - for (int i = 0; i < stmt.size(); i++) { - Relation IS = stmt[i].IS; - Relation xform = stmt[i].xform; - printf("stmt[%d]\n", i); - printf("IS\n"); - IS.print_with_subs(); - - printf("xform[%d]\n", i); - xform.print_with_subs(); - - } -} - -void LoopCuda::printIndexes() const { - for (int i = 0; i < stmt.size(); i++) { - printf("stmt %d nset %d ", i, stmt[i].IS.n_set()); - - for (int j = 0; j < idxNames[i].size(); j++) { - if (j > 0) - printf(","); - printf("%s", idxNames[i][j].c_str()); - } - printf("\n"); - } -} - -SgNode* LoopCuda::getCode(int effort) const { - const int m = stmt.size(); - if (m == 0) - return new SgNode; - const int n = stmt[0].xform.n_out(); - /* - Tuple ni(m); - Tuple < Relation > IS(m); - Tuple < Relation > xform(m); - vector < vector > nonSplitLevels(m); - for (int i = 0; i < m; i++) { - ni[i + 1] = stmt[i].code; - IS[i + 1] = stmt[i].IS; - xform[i + 1] = stmt[i].xform; - nonSplitLevels[i + 1] = stmt_nonSplitLevels[i]; - - //nonSplitLevels[i+1] = stmt[i].nonSplitLevels; - } - */ - //Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); -//#ifdef DEBUG -//#endif - //std::cout << GetString(MMGenerateCode(new CG_stringBuilder(), xform, IS, ni, known, - // nonSplitLevels, syncs, idxTupleNames, effort)); - if (last_compute_cgr_ != NULL) { - delete last_compute_cgr_; - last_compute_cgr_ = NULL; - } - - if (last_compute_cg_ != NULL) { - delete last_compute_cg_; - last_compute_cg_ = NULL; - } - - CG_outputBuilder *ocg = ir->builder(); - // -- replacing MMGenerateCode - // -- formally CG_outputRepr* repr = MMGenerateCode(ocg, xform, IS, nameInfo, known, nonSplitLevels, syncs, idxTupleNames, effort); - // -- in the future, these if statements need to be cleaned up. - // -- something like check_lastComputeCG might be a decent protected member function - // -- and/or something that returns a std::vector that also checks last_compute_cg_ - //if (last_compute_cg_ == NULL) { - std::vector IS(m); - std::vector xforms(m); - std::vector > nonSplitLevels(m); - for (int i = 0; i < m; i++) { - IS[i] = stmt[i].IS; - xforms[i] = stmt[i].xform; - nonSplitLevels[i] = stmt_nonSplitLevels[i]; - } - - /*std::vector < std::vector > idxTupleNames; - if (useIdxNames) { - for (int i = 0; i < idxNames.size(); i++) { - std::vector idxs; - for (int j = 0; j < idxNames[i].size(); j++) - idxs.push_back(idxNames[i][j]); - idxTupleNames.push_back(idxs); - } - } - */ - Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); - - last_compute_cg_ = new CodeGen(xforms, IS, known, nonSplitLevels, idxNames, - syncs); - delete last_compute_cgr_; - last_compute_cgr_ = NULL; - //} - - if (last_compute_cgr_ == NULL || last_compute_effort_ != effort) { - delete last_compute_cgr_; - last_compute_cgr_ = last_compute_cg_->buildAST(effort); - last_compute_effort_ = effort; - } - - std::vector stmts(m); - for (int i = 0; i < m; i++) - stmts[i] = stmt[i].code; - CG_outputRepr* repr = last_compute_cgr_->printRepr(ocg, stmts); - // -- end replacing MMGenerateCode - - //CG_outputRepr *overflow_initialization = ocg->CreateStmtList(); - CG_outputRepr *overflow_initialization = ocg->StmtListAppend(NULL, NULL); - for (std::map >::const_iterator i = - overflow.begin(); i != overflow.end(); i++) - for (std::vector::const_iterator j = i->second.begin(); - j != i->second.end(); j++) - //overflow_initialization = ocg->StmtListAppend(overflow_initialization, ocg->CreateStmtList(ocg->CreateAssignment(0, ocg->CreateIdent((*j)->base_name()), ocg->CreateInt(0)))); - overflow_initialization = ocg->StmtListAppend( - overflow_initialization, - ocg->StmtListAppend( - ocg->CreateAssignment(0, - ocg->CreateIdent((*j)->base_name()), - ocg->CreateInt(0)), NULL)); - - repr = ocg->StmtListAppend(overflow_initialization, repr); - SgNode *tnl = static_cast(repr)->GetCode(); - SgStatementPtrList *list = static_cast(repr)->GetList(); - - if (tnl != NULL) - return tnl; - else if (tnl == NULL && list != NULL) { - SgBasicBlock* bb2 = buildBasicBlock(); - - for (SgStatementPtrList::iterator it = (*list).begin(); - it != (*list).end(); it++) - bb2->append_statement(*it); - - tnl = isSgNode(bb2); - } else - throw loop_error("codegen failed"); - - delete repr; - /* - for (int i = 1; i <= m; i++) - delete ni[i]; - */ - return tnl; - -} - -//protonu--adding constructors for the new derived class -LoopCuda::LoopCuda() : - Loop(), code_gen_flags(GenInit) { -} - -LoopCuda::LoopCuda(IR_Control *irc, int loop_num) : - Loop(irc) { - setup_code = NULL; - teardown_code = NULL; - code_gen_flags = 0; - cu_bx = cu_by = cu_tx = cu_ty = cu_tz = 1; - cu_bx_repr = NULL; - cu_tx_repr = NULL; - cu_by_repr = NULL; - cu_ty_repr = NULL; - cu_tz_repr = NULL; - - cu_num_reduce = 0; - cu_mode = GlobalMem; - texture = NULL; - constant_mem = NULL; - - int m = stmt.size(); - //printf("\n the size of stmt(initially) is: %d\n", stmt.size()); - for (int i = 0; i < m; i++) - stmt_nonSplitLevels.push_back(std::vector()); - - globals = ((IR_cudaroseCode *) ir)->gsym_; - globalScope = ((IR_cudaroseCode *) ir)->first_scope; - parameter_symtab = ((IR_cudaroseCode *) ir)->parameter; - body_symtab = ((IR_cudaroseCode *) ir)->body; - func_body = ((IR_cudaroseCode *) ir)->defn; - func_definition = ((IR_cudaroseCode *) ir)->func_defn; - std::vector tf = ((IR_cudaroseCode *) ir)->get_loops(); - - symtab = tf[loop_num]->get_symbol_table(); - - std::vector deepest = find_deepest_loops( - isSgNode(tf[loop_num])); - - for (int i = 0; i < deepest.size(); i++) { - SgVariableSymbol* vs; - SgForInitStatement* list = deepest[i]->get_for_init_stmt(); - SgStatementPtrList& initStatements = list->get_init_stmt(); - SgStatementPtrList::const_iterator j = initStatements.begin(); - if (SgExprStatement *expr = isSgExprStatement(*j)) - if (SgAssignOp* op = isSgAssignOp(expr->get_expression())) - if (SgVarRefExp* var_ref = isSgVarRefExp(op->get_lhs_operand())) - vs = var_ref->get_symbol(); - - index.push_back(vs->get_name().getString().c_str()); //reflects original code index names - } - - for (int i = 0; i < stmt.size(); i++) - idxNames.push_back(index); //refects prefered index names (used as handles in cudaize v2) - useIdxNames = false; - -} - -void LoopCuda::printIS() { - if (!cudaDebug) return; - int k = stmt.size(); - for (int i = 0; i < k; i++) { - printf(" printing statement:%d\n", i); - stmt[i].IS.print(); - } -} - diff --git a/loop_modified.cc b/loop_modified.cc deleted file mode 100644 index 9686f6d..0000000 --- a/loop_modified.cc +++ /dev/null @@ -1,4234 +0,0 @@ -/***************************************************************************** - Copyright (C) 2008 University of Southern California - Copyright (C) 2009-2010 University of Utah - All Rights Reserved. - - Purpose: - Core loop transformation functionality. - - Notes: - "level" (starting from 1) means loop level and it corresponds to "dim" - (starting from 0) in transformed iteration space [c_1,l_1,c_2,l_2,...., - c_n,l_n,c_(n+1)], e.g., l_2 is loop level 2 in generated code, dim 3 - in transformed iteration space, and variable 4 in Omega relation. - All c's are constant numbers only and they will not show up as actual loops. - Formula: - dim = 2*level - 1 - var = dim + 1 - - History: - 10/2005 Created by Chun Chen. - 09/2009 Expand tile functionality, -chun - 10/2009 Initialize unfusible loop nest without bailing out, -chun -*****************************************************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include "loop.hh" -#include "omegatools.hh" -#include "irtools.hh" -#include "chill_error.hh" -#include -using namespace omega; - -const std::string Loop::tmp_loop_var_name_prefix = std::string("_t"); -const std::string Loop::overflow_var_name_prefix = std::string("over"); - -//----------------------------------------------------------------------------- -// Class Loop -//----------------------------------------------------------------------------- - -bool Loop::init_loop(std::vector &ir_tree, - std::vector &ir_stmt) { - ir_stmt = extract_ir_stmts(ir_tree); - stmt_nesting_level_.resize(ir_stmt.size()); - std::vector stmt_nesting_level(ir_stmt.size()); - for (int i = 0; i < ir_stmt.size(); i++) { - ir_stmt[i]->payload = i; - int t = 0; - ir_tree_node *itn = ir_stmt[i]; - while (itn->parent != NULL) { - itn = itn->parent; - if (itn->content->type() == IR_CONTROL_LOOP) - t++; - } - stmt_nesting_level_[i] = t; - stmt_nesting_level[i] = t; - } - - stmt = std::vector(ir_stmt.size()); - int n_dim = -1; - int max_loc; - //std::vector index; - for (int i = 0; i < ir_stmt.size(); i++) { - int max_nesting_level = -1; - int loc; - for (int j = 0; j < ir_stmt.size(); j++) - if (stmt_nesting_level[j] > max_nesting_level) { - max_nesting_level = stmt_nesting_level[j]; - loc = j; - } - - // most deeply nested statement acting as a reference point - if (n_dim == -1) { - n_dim = max_nesting_level; - max_loc = loc; - - index = std::vector(n_dim); - - ir_tree_node *itn = ir_stmt[loc]; - int cur_dim = n_dim - 1; - while (itn->parent != NULL) { - itn = itn->parent; - if (itn->content->type() == IR_CONTROL_LOOP) { - index[cur_dim] = - static_cast(itn->content)->index()->name(); - itn->payload = cur_dim--; - } - } - } - - // align loops by names, temporary solution - ir_tree_node *itn = ir_stmt[loc]; - int depth = stmt_nesting_level_[loc] - 1; - /* while (itn->parent != NULL) { - itn = itn->parent; - if (itn->content->type() == IR_CONTROL_LOOP && itn->payload == -1) { - std::string name = static_cast(itn->content)->index()->name(); - for (int j = 0; j < n_dim; j++) - if (index[j] == name) { - itn->payload = j; - break; - } - if (itn->payload == -1) - throw loop_error("no complex alignment yet"); - } - } - */ - for (int t = depth; t >= 0; t--) { - int y = t; - ir_tree_node *itn = ir_stmt[loc]; - - while ((itn->parent != NULL) && (y >= 0)) { - itn = itn->parent; - if (itn->content->type() == IR_CONTROL_LOOP) - y--; - } - - if (itn->content->type() == IR_CONTROL_LOOP && itn->payload == -1) { - CG_outputBuilder *ocg = ir_->builder(); - - itn->payload = depth - t; - - CG_outputRepr *code = - static_cast(ir_stmt[loc]->content)->extract(); - - Tuple index_expr; - Tuple old_index; - CG_outputRepr *repl = ocg->CreateIdent(index[itn->payload]); - index_expr.append(repl); - old_index.append( - static_cast(itn->content)->index()->name()); - - code = ocg->CreatePlaceHolder(0, code, index_expr, old_index); - replace.insert(std::pair(loc, code)); - //stmt[loc].code = code; - - } - } - - // set relation variable names - Relation r(n_dim); - F_And *f_root = r.add_and(); - itn = ir_stmt[loc]; - int temp_depth = depth; - while (itn->parent != NULL) { - - itn = itn->parent; - if (itn->content->type() == IR_CONTROL_LOOP) { - r.name_set_var(itn->payload + 1, index[temp_depth]); - - temp_depth--; - } - //static_cast(itn->content)->index()->name()); - } - - /*while (itn->parent != NULL) { - itn = itn->parent; - if (itn->content->type() == IR_CONTROL_LOOP) - r.name_set_var(itn->payload+1, static_cast(itn->content)->index()->name()); - }*/ - - // extract information from loop/if structures - std::vector processed(n_dim, false); - Tuple vars_to_be_reversed; - itn = ir_stmt[loc]; - while (itn->parent != NULL) { - itn = itn->parent; - - switch (itn->content->type()) { - case IR_CONTROL_LOOP: { - IR_Loop *lp = static_cast(itn->content); - Variable_ID v = r.set_var(itn->payload + 1); - int c; - - try { - c = lp->step_size(); - if (c > 0) { - CG_outputRepr *lb = lp->lower_bound(); - exp2formula(ir, r, f_root, freevar, lb, v, 's', - IR_COND_GE, true); - CG_outputRepr *ub = lp->upper_bound(); - IR_CONDITION_TYPE cond = lp->stop_cond(); - if (cond == IR_COND_LT || cond == IR_COND_LE) - exp2formula(ir, r, f_root, freevar, ub, v, 's', - cond, true); - else - throw ir_error("loop condition not supported"); - - } else if (c < 0) { - CG_outputBuilder *ocg = ir->builder(); - CG_outputRepr *lb = lp->lower_bound(); - lb = ocg->CreateMinus(NULL, lb); - exp2formula(ir, r, f_root, freevar, lb, v, 's', - IR_COND_GE, true); - CG_outputRepr *ub = lp->upper_bound(); - ub = ocg->CreateMinus(NULL, ub); - IR_CONDITION_TYPE cond = lp->stop_cond(); - if (cond == IR_COND_GE) - exp2formula(ir, r, f_root, freevar, ub, v, 's', - IR_COND_LE, true); - else if (cond == IR_COND_GT) - exp2formula(ir, r, f_root, freevar, ub, v, 's', - IR_COND_LT, true); - else - throw ir_error("loop condition not supported"); - - vars_to_be_reversed.append(lp->index()->name()); - } else - throw ir_error("loop step size zero"); - } catch (const ir_error &e) { - for (int i = 0; i < itn->children.size(); i++) - delete itn->children[i]; - itn->children = std::vector(); - itn->content = itn->content->convert(); - return false; - } - - if (abs(c) != 1) { - F_Exists *f_exists = f_root->add_exists(); - Variable_ID e = f_exists->declare(); - F_And *f_and = f_exists->add_and(); - Stride_Handle h = f_and->add_stride(abs(c)); - if (c > 0) - h.update_coef(e, 1); - else - h.update_coef(e, -1); - h.update_coef(v, -1); - CG_outputRepr *lb = lp->lower_bound(); - exp2formula(ir, r, f_and, freevar, lb, e, 's', IR_COND_EQ, - true); - } - - processed[itn->payload] = true; - break; - } - case IR_CONTROL_IF: { - CG_outputRepr *cond = - static_cast(itn->content)->condition(); - try { - if (itn->payload % 2 == 1) - exp2constraint(ir, r, f_root, freevar, cond, true); - else { - F_Not *f_not = f_root->add_not(); - F_And *f_and = f_not->add_and(); - exp2constraint(ir, r, f_and, freevar, cond, true); - } - } catch (const ir_error &e) { - std::vector *t; - if (itn->parent == NULL) - t = &ir_tree; - else - t = &(itn->parent->children); - int id = itn->payload; - int i = t->size() - 1; - while (i >= 0) { - if ((*t)[i] == itn) { - for (int j = 0; j < itn->children.size(); j++) - delete itn->children[j]; - itn->children = std::vector(); - itn->content = itn->content->convert(); - } else if ((*t)[i]->payload >> 1 == id >> 1) { - delete (*t)[i]; - t->erase(t->begin() + i); - } - i--; - } - return false; - } - - break; - } - default: - for (int i = 0; i < itn->children.size(); i++) - delete itn->children[i]; - itn->children = std::vector(); - itn->content = itn->content->convert(); - return false; - } - } - - // add information for missing loops - for (int j = 0; j < n_dim; j++) - if (!processed[j]) { - ir_tree_node *itn = ir_stmt[max_loc]; - while (itn->parent != NULL) { - itn = itn->parent; - if (itn->content->type() == IR_CONTROL_LOOP - && itn->payload == j) - break; - } - - Variable_ID v = r.set_var(j + 1); - if (loc < max_loc) { - CG_outputRepr *lb = - static_cast(itn->content)->lower_bound(); - exp2formula(ir, r, f_root, freevar, lb, v, 's', IR_COND_EQ, - true); - } else { // loc > max_loc - CG_outputRepr *ub = - static_cast(itn->content)->upper_bound(); - exp2formula(ir, r, f_root, freevar, ub, v, 's', IR_COND_EQ, - true); - } - } - - r.setup_names(); - r.simplify(); - - // insert the statement - CG_outputBuilder *ocg = ir->builder(); - Tuple reverse_expr; - for (int j = 1; j <= vars_to_be_reversed.size(); j++) { - CG_outputRepr *repl = ocg->CreateIdent(vars_to_be_reversed[j]); - repl = ocg->CreateMinus(NULL, repl); - reverse_expr.append(repl); - } - CG_outputRepr *code = - static_cast(ir_stmt[loc]->content)->original(); - code = ocg->CreatePlaceHolder(0, code, reverse_expr, - vars_to_be_reversed); - stmt[loc].code = code; - stmt[loc].IS = r; - stmt[loc].loop_level = std::vector(n_dim); - for (int i = 0; i < n_dim; i++) { - stmt[loc].loop_level[i].type = LoopLevelOriginal; - stmt[loc].loop_level[i].payload = i; - stmt[loc].loop_level[i].parallel_level = 0; - } - - stmt_nesting_level[loc] = -1; - } - - return true; -} - -Loop::Loop(const IR_Control *control) { - ir = const_cast(control->ir_); - init_code = NULL; - cleanup_code = NULL; - tmp_loop_var_name_counter = 1; - overflow_var_name_counter = 1; - known = Relation::True(0); - - std::vector ir_tree = build_ir_tree(control->clone(), NULL); - std::vector ir_stmt; - - while (!init_loop(ir_tree, ir_stmt)) { - } - - // init the dependence graph - for (int i = 0; i < stmt.size(); i++) - dep.insert(); - - for (int i = 0; i < stmt.size(); i++) - for (int j = i; j < stmt.size(); j++) { - std::pair, - std::vector > dv = test_data_dependences( - ir_, stmt[i].code, stmt[i].IS, stmt[j].code, stmt[j].IS, - freevar, index, stmt_nesting_level_[i], - stmt_nesting_level[j]); - - for (int k = 0; k < dv.first.size(); k++) { - if (is_dependence_valid(ir_stmt[i], ir_stmt[j], dv.first[k], - true)) - dep.connect(i, j, dv.first[k]); - else { - dep.connect(j, i, dv.first[k].reverse()); - } - - } - for (int k = 0; k < dv.second.size(); k++) - if (is_dependence_valid(ir_stmt[j], ir_stmt[i], dv.second[k], - false)) - dep.connect(j, i, dv.second[k]); - else { - dep.connect(i, j, dv.second[k].reverse()); - } - // std::pair, - // std::vector > dv_ = test_data_dependences( - - } - - for (int i = 0; i < stmt.size(); i++) { - std::map::iterator it = replace.find(i); - - if (it != replace.end()) - stmt[i].code = (it->second)->clone(); - else - stmt[i].code = stmt[i].code->clone(); - } - - // cleanup the IR tree - for (int i = 0; i < ir_tree.size(); i++) - delete ir_tree[i]; - - // init dumb transformation relations e.g. [i, j] -> [ 0, i, 0, j, 0] - for (int i = 0; i < stmt.size(); i++) { - int n = stmt[i].IS.n_set(); - stmt[i].xform = Relation(n, 2 * n + 1); - F_And *f_root = stmt[i].xform.add_and(); - - for (int j = 1; j <= n; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(stmt[i].xform.output_var(2 * j), 1); - h.update_coef(stmt[i].xform.input_var(j), -1); - } - - for (int j = 1; j <= 2 * n + 1; j += 2) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(stmt[i].xform.output_var(j), 1); - } - stmt[i].xform.simplify(); - } - - if (stmt.size() != 0) - num_dep_dim = stmt[0].IS.n_set(); - else - num_dep_dim = 0; -} - -Loop::~Loop() { - for (int i = 0; i < stmt.size(); i++) - if (stmt[i].code != NULL) { - stmt[i].code->clear(); - delete stmt[i].code; - } - if (init_code != NULL) { - init_code->clear(); - delete init_code; - } - if (cleanup_code != NULL) { - cleanup_code->clear(); - delete cleanup_code; - } -} - -int Loop::get_dep_dim_of(int stmt_num, int level) const { - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invaid statement " + to_string(stmt_num)); - - if (level < 1 || level > stmt[stmt_num].loop_level.size()) - return -1; - - int trip_count = 0; - while (true) { - switch (stmt[stmt_num].loop_level[level - 1].type) { - case LoopLevelOriginal: - return stmt[stmt_num].loop_level[level - 1].payload; - case LoopLevelTile: - level = stmt[stmt_num].loop_level[level - 1].payload; - if (level < 1) - return -1; - if (level > stmt[stmt_num].loop_level.size()) - throw loop_error( - "incorrect loop level information for statement " - + to_string(stmt_num)); - break; - default: - throw loop_error( - "unknown loop level information for statement " - + to_string(stmt_num)); - } - trip_count++; - if (trip_count >= stmt[stmt_num].loop_level.size()) - throw loop_error( - "incorrect loop level information for statement " - + to_string(stmt_num)); - } -} - -int Loop::get_last_dep_dim_before(int stmt_num, int level) const { - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invaid statement " + to_string(stmt_num)); - - if (level < 1) - return -1; - if (level > stmt[stmt_num].loop_level.size()) - level = stmt[stmt_num].loop_level.size() + 1; - - for (int i = level - 1; i >= 1; i--) - if (stmt[stmt_num].loop_level[i - 1].type == LoopLevelOriginal) - return stmt[stmt_num].loop_level[i - 1].payload; - - return -1; -} - -void Loop::print_internal_loop_structure() const { - for (int i = 0; i < stmt.size(); i++) { - std::vector lex = getLexicalOrder(i); - std::cout << "s" << i + 1 << ": "; - for (int j = 0; j < stmt[i].loop_level.size(); j++) { - if (2 * j < lex.size()) - std::cout << lex[2 * j]; - switch (stmt[i].loop_level[j].type) { - case LoopLevelOriginal: - std::cout << "(dim:" << stmt[i].loop_level[j].payload << ")"; - break; - case LoopLevelTile: - std::cout << "(tile:" << stmt[i].loop_level[j].payload << ")"; - break; - default: - std::cout << "(unknown)"; - } - std::cout << ' '; - } - for (int j = 2 * stmt[i].loop_level.size(); j < lex.size(); j += 2) { - std::cout << lex[j]; - if (j != lex.size() - 1) - std::cout << ' '; - } - std::cout << std::endl; - } -} - -CG_outputRepr *Loop::getCode(int effort) const { - const int m = stmt.size(); - if (m == 0) - return NULL; - const int n = stmt[0].xform.n_out(); - - Tuple ni(m); - Tuple < Relation > IS(m); - Tuple < Relation > xform(m); - for (int i = 0; i < m; i++) { - ni[i + 1] = stmt[i].code; - IS[i + 1] = stmt[i].IS; - xform[i + 1] = stmt[i].xform; - } - - Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); - CG_outputBuilder *ocg = ir->builder(); - CG_outputRepr *repr = MMGenerateCode(ocg, xform, IS, ni, known, effort); - - if (init_code != NULL) - repr = ocg->StmtListAppend(init_code->clone(), repr); - if (cleanup_code != NULL) - repr = ocg->StmtListAppend(repr, cleanup_code->clone()); - - return repr; -} - -void Loop::printCode(int effort) const { - const int m = stmt.size(); - if (m == 0) - return; - const int n = stmt[0].xform.n_out(); - - Tuple < Relation > IS(m); - Tuple < Relation > xform(m); - for (int i = 0; i < m; i++) { - IS[i + 1] = stmt[i].IS; - xform[i + 1] = stmt[i].xform; - } - - Relation known = Extend_Set(copy(this->known), n - this->known.n_set()); - std::cout << MMGenerateCode(xform, IS, known, effort); -} - -Relation Loop::getNewIS(int stmt_num) const { - Relation result; - - if (stmt[stmt_num].xform.is_null()) { - Relation known = Extend_Set(copy(this->known), - stmt[stmt_num].IS.n_set() - this->known.n_set()); - result = Intersection(copy(stmt[stmt_num].IS), known); - } else { - Relation known = Extend_Set(copy(this->known), - stmt[stmt_num].xform.n_out() - this->known.n_set()); - result = Intersection( - Range( - Restrict_Domain(copy(stmt[stmt_num].xform), - copy(stmt[stmt_num].IS))), known); - } - - result.simplify(2, 4); - - return result; -} - -std::vector Loop::getNewIS() const { - const int m = stmt.size(); - - std::vector new_IS(m); - for (int i = 0; i < m; i++) - new_IS[i] = getNewIS(i); - - return new_IS; -} - -void Loop::permute(const std::vector &pi) { - std::set active; - for (int i = 0; i < stmt.size(); i++) - active.insert(i); - - permute(active, pi); -} - -void Loop::original() { - std::set active; - for (int i = 0; i < stmt.size(); i++) - active.insert(i); - setLexicalOrder(0, active); -} - -void Loop::permute(const std::set &active, const std::vector &pi) { - if (active.size() == 0 || pi.size() == 0) - return; - - // check for sanity of parameters - int level = pi[0]; - for (int i = 1; i < pi.size(); i++) - if (pi[i] < level) - level = pi[i]; - if (level < 1) - throw std::invalid_argument("invalid permuation"); - std::vector reverse_pi(pi.size(), 0); - for (int i = 0; i < pi.size(); i++) - if (pi[i] >= level + pi.size()) - throw std::invalid_argument("invalid permutation"); - else - reverse_pi[pi[i] - level] = i + level; - for (int i = 0; i < reverse_pi.size(); i++) - if (reverse_pi[i] == 0) - throw std::invalid_argument("invalid permuation"); - int ref_stmt_num; - std::vector lex; - for (std::set::iterator i = active.begin(); i != active.end(); i++) { - if (*i < 0 || *i >= stmt.size()) - throw std::invalid_argument("invalid statement " + to_string(*i)); - if (i == active.begin()) { - ref_stmt_num = *i; - lex = getLexicalOrder(*i); - } else { - if (level + pi.size() - 1 > stmt[*i].loop_level.size()) - throw std::invalid_argument("invalid permuation"); - std::vector lex2 = getLexicalOrder(*i); - for (int j = 0; j < 2 * level - 3; j += 2) - if (lex[j] != lex2[j]) - throw std::invalid_argument( - "statements to permute must be in the same subloop"); - for (int j = 0; j < pi.size(); j++) - if (!(stmt[*i].loop_level[level + j - 1].type - == stmt[ref_stmt_num].loop_level[level + j - 1].type - && stmt[*i].loop_level[level + j - 1].payload - == stmt[ref_stmt_num].loop_level[level + j - 1].payload)) - throw std::invalid_argument( - "permuted loops must have the same loop level types"); - } - } - - // Update transformation relations - for (std::set::iterator i = active.begin(); i != active.end(); i++) { - int n = stmt[*i].xform.n_out(); - Relation mapping(n, n); - F_And *f_root = mapping.add_and(); - for (int j = 1; j <= n; j += 2) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(j), 1); - h.update_coef(mapping.input_var(j), -1); - } - for (int j = 0; j < pi.size(); j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(2 * (level + j)), 1); - h.update_coef(mapping.input_var(2 * pi[j]), -1); - } - for (int j = 1; j < level; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(2 * j), 1); - h.update_coef(mapping.input_var(2 * j), -1); - } - for (int j = level + pi.size(); j <= stmt[*i].loop_level.size(); j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(2 * j), 1); - h.update_coef(mapping.input_var(2 * j), -1); - } - - stmt[*i].xform = Composition(mapping, stmt[*i].xform); - stmt[*i].xform.simplify(); - } - - // get the permuation for dependence vectors - std::vector t; - for (int i = 0; i < pi.size(); i++) - if (stmt[ref_stmt_num].loop_level[pi[i] - 1].type == LoopLevelOriginal) - t.push_back(stmt[ref_stmt_num].loop_level[pi[i] - 1].payload); - int max_dep_dim = -1; - int min_dep_dim = num_dep_dim; - for (int i = 0; i < t.size(); i++) { - if (t[i] > max_dep_dim) - max_dep_dim = t[i]; - if (t[i] < min_dep_dim) - min_dep_dim = t[i]; - } - if (min_dep_dim > max_dep_dim) - return; - if (max_dep_dim - min_dep_dim + 1 != t.size()) - throw loop_error("cannot update the dependence graph after permuation"); - std::vector dep_pi(num_dep_dim); - for (int i = 0; i < min_dep_dim; i++) - dep_pi[i] = i; - for (int i = min_dep_dim; i <= max_dep_dim; i++) - dep_pi[i] = t[i - min_dep_dim]; - for (int i = max_dep_dim + 1; i < num_dep_dim; i++) - dep_pi[i] = i; - - // update the dependence graph - DependenceGraph g; - for (int i = 0; i < dep.vertex.size(); i++) - g.insert(); - for (int i = 0; i < dep.vertex.size(); i++) - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); - j++) { - if ((active.find(i) != active.end() - && active.find(j->first) != active.end())) { - std::vector dv = j->second; - for (int k = 0; k < dv.size(); k++) { - switch (dv[k].type) { - case DEP_W2R: - case DEP_R2W: - case DEP_W2W: - case DEP_R2R: { - std::vector lbounds(num_dep_dim); - std::vector ubounds(num_dep_dim); - for (int d = 0; d < num_dep_dim; d++) { - lbounds[d] = dv[k].lbounds[dep_pi[d]]; - ubounds[d] = dv[k].ubounds[dep_pi[d]]; - } - dv[k].lbounds = lbounds; - dv[k].ubounds = ubounds; - break; - } - case DEP_CONTROL: { - break; - } - default: - throw loop_error("unknown dependence type"); - } - } - g.connect(i, j->first, dv); - } else if (active.find(i) == active.end() - && active.find(j->first) == active.end()) { - std::vector dv = j->second; - g.connect(i, j->first, dv); - } else { - std::vector dv = j->second; - for (int k = 0; k < dv.size(); k++) - switch (dv[k].type) { - case DEP_W2R: - case DEP_R2W: - case DEP_W2W: - case DEP_R2R: { - for (int d = 0; d < num_dep_dim; d++) - if (dep_pi[d] != d) { - dv[k].lbounds[d] = -posInfinity; - dv[k].ubounds[d] = posInfinity; - } - break; - } - case DEP_CONTROL: - break; - default: - throw loop_error("unknown dependence type"); - } - g.connect(i, j->first, dv); - } - } - dep = g; - - // update loop level information - for (std::set::iterator i = active.begin(); i != active.end(); i++) { - int cur_dep_dim = min_dep_dim; - std::vector new_loop_level(stmt[*i].loop_level.size()); - for (int j = 1; j <= stmt[*i].loop_level.size(); j++) - if (j >= level && j < level + pi.size()) { - switch (stmt[*i].loop_level[reverse_pi[j - level] - 1].type) { - case LoopLevelOriginal: - new_loop_level[j - 1].type = LoopLevelOriginal; - new_loop_level[j - 1].payload = cur_dep_dim++; - new_loop_level[j - 1].parallel_level = - stmt[*i].loop_level[reverse_pi[j - level] - 1].parallel_level; - break; - case LoopLevelTile: { - new_loop_level[j - 1].type = LoopLevelTile; - int ref_level = stmt[*i].loop_level[reverse_pi[j - level] - - 1].payload; - if (ref_level >= level && ref_level < level + pi.size()) - new_loop_level[j - 1].payload = reverse_pi[ref_level - - level]; - else - new_loop_level[j - 1].payload = ref_level; - new_loop_level[j - 1].parallel_level = - stmt[*i].loop_level[reverse_pi[j - level] - 1].parallel_level; - break; - } - default: - throw loop_error( - "unknown loop level information for statement " - + to_string(*i)); - } - } else { - switch (stmt[*i].loop_level[j - 1].type) { - case LoopLevelOriginal: - new_loop_level[j - 1].type = LoopLevelOriginal; - new_loop_level[j - 1].payload = - stmt[*i].loop_level[j - 1].payload; - new_loop_level[j - 1].parallel_level = stmt[*i].loop_level[j - - 1].parallel_level; - break; - case LoopLevelTile: { - new_loop_level[j - 1].type = LoopLevelTile; - int ref_level = stmt[*i].loop_level[j - 1].payload; - if (ref_level >= level && ref_level < level + pi.size()) - new_loop_level[j - 1].payload = reverse_pi[ref_level - - level]; - else - new_loop_level[j - 1].payload = ref_level; - new_loop_level[j - 1].parallel_level = stmt[*i].loop_level[j - - 1].parallel_level; - break; - } - default: - throw loop_error( - "unknown loop level information for statement " - + to_string(*i)); - } - } - stmt[*i].loop_level = new_loop_level; - } - - setLexicalOrder(2 * level - 2, active); -} - -std::set Loop::split(int stmt_num, int level, const Relation &cond) { - // check for sanity of parameters - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - if (level <= 0 || level > stmt[stmt_num].loop_level.size()) - throw std::invalid_argument("invalid loop level " + to_string(level)); - - std::set result; - int dim = 2 * level - 1; - std::vector lex = getLexicalOrder(stmt_num); - std::set same_loop = getStatements(lex, dim - 1); - - Relation cond2 = copy(cond); - cond2.simplify(); - cond2 = EQs_to_GEQs(cond2); - Conjunct *c = cond2.single_conjunct(); - int cur_lex = lex[dim - 1]; - for (GEQ_Iterator gi(c->GEQs()); gi; gi++) { - int max_level = (*gi).max_tuple_pos(); - Relation single_cond(max_level); - single_cond.and_with_GEQ(*gi); - - // TODO: should decide where to place newly created statements with - // complementary split condition from dependence graph. - bool place_after; - if (max_level == 0) - place_after = true; - else if ((*gi).get_coef(cond2.set_var(max_level)) < 0) - place_after = true; - else - place_after = false; - - // original statements with split condition, - // new statements with complement of split condition - int old_num_stmt = stmt.size(); - std::map what_stmt_num; - apply_xform(same_loop); - for (std::set::iterator i = same_loop.begin(); - i != same_loop.end(); i++) { - int n = stmt[*i].IS.n_set(); - Relation part1, part2; - if (max_level > n) { - part1 = copy(stmt[*i].IS); - part2 = Relation::False(0); - } else { - part1 = Intersection(copy(stmt[*i].IS), - Extend_Set(copy(single_cond), n - max_level)); - part2 = Intersection(copy(stmt[*i].IS), - Extend_Set(Complement(copy(single_cond)), - n - max_level)); - } - - //split dependence check - - if (max_level > level) { - - DNF_Iterator di1(stmt[*i].IS.query_DNF()); - DNF_Iterator di2(part1.query_DNF()); - for (; di1 && di2; di1++, di2++) { - //printf("In next conjunct,\n"); - EQ_Iterator ei1 = (*di1)->EQs(); - EQ_Iterator ei2 = (*di2)->EQs(); - for (; ei1 && ei2; ei1++, ei2++) { - //printf(" In next equality constraint,\n"); - Constr_Vars_Iter cvi1(*ei1); - Constr_Vars_Iter cvi2(*ei2); - int dimension = (*cvi1).var->get_position(); - int same = 0; - bool identical = false; - if (identical = !strcmp((*cvi1).var->char_name(), - (*cvi2).var->char_name())) { - - for (; cvi1 && cvi2; cvi1++, cvi2++) { - - if (((*cvi1).coef != (*cvi2).coef - || (*ei1).get_const() - != (*ei2).get_const()) - || (strcmp((*cvi1).var->char_name(), - (*cvi2).var->char_name()))) { - - same++; - } - } - } - if ((same != 0) || !identical) { - - dimension = dimension - 1; - - while (stmt[*i].loop_level[dimension].type - == LoopLevelTile) - dimension = xform_index[dimension].first; - - dimension = stmt[*i].loop_level[dimension].payload; - - for (int i = 0; i < stmt.size(); i++) { - std::vector > D; - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[i].second.begin(); - j != dep.vertex[i].second.end(); j++) { - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if ((dv.hasNegative(dimension) - && !dv.quasi) - || (dv.hasPositive(dimension) - && dv.quasi)) - - throw loop_error( - "loop error: Split is illegal, dependence violation!"); - - } - } - } - - } - - GEQ_Iterator gi1 = (*di1)->GEQs(); - GEQ_Iterator gi2 = (*di2)->GEQs(); - - for (; gi1 && gi2; gi++, gi2++) { - - Constr_Vars_Iter cvi1(*gi1); - Constr_Vars_Iter cvi2(*gi2); - int dimension = (*cvi1).var->get_position(); - int same = 0; - bool identical = false; - if (identical = !strcmp((*cvi1).var->char_name(), - (*cvi2).var->char_name())) { - - for (; cvi1 && cvi2; cvi1++, cvi2++) { - - if (((*cvi1).coef != (*cvi2).coef - || (*gi1).get_const() - != (*gi2).get_const()) - || (strcmp((*cvi1).var->char_name(), - (*cvi2).var->char_name()))) { - - same++; - } - } - } - if ((same != 0) || !identical) { - dimension = dimension - 1; - - while (stmt[*i].loop_level[dimension].type - == LoopLevelTile) - dimension = xform_index[dimension].first; - - dimension = - stmt[*i].loop_level[dimension].payload; - - for (int i = 0; i < stmt.size(); i++) { - std::vector > D; - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[i].second.begin(); - j != dep.vertex[i].second.end(); - j++) { - for (int k = 0; k < j->second.size(); - k++) { - DependenceVector dv = j->second[k]; - if ((dv.hasNegative(dimension) - && !dv.quasi) - || (dv.hasPositive( - dimension) - && dv.quasi)) - - throw loop_error( - "loop error: Split is illegal, dependence violation!"); - - } - } - } - - } - - } - - } - - } - - DNF_Iterator di3(stmt[*i].IS.query_DNF()); - DNF_Iterator di4(part2.query_DNF()); - for (; di3 && di4; di3++, di4++) { - EQ_Iterator ei1 = (*di3)->EQs(); - EQ_Iterator ei2 = (*di4)->EQs(); - for (; ei1 && ei2; ei1++, ei2++) { - Constr_Vars_Iter cvi1(*ei1); - Constr_Vars_Iter cvi2(*ei2); - int dimension = (*cvi1).var->get_position(); - int same = 0; - bool identical = false; - if (identical = !strcmp((*cvi1).var->char_name(), - (*cvi2).var->char_name())) { - - for (; cvi1 && cvi2; cvi1++, cvi2++) { - - if (((*cvi1).coef != (*cvi2).coef - || (*ei1).get_const() - != (*ei2).get_const()) - || (strcmp((*cvi1).var->char_name(), - (*cvi2).var->char_name()))) { - - same++; - } - } - } - if ((same != 0) || !identical) { - dimension = dimension - 1; - - while (stmt[*i].loop_level[dimension].type - == LoopLevelTile) - dimension = xform_index[dimension].first; - - dimension = stmt[*i].loop_level[dimension].payload; - - for (int i = 0; i < stmt.size(); i++) { - std::vector > D; - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[i].second.begin(); - j != dep.vertex[i].second.end(); j++) { - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if ((dv.hasNegative(dimension) - && !dv.quasi) - || (dv.hasPositive(dimension) - && dv.quasi)) - - throw loop_error( - "loop error: Split is illegal, dependence violation!"); - - } - } - } - - } - - } - GEQ_Iterator gi1 = (*di3)->GEQs(); - GEQ_Iterator gi2 = (*di4)->GEQs(); - - for (; gi1 && gi2; gi++, gi2++) { - Constr_Vars_Iter cvi1(*gi1); - Constr_Vars_Iter cvi2(*gi2); - int dimension = (*cvi1).var->get_position(); - int same = 0; - bool identical = false; - if (identical = !strcmp((*cvi1).var->char_name(), - (*cvi2).var->char_name())) { - - for (; cvi1 && cvi2; cvi1++, cvi2++) { - - if (((*cvi1).coef != (*cvi2).coef - || (*gi1).get_const() - != (*gi2).get_const()) - || (strcmp((*cvi1).var->char_name(), - (*cvi2).var->char_name()))) { - - same++; - } - } - } - if ((same != 0) || !identical) { - dimension = dimension - 1; - - while (stmt[*i].loop_level[dimension].type - == LoopLevelTile) - dimension = xform_index[dimension].first; - - dimension = stmt[*i].loop_level[dimension].payload; - - for (int i = 0; i < stmt.size(); i++) { - std::vector > D; - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[i].second.begin(); - j != dep.vertex[i].second.end(); j++) { - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if ((dv.hasNegative(dimension) - && !dv.quasi) - || (dv.hasPositive(dimension) - && dv.quasi)) - - throw loop_error( - "loop error: Split is illegal, dependence violation!"); - - } - } - } - - } - - } - - } - - } - - stmt[*i].IS = part1; - - if (Intersection(copy(part2), - Extend_Set(copy(this->known), n - this->known.n_set())).is_upper_bound_satisfiable()) { - Statement new_stmt; - new_stmt.code = stmt[*i].code->clone(); - new_stmt.IS = part2; - new_stmt.xform = copy(stmt[*i].xform); - - new_stmt.loop_level = stmt[*i].loop_level; - stmt.push_back(new_stmt); - dep.insert(); - what_stmt_num[*i] = stmt.size() - 1; - if (*i == stmt_num) - result.insert(stmt.size() - 1); - - stmt_nesting_level_.push_back(stmt_nesting_level[*i]); - std::pair, - std::vector > dv = - test_data_dependences(ir_, stmt[*i].code, part1, - stmt[*i].code, part2, freevar, index, - stmt_nesting_level[*i], - stmt_nesting_level[stmt.size() - 1]); - - int part1_to_part2 = 0; - int part2_to_part1 = 0; - - for (int k = 0; k < dv.first.size(); k++) - if (is_dependence_valid_based_on_lex_order(*i, - what_stmt_num[*i], dv.first[k], true)) - part1_to_part2++; - else - part2_to_part1++; - - if (part1_to_part2 > 0 && part2_to_part1 > 0) - throw loop_error( - "loop error: Aborting, split resulted in impossible dependence cycle!"); - - for (int k = 0; k < dv.second.size(); k++) - if (is_dependence_valid_based_on_lex_order( - what_stmt_num[*i], *i, dv.second[k], false)) - part2_to_part1++; - - else - part1_to_part2++; - - if (part1_to_part2 > 0 && part2_to_part1 > 0) - throw loop_error( - "loop error: Aborting, split resulted in impossible dependence cycle!"); - bool temp_place_after; - if (part2_to_part1 > 0) - temp_place_after = false; - else - temp_place_after = true; - - if (i == same_loop.begin()) - place_after = temp_place_after; - else { - if (temp_place_after != place_after) - throw loop_error( - "loop error: Aborting, split resulted in impossible dependence cycle!"); - - } - - if (place_after) - assign_const(new_stmt.xform, dim - 1, cur_lex + 1); - else - assign_const(new_stmt.xform, dim - 1, cur_lex - 1); - - } - - } - // make adjacent lexical number available for new statements - if (place_after) { - lex[dim - 1] = cur_lex + 1; - shiftLexicalOrder(lex, dim - 1, 1); - } else { - lex[dim - 1] = cur_lex - 1; - shiftLexicalOrder(lex, dim - 1, -1); - } - // update dependence graph - int dep_dim = get_dep_dim_of(stmt_num, level); - for (int i = 0; i < old_num_stmt; i++) { - std::vector > > D; - - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[i].second.begin(); - j != dep.vertex[i].second.end(); j++) { - if (same_loop.find(i) != same_loop.end()) { - if (same_loop.find(j->first) != same_loop.end()) { - if (what_stmt_num.find(i) != what_stmt_num.end() - && what_stmt_num.find(j->first) - != what_stmt_num.end()) - dep.connect(what_stmt_num[i], - what_stmt_num[j->first], j->second); - if (place_after - && what_stmt_num.find(j->first) - != what_stmt_num.end()) { - std::vector dvs; - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if (dv.is_data_dependence() && dep_dim != -1) { - dv.lbounds[dep_dim] = -posInfinity; - dv.ubounds[dep_dim] = posInfinity; - } - dvs.push_back(dv); - } - if (dvs.size() > 0) - D.push_back( - std::make_pair(what_stmt_num[j->first], - dvs)); - } else if (!place_after - && what_stmt_num.find(i) - != what_stmt_num.end()) { - std::vector dvs; - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if (dv.is_data_dependence() && dep_dim != -1) { - dv.lbounds[dep_dim] = -posInfinity; - dv.ubounds[dep_dim] = posInfinity; - } - dvs.push_back(dv); - } - if (dvs.size() > 0) - dep.connect(what_stmt_num[i], j->first, dvs); - - } - } else { - if (what_stmt_num.find(i) != what_stmt_num.end()) - dep.connect(what_stmt_num[i], j->first, j->second); - } - } else if (same_loop.find(j->first) != same_loop.end()) { - if (what_stmt_num.find(j->first) != what_stmt_num.end()) - D.push_back( - std::make_pair(what_stmt_num[j->first], - j->second)); - } - } - - for (int j = 0; j < D.size(); j++) - dep.connect(i, D[j].first, D[j].second); - } - - } - - return result; -} - -void Loop::tile(int stmt_num, int level, int tile_size, int outer_level, - TilingMethodType method, int alignment_offset, int alignment_multiple) { - // check for sanity of parameters - if (tile_size < 0) - throw std::invalid_argument("invalid tile size"); - if (alignment_multiple < 1 || alignment_offset < 0) - throw std::invalid_argument("invalid alignment for tile"); - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - if (level <= 0) - throw std::invalid_argument("invalid loop level " + to_string(level)); - if (level > stmt[stmt_num].loop_level.size()) - throw std::invalid_argument( - "there is no loop level " + to_string(level) + " for statement " - + to_string(stmt_num)); - if (outer_level <= 0 || outer_level > level) - throw std::invalid_argument( - "invalid tile controlling loop level " - + to_string(outer_level)); - - int dim = 2 * level - 1; - int outer_dim = 2 * outer_level - 1; - std::vector lex = getLexicalOrder(stmt_num); - std::set same_tiled_loop = getStatements(lex, dim - 1); - std::set same_tile_controlling_loop = getStatements(lex, - outer_dim - 1); - - for (int i = 0; i < stmt.size(); i++) { - std::vector > D; - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); - j++) { - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - int dim2 = level - 1; - if ((dv.type != DEP_CONTROL) && (dv.type != DEP_UNKNOWN)) { - while (stmt[i].loop_level[dim2].type == LoopLevelTile) { - dim2 = stmt[i].loop_level[dim2].payload; - } - dim2 = stmt[i].loop_level[dim2].payload; - - if ((dv.hasNegative(dim2) && (!dv.quasi)) - || (dv.quasi && dv.hasPositive(dim2))) { - for (int l = outer_level; l < level; l++) - if (stmt[i].loop_level[l - 1].type - != LoopLevelTile) { - if (dv.isCarried( - stmt[i].loop_level[l - 1].payload)) - throw loop_error( - "loop error: Tiling is illegal, dependence violation!"); - } else { - - int dim3 = l - 1; - while (stmt[i].loop_level[l - 1].type - != LoopLevelTile) { - dim3 = stmt[i].loop_level[l - 1].payload; - - } - - dim3 = stmt[i].loop_level[l - 1].payload; - if (dim3 < level - 1) - if (dv.isCarried(dim3)) - throw loop_error( - "loop error: Tiling is illegal, dependence violation!"); - } - } - } - } - } - } - // special case for no tiling - if (tile_size == 0) { - for (std::set::iterator i = same_tile_controlling_loop.begin(); - i != same_tile_controlling_loop.end(); i++) { - Relation r(stmt[*i].xform.n_out(), stmt[*i].xform.n_out() + 2); - F_And *f_root = r.add_and(); - for (int j = 1; j <= 2 * outer_level - 1; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.input_var(j), 1); - h.update_coef(r.output_var(j), -1); - } - EQ_Handle h1 = f_root->add_EQ(); - h1.update_coef(r.output_var(2 * outer_level), 1); - EQ_Handle h2 = f_root->add_EQ(); - h2.update_coef(r.output_var(2 * outer_level + 1), 1); - for (int j = 2 * outer_level; j <= stmt[*i].xform.n_out(); j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.input_var(j), 1); - h.update_coef(r.output_var(j + 2), -1); - } - - stmt[*i].xform = Composition(copy(r), stmt[*i].xform); - } - } - // normal tiling - else { - std::set private_stmt; - for (std::set::iterator i = same_tile_controlling_loop.begin(); - i != same_tile_controlling_loop.end(); i++) { -// if (same_tiled_loop.find(*i) == same_tiled_loop.end() && !is_single_iteration(getNewIS(*i), dim)) -// same_tiled_loop.insert(*i); - - // should test dim's value directly but it is ok for now -// if (same_tiled_loop.find(*i) == same_tiled_loop.end() && get_const(stmt[*i].xform, dim+1, Output_Var) == posInfinity) - if (same_tiled_loop.find(*i) == same_tiled_loop.end() - && overflow.find(*i) != overflow.end()) - private_stmt.insert(*i); - } - - // extract the union of the iteration space to be considered - Relation hull; - { - Tuple < Relation > r_list; - Tuple r_mask; - - for (std::set::iterator i = same_tile_controlling_loop.begin(); - i != same_tile_controlling_loop.end(); i++) - if (private_stmt.find(*i) == private_stmt.end()) { - Relation r = project_onto_levels(getNewIS(*i), dim + 1, - true); - for (int j = outer_dim; j < dim; j++) - r = Project(r, j + 1, Set_Var); - for (int j = 0; j < outer_dim; j += 2) - r = Project(r, j + 1, Set_Var); - r_list.append(r); - r_mask.append(1); - } - - hull = Hull(r_list, r_mask, 1, true); - } - - // extract the bound of the dimension to be tiled - Relation bound = get_loop_bound(hull, dim); - if (!bound.has_single_conjunct()) { - // further simplify the bound - hull = Approximate(hull); - bound = get_loop_bound(hull, dim); - - int i = outer_dim - 2; - while (!bound.has_single_conjunct() && i >= 0) { - hull = Project(hull, i + 1, Set_Var); - bound = get_loop_bound(hull, dim); - i -= 2; - } - - if (!bound.has_single_conjunct()) - throw loop_error("cannot handle tile bounds"); - } - - // separate lower and upper bounds - std::vector lb_list, ub_list; - { - Conjunct *c = bound.query_DNF()->single_conjunct(); - for (GEQ_Iterator gi(c->GEQs()); gi; gi++) { - int coef = (*gi).get_coef(bound.set_var(dim + 1)); - if (coef < 0) - ub_list.push_back(*gi); - else if (coef > 0) - lb_list.push_back(*gi); - } - } - if (lb_list.size() == 0) - throw loop_error( - "unable to calculate tile controlling loop lower bound"); - if (ub_list.size() == 0) - throw loop_error( - "unable to calculate tile controlling loop upper bound"); - - // find the simplest lower bound for StridedTile or simplest iteration count for CountedTile - int simplest_lb = 0, simplest_ub = 0; - if (method == StridedTile) { - int best_cost = INT_MAX; - for (int i = 0; i < lb_list.size(); i++) { - int cost = 0; - for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - cost += 5; - break; - } - case Global_Var: { - cost += 2; - break; - } - default: - cost += 15; - break; - } - } - - if (cost < best_cost) { - best_cost = cost; - simplest_lb = i; - } - } - } else if (method == CountedTile) { - std::map s1, s2, s3; - int best_cost = INT_MAX; - for (int i = 0; i < lb_list.size(); i++) - for (int j = 0; j < ub_list.size(); j++) { - int cost = 0; - - for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - s1[(*ci).var] += (*ci).coef; - break; - } - case Global_Var: { - s2[(*ci).var] += (*ci).coef; - break; - } - case Exists_Var: - case Wildcard_Var: { - s3[(*ci).var] += (*ci).coef; - break; - } - default: - cost = INT_MAX - 2; - break; - } - } - - for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - s1[(*ci).var] += (*ci).coef; - break; - } - case Global_Var: { - s2[(*ci).var] += (*ci).coef; - break; - } - case Exists_Var: - case Wildcard_Var: { - s3[(*ci).var] += (*ci).coef; - break; - } - default: - if (cost == INT_MAX - 2) - cost = INT_MAX - 1; - else - cost = INT_MAX - 3; - break; - } - } - - if (cost == 0) { - for (std::map::iterator k = - s1.begin(); k != s1.end(); k++) - if ((*k).second != 0) - cost += 5; - for (std::map::iterator k = - s2.begin(); k != s2.end(); k++) - if ((*k).second != 0) - cost += 2; - for (std::map::iterator k = - s3.begin(); k != s3.end(); k++) - if ((*k).second != 0) - cost += 15; - } - - if (cost < best_cost) { - best_cost = cost; - simplest_lb = i; - simplest_ub = j; - } - } - } - - // prepare the new transformation relations - for (std::set::iterator i = same_tile_controlling_loop.begin(); - i != same_tile_controlling_loop.end(); i++) { - Relation r(stmt[*i].xform.n_out(), stmt[*i].xform.n_out() + 2); - F_And *f_root = r.add_and(); - for (int j = 0; j < outer_dim - 1; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.output_var(j + 1), 1); - h.update_coef(r.input_var(j + 1), -1); - } - - for (int j = outer_dim - 1; j < stmt[*i].xform.n_out(); j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.output_var(j + 3), 1); - h.update_coef(r.input_var(j + 1), -1); - } - - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.output_var(outer_dim), 1); - h.update_const(-lex[outer_dim - 1]); - - stmt[*i].xform = Composition(r, stmt[*i].xform); - } - - // add tiling constraints. - for (std::set::iterator i = same_tile_controlling_loop.begin(); - i != same_tile_controlling_loop.end(); i++) { - F_And *f_super_root = stmt[*i].xform.and_with_and(); - F_Exists *f_exists = f_super_root->add_exists(); - F_And *f_root = f_exists->add_and(); - - // create a lower bound variable for easy formula creation later - Variable_ID aligned_lb; - { - Variable_ID lb = f_exists->declare(); - coef_t coef = lb_list[simplest_lb].get_coef( - bound.set_var(dim + 1)); - if (coef == 1) { // e.g. if i >= m+5, then LB = m+5 - EQ_Handle h = f_root->add_EQ(); - h.update_coef(lb, 1); - for (Constr_Vars_Iter ci(lb_list[simplest_lb]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - int pos = (*ci).var->get_position(); - if (pos != dim + 1) - h.update_coef(stmt[*i].xform.output_var(pos), - (*ci).coef); - break; - } - case Global_Var: { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = stmt[*i].xform.get_local(g); - else - v = stmt[*i].xform.get_local(g, - (*ci).var->function_of()); - h.update_coef(v, (*ci).coef); - break; - } - default: - throw loop_error("cannot handle tile bounds"); - } - } - h.update_const(lb_list[simplest_lb].get_const()); - } else { // e.g. if 2i >= m+5, then m+5 <= 2*LB < m+5+2 - GEQ_Handle h1 = f_root->add_GEQ(); - GEQ_Handle h2 = f_root->add_GEQ(); - for (Constr_Vars_Iter ci(lb_list[simplest_lb]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - int pos = (*ci).var->get_position(); - if (pos == dim + 1) { - h1.update_coef(lb, (*ci).coef); - h2.update_coef(lb, -(*ci).coef); - } else { - h1.update_coef(stmt[*i].xform.output_var(pos), - (*ci).coef); - h2.update_coef(stmt[*i].xform.output_var(pos), - -(*ci).coef); - } - break; - } - case Global_Var: { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = stmt[*i].xform.get_local(g); - else - v = stmt[*i].xform.get_local(g, - (*ci).var->function_of()); - h1.update_coef(v, (*ci).coef); - h2.update_coef(v, -(*ci).coef); - break; - } - default: - throw loop_error("cannot handle tile bounds"); - } - } - h1.update_const(lb_list[simplest_lb].get_const()); - h2.update_const(-lb_list[simplest_lb].get_const()); - h2.update_const(coef - 1); - } - - Variable_ID offset_lb; - if (alignment_offset == 0) - offset_lb = lb; - else { - EQ_Handle h = f_root->add_EQ(); - offset_lb = f_exists->declare(); - h.update_coef(offset_lb, 1); - h.update_coef(lb, -1); - h.update_const(alignment_offset); - } - - if (alignment_multiple == 1) { // trivial - aligned_lb = offset_lb; - } else { // e.g. to align at 4, aligned_lb = 4*alpha && LB-4 < 4*alpha <= LB - aligned_lb = f_exists->declare(); - Variable_ID e = f_exists->declare(); - - EQ_Handle h = f_root->add_EQ(); - h.update_coef(aligned_lb, 1); - h.update_coef(e, -alignment_multiple); - - GEQ_Handle h1 = f_root->add_GEQ(); - GEQ_Handle h2 = f_root->add_GEQ(); - h1.update_coef(e, alignment_multiple); - h2.update_coef(e, -alignment_multiple); - h1.update_coef(offset_lb, -1); - h2.update_coef(offset_lb, 1); - h1.update_const(alignment_multiple - 1); - } - } - - // create an upper bound variable for easy formula creation later - Variable_ID ub = f_exists->declare(); - { - coef_t coef = -ub_list[simplest_ub].get_coef( - bound.set_var(dim + 1)); - if (coef == 1) { // e.g. if i <= m+5, then UB = m+5 - EQ_Handle h = f_root->add_EQ(); - h.update_coef(ub, -1); - for (Constr_Vars_Iter ci(ub_list[simplest_ub]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - int pos = (*ci).var->get_position(); - if (pos != dim + 1) - h.update_coef(stmt[*i].xform.output_var(pos), - (*ci).coef); - break; - } - case Global_Var: { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = stmt[*i].xform.get_local(g); - else - v = stmt[*i].xform.get_local(g, - (*ci).var->function_of()); - h.update_coef(v, (*ci).coef); - break; - } - default: - throw loop_error("cannot handle tile bounds"); - } - } - h.update_const(ub_list[simplest_ub].get_const()); - } else { // e.g. if 2i <= m+5, then m+5-2 < 2*UB <= m+5 - GEQ_Handle h1 = f_root->add_GEQ(); - GEQ_Handle h2 = f_root->add_GEQ(); - for (Constr_Vars_Iter ci(ub_list[simplest_ub]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - int pos = (*ci).var->get_position(); - if (pos == dim + 1) { - h1.update_coef(ub, -(*ci).coef); - h2.update_coef(ub, (*ci).coef); - } else { - h1.update_coef(stmt[*i].xform.output_var(pos), - -(*ci).coef); - h2.update_coef(stmt[*i].xform.output_var(pos), - (*ci).coef); - } - break; - } - case Global_Var: { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = stmt[*i].xform.get_local(g); - else - v = stmt[*i].xform.get_local(g, - (*ci).var->function_of()); - h1.update_coef(v, -(*ci).coef); - h2.update_coef(v, (*ci).coef); - break; - } - default: - throw loop_error("cannot handle tile bounds"); - } - } - h1.update_const(-ub_list[simplest_ub].get_const()); - h2.update_const(ub_list[simplest_ub].get_const()); - h1.update_const(coef - 1); - } - } - - // insert tile controlling loop constraints - if (method == StridedTile) { // e.g. ii = LB + 32 * alpha && alpha >= 0 - Variable_ID e = f_exists->declare(); - GEQ_Handle h1 = f_root->add_GEQ(); - h1.update_coef(e, 1); - - EQ_Handle h2 = f_root->add_EQ(); - h2.update_coef(stmt[*i].xform.output_var(outer_dim + 1), 1); - h2.update_coef(e, -tile_size); - h2.update_coef(aligned_lb, -1); - } else if (method == CountedTile) { // e.g. 0 <= ii < ceiling((UB-LB+1)/32) - GEQ_Handle h1 = f_root->add_GEQ(); - h1.update_coef(stmt[*i].xform.output_var(outer_dim + 1), 1); - - GEQ_Handle h2 = f_root->add_GEQ(); - h2.update_coef(stmt[*i].xform.output_var(outer_dim + 1), - -tile_size); - h2.update_coef(aligned_lb, -1); - h2.update_coef(ub, 1); - } - - // special care for private statements like overflow assignment - if (private_stmt.find(*i) != private_stmt.end()) { // e.g. ii <= UB - GEQ_Handle h = f_root->add_GEQ(); - h.update_coef(stmt[*i].xform.output_var(outer_dim + 1), -1); - h.update_coef(ub, 1); - } - // if (private_stmt.find(*i) != private_stmt.end()) { - // if (stmt[*i].xform.n_out() > dim+3) { // e.g. ii <= UB && i = ii - // GEQ_Handle h = f_root->add_GEQ(); - // h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1); - // h.update_coef(ub, 1); - - // stmt[*i].xform = Project(stmt[*i].xform, dim+3, Output_Var); - // f_root = stmt[*i].xform.and_with_and(); - // EQ_Handle h1 = f_root->add_EQ(); - // h1.update_coef(stmt[*i].xform.output_var(dim+3), 1); - // h1.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1); - // } - // else if (method == StridedTile) { // e.g. ii <= UB since i does not exist - // GEQ_Handle h = f_root->add_GEQ(); - // h.update_coef(stmt[*i].xform.output_var(outer_dim+1), -1); - // h.update_coef(ub, 1); - // } - // } - - // restrict original loop index inside the tile - else { - if (method == StridedTile) { // e.g. ii <= i < ii + tile_size - GEQ_Handle h1 = f_root->add_GEQ(); - h1.update_coef(stmt[*i].xform.output_var(dim + 3), 1); - h1.update_coef(stmt[*i].xform.output_var(outer_dim + 1), - -1); - - GEQ_Handle h2 = f_root->add_GEQ(); - h2.update_coef(stmt[*i].xform.output_var(dim + 3), -1); - h2.update_coef(stmt[*i].xform.output_var(outer_dim + 1), 1); - h2.update_const(tile_size - 1); - } else if (method == CountedTile) { // e.g. LB+32*ii <= i < LB+32*ii+tile_size - GEQ_Handle h1 = f_root->add_GEQ(); - h1.update_coef(stmt[*i].xform.output_var(outer_dim + 1), - -tile_size); - h1.update_coef(stmt[*i].xform.output_var(dim + 3), 1); - h1.update_coef(aligned_lb, -1); - - GEQ_Handle h2 = f_root->add_GEQ(); - h2.update_coef(stmt[*i].xform.output_var(outer_dim + 1), - tile_size); - h2.update_coef(stmt[*i].xform.output_var(dim + 3), -1); - h2.update_const(tile_size - 1); - h2.update_coef(aligned_lb, 1); - } - } - } - } - - // update loop level information - for (std::set::iterator i = same_tile_controlling_loop.begin(); - i != same_tile_controlling_loop.end(); i++) { - for (int j = 1; j <= stmt[*i].loop_level.size(); j++) - switch (stmt[*i].loop_level[j - 1].type) { - case LoopLevelOriginal: - break; - case LoopLevelTile: - if (stmt[*i].loop_level[j - 1].payload >= outer_level) - stmt[*i].loop_level[j - 1].payload++; - break; - default: - throw loop_error( - "unknown loop level type for statement " - + to_string(*i)); - } - - LoopLevel ll; - ll.type = LoopLevelTile; - ll.payload = level + 1; - ll.parallel_level = 0; - stmt[*i].loop_level.insert( - stmt[*i].loop_level.begin() + (outer_level - 1), ll); - } -} - -std::set Loop::unroll(int stmt_num, int level, int unroll_amount) { - // check for sanity of parameters - if (unroll_amount < 0) - throw std::invalid_argument( - "invalid unroll amount " + to_string(unroll_amount)); - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invalid statement " + to_string(stmt_num)); - if (level <= 0 || level > stmt[stmt_num].loop_level.size()) - throw std::invalid_argument("invalid loop level " + to_string(level)); - - int dim = 2 * level - 1; - std::vector lex = getLexicalOrder(stmt_num); - std::set same_loop = getStatements(lex, dim - 1); - - // nothing to do - if (unroll_amount == 1) - return std::set(); - - for (int i = 0; i < stmt.size(); i++) { - std::vector > D; - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); - j++) { - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - int dim2 = level - 1; - if ((dv.type != DEP_CONTROL) && (dv.type != DEP_UNKNOWN)) { - - while (stmt[i].loop_level[dim2].type == LoopLevelTile) { - dim2 = xform_index[dim2].first; - } - dim2 = stmt[i].loop_level[dim2].payload; - - if (dv.isCarried(dim2) - && (dv.hasNegative(dim2) && !dv.quasi)) - throw loop_error( - "loop error: Unrolling is illegal, dependence violation!"); - - if (dv.isCarried(dim2) - && (dv.hasPositive(dim2) && dv.quasi)) - throw loop_error( - "loop error: Unrolling is illegal, dependence violation!"); - bool safe = false; - - if (dv.isCarried(dim2)) { - - if (!dv.quasi) { - if (dv.lbounds[dim2] != posInfinity) { - if (dv.lbounds[dim2] != negInfinity) - if (dv.lbounds[dim2] > unroll_amount) - safe = true; - } else - safe = true; - } else { - if (dv.ubounds[dim2] != negInfinity) { - if (dv.ubounds[dim2] != posInfinity) - if ((-(dv.ubounds[dim2])) > unroll_amount) - safe = true; - } else - safe = true; - } - - if (!safe) { - for (int l = level; l <= (n - 1) / 2; l++) { - int dim3 = l - 1; - - if (stmt[i].loop_level[dim3].type - != LoopLevelTile) - dim3 = stmt[i].loop_level[dim3].payload; - else { - while (stmt[i].loop_level[dim2].type - == LoopLevelTile) { - dim3 = stmt[i].loop_level[dim3].payload; - } - dim3 = stmt[i].loop_level[dim3].payload; - } - - if (dim3 > dim2) { - if ((dv.hasPositive(dim3) && !dv.quasi) - || (dv.hasNegative(dim3) && dv.quasi)) - break; - else if ((dv.hasNegative(dim3) && !dv.quasi) - || (dv.hasPositive(dim3) && dv.quasi)) - throw loop_error( - "loop error: Unrolling is illegal, dependence violation!"); - } - } - } - } - } - } - } - } - - // extract the intersection of the iteration space to be considered - Relation hull = Relation::True(level); - apply_xform(same_loop); - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); - i++) { - if (stmt[*i].IS.is_upper_bound_satisfiable()) { - Relation mapping(stmt[*i].IS.n_set(), level); - F_And *f_root = mapping.add_and(); - for (int j = 1; j <= level; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.input_var(j), 1); - h.update_coef(mapping.output_var(j), -1); - } - hull = Intersection(hull, - Range(Restrict_Domain(mapping, copy(stmt[*i].IS)))); - hull.simplify(2, 4); - } - } - for (int i = 1; i <= level; i++) { - std::string name = tmp_loop_var_name_prefix + to_string(i); - hull.name_set_var(i, name); - } - hull.setup_names(); - - // extract the exact loop bound of the dimension to be unrolled - if (is_single_loop_iteration(hull, level, this->known)) - return std::set(); - Relation bound = get_loop_bound(hull, level, this->known); - if (!bound.has_single_conjunct() || !bound.is_satisfiable() - || bound.is_tautology()) - throw loop_error("unable to extract loop bound for unrolling"); - - // extract the loop stride - EQ_Handle stride_eq; - int stride = 1; - { - bool simple_stride = true; - int strides = countStrides(bound.query_DNF()->single_conjunct(), - bound.set_var(level), stride_eq, simple_stride); - if (strides > 1) - throw loop_error("too many strides"); - else if (strides == 1) { - int sign = stride_eq.get_coef(bound.set_var(level)); - Constr_Vars_Iter it(stride_eq, true); - stride = abs((*it).coef / sign); - } - } - - // separate lower and upper bounds - std::vector lb_list, ub_list; - { - Conjunct *c = bound.query_DNF()->single_conjunct(); - for (GEQ_Iterator gi(c->GEQs()); gi; gi++) { - int coef = (*gi).get_coef(bound.set_var(level)); - if (coef < 0) - ub_list.push_back(*gi); - else if (coef > 0) - lb_list.push_back(*gi); - } - } - - // simplify overflow expression for each pair of upper and lower bounds - std::vector > > overflow_table( - lb_list.size(), - std::vector >(ub_list.size(), - std::map())); - bool is_overflow_simplifiable = true; - for (int i = 0; i < lb_list.size(); i++) { - if (!is_overflow_simplifiable) - break; - - for (int j = 0; j < ub_list.size(); j++) { - // lower bound or upper bound has non-unit coefficient, can't simplify - if (ub_list[j].get_coef(bound.set_var(level)) != -1 - || lb_list[i].get_coef(bound.set_var(level)) != 1) { - is_overflow_simplifiable = false; - break; - } - - for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - if ((*ci).var != bound.set_var(level)) - overflow_table[i][j][(*ci).var] += (*ci).coef; - - break; - } - case Global_Var: { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = bound.get_local(g); - else - v = bound.get_local(g, (*ci).var->function_of()); - overflow_table[i][j][(*ci).var] += (*ci).coef; - break; - } - default: - throw loop_error("failed to calculate overflow amount"); - } - } - overflow_table[i][j][NULL] += ub_list[j].get_const(); - - for (Constr_Vars_Iter ci(lb_list[i]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: { - if ((*ci).var != bound.set_var(level)) { - overflow_table[i][j][(*ci).var] += (*ci).coef; - if (overflow_table[i][j][(*ci).var] == 0) - overflow_table[i][j].erase( - overflow_table[i][j].find((*ci).var)); - } - break; - } - case Global_Var: { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = bound.get_local(g); - else - v = bound.get_local(g, (*ci).var->function_of()); - overflow_table[i][j][(*ci).var] += (*ci).coef; - if (overflow_table[i][j][(*ci).var] == 0) - overflow_table[i][j].erase( - overflow_table[i][j].find((*ci).var)); - break; - } - default: - throw loop_error("failed to calculate overflow amount"); - } - } - overflow_table[i][j][NULL] += lb_list[i].get_const(); - - overflow_table[i][j][NULL] += stride; - if (unroll_amount == 0 - || (overflow_table[i][j].size() == 1 - && overflow_table[i][j][NULL] / stride - < unroll_amount)) - unroll_amount = overflow_table[i][j][NULL] / stride; - } - } - - // loop iteration count can't be determined, bail out gracefully - if (unroll_amount == 0) - return std::set(); - - // further simply overflow calculation using coefficients' modular - if (is_overflow_simplifiable) { - for (int i = 0; i < lb_list.size(); i++) - for (int j = 0; j < ub_list.size(); j++) - if (stride == 1) { - for (std::map::iterator k = - overflow_table[i][j].begin(); - k != overflow_table[i][j].end();) - if ((*k).first != NULL) { - int t = int_mod_hat((*k).second, unroll_amount); - if (t == 0) { - overflow_table[i][j].erase(k++); - } else { - int t2 = hull.query_variable_mod((*k).first, - unroll_amount); - if (t2 != INT_MAX) { - overflow_table[i][j][NULL] += t * t2; - overflow_table[i][j].erase(k++); - } else { - (*k).second = t; - k++; - } - } - } else - k++; - - overflow_table[i][j][NULL] = int_mod_hat( - overflow_table[i][j][NULL], unroll_amount); - - // Since we don't have MODULO instruction in SUIF yet (only MOD), make all coef positive in the final formula - for (std::map::iterator k = - overflow_table[i][j].begin(); - k != overflow_table[i][j].end(); k++) - if ((*k).second < 0) - (*k).second += unroll_amount; - } - } - - // build overflow statement - CG_outputBuilder *ocg = ir->builder(); - CG_outputRepr *overflow_code = NULL; - Relation cond_upper(level), cond_lower(level); - Relation overflow_constraint(0); - F_And *overflow_constraint_root = overflow_constraint.add_and(); - std::vector over_var_list; - if (is_overflow_simplifiable && lb_list.size() == 1) { - for (int i = 0; i < ub_list.size(); i++) { - if (overflow_table[0][i].size() == 1) { - // upper splitting condition - GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]); - h.update_const( - ((overflow_table[0][i][NULL] / stride) % unroll_amount) - * -stride); - } else { - // upper splitting condition - std::string over_name = overflow_var_name_prefix - + to_string(overflow_var_name_counter++); - Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name); - over_var_list.push_back(over_free_var); - GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]); - h.update_coef(cond_upper.get_local(over_free_var), -stride); - - // insert constraint 0 <= overflow < unroll_amount - Variable_ID v = overflow_constraint.get_local(over_free_var); - GEQ_Handle h1 = overflow_constraint_root->add_GEQ(); - h1.update_coef(v, 1); - GEQ_Handle h2 = overflow_constraint_root->add_GEQ(); - h2.update_coef(v, -1); - h2.update_const(unroll_amount - 1); - - // create overflow assignment - bound.setup_names(); - CG_outputRepr *rhs = NULL; - for (std::map::iterator j = - overflow_table[0][i].begin(); - j != overflow_table[0][i].end(); j++) - if ((*j).first != NULL) { - CG_outputRepr *t = ocg->CreateIdent((*j).first->name()); - if ((*j).second != 1) - t = ocg->CreateTimes(ocg->CreateInt((*j).second), - t); - rhs = ocg->CreatePlus(rhs, t); - } else if ((*j).second != 0) - rhs = ocg->CreatePlus(rhs, ocg->CreateInt((*j).second)); - - if (stride != 1) - rhs = ocg->CreateIntegerCeil(rhs, ocg->CreateInt(stride)); - rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount)); - - CG_outputRepr *lhs = ocg->CreateIdent(over_name); - init_code = ocg->StmtListAppend(init_code, - ocg->CreateAssignment(0, lhs, ocg->CreateInt(0))); - lhs = ocg->CreateIdent(over_name); - overflow_code = ocg->StmtListAppend(overflow_code, - ocg->CreateAssignment(0, lhs, rhs)); - } - } - - // lower splitting condition - GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[0]); - } else if (is_overflow_simplifiable && ub_list.size() == 1) { - for (int i = 0; i < lb_list.size(); i++) { - - if (overflow_table[i][0].size() == 1) { - // lower splitting condition - GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]); - h.update_const(overflow_table[i][0][NULL] * -stride); - } else { - // lower splitting condition - std::string over_name = overflow_var_name_prefix - + to_string(overflow_var_name_counter++); - Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name); - over_var_list.push_back(over_free_var); - GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]); - h.update_coef(cond_lower.get_local(over_free_var), -stride); - - // insert constraint 0 <= overflow < unroll_amount - Variable_ID v = overflow_constraint.get_local(over_free_var); - GEQ_Handle h1 = overflow_constraint_root->add_GEQ(); - h1.update_coef(v, 1); - GEQ_Handle h2 = overflow_constraint_root->add_GEQ(); - h2.update_coef(v, -1); - h2.update_const(unroll_amount - 1); - - // create overflow assignment - bound.setup_names(); - CG_outputRepr *rhs = NULL; - for (std::map::iterator j = - overflow_table[0][i].begin(); - j != overflow_table[0][i].end(); j++) - if ((*j).first != NULL) { - CG_outputRepr *t = ocg->CreateIdent((*j).first->name()); - if ((*j).second != 1) - t = ocg->CreateTimes(ocg->CreateInt((*j).second), - t); - rhs = ocg->CreatePlus(rhs, t); - } else if ((*j).second != 0) - rhs = ocg->CreatePlus(rhs, ocg->CreateInt((*j).second)); - - if (stride != 1) - rhs = ocg->CreateIntegerCeil(rhs, ocg->CreateInt(stride)); - rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount)); - - CG_outputRepr *lhs = ocg->CreateIdent(over_name); - init_code = ocg->StmtListAppend(init_code, - ocg->CreateAssignment(0, lhs, ocg->CreateInt(0))); - lhs = ocg->CreateIdent(over_name); - overflow_code = ocg->StmtListAppend(overflow_code, - ocg->CreateAssignment(0, lhs, rhs)); - } - } - - // upper splitting condition - GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[0]); - } else { - std::string over_name = overflow_var_name_prefix - + to_string(overflow_var_name_counter++); - Free_Var_Decl *over_free_var = new Free_Var_Decl(over_name); - over_var_list.push_back(over_free_var); - - Tuple lb_repr_list, ub_repr_list; - for (int i = 0; i < lb_list.size(); i++) { - //lb_repr_list.append(outputLBasRepr(ocg, lb_list[i], bound, bound.set_var(dim+1), stride, stride_eq, Relation::True(bound.n_set()), std::vector(bound.n_set(), NULL))); - lb_repr_list.append( - outputLBasRepr(ocg, lb_list[i], bound, - bound.set_var(dim + 1), stride, stride_eq, - Relation::True(bound.n_set()), - std::vector(bound.n_set()))); - GEQ_Handle h = cond_lower.and_with_GEQ(lb_list[i]); - } - for (int i = 0; i < ub_list.size(); i++) { - //ub_repr_list.append(outputUBasRepr(ocg, ub_list[i], bound, bound.set_var(dim+1), stride, stride_eq, std::vector(bound.n_set(), NULL))); - ub_repr_list.append( - outputUBasRepr(ocg, ub_list[i], bound, - bound.set_var(dim + 1), stride, stride_eq, - std::vector(bound.n_set()))); - GEQ_Handle h = cond_upper.and_with_GEQ(ub_list[i]); - h.update_coef(cond_upper.get_local(over_free_var), -stride); - } - - CG_outputRepr *lbRepr, *ubRepr; - if (lb_repr_list.size() > 1) - lbRepr = ocg->CreateInvoke("max", lb_repr_list); - else if (lb_repr_list.size() == 1) - lbRepr = lb_repr_list[1]; - - if (ub_repr_list.size() > 1) - ubRepr = ocg->CreateInvoke("min", ub_repr_list); - else if (ub_repr_list.size() == 1) - ubRepr = ub_repr_list[1]; - - // create overflow assignment - bound.setup_names(); - CG_outputRepr *rhs = ocg->CreatePlus(ocg->CreateMinus(ubRepr, lbRepr), - ocg->CreateInt(1)); - if (stride != 1) - rhs = ocg->CreateIntegerDivide(rhs, ocg->CreateInt(stride)); - rhs = ocg->CreateIntegerMod(rhs, ocg->CreateInt(unroll_amount)); - CG_outputRepr *lhs = ocg->CreateIdent(over_name); - init_code = ocg->StmtListAppend(init_code, - ocg->CreateAssignment(0, lhs, ocg->CreateInt(0))); - lhs = ocg->CreateIdent(over_name); - overflow_code = ocg->CreateAssignment(0, lhs, rhs); - - // insert constraint 0 <= overflow < unroll_amount - Variable_ID v = overflow_constraint.get_local(over_free_var); - GEQ_Handle h1 = overflow_constraint_root->add_GEQ(); - h1.update_coef(v, 1); - GEQ_Handle h2 = overflow_constraint_root->add_GEQ(); - h2.update_coef(v, -1); - h2.update_const(unroll_amount - 1); - } - - // insert overflow statement - int overflow_stmt_num = -1; - if (overflow_code != NULL) { - // build iteration space for overflow statement - Relation mapping(level, level - 1); - F_And *f_root = mapping.add_and(); - for (int i = 1; i < level; i++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(i), 1); - h.update_coef(mapping.input_var(i), -1); - } - Relation overflow_IS = Range(Restrict_Domain(mapping, copy(hull))); - for (int i = 1; i < level; i++) - overflow_IS.name_set_var(i, hull.set_var(i)->name()); - overflow_IS.setup_names(); - - // build dumb transformation relation for overflow statement - Relation overflow_xform(level - 1, 2 * (level - 1) + 1); - f_root = overflow_xform.add_and(); - for (int i = 1; i <= level - 1; i++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(overflow_xform.output_var(2 * i), 1); - h.update_coef(overflow_xform.input_var(i), -1); - - h = f_root->add_EQ(); - h.update_coef(overflow_xform.output_var(2 * i - 1), 1); - h.update_const(-lex[2 * i - 2]); - } - EQ_Handle h = f_root->add_EQ(); - h.update_coef(overflow_xform.output_var(2 * (level - 1) + 1), 1); - h.update_const(-lex[2 * (level - 1)]); - - shiftLexicalOrder(lex, dim - 1, 1); - Statement overflow_stmt; - overflow_stmt.code = overflow_code; - overflow_stmt.IS = overflow_IS; - overflow_stmt.xform = overflow_xform; - overflow_stmt.loop_level = std::vector(level - 1); - for (int i = 0; i < level - 1; i++) { - overflow_stmt.loop_level[i].type = - stmt[stmt_num].loop_level[i].type; - if (stmt[stmt_num].loop_level[i].type == LoopLevelTile - && stmt[stmt_num].loop_level[i].payload >= level) - overflow_stmt.loop_level[i].payload = -1; - else - overflow_stmt.loop_level[i].payload = - stmt[stmt_num].loop_level[i].payload; - overflow_stmt.loop_level[i].parallel_level = - stmt[stmt_num].loop_level[i].parallel_level; - } - stmt.push_back(overflow_stmt); - dep.insert(); - overflow_stmt_num = stmt.size() - 1; - overflow[overflow_stmt_num] = over_var_list; - - // update the global known information on overflow variable - this->known = Intersection(this->known, - Extend_Set(copy(overflow_constraint), - this->known.n_set() - overflow_constraint.n_set())); - - // update dependence graph - DependenceVector dv; - dv.type = DEP_CONTROL; - for (std::set::iterator i = same_loop.begin(); - i != same_loop.end(); i++) - dep.connect(overflow_stmt_num, *i, dv); - dv.type = DEP_W2W; - { - IR_ScalarSymbol *overflow_sym = NULL; - std::vector scalars = ir->FindScalarRef( - overflow_code); - for (int i = scalars.size() - 1; i >= 0; i--) - if (scalars[i]->is_write()) { - overflow_sym = scalars[i]->symbol(); - break; - } - for (int i = scalars.size() - 1; i >= 0; i--) - delete scalars[i]; - dv.sym = overflow_sym; - } - dv.lbounds = std::vector(num_dep_dim, 0); - dv.ubounds = std::vector(num_dep_dim, 0); - int dep_dim = get_last_dep_dim_before(stmt_num, level); - for (int i = dep_dim + 1; i < num_dep_dim; i++) { - dv.lbounds[i] = -posInfinity; - dv.ubounds[i] = posInfinity; - } - for (int i = 0; i <= dep_dim; i++) { - if (i != 0) { - dv.lbounds[i - 1] = 0; - dv.ubounds[i - 1] = 0; - } - dv.lbounds[i] = 1; - dv.ubounds[i] = posInfinity; - dep.connect(overflow_stmt_num, overflow_stmt_num, dv); - } - } - - // split the loop so it can be fully unrolled - std::set result = split(stmt_num, level, cond_upper); - std::set result2 = split(stmt_num, level, cond_lower); - for (std::set::iterator i = result2.begin(); i != result2.end(); i++) - result.insert(*i); - - // check if unrolled statements can be trivially lumped together as one statement - bool can_be_lumped = true; - if (can_be_lumped) { - for (std::set::iterator i = same_loop.begin(); - i != same_loop.end(); i++) - if (*i != stmt_num) { - if (stmt[*i].loop_level.size() - != stmt[stmt_num].loop_level.size()) { - can_be_lumped = false; - break; - } - for (int j = 0; j < stmt[stmt_num].loop_level.size(); j++) - if (!(stmt[*i].loop_level[j].type - == stmt[stmt_num].loop_level[j].type - && stmt[*i].loop_level[j].payload - == stmt[stmt_num].loop_level[j].payload)) { - can_be_lumped = false; - break; - } - if (!can_be_lumped) - break; - std::vector lex2 = getLexicalOrder(*i); - for (int j = 2 * level; j < lex.size() - 1; j += 2) - if (lex[j] != lex2[j]) { - can_be_lumped = false; - break; - } - if (!can_be_lumped) - break; - } - } - if (can_be_lumped) { - for (std::set::iterator i = same_loop.begin(); - i != same_loop.end(); i++) - if (is_inner_loop_depend_on_level(stmt[*i].IS, level, known)) { - can_be_lumped = false; - break; - } - } - if (can_be_lumped) { - for (std::set::iterator i = same_loop.begin(); - i != same_loop.end(); i++) - if (*i != stmt_num) { - if (!(Must_Be_Subset(copy(stmt[*i].IS), copy(stmt[stmt_num].IS)) - && Must_Be_Subset(copy(stmt[stmt_num].IS), - copy(stmt[*i].IS)))) { - can_be_lumped = false; - break; - } - } - } - if (can_be_lumped) { - for (std::set::iterator i = same_loop.begin(); - i != same_loop.end(); i++) { - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[*i].second.begin(); - j != dep.vertex[*i].second.end(); j++) - if (same_loop.find(j->first) != same_loop.end()) { - for (int k = 0; k < j->second.size(); k++) - if (j->second[k].type == DEP_CONTROL - || j->second[k].type == DEP_UNKNOWN) { - can_be_lumped = false; - break; - } - if (!can_be_lumped) - break; - } - if (!can_be_lumped) - break; - } - } - - // add strides to original statements - // for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) - // add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride); - - // std::vector depending_overflow_var; - // for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - // add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride); - // if (overflow.find(*i) != overflow.end()) { - // // TO DO: It should check whether overflow vaiable depends on - // // this loop index and by how much. This step is important if - // // you want to unroll loops in arbitrary order. - // depending_overflow_var.insert(depending_overflow_var.end(), overflow[*i].begin(), overflow[*i].end()); - - // continue; - // } - // } - -// std::map > pending; -// for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { -// add_loop_stride(stmt[*i].IS, bound, level-1, unroll_amount * stride); - -// if (overflow.find(*i) != overflow.end()) { -// // TO DO: It should check whether overflow vaiable depends on -// // this loop index and by how much. This step is important if -// // you want to unroll loops in arbitrary order. -// depending_overflow_var.insert(depending_overflow_var.end(), overflow[*i].begin(), overflow[*i].end()); - -// continue; -// } - -// // create copy for each unroll amount -// for (int j = 1; j < unroll_amount; j++) { -// Tuple funcList; -// Tuple loop_vars; -// loop_vars.append(stmt[*i].IS.set_var((dim+1)/2)->name()); -// funcList.append(ocg->CreatePlus(ocg->CreateIdent(stmt[*i].IS.set_var(level)->name()), ocg->CreateInt(j*stride))); -// CG_outputRepr *code = ocg->CreatePlaceHolder(0, stmt[*i].code->clone(), funcList, loop_vars); - -// // prepare the new statment to insert -// Statement unrolled_stmt; -// unrolled_stmt.IS = copy(stmt[*i].IS); -// // adjust_loop_bound(unrolled_stmt.IS, (dim-1)/2, j); -// unrolled_stmt.xform = copy(stmt[*i].xform); -// unrolled_stmt.code = code; -// unrolled_stmt.loop_level = stmt[*i].loop_level; -// pending[*i].push_back(unrolled_stmt); -// } -// } - -// // adjust iteration space due to loop bounds depending on this loop -// // index and affected overflow variables -// for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { -// for (int j = 0; j < pending[*i].size(); j++) { -// adjust_loop_bound(pending[*i][j].IS, (dim-1)/2, j+1, depending_overflow_var); -// //pending[*i][j].IS = Intersection(pending[*i][j].IS, Extend_Set(copy(this->known), pending[*i][j].IS.n_set() - this->known.n_set())); -// } -// } - - // insert unrolled statements - int old_num_stmt = stmt.size(); - if (!can_be_lumped) { - std::map > what_stmt_num; - - for (int j = 1; j < unroll_amount; j++) { - for (std::set::iterator i = same_loop.begin(); - i != same_loop.end(); i++) { - Statement new_stmt; - - Tuple funcList; - Tuple loop_vars; - loop_vars.append(stmt[*i].IS.set_var(level)->name()); - funcList.append( - ocg->CreatePlus( - ocg->CreateIdent( - stmt[*i].IS.set_var(level)->name()), - ocg->CreateInt(j * stride))); - new_stmt.code = ocg->CreatePlaceHolder(0, - stmt[*i].code->clone(), funcList, loop_vars); - - new_stmt.IS = adjust_loop_bound(stmt[*i].IS, level, j * stride); - add_loop_stride(new_stmt.IS, bound, level - 1, - unroll_amount * stride); - - new_stmt.xform = copy(stmt[*i].xform); - new_stmt.loop_level = stmt[*i].loop_level; - stmt.push_back(new_stmt); - dep.insert(); - what_stmt_num[*i].push_back(stmt.size() - 1); - } - } - for (std::set::iterator i = same_loop.begin(); - i != same_loop.end(); i++) - add_loop_stride(stmt[*i].IS, bound, level - 1, - unroll_amount * stride); - - // update dependence graph - if (stmt[stmt_num].loop_level[level - 1].type == LoopLevelOriginal) { - int dep_dim = stmt[stmt_num].loop_level[level - 1].payload; - int new_stride = unroll_amount * stride; - for (int i = 0; i < old_num_stmt; i++) { - std::vector > D; - - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[i].second.begin(); - j != dep.vertex[i].second.end();) { - if (same_loop.find(i) != same_loop.end()) { - if (same_loop.find(j->first) != same_loop.end()) { - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if (dv.type == DEP_CONTROL - || dv.type == DEP_UNKNOWN) { - D.push_back(std::make_pair(j->first, dv)); - for (int kk = 0; kk < unroll_amount - 1; - kk++) - if (what_stmt_num[i][kk] != -1 - && what_stmt_num[j->first][kk] - != -1) - dep.connect(what_stmt_num[i][kk], - what_stmt_num[j->first][kk], - dv); - } else { - coef_t lb = dv.lbounds[dep_dim]; - coef_t ub = dv.ubounds[dep_dim]; - if (ub == lb - && int_mod(lb, - static_cast(new_stride)) - == 0) { - D.push_back( - std::make_pair(j->first, dv)); - for (int kk = 0; kk < unroll_amount - 1; - kk++) - if (what_stmt_num[i][kk] != -1 - && what_stmt_num[j->first][kk] - != -1) - dep.connect( - what_stmt_num[i][kk], - what_stmt_num[j->first][kk], - dv); - } else if (lb == -posInfinity - && ub == posInfinity) { - D.push_back( - std::make_pair(j->first, dv)); - for (int kk = 0; kk < unroll_amount; - kk++) - if (kk == 0) - D.push_back( - std::make_pair(j->first, - dv)); - else if (what_stmt_num[j->first][kk - - 1] != -1) - D.push_back( - std::make_pair( - what_stmt_num[j->first][kk - - 1], - dv)); - for (int t = 0; t < unroll_amount - 1; - t++) - if (what_stmt_num[i][t] != -1) - for (int kk = 0; - kk < unroll_amount; - kk++) - if (kk == 0) - dep.connect( - what_stmt_num[i][t], - j->first, dv); - else if (what_stmt_num[j->first][kk - - 1] != -1) - dep.connect( - what_stmt_num[i][t], - what_stmt_num[j->first][kk - - 1], - dv); - } else { - for (int kk = 0; kk < unroll_amount; - kk++) { - if (lb != -posInfinity) { - if (kk * stride - < int_mod(lb, - static_cast(new_stride))) - dv.lbounds[dep_dim] = - floor( - static_cast(lb) - / new_stride) - * new_stride - + new_stride; - else - dv.lbounds[dep_dim] = - floor( - static_cast(lb) - / new_stride) - * new_stride; - } - if (ub != posInfinity) { - if (kk * stride - > int_mod(ub, - static_cast(new_stride))) - dv.ubounds[dep_dim] = - floor( - static_cast(ub) - / new_stride) - * new_stride - - new_stride; - else - dv.ubounds[dep_dim] = - floor( - static_cast(ub) - / new_stride) - * new_stride; - } - if (dv.ubounds[dep_dim] - >= dv.lbounds[dep_dim]) { - if (kk == 0) - D.push_back( - std::make_pair( - j->first, - dv)); - else if (what_stmt_num[j->first][kk - - 1] != -1) - D.push_back( - std::make_pair( - what_stmt_num[j->first][kk - - 1], - dv)); - } - } - for (int t = 0; t < unroll_amount - 1; - t++) - if (what_stmt_num[i][t] != -1) - for (int kk = 0; - kk < unroll_amount; - kk++) { - if (lb != -posInfinity) { - if (kk * stride - < int_mod( - lb + t - + 1, - static_cast(new_stride))) - dv.lbounds[dep_dim] = - floor( - static_cast(lb - + (t - + 1) - * stride) - / new_stride) - * new_stride - + new_stride; - else - dv.lbounds[dep_dim] = - floor( - static_cast(lb - + (t - + 1) - * stride) - / new_stride) - * new_stride; - } - if (ub != posInfinity) { - if (kk * stride - > int_mod( - ub + t - + 1, - static_cast(new_stride))) - dv.ubounds[dep_dim] = - floor( - static_cast(ub - + (t - + 1) - * stride) - / new_stride) - * new_stride - - new_stride; - else - dv.ubounds[dep_dim] = - floor( - static_cast(ub - + (t - + 1) - * stride) - / new_stride) - * new_stride; - } - if (dv.ubounds[dep_dim] - >= dv.lbounds[dep_dim]) { - if (kk == 0) - dep.connect( - what_stmt_num[i][t], - j->first, - dv); - else if (what_stmt_num[j->first][kk - - 1] != -1) - dep.connect( - what_stmt_num[i][t], - what_stmt_num[j->first][kk - - 1], - dv); - } - } - } - } - } - - dep.vertex[i].second.erase(j++); - } else { - for (int kk = 0; kk < unroll_amount - 1; kk++) - if (what_stmt_num[i][kk] != -1) - dep.connect(what_stmt_num[i][kk], j->first, - j->second); - - j++; - } - } else { - if (same_loop.find(j->first) != same_loop.end()) - for (int k = 0; k < j->second.size(); k++) - for (int kk = 0; kk < unroll_amount - 1; kk++) - if (what_stmt_num[j->first][kk] != -1) - D.push_back( - std::make_pair( - what_stmt_num[j->first][kk], - j->second[k])); - j++; - } - } - - for (int j = 0; j < D.size(); j++) - dep.connect(i, D[j].first, D[j].second); - } - } - - // reset lexical order for the unrolled loop body - std::set new_same_loop; - for (std::map >::iterator i = - what_stmt_num.begin(); i != what_stmt_num.end(); i++) { - new_same_loop.insert(i->first); - for (int j = 0; j < i->second.size(); j++) - new_same_loop.insert(i->second[j]); - } - setLexicalOrder(dim + 1, new_same_loop); - } else { - for (std::set::iterator i = same_loop.begin(); - i != same_loop.end(); i++) - add_loop_stride(stmt[*i].IS, bound, level - 1, - unroll_amount * stride); - - int max_level = stmt[stmt_num].loop_level.size(); - std::vector > stmt_order; - for (std::set::iterator i = same_loop.begin(); - i != same_loop.end(); i++) - stmt_order.push_back( - std::make_pair( - get_const(stmt[*i].xform, 2 * max_level, - Output_Var), *i)); - sort(stmt_order.begin(), stmt_order.end()); - - Statement new_stmt; - new_stmt.code = NULL; - for (int j = 1; j < unroll_amount; j++) - for (int i = 0; i < stmt_order.size(); i++) { - Tuple funcList; - Tuple loop_vars; - loop_vars.append( - stmt[stmt_order[i].second].IS.set_var(level)->name()); - funcList.append( - ocg->CreatePlus( - ocg->CreateIdent( - stmt[stmt_order[i].second].IS.set_var( - level)->name()), - ocg->CreateInt(j * stride))); - CG_outputRepr *code = ocg->CreatePlaceHolder(0, - stmt[stmt_order[i].second].code->clone(), funcList, - loop_vars); - new_stmt.code = ocg->StmtListAppend(new_stmt.code, code); - } - - new_stmt.IS = copy(stmt[stmt_num].IS); - new_stmt.xform = copy(stmt[stmt_num].xform); - assign_const(new_stmt.xform, 2 * max_level, - stmt_order[stmt_order.size() - 1].first + 1); - new_stmt.loop_level = stmt[stmt_num].loop_level; - stmt.push_back(new_stmt); - dep.insert(); - - // update dependence graph - if (stmt[stmt_num].loop_level[level - 1].type == LoopLevelOriginal) { - int dep_dim = stmt[stmt_num].loop_level[level - 1].payload; - int new_stride = unroll_amount * stride; - for (int i = 0; i < old_num_stmt; i++) { - std::vector > > D; - - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[i].second.begin(); - j != dep.vertex[i].second.end();) { - if (same_loop.find(i) != same_loop.end()) { - if (same_loop.find(j->first) != same_loop.end()) { - std::vector dvs11, dvs12, dvs22, - dvs21; - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if (dv.type == DEP_CONTROL - || dv.type == DEP_UNKNOWN) { - if (i == j->first) { - dvs11.push_back(dv); - dvs22.push_back(dv); - } else - throw loop_error( - "unrolled statements lumped together illegally"); - } else { - coef_t lb = dv.lbounds[dep_dim]; - coef_t ub = dv.ubounds[dep_dim]; - if (ub == lb - && int_mod(lb, - static_cast(new_stride)) - == 0) { - dvs11.push_back(dv); - dvs22.push_back(dv); - } else { - if (lb != -posInfinity) - dv.lbounds[dep_dim] = ceil( - static_cast(lb) - / new_stride) - * new_stride; - if (ub != posInfinity) - dv.ubounds[dep_dim] = floor( - static_cast(ub) - / new_stride) - * new_stride; - if (dv.ubounds[dep_dim] - >= dv.lbounds[dep_dim]) - dvs11.push_back(dv); - - if (lb != -posInfinity) - dv.lbounds[dep_dim] = ceil( - static_cast(lb) - / new_stride) - * new_stride; - if (ub != posInfinity) - dv.ubounds[dep_dim] = ceil( - static_cast(ub) - / new_stride) - * new_stride; - if (dv.ubounds[dep_dim] - >= dv.lbounds[dep_dim]) - dvs21.push_back(dv); - - if (lb != -posInfinity) - dv.lbounds[dep_dim] = floor( - static_cast(lb) - / new_stride) - * new_stride; - if (ub != posInfinity) - dv.ubounds[dep_dim] = floor( - static_cast(ub - - stride) - / new_stride) - * new_stride; - if (dv.ubounds[dep_dim] - >= dv.lbounds[dep_dim]) - dvs12.push_back(dv); - - if (lb != -posInfinity) - dv.lbounds[dep_dim] = floor( - static_cast(lb) - / new_stride) - * new_stride; - if (ub != posInfinity) - dv.ubounds[dep_dim] = ceil( - static_cast(ub - - stride) - / new_stride) - * new_stride; - if (dv.ubounds[dep_dim] - >= dv.lbounds[dep_dim]) - dvs22.push_back(dv); - } - } - } - if (dvs11.size() > 0) - D.push_back(std::make_pair(i, dvs11)); - if (dvs22.size() > 0) - dep.connect(old_num_stmt, old_num_stmt, dvs22); - if (dvs12.size() > 0) - D.push_back( - std::make_pair(old_num_stmt, dvs12)); - if (dvs21.size() > 0) - dep.connect(old_num_stmt, i, dvs21); - - dep.vertex[i].second.erase(j++); - } else { - dep.connect(old_num_stmt, j->first, j->second); - j++; - } - } else { - if (same_loop.find(j->first) != same_loop.end()) - D.push_back( - std::make_pair(old_num_stmt, j->second)); - j++; - } - } - - for (int j = 0; j < D.size(); j++) - dep.connect(i, D[j].first, D[j].second); - } - } - } - - return result; -} - -std::vector Loop::getLexicalOrder(int stmt_num) const { - assert(stmt_num < stmt.size()); - - const int n = stmt[stmt_num].xform.n_out(); - std::vector lex(n, 0); - - for (int i = 0; i < n; i += 2) - lex[i] = get_const(stmt[stmt_num].xform, i, Output_Var); - - return lex; -} - -std::set Loop::getStatements(const std::vector &lex, int dim) const { - const int m = stmt.size(); - - std::set same_loops; - for (int i = 0; i < m; i++) { - if (dim < 0) - same_loops.insert(i); - else { - std::vector a_lex = getLexicalOrder(i); - int j; - for (j = 0; j <= dim; j += 2) - if (lex[j] != a_lex[j]) - break; - if (j > dim) - same_loops.insert(i); - } - } - - return same_loops; -} - -void Loop::shiftLexicalOrder(const std::vector &lex, int dim, int amount) { - const int m = stmt.size(); - - if (amount == 0) - return; - - for (int i = 0; i < m; i++) { - std::vector lex2 = getLexicalOrder(i); - - bool need_shift = true; - - for (int j = 0; j < dim; j++) - if (lex2[j] != lex[j]) { - need_shift = false; - break; - } - - if (!need_shift) - continue; - - if (amount > 0) { - if (lex2[dim] < lex[dim]) - continue; - } else if (amount < 0) { - if (lex2[dim] > lex[dim]) - continue; - } - - assign_const(stmt[i].xform, dim, lex2[dim] + amount); - } -} - -void Loop::setLexicalOrder(int dim, const std::set &active, - int starting_order) { - if (active.size() == 0) - return; - - // check for sanity of parameters - if (dim < 0 || dim % 2 != 0) - throw std::invalid_argument( - "invalid constant loop level to set lexicographical order"); - std::vector lex; - int ref_stmt_num; - for (std::set::iterator i = active.begin(); i != active.end(); i++) { - if ((*i) < 0 || (*i) >= stmt.size()) - throw std::invalid_argument( - "invalid statement number " + to_string(*i)); - if (dim >= stmt[*i].xform.n_out()) - throw std::invalid_argument( - "invalid constant loop level to set lexicographical order"); - if (i == active.begin()) { - lex = getLexicalOrder(*i); - ref_stmt_num = *i; - } else { - std::vector lex2 = getLexicalOrder(*i); - for (int j = 0; j < dim; j += 2) - if (lex[j] != lex2[j]) - throw std::invalid_argument( - "statements are not in the same sub loop nest"); - } - } - - // sepearate statements by current loop level types - int level = (dim + 2) / 2; - std::map, std::set > active_by_level_type; - std::set active_by_no_level; - for (std::set::iterator i = active.begin(); i != active.end(); i++) { - if (level > stmt[*i].loop_level.size()) - active_by_no_level.insert(*i); - else - active_by_level_type[std::make_pair( - stmt[*i].loop_level[level - 1].type, - stmt[*i].loop_level[level - 1].payload)].insert(*i); - } - - // further separate statements due to control dependences - std::vector > active_by_level_type_splitted; - for (std::map, std::set >::iterator i = - active_by_level_type.begin(); i != active_by_level_type.end(); i++) - active_by_level_type_splitted.push_back(i->second); - for (std::set::iterator i = active_by_no_level.begin(); - i != active_by_no_level.end(); i++) - for (int j = active_by_level_type_splitted.size() - 1; j >= 0; j--) { - std::set controlled, not_controlled; - for (std::set::iterator k = - active_by_level_type_splitted[j].begin(); - k != active_by_level_type_splitted[j].end(); k++) { - std::vector dvs = dep.getEdge(*i, *k); - bool is_controlled = false; - for (int kk = 0; kk < dvs.size(); kk++) - if (dvs[kk].type = DEP_CONTROL) { - is_controlled = true; - break; - } - if (is_controlled) - controlled.insert(*k); - else - not_controlled.insert(*k); - } - if (controlled.size() != 0 && not_controlled.size() != 0) { - active_by_level_type_splitted.erase( - active_by_level_type_splitted.begin() + j); - active_by_level_type_splitted.push_back(controlled); - active_by_level_type_splitted.push_back(not_controlled); - } - } - - // set lexical order separating loops with different loop types first - if (active_by_level_type_splitted.size() + active_by_no_level.size() > 1) { - int dep_dim = get_last_dep_dim_before(ref_stmt_num, level) + 1; - - Graph, Empty> g; - for (std::vector >::iterator i = - active_by_level_type_splitted.begin(); - i != active_by_level_type_splitted.end(); i++) - g.insert(*i); - for (std::set::iterator i = active_by_no_level.begin(); - i != active_by_no_level.end(); i++) { - std::set t; - t.insert(*i); - g.insert(t); - } - for (int i = 0; i < g.vertex.size(); i++) - for (int j = i + 1; j < g.vertex.size(); j++) { - bool connected = false; - for (std::set::iterator ii = g.vertex[i].first.begin(); - ii != g.vertex[i].first.end(); ii++) { - for (std::set::iterator jj = g.vertex[j].first.begin(); - jj != g.vertex[j].first.end(); jj++) { - std::vector dvs = dep.getEdge(*ii, - *jj); - for (int k = 0; k < dvs.size(); k++) - if (dvs[k].is_control_dependence() - || (dvs[k].is_data_dependence() - && !dvs[k].has_been_carried_before( - dep_dim))) { - g.connect(i, j); - connected = true; - break; - } - if (connected) - break; - } - if (connected) - break; - } - connected = false; - for (std::set::iterator ii = g.vertex[i].first.begin(); - ii != g.vertex[i].first.end(); ii++) { - for (std::set::iterator jj = g.vertex[j].first.begin(); - jj != g.vertex[j].first.end(); jj++) { - std::vector dvs = dep.getEdge(*jj, - *ii); - for (int k = 0; k < dvs.size(); k++) - if (dvs[k].is_control_dependence() - || (dvs[k].is_data_dependence() - && !dvs[k].has_been_carried_before( - dep_dim))) { - g.connect(j, i); - connected = true; - break; - } - if (connected) - break; - } - if (connected) - break; - } - } - - std::vector > s = g.topoSort(); - if (s.size() != g.vertex.size()) - throw loop_error( - "cannot separate statements with different loop types at loop level " - + to_string(level)); - - // assign lexical order - int order = starting_order; - for (int i = 0; i < s.size(); i++) { - std::set &cur_scc = g.vertex[*(s[i].begin())].first; - int sz = cur_scc.size(); - if (sz == 1) { - int cur_stmt = *(cur_scc.begin()); - assign_const(stmt[cur_stmt].xform, dim, order); - for (int j = dim + 2; j < stmt[cur_stmt].xform.n_out(); j += 2) - assign_const(stmt[cur_stmt].xform, j, 0); - order++; - } else { - setLexicalOrder(dim, cur_scc, order); - order += sz; - } - } - } - // set lexical order seperating single iteration statements and loops - else { - std::set true_singles; - std::set nonsingles; - std::map > fake_singles; - - // sort out statements that do not require loops - for (std::set::iterator i = active.begin(); i != active.end(); - i++) { - Relation cur_IS = getNewIS(*i); - if (is_single_iteration(cur_IS, dim + 1)) { - bool is_all_single = true; - for (int j = dim + 3; j < stmt[*i].xform.n_out(); j += 2) - if (!is_single_iteration(cur_IS, j)) { - is_all_single = false; - break; - } - if (is_all_single) - true_singles.insert(*i); - else { - try { - fake_singles[get_const(cur_IS, dim + 1, Set_Var)].insert( - *i); - } catch (const std::exception &e) { - fake_singles[posInfinity].insert(*i); - } - } - } else - nonsingles.insert(*i); - } - - // split nonsingles forcibly according to negative dependences present (loop unfusible) - int dep_dim = get_dep_dim_of(ref_stmt_num, level); - Graph g2; - for (std::set::iterator i = nonsingles.begin(); - i != nonsingles.end(); i++) - g2.insert(*i); - for (int i = 0; i < g2.vertex.size(); i++) - for (int j = i + 1; j < g2.vertex.size(); j++) { - std::vector dvs = dep.getEdge( - g2.vertex[i].first, g2.vertex[j].first); - for (int k = 0; k < dvs.size(); k++) - if (dvs[k].is_control_dependence() - || (dvs[k].is_data_dependence() - && dvs[k].has_negative_been_carried_at( - dep_dim))) { - g2.connect(i, j); - break; - } - dvs = dep.getEdge(g2.vertex[j].first, g2.vertex[i].first); - for (int k = 0; k < dvs.size(); k++) - if (dvs[k].is_control_dependence() - || (dvs[k].is_data_dependence() - && dvs[k].has_negative_been_carried_at( - dep_dim))) { - g2.connect(j, i); - break; - } - } - - std::vector > s2 = g2.packed_topoSort(); - - std::vector > splitted_nonsingles; - for (int i = 0; i < s2.size(); i++) { - std::set cur_scc; - for (std::set::iterator j = s2[i].begin(); j != s2[i].end(); - j++) - cur_scc.insert(g2.vertex[*j].first); - splitted_nonsingles.push_back(cur_scc); - } - - // convert to dependence graph for grouped statements - dep_dim = get_last_dep_dim_before(ref_stmt_num, level) + 1; - Graph, Empty> g; - for (std::set::iterator i = true_singles.begin(); - i != true_singles.end(); i++) { - std::set t; - t.insert(*i); - g.insert(t); - } - for (int i = 0; i < splitted_nonsingles.size(); i++) { - g.insert(splitted_nonsingles[i]); - } - for (std::map >::iterator i = - fake_singles.begin(); i != fake_singles.end(); i++) - g.insert((*i).second); - - for (int i = 0; i < g.vertex.size(); i++) - for (int j = i + 1; j < g.vertex.size(); j++) { - bool connected = false; - for (std::set::iterator ii = g.vertex[i].first.begin(); - ii != g.vertex[i].first.end(); ii++) { - for (std::set::iterator jj = g.vertex[j].first.begin(); - jj != g.vertex[j].first.end(); jj++) { - std::vector dvs = dep.getEdge(*ii, - *jj); - for (int k = 0; k < dvs.size(); k++) - if (dvs[k].is_control_dependence() - || (dvs[k].is_data_dependence() - && !dvs[k].has_been_carried_before( - dep_dim))) { - g.connect(i, j); - connected = true; - break; - } - if (connected) - break; - } - if (connected) - break; - } - connected = false; - for (std::set::iterator ii = g.vertex[i].first.begin(); - ii != g.vertex[i].first.end(); ii++) { - for (std::set::iterator jj = g.vertex[j].first.begin(); - jj != g.vertex[j].first.end(); jj++) { - std::vector dvs = dep.getEdge(*jj, - *ii); - for (int k = 0; k < dvs.size(); k++) - if (dvs[k].is_control_dependence() - || (dvs[k].is_data_dependence() - && !dvs[k].has_been_carried_before( - dep_dim))) { - g.connect(j, i); - connected = true; - break; - } - if (connected) - break; - } - if (connected) - break; - } - } - - // topological sort according to chun's permute algorithm - std::vector > s = g.topoSort(); - - // assign lexical order - int order = starting_order; - for (int i = 0; i < s.size(); i++) { - // translate each SCC into original statements - std::set cur_scc; - for (std::set::iterator j = s[i].begin(); j != s[i].end(); j++) - copy(g.vertex[*j].first.begin(), g.vertex[*j].first.end(), - inserter(cur_scc, cur_scc.begin())); - - // now assign the constant - for (std::set::iterator j = cur_scc.begin(); - j != cur_scc.end(); j++) - assign_const(stmt[*j].xform, dim, order); - - if (cur_scc.size() > 1) - setLexicalOrder(dim + 2, cur_scc); - else if (cur_scc.size() == 1) { - int cur_stmt = *(cur_scc.begin()); - for (int j = dim + 2; j < stmt[cur_stmt].xform.n_out(); j += 2) - assign_const(stmt[cur_stmt].xform, j, 0); - } - - if (cur_scc.size() > 0) - order++; - } - } -} - -void Loop::apply_xform() { - std::set active; - for (int i = 0; i < stmt.size(); i++) - active.insert(i); - apply_xform(active); -} - -void Loop::apply_xform(int stmt_num) { - std::set active; - active.insert(stmt_num); - apply_xform(active); -} - -void Loop::apply_xform(std::set &active) { - int max_n = 0; - - CG_outputBuilder *ocg = ir->builder(); - for (std::set::iterator i = active.begin(); i != active.end(); i++) { - int n = stmt[*i].loop_level.size(); - if (n > max_n) - max_n = n; - - std::vector lex = getLexicalOrder(*i); - - Relation mapping(2 * n + 1, n); - F_And *f_root = mapping.add_and(); - for (int j = 1; j <= n; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(j), 1); - h.update_coef(mapping.input_var(2 * j), -1); - } - mapping = Composition(mapping, stmt[*i].xform); - mapping.simplify(); - - // match omega input/output variables to variable names in the code - for (int j = 1; j <= stmt[*i].IS.n_set(); j++) - mapping.name_input_var(j, stmt[*i].IS.set_var(j)->name()); - for (int j = 1; j <= n; j++) - mapping.name_output_var(j, - tmp_loop_var_name_prefix - + to_string(tmp_loop_var_name_counter + j - 1)); - mapping.setup_names(); - - Relation known = Extend_Set(copy(this->known), - mapping.n_out() - this->known.n_set()); - //stmt[*i].code = outputStatement(ocg, stmt[*i].code, 0, mapping, known, std::vector(mapping.n_out(), NULL)); - stmt[*i].code = outputStatement(ocg, stmt[*i].code, 0, mapping, known, - std::vector(mapping.n_out())); - stmt[*i].IS = Range(Restrict_Domain(mapping, stmt[*i].IS)); - stmt[*i].IS.simplify(); - - // replace original transformation relation with straight 1-1 mapping - mapping = Relation(n, 2 * n + 1); - f_root = mapping.add_and(); - for (int j = 1; j <= n; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(2 * j), 1); - h.update_coef(mapping.input_var(j), -1); - } - for (int j = 1; j <= 2 * n + 1; j += 2) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(j), 1); - h.update_const(-lex[j - 1]); - } - stmt[*i].xform = mapping; - } - - tmp_loop_var_name_counter += max_n; -} - -void Loop::addKnown(const Relation &cond) { - int n1 = this->known.n_set(); - - Relation r = copy(cond); - int n2 = r.n_set(); - - if (n1 < n2) - this->known = Extend_Set(this->known, n2 - n1); - else if (n1 > n2) - r = Extend_Set(r, n1 - n2); - - this->known = Intersection(this->known, r); -} - -bool Loop::nonsingular(const std::vector > &T) { - if (stmt.size() == 0) - return true; - - // check for sanity of parameters - for (int i = 0; i < stmt.size(); i++) { - if (stmt[i].loop_level.size() != num_dep_dim) - throw std::invalid_argument( - "nonsingular loop transformations must be applied to original perfect loop nest"); - for (int j = 0; j < stmt[i].loop_level.size(); j++) - if (stmt[i].loop_level[j].type != LoopLevelOriginal) - throw std::invalid_argument( - "nonsingular loop transformations must be applied to original perfect loop nest"); - } - if (T.size() != num_dep_dim) - throw std::invalid_argument("invalid transformation matrix"); - for (int i = 0; i < stmt.size(); i++) - if (T[i].size() != num_dep_dim + 1 && T[i].size() != num_dep_dim) - throw std::invalid_argument("invalid transformation matrix"); - - // build relation from matrix - Relation mapping(2 * num_dep_dim + 1, 2 * num_dep_dim + 1); - F_And *f_root = mapping.add_and(); - for (int i = 0; i < num_dep_dim; i++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(2 * (i + 1)), -1); - for (int j = 0; j < num_dep_dim; j++) - if (T[i][j] != 0) - h.update_coef(mapping.input_var(2 * (j + 1)), T[i][j]); - if (T[i].size() == num_dep_dim + 1) - h.update_const(T[i][num_dep_dim]); - } - for (int i = 1; i <= 2 * num_dep_dim + 1; i += 2) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.output_var(i), -1); - h.update_coef(mapping.input_var(i), 1); - } - - // update transformation relations - for (int i = 0; i < stmt.size(); i++) - stmt[i].xform = Composition(copy(mapping), stmt[i].xform); - - // update dependence graph - for (int i = 0; i < dep.vertex.size(); i++) - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); - j++) { - std::vector dvs = j->second; - for (int k = 0; k < dvs.size(); k++) { - DependenceVector &dv = dvs[k]; - switch (dv.type) { - case DEP_W2R: - case DEP_R2W: - case DEP_W2W: - case DEP_R2R: { - std::vector lbounds(num_dep_dim), ubounds( - num_dep_dim); - for (int p = 0; p < num_dep_dim; p++) { - coef_t lb = 0; - coef_t ub = 0; - for (int q = 0; q < num_dep_dim; q++) { - if (T[p][q] > 0) { - if (lb == -posInfinity - || dv.lbounds[q] == -posInfinity) - lb = -posInfinity; - else - lb += T[p][q] * dv.lbounds[q]; - if (ub == posInfinity - || dv.ubounds[q] == posInfinity) - ub = posInfinity; - else - ub += T[p][q] * dv.ubounds[q]; - } else if (T[p][q] < 0) { - if (lb == -posInfinity - || dv.ubounds[q] == posInfinity) - lb = -posInfinity; - else - lb += T[p][q] * dv.ubounds[q]; - if (ub == posInfinity - || dv.lbounds[q] == -posInfinity) - ub = posInfinity; - else - ub += T[p][q] * dv.lbounds[q]; - } - } - if (T[p].size() == num_dep_dim + 1) { - if (lb != -posInfinity) - lb += T[p][num_dep_dim]; - if (ub != posInfinity) - ub += T[p][num_dep_dim]; - } - lbounds[p] = lb; - ubounds[p] = ub; - } - dv.lbounds = lbounds; - dv.ubounds = ubounds; - - break; - } - default: - ; - } - } - j->second = dvs; - } - - // set constant loop values - std::set active; - for (int i = 0; i < stmt.size(); i++) - active.insert(i); - setLexicalOrder(0, active); - - return true; -} - -void Loop::skew(const std::set &stmt_nums, int level, - const std::vector &skew_amount) { - if (stmt_nums.size() == 0) - return; - - // check for sanity of parameters - int ref_stmt_num = *(stmt_nums.begin()); - std::vector > array_of_deps; - for (std::set::const_iterator i = stmt_nums.begin(); - i != stmt_nums.end(); i++) { - if (*i < 0 || *i >= stmt.size()) - throw std::invalid_argument( - "invalid statement number " + to_string(*i)); - if (level < 1 || level > stmt[*i].loop_level.size()) - throw std::invalid_argument( - "invalid loop level " + to_string(level)); - for (int j = stmt[*i].loop_level.size(); j < skew_amount.size(); j++) - if (skew_amount[j] != 0) - throw std::invalid_argument("invalid skewing formula"); - } - - // set trasformation relations - for (std::set::const_iterator i = stmt_nums.begin(); - i != stmt_nums.end(); i++) { - int n = stmt[*i].xform.n_out(); - Relation r(n, n); - F_And *f_root = r.add_and(); - for (int j = 1; j <= n; j++) - if (j != 2 * level) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.input_var(j), 1); - h.update_coef(r.output_var(j), -1); - } - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.output_var(2 * level), -1); - for (int j = 0; j < skew_amount.size(); j++) - if (skew_amount[j] != 0) - h.update_coef(r.input_var(2 * (j + 1)), skew_amount[j]); - - stmt[*i].xform = Composition(r, stmt[*i].xform); - stmt[*i].xform.simplify(); - applyXform(*i); - std::set dont_consider; - //} - - // update dependence graph - if (stmt[ref_stmt_num].loop_level[level - 1].type - == LoopLevelOriginal) { - int dep_dim = stmt[ref_stmt_num].loop_level[level - 1].payload; - for (std::set::const_iterator i = stmt_nums.begin(); - i != stmt_nums.end(); i++) - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[*i].second.begin(); - j != dep.vertex[*i].second.end(); j++) - if (stmt_nums.find(j->first) != stmt_nums.end()) { - // dependence between skewed statements - std::vector dvs = j->second; - for (int k = 0; k < dvs.size(); k++) { - DependenceVector &dv = dvs[k]; - if (dv.is_data_dependence()) { - coef_t lb = 0; - coef_t ub = 0; - for (int kk = 0; kk < skew_amount.size(); - kk++) { - int cur_dep_dim = get_dep_dim_of(*i, - kk + 1); - if (skew_amount[kk] > 0) { - if (lb != -posInfinity - && stmt[*i].loop_level[kk].type - == LoopLevelOriginal - && dv.lbounds[cur_dep_dim] - != -posInfinity) - lb += skew_amount[kk] - * dv.lbounds[cur_dep_dim]; - else { - if (cur_dep_dim != -1 - && !(dv.lbounds[cur_dep_dim] - == 0 - && dv.ubounds[cur_dep_dim] - == 0)) - lb = -posInfinity; - } - if (ub != posInfinity - && stmt[*i].loop_level[kk].type - == LoopLevelOriginal - && dv.ubounds[cur_dep_dim] - != posInfinity) - ub += skew_amount[kk] - * dv.ubounds[cur_dep_dim]; - else { - if (cur_dep_dim != -1 - && !(dv.lbounds[cur_dep_dim] - == 0 - && dv.ubounds[cur_dep_dim] - == 0)) - ub = posInfinity; - } - } else if (skew_amount[kk] < 0) { - if (lb != -posInfinity - && stmt[*i].loop_level[kk].type - == LoopLevelOriginal - && dv.ubounds[cur_dep_dim] - != posInfinity) - lb += skew_amount[kk] - * dv.ubounds[cur_dep_dim]; - else { - if (cur_dep_dim != -1 - && !(dv.lbounds[cur_dep_dim] - == 0 - && dv.ubounds[cur_dep_dim] - == 0)) - lb = -posInfinity; - } - if (ub != posInfinity - && stmt[*i].loop_level[kk].type - == LoopLevelOriginal - && dv.lbounds[cur_dep_dim] - != -posInfinity) - ub += skew_amount[kk] - * dv.lbounds[cur_dep_dim]; - else { - if (cur_dep_dim != -1 - && !(dv.lbounds[cur_dep_dim] - == 0 - && dv.ubounds[cur_dep_dim] - == 0)) - ub = posInfinity; - } - } - } - if ((dv.isCarried(dep_dim) - && dv.hasPositive(dep_dim)) && dv.quasi) - dv.quasi = false; - - if ((dv.isCarried(dep_dim) - && dv.hasNegative(dep_dim)) - && !dv.quasi) - throw loop_error( - "loop error: Skewing is illegal, dependence violation!"); - dv.lbounds[dep_dim] = lb; - dv.ubounds[dep_dim] = ub; - if ((dv.isCarried(dep_dim) - && dv.hasPositive(dep_dim)) && dv.quasi) - dv.quasi = false; - - if ((dv.isCarried(dep_dim) - && dv.hasNegative(dep_dim)) - && !dv.quasi) - throw loop_error( - "loop error: Skewing is illegal, dependence violation!"); - } - } - - j->second = dvs; - } - } else { - // dependence from skewed statement to unskewed statement becomes jumbled, - // put distance value at skewed dimension to unknown - /*std::vector dvs = j->second; - for (int k = 0; k < dvs.size(); k++) { - DependenceVector &dv = dvs[k]; - if (dv.is_data_dependence()) { - dv.lbounds[dep_dim] = -posInfinity; - dv.ubounds[dep_dim] = posInfinity; - } - } - j->second = dvs; - */ - dont_consider.insert(j->first); - } - for (int l = 0; l < dep.vertex.size(); l++) - if (stmt_nums.find(l) == stmt_nums.end()) - if (dont_consider.find(l) == stmt_nums.end() - && (dep.vertex[l].second.find(*i) - != dep.vertex[l].second.end())) - dont_consider.insert(l); - array_of_deps.push_back(dont_consider); - } - /*for (int i = 0; i < dep.vertex.size(); i++) - if (stmt_nums.find(i) == stmt_nums.end()) - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[i].second.begin(); - j != dep.vertex[i].second.end(); j++) - if (stmt_nums.find(j->first) != stmt_nums.end()) { - // dependence from unskewed statement to skewed statement becomes jumbled, - // put distance value at skewed dimension to unknown - std::vector dvs = j->second; - for (int k = 0; k < dvs.size(); k++) { - DependenceVector &dv = dvs[k]; - if (dv.is_data_dependence()) { - dv.lbounds[dep_dim] = -posInfinity; - dv.ubounds[dep_dim] = posInfinity; - } - } - j->second = dvs; - } - }*/ - std::set::const_iterator w = stmt_nums.begin(); - for (int i = 0; i < array_of_deps.size() && w != stmt_nums.end(); i++) - for (std::set::const_iterator j = array_of_deps[i].begin(); - j != array_of_deps[i].end(); j++) { - if (dep.vertex[*w].second.find(*j) != dep.vertex[*w].second.end()) - dep.disconnect(*w, *j); - if (dep.vertex[*j].second.find(*w) != dep.vertex[*j].second.end()) - dep.disconnect(*j, *w); - int x, y; - std::pair, - std::vector > dv_s; - if ((*w) <= (*j)) { - x = *w; - y = *j; - - dv_s = test_data_dependences(ir_, stmt[x].code, stmt[x].IS, - stmt[y].code, stmt[y].IS, freevar, index, x, y); - } else { - x = *j; - y = *w; - dv_s = test_data_dependences(ir_, stmt[y].code, stmt[y].IS, - stmt[x].code, stmt[x].IS, freevar, index, x, y); - } - for (int k = 0; k < dv_s.first.size(); k++) { - if (is_dependence_valid_based_on_lex_order(x, y, dv_s.first[k], - true)) - dep.connect(x, y, dv_s.first[k]); - else - dep.connect(y, x, dv_s.first[k].reverse()); - } - for (int k = 0; k < dv_s.second.size(); k++) { - if (is_dependence_valid_based_on_lex_order(x, y, dv_s.second[k], - false)) - dep.connect(y, x, dv_s.second[k]); - else - dep.connect(x, y, dv_s.second[k].reverse()); - } - w++; - } -} - -void Loop::shift(const std::set &stmt_nums, int level, int shift_amount) { - if (stmt_nums.size() == 0) - return; - - // check for sanity of parameters - int ref_stmt_num = *(stmt_nums.begin()); - for (std::set::const_iterator i = stmt_nums.begin(); - i != stmt_nums.end(); i++) { - if (*i < 0 || *i >= stmt.size()) - throw std::invalid_argument( - "invalid statement number " + to_string(*i)); - if (level < 1 || level > stmt[*i].loop_level.size()) - throw std::invalid_argument( - "invalid loop level " + to_string(level)); - } - - // do nothing - if (shift_amount == 0) - return; - - // set trasformation relations - for (std::set::const_iterator i = stmt_nums.begin(); - i != stmt_nums.end(); i++) { - int n = stmt[*i].xform.n_out(); - - Relation r(n, n); - F_And *f_root = r.add_and(); - for (int j = 1; j <= n; j++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(r.input_var(j), 1); - h.update_coef(r.output_var(j), -1); - if (j == 2 * level) - h.update_const(shift_amount); - } - - stmt[*i].xform = Composition(r, stmt[*i].xform); - stmt[*i].xform.simplify(); - } - - // update dependence graph - if (stmt[ref_stmt_num].loop_level[level - 1].type == LoopLevelOriginal) { - int dep_dim = stmt[ref_stmt_num].loop_level[level - 1].payload; - for (std::set::const_iterator i = stmt_nums.begin(); - i != stmt_nums.end(); i++) - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[*i].second.begin(); - j != dep.vertex[*i].second.end(); j++) - if (stmt_nums.find(j->first) == stmt_nums.end()) { - // dependence from shifted statement to unshifted statement - std::vector dvs = j->second; - for (int k = 0; k < dvs.size(); k++) { - DependenceVector &dv = dvs[k]; - if (dv.is_data_dependence()) { - if (dv.lbounds[dep_dim] != -posInfinity) - dv.lbounds[dep_dim] -= shift_amount; - if (dv.ubounds[dep_dim] != posInfinity) - dv.ubounds[dep_dim] -= shift_amount; - } - } - j->second = dvs; - } - for (int i = 0; i < dep.vertex.size(); i++) - if (stmt_nums.find(i) == stmt_nums.end()) - for (DependenceGraph::EdgeList::iterator j = - dep.vertex[i].second.begin(); - j != dep.vertex[i].second.end(); j++) - if (stmt_nums.find(j->first) != stmt_nums.end()) { - // dependence from unshifted statement to shifted statement - std::vector dvs = j->second; - for (int k = 0; k < dvs.size(); k++) { - DependenceVector &dv = dvs[k]; - if (dv.is_data_dependence()) { - if (dv.lbounds[dep_dim] != -posInfinity) - dv.lbounds[dep_dim] += shift_amount; - if (dv.ubounds[dep_dim] != posInfinity) - dv.ubounds[dep_dim] += shift_amount; - } - } - j->second = dvs; - } - } -} - -// bool Loop::fuse(const std::set &stmt_nums, int level) { -// if (stmt_nums.size() == 0 || stmt_nums.size() == 1) -// return true; -// int dim = 2*level-1; - -// // check for sanity of parameters -// std::vector ref_lex; -// for (std::set::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) { -// if (*i < 0 || *i >= stmt.size()) -// throw std::invalid_argument("invalid statement number " + to_string(*i)); -// if (level < 1 || level > (stmt[*i].xform.n_out()-1)/2) -// throw std::invalid_argument("invalid loop level " + to_string(level)); -// if (ref_lex.size() == 0) -// ref_lex = getLexicalOrder(*i); -// else { -// std::vector lex = getLexicalOrder(*i); -// for (int j = 0; j < dim-1; j+=2) -// if (lex[j] != ref_lex[j]) -// throw std::invalid_argument("statements for fusion must be in the same level-" + to_string(level-1) + " subloop"); -// } -// } - -// // collect lexicographical order values from to-be-fused statements -// std::set lex_values; -// for (std::set::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) { -// std::vector lex = getLexicalOrder(*i); -// lex_values.insert(lex[dim-1]); -// } -// if (lex_values.size() == 1) -// return true; - -// // negative dependence would prevent fusion -// int dep_dim = xform_index[dim].first; -// for (std::set::iterator i = lex_values.begin(); i != lex_values.end(); i++) { -// ref_lex[dim-1] = *i; -// std::set a = getStatements(ref_lex, dim-1); -// std::set::iterator j = i; -// j++; -// for (; j != lex_values.end(); j++) { -// ref_lex[dim-1] = *j; -// std::set b = getStatements(ref_lex, dim-1); -// for (std::set::iterator ii = a.begin(); ii != a.end(); ii++) -// for (std::set::iterator jj = b.begin(); jj != b.end(); jj++) { -// std::vector dvs; -// dvs = dep.getEdge(*ii, *jj); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim) && dvs[k].hasNegative(dep_dim)) -// throw loop_error("loop error: statements " + to_string(*ii) + " and " + to_string(*jj) + " cannot be fused together due to negative dependence"); -// dvs = dep.getEdge(*jj, *ii); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim) && dvs[k].hasNegative(dep_dim)) -// throw loop_error("loop error: statements " + to_string(*jj) + " and " + to_string(*ii) + " cannot be fused together due to negative dependence"); -// } -// } -// } - -// // collect all other lexicographical order values from the subloop -// // enclosing these to-be-fused loops -// std::set same_loop = getStatements(ref_lex, dim-3); -// std::set other_lex_values; -// for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { -// std::vector lex = getLexicalOrder(*i); -// if (lex_values.find(lex[dim-1]) == lex_values.end()) -// other_lex_values.insert(lex[dim-1]); -// } - -// // update to-be-fused loops due to dependence cycle -// Graph, Empty> g; -// { -// std::set t; -// for (std::set::iterator i = lex_values.begin(); i != lex_values.end(); i++) { -// ref_lex[dim-1] = *i; -// std::set t2 = getStatements(ref_lex, dim-1); -// std::set_union(t.begin(), t.end(), t2.begin(), t2.end(), inserter(t, t.begin())); -// } -// g.insert(t); -// } -// for (std::set::iterator i = other_lex_values.begin(); i != other_lex_values.end(); i++) { -// ref_lex[dim-1] = *i; -// std::set t = getStatements(ref_lex, dim-1); -// g.insert(t); -// } -// for (int i = 0; i < g.vertex.size(); i++) -// for (int j = i+1; j < g.vertex.size(); j++) -// for (std::set::iterator ii = g.vertex[i].first.begin(); ii != g.vertex[i].first.end(); ii++) -// for (std::set::iterator jj = g.vertex[j].first.begin(); jj != g.vertex[j].first.end(); jj++) { -// std::vector dvs; -// dvs = dep.getEdge(*ii, *jj); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim)) { -// g.connect(i, j); -// break; -// } -// dvs = dep.getEdge(*jj, *ii); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim)) { -// g.connect(j, i); -// break; -// } -// } -// std::vector > s = g.topoSort(); -// int fused_lex_value = 0; -// for (int i = 0; i < s.size(); i++) -// if (s[i].find(0) != s[i].end()) { -// // now add additional lexicographical order values -// for (std::set::iterator j = s[i].begin(); j != s[i].end(); j++) -// if (*j != 0) { -// int stmt = *(g.vertex[*j].first.begin()); -// std::vector lex = getLexicalOrder(stmt); -// lex_values.insert(lex[dim-1]); -// } - -// if (s.size() > 1) { -// if (i == 0) { -// int min_lex_value; -// for (std::set::iterator j = s[i+1].begin(); j != s[i+1].end(); j++) { -// int stmt = *(g.vertex[*j].first.begin()); -// std::vector lex = getLexicalOrder(stmt); -// if (j == s[i+1].begin()) -// min_lex_value = lex[dim-1]; -// else if (lex[dim-1] < min_lex_value) -// min_lex_value = lex[dim-1]; -// } -// fused_lex_value = min_lex_value - 1; -// } -// else { -// int max_lex_value; -// for (std::set::iterator j = s[i-1].begin(); j != s[i-1].end(); j++) { -// int stmt = *(g.vertex[*j].first.begin()); -// std::vector lex = getLexicalOrder(stmt); -// if (j == s[i-1].begin()) -// max_lex_value = lex[dim-1]; -// else if (lex[dim-1] > max_lex_value) -// max_lex_value = lex[dim-1]; -// } -// fused_lex_value = max_lex_value + 1; -// } -// } - -// break; -// } - -// // sort the newly updated to-be-fused lexicographical order values -// std::vector ordered_lex_values; -// for (std::set::iterator i = lex_values.begin(); i != lex_values.end(); i++) -// ordered_lex_values.push_back(*i); -// std::sort(ordered_lex_values.begin(), ordered_lex_values.end()); - -// // make sure internal loops inside to-be-fused loops have the same -// // lexicographical order before and after fusion -// std::vector > inside_lex_range(ordered_lex_values.size()); -// for (int i = 0; i < ordered_lex_values.size(); i++) { -// ref_lex[dim-1] = ordered_lex_values[i]; -// std::set the_stmts = getStatements(ref_lex, dim-1); -// std::set::iterator j = the_stmts.begin(); -// std::vector lex = getLexicalOrder(*j); -// int min_inside_lex_value = lex[dim+1]; -// int max_inside_lex_value = lex[dim+1]; -// j++; -// for (; j != the_stmts.end(); j++) { -// std::vector lex = getLexicalOrder(*j); -// if (lex[dim+1] < min_inside_lex_value) -// min_inside_lex_value = lex[dim+1]; -// if (lex[dim+1] > max_inside_lex_value) -// max_inside_lex_value = lex[dim+1]; -// } -// inside_lex_range[i].first = min_inside_lex_value; -// inside_lex_range[i].second = max_inside_lex_value; -// } -// for (int i = 1; i < ordered_lex_values.size(); i++) -// if (inside_lex_range[i].first <= inside_lex_range[i-1].second) { -// int shift_lex_value = inside_lex_range[i-1].second - inside_lex_range[i].first + 1; -// ref_lex[dim-1] = ordered_lex_values[i]; -// ref_lex[dim+1] = inside_lex_range[i].first; -// shiftLexicalOrder(ref_lex, dim+1, shift_lex_value); -// inside_lex_range[i].first += shift_lex_value; -// inside_lex_range[i].second += shift_lex_value; -// } - -// // set lexicographical order for fused loops -// for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { -// std::vector lex = getLexicalOrder(*i); -// if (lex_values.find(lex[dim-1]) != lex_values.end()) -// assign_const(stmt[*i].xform, dim-1, fused_lex_value); -// } - -// // no need to update dependence graph -// ; - -// return true; -// } - -// bool Loop::distribute(const std::set &stmt_nums, int level) { -// if (stmt_nums.size() == 0 || stmt_nums.size() == 1) -// return true; -// int dim = 2*level-1; - -// // check for sanity of parameters -// std::vector ref_lex; -// for (std::set::const_iterator i = stmt_nums.begin(); i != stmt_nums.end(); i++) { -// if (*i < 0 || *i >= stmt.size()) -// throw std::invalid_argument("invalid statement number " + to_string(*i)); -// if (level < 1 || level > (stmt[*i].xform.n_out()-1)/2) -// throw std::invalid_argument("invalid loop level " + to_string(level)); -// if (ref_lex.size() == 0) -// ref_lex = getLexicalOrder(*i); -// else { -// std::vector lex = getLexicalOrder(*i); -// for (int j = 0; j <= dim-1; j+=2) -// if (lex[j] != ref_lex[j]) -// throw std::invalid_argument("statements for distribution must be in the same level-" + to_string(level) + " subloop"); -// } -// } - -// // find SCC in the to-be-distributed loop -// int dep_dim = xform_index[dim].first; -// std::set same_loop = getStatements(ref_lex, dim-1); -// Graph g; -// for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) -// g.insert(*i); -// for (int i = 0; i < g.vertex.size(); i++) -// for (int j = i+1; j < g.vertex.size(); j++) { -// std::vector dvs; -// dvs = dep.getEdge(g.vertex[i].first, g.vertex[j].first); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim)) { -// g.connect(i, j); -// break; -// } -// dvs = dep.getEdge(g.vertex[j].first, g.vertex[i].first); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim)) { -// g.connect(j, i); -// break; -// } -// } -// std::vector > s = g.topoSort(); - -// // find statements that cannot be distributed due to dependence cycle -// Graph, Empty> g2; -// for (int i = 0; i < s.size(); i++) { -// std::set t; -// for (std::set::iterator j = s[i].begin(); j != s[i].end(); j++) -// if (stmt_nums.find(g.vertex[*j].first) != stmt_nums.end()) -// t.insert(g.vertex[*j].first); -// if (!t.empty()) -// g2.insert(t); -// } -// for (int i = 0; i < g2.vertex.size(); i++) -// for (int j = i+1; j < g2.vertex.size(); j++) -// for (std::set::iterator ii = g2.vertex[i].first.begin(); ii != g2.vertex[i].first.end(); ii++) -// for (std::set::iterator jj = g2.vertex[j].first.begin(); jj != g2.vertex[j].first.end(); jj++) { -// std::vector dvs; -// dvs = dep.getEdge(*ii, *jj); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim)) { -// g2.connect(i, j); -// break; -// } -// dvs = dep.getEdge(*jj, *ii); -// for (int k = 0; k < dvs.size(); k++) -// if (dvs[k].isCarried(dep_dim)) { -// g2.connect(j, i); -// break; -// } -// } -// std::vector > s2 = g2.topoSort(); - -// // nothing to distribute -// if (s2.size() == 1) -// throw loop_error("loop error: no statement can be distributed due to dependence cycle"); - -// std::vector > s3; -// for (int i = 0; i < s2.size(); i++) { -// std::set t; -// for (std::set::iterator j = s2[i].begin(); j != s2[i].end(); j++) -// std::set_union(t.begin(), t.end(), g2.vertex[*j].first.begin(), g2.vertex[*j].first.end(), inserter(t, t.begin())); -// s3.push_back(t); -// } - -// // associate other affected statements with the right distributed statements -// for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) -// if (stmt_nums.find(*i) == stmt_nums.end()) { -// bool is_inserted = false; -// int potential_insertion_point = 0; -// for (int j = 0; j < s3.size(); j++) { -// for (std::set::iterator k = s3[j].begin(); k != s3[j].end(); k++) { -// std::vector dvs; -// dvs = dep.getEdge(*i, *k); -// for (int kk = 0; kk < dvs.size(); kk++) -// if (dvs[kk].isCarried(dep_dim)) { -// s3[j].insert(*i); -// is_inserted = true; -// break; -// } -// dvs = dep.getEdge(*k, *i); -// for (int kk = 0; kk < dvs.size(); kk++) -// if (dvs[kk].isCarried(dep_dim)) -// potential_insertion_point = j; -// } -// if (is_inserted) -// break; -// } - -// if (!is_inserted) -// s3[potential_insertion_point].insert(*i); -// } - -// // set lexicographical order after distribution -// int order = ref_lex[dim-1]; -// shiftLexicalOrder(ref_lex, dim-1, s3.size()-1); -// for (std::vector >::iterator i = s3.begin(); i != s3.end(); i++) { -// for (std::set::iterator j = (*i).begin(); j != (*i).end(); j++) -// assign_const(stmt[*j].xform, dim-1, order); -// order++; -// } - -// // no need to update dependence graph -// ; - -// return true; -// } - diff --git a/mem_mapping_utils.cc b/mem_mapping_utils.cc deleted file mode 100644 index 645fe59..0000000 --- a/mem_mapping_utils.cc +++ /dev/null @@ -1,76 +0,0 @@ -#include -#include -#include -#include "rose.h" -#include "mem_mapping_utils.hh" - -using namespace SageBuilder; -using namespace SageInterface; - -memory_mapping::memory_mapping (bool used, const char * array_name){ - this->mem_used = used; - this->add(array_name); -} - -texture_memory_mapping::texture_memory_mapping(bool used, const char* array_name) : memory_mapping(used, array_name) { } -constant_memory_mapping::constant_memory_mapping(bool used, const char* array_name) : memory_mapping(used, array_name) { } -//texture_memory_mapping::texture_memory_mapping (bool used, const char* array_name, int width, int height) { -// tex_mem_used = used; -// this->add(array_name, width, height); -//} - -void memory_mapping::add(const char * array_name) { - this->mapped_array_name.push_back(std::string(array_name)); - //std::vector ivec = std::vector(); - //dims[std::string(array_name)] = ivec; -} -//void texture_memory_mapping::add(const char* array_name, int width, int height) { -// tex_mapped_array_name.push_back(std::string(array_name)); -// std::vector ivec = std::vector(); -// ivec.push_back(width); -// ivec.push_back(height); -// dims[std::string(array_name)] = ivec; -//} - -bool memory_mapping::is_mem_used(){ - return this->mem_used; -} -bool memory_mapping::is_array_mapped(const char * array_name){ - - for( int i=0; imapped_symbol[std::string(array_name)] = sym; -} -void texture_memory_mapping::set_devptr_symbol(const char * array_name, SgVariableSymbol* sym) { - devptr_symbol[std::string(array_name)] = sym; -} -void memory_mapping::set_vardef(const char* array_name, VarDefs* vardef) { - this->vardefs[std::string(array_name)] = vardef; -} -SgVarRefExp* memory_mapping::get_mapped_symbol_exp(const char * array_name) { - return buildVarRefExp(this->mapped_symbol[std::string(array_name)]); -} -SgVarRefExp* texture_memory_mapping::get_devptr_symbol_exp(const char * array_name) { - return buildVarRefExp(devptr_symbol[std::string(array_name)]); -} -VarDefs* memory_mapping::get_vardef(const char* vardef_name) { - return this->vardefs[std::string(vardef_name)]; -} -//int texture_memory_mapping::get_dims(const char* array_name) { -// return (int)(dims[std::string(array_name)].size()); -//} -//int texture_memory_mapping::get_dim_length(const char* array_name, int dim) { -// return dims[std::string(array_name)][dim]; -//} -memory_mapping::memory_mapping() { - mem_used = false; -} -texture_memory_mapping::texture_memory_mapping() : memory_mapping() { } -constant_memory_mapping::constant_memory_mapping() : memory_mapping() { } - - diff --git a/mem_mapping_utils.hh b/mem_mapping_utils.hh deleted file mode 100644 index 8ff0545..0000000 --- a/mem_mapping_utils.hh +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef MEM_MAPPING_UTILS_HH -#define MEM_MAPPING_UTILS_HH - -#include -#include -#include -#include "rose.h" - -using namespace SageInterface; -using namespace SageBuilder; - -struct VarDefs; - -class memory_mapping { -private: - bool mem_used; - std::vector< std::string > mapped_array_name; - std::map mapped_symbol; - std::map vardefs; -public: - memory_mapping(); - memory_mapping(bool used, const char* array_name); - void add(const char* array_name); - bool is_mem_used(); - bool is_array_mapped(const char* array_name); - void set_mapped_symbol(const char* array_name, SgVariableSymbol* sym); - void set_vardef(const char* array_name, VarDefs* vardef); - SgVarRefExp* get_mapped_symbol_exp(const char* array_name); - VarDefs* get_vardef(const char* vardef_name); -}; - -//protonu --class introduced to hold texture memory information in one single place -//this might help me get over the weird memory issues I am having with the Loop class -//where someone/something corrupts my memory - -class texture_memory_mapping : public memory_mapping { -private: - std::map devptr_symbol; - // a hack for multi-dimensional texture mapping - //std::map > dims; -public: - texture_memory_mapping ( bool used, const char * array_name); - //texture_memory_mapping (bool used, const char* array_name, int width, int height); - // this function is a hack to get arround a bug - // void add(const char* array_name, int width, int height); - void set_devptr_symbol(const char * array_name, SgVariableSymbol* sym); - SgVarRefExp* get_devptr_symbol_exp(const char * array_name); - //int get_dim_length(const char* array_name, int dim); - //int get_dims(const char* array_name); - texture_memory_mapping(); -}; - -class constant_memory_mapping : public memory_mapping { -public: - constant_memory_mapping(); - constant_memory_mapping(bool used, const char* array_name); -}; - -#endif diff --git a/omega/INSTALL b/omega/INSTALL deleted file mode 100644 index f3c3558..0000000 --- a/omega/INSTALL +++ /dev/null @@ -1,34 +0,0 @@ -BUILD -===== - -0. Install Rose using the rose installation instructions given. - -1. Edit Makefile.config. Change BUILD_CODEGEN to false if you don't want - CodeGen+ library to be built. - -2. Do "make depend". - -3. Optionally, do "make clean" to remove object files or "make veryclean" - to additionally remove target files. - -4. Do "make". - - -INSTALLATION -============ - -You can use Omega+ and CodeGen+ in source directory since all links -are already created in bin/, lib/ and include/ subdirectories. - -omega/ source directory root - bin/ command line interface "oc" - lib/ libraries "libomega.a" and "libcode_gen.a" - include/ - omega.h main Omega+ header file - omega/ Omega+ header files - basic/ basic utility header files - code_gen/ CodeGen+ header files - -You can also do "make install" to copy necessary files into -/usr/local for root account, or use home directory for other accounts. - diff --git a/omega/README b/omega/README deleted file mode 100644 index 378f4bd..0000000 --- a/omega/README +++ /dev/null @@ -1,96 +0,0 @@ -Omega+ and CodeGen+ 2.2 open source release -See LICENSE file for copyright information. - -Omega+ is a mathematical library for manipulating integer linear -constraints over integer variables in first order logic, and -operations on integer sets and their mappings. CodeGen+ is a code -generation library by scanning the points in a union of polytopes. -A command-line interface to libraries is also included. - - -What is new? -============ - -version 2.2: - * Redesigned polyhedra scanning which generates higher quality code - than before especially for complex set of polyhedra. - * New SimpleHull for hull approximation (deprecate Hull). - * Command line editing and history support in calculator. - -version 2.1: - * Updated "effort" parameter's meaning in MMGenerateCode: value n - (n >= 0, default to 1) means that control overheads are removed - from all n-depth innermost loops. - * Enhanced stride handling in the code generation. - * Support code generation for a set of iteration spaces with different - dimensionality. - * New ConvexRepresentation that reduces the number of conjuncts in a union - (deprecate CheckForConvexPairs and CheckForConvexRepresentation). - * Handle floor/ceiling defined variables cleanly in output code. - * Use namespace omega for the library. - * New closure functions contributed by Klimek Tomasz (R^+ and R^@). - -version 2.0: - * Improved internal code generation interface so that it generates both - string and rose ouput now, and more easily extendable for new compiler - intermediate representations. - * Improved gist function so that integer modular constraints are handled - more gracefully. - * Merge duplicate if-conditions in generated code, which might still miss - a few opportunities due to the way AST is constructed. - * Correct output/input variable substitution for non-unimodular - mapping relations. - * Deprecate Omega's assert/Exit interface. - * Some fixing in calculator's parsing and interactive interface. - -version 1.2 (Omega Project): - * Support for code generation with memory mappings, as described in - Tina Shen's MASPLAS '98 paper. This is available in oc via the - tcodegen function; see examples/calc/mm* for examples. - * Use of the compile-time flags -DSTILL_CHECK_MULT=1 -DNDEBUG turns off - all assertions and chechk _except_ some checks for integer overflow - during variable elimination in the omega core. Unless you know a priori - that overflow cannot occur, you should use this instead of just plain - -DNDEBUG when optimizing. - * You can now use "assertUnsatisfiable relation" to cause oc to quit if - "relation" could be satisfiable. This is mainly useful when running oc - in a script. - -version 1.1 (Omega Project): - * An exact convex hull computation. - * An improved system for handling inexact relations, including taking - upper and lower bounds, checking for subsets, and checking tautologies. - * Better handling of existentially quantified variables: we can now - negate and generate code for sets like: - {[i]: 1 <= i <= n && exists (alpha: i <= 10*alpha <= i+k)}. - * An Example operator, that gives a sample solution to set or relation. - -version 0.90 (Omega Project): - * Initial release. - - -DIRECTORIES -=========== - -omega/ - omega_lib/ source files for the Omega+ library - code_gen/ source files for the CodeGen+ library - omega_calc/ source files for the calculator - examples/ script examples using calculator - c_code/ code examples for using libraries - bin/ links to executables: oc - lib/ links to libraries: libomega.a, libcode_gen.a - include/ links to header files - - -DOCUMENTATION AND QUESTIONS -=========================== - -There are only old documents from the Omega Project under doc/ subdirectory -for now. - -Software website: - http://www.chunchen.info/omega - -For questions, bug reports or suggestions, please contact: - mailto:riverofdreams@gmail.com diff --git a/omega/ROSE_INSTALL.txt b/omega/ROSE_INSTALL.txt deleted file mode 100644 index 79e0c43..0000000 --- a/omega/ROSE_INSTALL.txt +++ /dev/null @@ -1,77 +0,0 @@ -INSTALLATION STEPS: - -1) Please install Boost library version <= 1.45.0 using these instruccions - -1. Download BOOST. -Download BOOST at www.boost.org/users/download. - -2. Untar BOOST. -Type tar -zxf BOOST-[VersionNumber].tar.gz to untar the BOOST distribution. - -3. Create a separate install tree. -Type mkdir installTree to create a location for the install. - -4. Run the bootstrap.sh script. -Type ./bootstrap.sh --prefix=[installTree] - -5. Run bjam. -Type ./bjam install --prefix=[installTree] - - -6) set your BOOSTHOME environment variable to where you've installed BOOST. - -7) Download the latest version of rose from the website. - https://outreach.scidac.gov/frs/?group_id=24 - -8) set the JAVA_HOME environment variable in your ${HOME}/.bashrc - eg. export JAVA_HOME=/usr/lib/jvm/java-1.6.0-openjdk - -9) add this to the LD_LIBRARY_PATH environment variable - - LD_LIBRARY_PATH=$JAVA_HOME/jre/lib/i386/server:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${BOOSTHOME}/lib - -10) make a new empty directory separate from the downloaded source directory($ROSE_SRC) for rose. - eg. mkdir ${HOME}/compileTree - -11) set your ROSEHOME environment variable in ${HOME}/.bashrc to ${HOME}/compileTree - -12) run the following command from this ${ROSEHOME} - ${ROSE_SRC}/configure --prefix=${ROSEHOME} --with-boost=${BOOSTHOME} --with-boost-libdir=${BOOSTHOME}/lib -with-haskell=no - -13) run the following command to compile: - make install-core - - -14) Install lua version <= 5.1 (usually not necessary to set the LUAHOME environment variable unless - you installed it in a local directory, in which case set LUAHOME to that directory). Lua is only required for - cuda-chill and not plain chill. - -15) If you are installing for CUDA-CHILL set the CUDACHILL environment variable to true - else false - - -16) Install omega by doing the following commands - i) make clean - ii) make veryclean - iii)make depend - iv) make - -17) Set your OMEGAHOME environment variable to the appropriate directory in ${HOME}/.bashrc - -18) Install cuda-chill by doing the following commands - i) make clean - ii) make veryclean - iii)make depend-cuda-chill - iv) make cuda-chill - - else if you are installing just plain chill - export CUDACHILL=false; (remember to rebuild plain omega as well) - i) make clean - ii) make veryclean - iii)make depend - iv) make - -19) Go to examples/cuda-chill and run ../../cuda-chill mm.lua - -20) If running plain Chill go to examples/chill and run ../../chill gemm.script diff --git a/omega/bin/oc b/omega/bin/oc deleted file mode 120000 index be58273..0000000 --- a/omega/bin/oc +++ /dev/null @@ -1 +0,0 @@ -../omega_calc/obj/oc \ No newline at end of file diff --git a/orig_loop_datacopy.cc b/orig_loop_datacopy.cc deleted file mode 100644 index 04741bc..0000000 --- a/orig_loop_datacopy.cc +++ /dev/null @@ -1,1175 +0,0 @@ -/***************************************************************************** - Copyright (C) 2008 University of Southern California - Copyright (C) 2009-2010 University of Utah - All Rights Reserved. - - Purpose: - Various data copy schemes. - - Notes: - - History: - 02/20/09 Created by Chun Chen by splitting original datacopy from loop.cc -*****************************************************************************/ - -#include -#include -#include "loop.hh" -#include "omegatools.hh" -#include "ir_code.hh" -#include "chill_error.hh" - -using namespace omega; - -// -// data copy function by referring arrays by numbers. -// e.g. A[i] = A[i-1] + B[i] -// parameter array_ref_num=[0,2] means to copy data touched by A[i-1] and A[i] -// -bool Loop::datacopy(const std::vector > > &array_ref_nums, int level, - bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, int memory_type) { - // check for sanity of parameters - std::set same_loop; - for (int i = 0; i < array_ref_nums.size(); i++) { - int stmt_num = array_ref_nums[i].first; - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invalid statement number " + to_string(stmt_num)); - if (level <= 0 || level > stmt[stmt_num].loop_level.size()) - throw std::invalid_argument("invalid loop level " + to_string(level)); - if (i == 0) { - std::vector lex = getLexicalOrder(stmt_num); - same_loop = getStatements(lex, 2*level-2); - } - else if (same_loop.find(stmt_num) == same_loop.end()) - throw std::invalid_argument("array references for data copy must be located in the same subloop"); - } - - // convert array reference numbering scheme to actual array references - std::vector > > selected_refs; - for (int i = 0; i < array_ref_nums.size(); i++) { - if (array_ref_nums[i].second.size() == 0) - continue; - - int stmt_num = array_ref_nums[i].first; - selected_refs.push_back(std::make_pair(stmt_num, std::vector())); - std::vector refs = ir->FindArrayRef(stmt[stmt_num].code); - std::vector selected(refs.size(), false); - for (int j = 0; j < array_ref_nums[i].second.size(); j++) { - int ref_num = array_ref_nums[i].second[j]; - if (ref_num < 0 || ref_num >= refs.size()) { - for (int k = 0; k < refs.size(); k++) - delete refs[k]; - throw std::invalid_argument("invalid array reference number " + to_string(ref_num) + " in statement " + to_string(stmt_num)); - } - selected_refs[selected_refs.size()-1].second.push_back(refs[ref_num]); - selected[ref_num] = true; - } - for (int j = 0; j < refs.size(); j++) - if (!selected[j]) - delete refs[j]; - } - if (selected_refs.size() == 0) - throw std::invalid_argument("found no array references to copy"); - - // do the copy - return datacopy_privatized(selected_refs, level, std::vector(), allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, memory_type); -} - -// -// data copy function by referring arrays by name. -// e.g. A[i] = A[i-1] + B[i] -// parameter array_name=A means to copy data touched by A[i-1] and A[i] -// -bool Loop::datacopy(int stmt_num, int level, const std::string &array_name, - bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, int memory_type) { - // check for sanity of parameters - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invalid statement number " + to_string(stmt_num)); - if (level <= 0 || level > stmt[stmt_num].loop_level.size()) - throw std::invalid_argument("invalid loop level " + to_string(level)); - - // collect array references by name - std::vector lex = getLexicalOrder(stmt_num); - int dim = 2*level - 1; - std::set same_loop = getStatements(lex, dim-1); - - std::vector > > selected_refs; - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - std::vector t; - std::vector refs = ir->FindArrayRef(stmt[*i].code); - for (int j = 0; j < refs.size(); j++) - if (refs[j]->name() == array_name) - t.push_back(refs[j]); - else - delete refs[j]; - if (t.size() != 0) - selected_refs.push_back(std::make_pair(*i, t)); - } - if (selected_refs.size() == 0) - throw std::invalid_argument("found no array references with name " + to_string(array_name) + " to copy"); - - // do the copy - return datacopy_privatized(selected_refs, level, std::vector(), allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, memory_type); -} - - -bool Loop::datacopy_privatized(int stmt_num, int level, const std::string &array_name, const std::vector &privatized_levels, - bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, int memory_type) { - // check for sanity of parameters - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invalid statement number " + to_string(stmt_num)); - if (level <= 0 || level > stmt[stmt_num].loop_level.size()) - throw std::invalid_argument("invalid loop level " + to_string(level)); - - // collect array references by name - std::vector lex = getLexicalOrder(stmt_num); - int dim = 2*level - 1; - std::set same_loop = getStatements(lex, dim-1); - - std::vector > > selected_refs; - for (std::set::iterator i = same_loop.begin(); i != same_loop.end(); i++) { - selected_refs.push_back(std::make_pair(*i, std::vector())); - - std::vector refs = ir->FindArrayRef(stmt[*i].code); - for (int j = 0; j < refs.size(); j++) - if (refs[j]->name() == array_name) - selected_refs[selected_refs.size()-1].second.push_back(refs[j]); - else - delete refs[j]; - } - if (selected_refs.size() == 0) - throw std::invalid_argument("found no array references with name " + to_string(array_name) + " to copy"); - - // do the copy - return datacopy_privatized(selected_refs, level, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, memory_type); -} - - -bool Loop::datacopy_privatized(const std::vector > > &array_ref_nums, int level, const std::vector &privatized_levels, bool allow_extra_read, int fastest_changing_dimension, int padding_stride, int padding_alignment, int memory_type) { - // check for sanity of parameters - std::set same_loop; - for (int i = 0; i < array_ref_nums.size(); i++) { - int stmt_num = array_ref_nums[i].first; - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invalid statement number " + to_string(stmt_num)); - if (level <= 0 || level > stmt[stmt_num].loop_level.size()) - throw std::invalid_argument("invalid loop level " + to_string(level)); - if (i == 0) { - std::vector lex = getLexicalOrder(stmt_num); - same_loop = getStatements(lex, 2*level-2); - } - else if (same_loop.find(stmt_num) == same_loop.end()) - throw std::invalid_argument("array references for data copy must be located in the same subloop"); - } - - // convert array reference numbering scheme to actual array references - std::vector > > selected_refs; - for (int i = 0; i < array_ref_nums.size(); i++) { - if (array_ref_nums[i].second.size() == 0) - continue; - - int stmt_num = array_ref_nums[i].first; - selected_refs.push_back(std::make_pair(stmt_num, std::vector())); - std::vector refs = ir->FindArrayRef(stmt[stmt_num].code); - std::vector selected(refs.size(), false); - for (int j = 0; j < array_ref_nums[i].second.size(); j++) { - int ref_num = array_ref_nums[i].second[j]; - if (ref_num < 0 || ref_num >= refs.size()) { - for (int k = 0; k < refs.size(); k++) - delete refs[k]; - throw std::invalid_argument("invalid array reference number " + to_string(ref_num) + " in statement " + to_string(stmt_num)); - } - selected_refs[selected_refs.size()-1].second.push_back(refs[ref_num]); - selected[ref_num] = true; - } - for (int j = 0; j < refs.size(); j++) - if (!selected[j]) - delete refs[j]; - } - if (selected_refs.size() == 0) - throw std::invalid_argument("found no array references to copy"); - - // do the copy - return datacopy_privatized(selected_refs, level, privatized_levels, allow_extra_read, fastest_changing_dimension, padding_stride, padding_alignment, memory_type); -} - - -// -// Implement low level datacopy function with lots of options. -// -bool Loop::datacopy_privatized(const std::vector > > &stmt_refs, int level, - const std::vector &privatized_levels, - bool allow_extra_read, int fastest_changing_dimension, - int padding_stride, int padding_alignment, int memory_type) { - if (stmt_refs.size() == 0) - return true; - - // check for sanity of parameters - IR_ArraySymbol *sym = NULL; - std::vector lex; - std::set active; - if (level <= 0) - throw std::invalid_argument("invalid loop level " + to_string(level)); - for (int i = 0; i < privatized_levels.size(); i++) { - if (i == 0) { - if (privatized_levels[i] < level) - throw std::invalid_argument("privatized loop levels must be no less than level " + to_string(level)); - } - else if (privatized_levels[i] <= privatized_levels[i-1]) - throw std::invalid_argument("privatized loop levels must be in ascending order"); - } - for (int i = 0; i < stmt_refs.size(); i++) { - int stmt_num = stmt_refs[i].first; - active.insert(stmt_num); - if (stmt_num < 0 || stmt_num >= stmt.size()) - throw std::invalid_argument("invalid statement number " + to_string(stmt_num)); - if (privatized_levels.size() != 0) { - if (privatized_levels[privatized_levels.size()-1] > stmt[stmt_num].loop_level.size()) - throw std::invalid_argument("invalid loop level " + to_string(privatized_levels[privatized_levels.size()-1]) + " for statement " + to_string(stmt_num)); - } - else { - if (level > stmt[stmt_num].loop_level.size()) - throw std::invalid_argument("invalid loop level " + to_string(level) + " for statement " + to_string(stmt_num)); - } - for (int j = 0; j < stmt_refs[i].second.size(); j++) { - if (sym == NULL) { - sym = stmt_refs[i].second[j]->symbol(); - lex = getLexicalOrder(stmt_num); - } - else { - IR_ArraySymbol *t = stmt_refs[i].second[j]->symbol(); - if (t->name() != sym->name()) { - delete t; - delete sym; - throw std::invalid_argument("try to copy data from different arrays"); - } - delete t; - } - } - } - if (!(fastest_changing_dimension >= -1 && fastest_changing_dimension < sym->n_dim())) - throw std::invalid_argument("invalid fastest changing dimension for the array to be copied"); - if (padding_stride < 0) - throw std::invalid_argument("invalid temporary array stride requirement"); - if (padding_alignment == -1 || padding_alignment == 0) - throw std::invalid_argument("invalid temporary array alignment requirement"); - - int dim = 2*level - 1; - int n_dim = sym->n_dim(); - - if (fastest_changing_dimension == -1) - switch (sym->layout_type()) { - case IR_ARRAY_LAYOUT_ROW_MAJOR: - fastest_changing_dimension = n_dim - 1; - break; - case IR_ARRAY_LAYOUT_COLUMN_MAJOR: - fastest_changing_dimension = 0; - break; - default: - throw loop_error("unsupported array layout"); - } - - - // build iteration spaces for all reads and for all writes separately - apply_xform(active); - bool has_write_refs = false; - bool has_read_refs = false; - Relation wo_copy_is = Relation::False(level-1+privatized_levels.size()+n_dim); - Relation ro_copy_is = Relation::False(level-1+privatized_levels.size()+n_dim); - for (int i = 0; i < stmt_refs.size(); i++) { - int stmt_num = stmt_refs[i].first; - - for (int j = 0; j < stmt_refs[i].second.size(); j++) { - Relation mapping(stmt[stmt_num].IS.n_set(), level-1+privatized_levels.size()+n_dim); - for (int k = 1; k <= mapping.n_inp(); k++) - mapping.name_input_var(k, stmt[stmt_num].IS.set_var(k)->name()); - mapping.setup_names(); - F_And *f_root = mapping.add_and(); - for (int k = 1; k <= level-1; k++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.input_var(k), 1); - h.update_coef(mapping.output_var(k), -1); - } - for (int k = 0; k < privatized_levels.size(); k++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.input_var(privatized_levels[k]), 1); - h.update_coef(mapping.output_var(level+k), -1); - } - for (int k = 0; k < n_dim; k++) { - CG_outputRepr *repr = stmt_refs[i].second[j]->index(k); - exp2formula(ir, mapping, f_root, freevar, repr, mapping.output_var(level-1+privatized_levels.size()+k+1), 'w', IR_COND_EQ, false); - repr->clear(); - delete repr; - } - Relation r = Range(Restrict_Domain(mapping, Intersection(copy(stmt[stmt_num].IS), Extend_Set(copy(this->known), stmt[stmt_num].IS.n_set() - this->known.n_set())))); - if (stmt_refs[i].second[j]->is_write()) { - has_write_refs = true; - wo_copy_is = Union(wo_copy_is, r); - wo_copy_is.simplify(2, 4); - } - else { - has_read_refs = true; - //protonu--removing the next line for now - ro_copy_is = Union(ro_copy_is, r); - ro_copy_is.simplify(2, 4); - //ro_copy_is = ConvexRepresentation(Union(ro_copy_is, r)); - - } - } - } - - if (allow_extra_read) { - Relation t = DecoupledConvexHull(copy(ro_copy_is)); - if (t.number_of_conjuncts() > 1) - ro_copy_is = RectHull(ro_copy_is); - else - ro_copy_is = t; - } - else { - Relation t = ConvexRepresentation(copy(ro_copy_is)); - if (t.number_of_conjuncts() > 1) - ro_copy_is = RectHull(ro_copy_is); - else - ro_copy_is = t; - } - wo_copy_is = ConvexRepresentation(wo_copy_is); - - if (allow_extra_read) { - Tuple Rs; - Tuple active; - for (DNF_Iterator di(ro_copy_is.query_DNF()); di; di++) { - Rs.append(Relation(ro_copy_is, di.curr())); - active.append(1); - } - Relation the_gcs = Relation::True(ro_copy_is.n_set()); - for (int i = level-1+privatized_levels.size()+1; i <= level-1+privatized_levels.size()+n_dim; i++) { - Relation r = greatest_common_step(Rs, active, i, Relation::Null()); - the_gcs = Intersection(the_gcs, r); - } - - ro_copy_is = Approximate(ro_copy_is); - ro_copy_is = ConvexRepresentation(ro_copy_is); - ro_copy_is = Intersection(ro_copy_is, the_gcs); - ro_copy_is.simplify(); - } - for (int i = 1; i <= level-1+privatized_levels.size()+n_dim; i++) { - wo_copy_is.name_set_var(i, tmp_loop_var_name_prefix+to_string(i)); - ro_copy_is.name_set_var(i, tmp_loop_var_name_prefix+to_string(i)); - } - wo_copy_is.setup_names(); - ro_copy_is.setup_names(); - - // build merged iteration space for calculating temporary array size - bool already_use_recthull = false; - Relation untampered_copy_is = ConvexRepresentation(Union(copy(wo_copy_is), copy(ro_copy_is))); - Relation copy_is = untampered_copy_is; - if (copy_is.number_of_conjuncts() > 1) { - try { - copy_is = ConvexHull(copy(untampered_copy_is)); - } - catch (const std::overflow_error &e) { - copy_is = RectHull(copy(untampered_copy_is)); - already_use_recthull = true; - } - } - - -Retry_copy_is: - // extract temporary array information - CG_outputBuilder *ocg = ir->builder(); - std::vector index_lb(n_dim); // initialized to NULL - std::vector index_stride(n_dim, 1); - std::vector is_index_eq(n_dim, false); - std::vector > index_sz(0); - Relation reduced_copy_is = copy(copy_is); - - for (int i = 0; i < n_dim; i++) { - if (i != 0) - reduced_copy_is = Project(reduced_copy_is, level-1+privatized_levels.size()+i, Set_Var); - Relation bound = get_loop_bound(reduced_copy_is, level-1+privatized_levels.size()+i); - - // extract stride - EQ_Handle stride_eq; - { - bool simple_stride = true; - int strides = countStrides(bound.query_DNF()->single_conjunct(), bound.set_var(level-1+privatized_levels.size()+i+1), stride_eq, simple_stride); - if (strides > 1) { - throw loop_error("too many strides"); - } - else if (strides == 1) { - int sign = stride_eq.get_coef(bound.set_var(level-1+privatized_levels.size()+i+1)); - Constr_Vars_Iter it(stride_eq, true); - index_stride[i] = abs((*it).coef/sign); - } - } - - // check if this arary index requires loop - Conjunct *c = bound.query_DNF()->single_conjunct(); - for (EQ_Iterator ei(c->EQs()); ei; ei++) { - if ((*ei).has_wildcards()) - continue; - - int coef = (*ei).get_coef(bound.set_var(level-1+privatized_levels.size()+i+1)); - if (coef != 0) { - int sign = 1; - if (coef < 0) { - coef = -coef; - sign = -1; - } - - CG_outputRepr *op = NULL; - for (Constr_Vars_Iter ci(*ei); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: - { - if ((*ci).var != bound.set_var(level-1+privatized_levels.size()+i+1)) - if ((*ci).coef*sign == 1) - op = ocg->CreateMinus(op, ocg->CreateIdent((*ci).var->name())); - else if ((*ci).coef*sign == -1) - op = ocg->CreatePlus(op, ocg->CreateIdent((*ci).var->name())); - else if ((*ci).coef*sign > 1) - op = ocg->CreateMinus(op, ocg->CreateTimes(ocg->CreateInt(abs((*ci).coef)), ocg->CreateIdent((*ci).var->name()))); - else // (*ci).coef*sign < -1 - op = ocg->CreatePlus(op, ocg->CreateTimes(ocg->CreateInt(abs((*ci).coef)), ocg->CreateIdent((*ci).var->name()))); - break; - } - case Global_Var: - { - Global_Var_ID g = (*ci).var->get_global_var(); - if ((*ci).coef*sign == 1) - op = ocg->CreateMinus(op, ocg->CreateIdent(g->base_name())); - else if ((*ci).coef*sign == -1) - op = ocg->CreatePlus(op, ocg->CreateIdent(g->base_name())); - else if ((*ci).coef*sign > 1) - op = ocg->CreateMinus(op, ocg->CreateTimes(ocg->CreateInt(abs((*ci).coef)), ocg->CreateIdent(g->base_name()))); - else // (*ci).coef*sign < -1 - op = ocg->CreatePlus(op, ocg->CreateTimes(ocg->CreateInt(abs((*ci).coef)), ocg->CreateIdent(g->base_name()))); - break; - } - default: - throw loop_error("unsupported array index expression"); - } - } - if ((*ei).get_const() != 0) - op = ocg->CreatePlus(op, ocg->CreateInt(-sign*((*ei).get_const()))); - if (coef != 1) - op = ocg->CreateIntegerDivide(op, ocg->CreateInt(coef)); - - index_lb[i] = op; - is_index_eq[i] = true; - break; - } - } - if (is_index_eq[i]) - continue; - - // seperate lower and upper bounds - std::vector lb_list, ub_list; - for (GEQ_Iterator gi(c->GEQs()); gi; gi++) { - int coef = (*gi).get_coef(bound.set_var(level-1+privatized_levels.size()+i+1)); - if (coef != 0 && (*gi).has_wildcards()) { - bool clean_bound = true; - GEQ_Handle h; - for (Constr_Vars_Iter cvi(*gi, true); gi; gi++) - if (!findFloorInequality(bound, (*cvi).var, h, bound.set_var(level-1+privatized_levels.size()+i+1))) { - clean_bound = false; - break; - } - if (!clean_bound) - continue; - } - - if (coef > 0) - lb_list.push_back(*gi); - else if (coef < 0) - ub_list.push_back(*gi); - } - if (lb_list.size() == 0 || ub_list.size() == 0) - if (already_use_recthull) - throw loop_error("failed to calcuate array footprint size"); - else { - copy_is = RectHull(copy(untampered_copy_is)); - already_use_recthull = true; - goto Retry_copy_is; - } - - // build lower bound representation - Tuple lb_repr_list; - for (int j = 0; j < lb_list.size(); j++) - lb_repr_list.append(outputLBasRepr(ocg, lb_list[j], bound, - bound.set_var(level-1+privatized_levels.size()+i+1), - index_stride[i], stride_eq, Relation::True(bound.n_set()), - std::vector(bound.n_set()))); - - if (lb_repr_list.size() > 1) - index_lb[i] = ocg->CreateInvoke("max", lb_repr_list); - else if (lb_repr_list.size() == 1) - index_lb[i] = lb_repr_list[1]; - - // build temporary array size representation - { - Relation cal(copy_is.n_set(), 1); - F_And *f_root = cal.add_and(); - for (int j = 0; j < ub_list.size(); j++) - for (int k = 0; k < lb_list.size(); k++) { - GEQ_Handle h = f_root->add_GEQ(); - - for (Constr_Vars_Iter ci(ub_list[j]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: - { - int pos = (*ci).var->get_position(); - h.update_coef(cal.input_var(pos), (*ci).coef); - break; - } - case Global_Var: - { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = cal.get_local(g); - else - v = cal.get_local(g, (*ci).var->function_of()); - h.update_coef(v, (*ci).coef); - break; - } - default: - throw loop_error("cannot calculate temporay array size statically"); - } - } - h.update_const(ub_list[j].get_const()); - - for (Constr_Vars_Iter ci(lb_list[k]); ci; ci++) { - switch ((*ci).var->kind()) { - case Input_Var: - { - int pos = (*ci).var->get_position(); - h.update_coef(cal.input_var(pos), (*ci).coef); - break; - } - case Global_Var: - { - Global_Var_ID g = (*ci).var->get_global_var(); - Variable_ID v; - if (g->arity() == 0) - v = cal.get_local(g); - else - v = cal.get_local(g, (*ci).var->function_of()); - h.update_coef(v, (*ci).coef); - break; - } - default: - throw loop_error("cannot calculate temporay array size statically"); - } - } - h.update_const(lb_list[k].get_const()); - - h.update_const(1); - h.update_coef(cal.output_var(1), -1); - } - - cal = Restrict_Domain(cal, copy(copy_is)); - for (int j = 1; j <= cal.n_inp(); j++) - cal = Project(cal, j, Input_Var); - cal.simplify(); - - // pad temporary array size - // TODO: for variable array size, create padding formula - Conjunct *c = cal.query_DNF()->single_conjunct(); - bool is_index_bound_const = false; - for (GEQ_Iterator gi(c->GEQs()); gi && !is_index_bound_const; gi++) - if ((*gi).is_const(cal.output_var(1))) { - coef_t size = (*gi).get_const() / (-(*gi).get_coef(cal.output_var(1))); - if (padding_stride != 0) { - size = (size + index_stride[i] - 1) / index_stride[i]; - if (i == fastest_changing_dimension) - size = size * padding_stride; - } - if (i == fastest_changing_dimension) { - if (padding_alignment > 1) { // align to boundary for data packing - int residue = size % padding_alignment; - if (residue) - size = size+padding_alignment-residue; - } - else if (padding_alignment < -1) { // un-alignment for memory bank conflicts - while (gcd(size, static_cast(-padding_alignment)) != 1) - size++; - } - } - index_sz.push_back(std::make_pair(i, ocg->CreateInt(size))); - is_index_bound_const = true; - } - - if (!is_index_bound_const) { - for (GEQ_Iterator gi(c->GEQs()); gi && !is_index_bound_const; gi++) { - int coef = (*gi).get_coef(cal.output_var(1)); - if (coef < 0) { - CG_outputRepr *op = NULL; - for (Constr_Vars_Iter ci(*gi); ci; ci++) { - if ((*ci).var != cal.output_var(1)) { - switch((*ci).var->kind()) { - case Global_Var: - { - Global_Var_ID g = (*ci).var->get_global_var(); - if ((*ci).coef == 1) - op = ocg->CreatePlus(op, ocg->CreateIdent(g->base_name())); - else if ((*ci).coef == -1) - op = ocg->CreateMinus(op, ocg->CreateIdent(g->base_name())); - else if ((*ci).coef > 1) - op = ocg->CreatePlus(op, ocg->CreateTimes(ocg->CreateInt((*ci).coef), ocg->CreateIdent(g->base_name()))); - else // (*ci).coef < -1 - op = ocg->CreateMinus(op, ocg->CreateTimes(ocg->CreateInt(-(*ci).coef), ocg->CreateIdent(g->base_name()))); - break; - } - default: - throw loop_error("failed to generate array index bound code"); - } - } - } - int c = (*gi).get_const(); - if (c > 0) - op = ocg->CreatePlus(op, ocg->CreateInt(c)); - else if (c < 0) - op = ocg->CreateMinus(op, ocg->CreateInt(-c)); - if (padding_stride != 0) { - if (i == fastest_changing_dimension) { - coef_t g = gcd(index_stride[i], static_cast(padding_stride)); - coef_t t1 = index_stride[i] / g; - if (t1 != 1) - op = ocg->CreateIntegerDivide(ocg->CreatePlus(op, ocg->CreateInt(t1-1)), ocg->CreateInt(t1)); - coef_t t2 = padding_stride / g; - if (t2 != 1) - op = ocg->CreateTimes(op, ocg->CreateInt(t2)); - } - else if (index_stride[i] != 1) { - op = ocg->CreateIntegerDivide(ocg->CreatePlus(op, ocg->CreateInt(index_stride[i]-1)), ocg->CreateInt(index_stride[i])); - } - } - - index_sz.push_back(std::make_pair(i, op)); - break; - } - } - } - } - } - - // change the temporary array index order - for (int i = 0; i < index_sz.size(); i++) - if (index_sz[i].first == fastest_changing_dimension) - switch (sym->layout_type()) { - case IR_ARRAY_LAYOUT_ROW_MAJOR: - std::swap(index_sz[index_sz.size()-1], index_sz[i]); - break; - case IR_ARRAY_LAYOUT_COLUMN_MAJOR: - std::swap(index_sz[0], index_sz[i]); - break; - default: - throw loop_error("unsupported array layout"); - } - - // declare temporary array or scalar - IR_Symbol *tmp_sym; - if (index_sz.size() == 0) { - tmp_sym = ir->CreateScalarSymbol(sym, memory_type); - } - else { - std::vector tmp_array_size(index_sz.size()); - for (int i = 0; i < index_sz.size(); i++) - tmp_array_size[i] = index_sz[i].second->clone(); - tmp_sym = ir->CreateArraySymbol(sym, tmp_array_size, memory_type); - } - - // create temporary array read initialization code - CG_outputRepr *copy_code_read; - if (has_read_refs) - if (index_sz.size() == 0) { - IR_ScalarRef *tmp_scalar_ref = ir->CreateScalarRef(static_cast(tmp_sym)); - - std::vector rhs_index(n_dim); - for (int i = 0; i < index_lb.size(); i++) - if (is_index_eq[i]) - rhs_index[i] = index_lb[i]->clone(); - else - rhs_index[i] = ir->builder()->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+i+1)->name()); - IR_ArrayRef *copied_array_ref = ir->CreateArrayRef(sym, rhs_index); - - copy_code_read = ir->builder()->CreateAssignment(0, tmp_scalar_ref->convert(), copied_array_ref->convert()); - } - else { - std::vector lhs_index(index_sz.size()); - for (int i = 0; i < index_sz.size(); i++) { - int cur_index_num = index_sz[i].first; - CG_outputRepr *cur_index_repr = ocg->CreateMinus(ocg->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+cur_index_num+1)->name()), index_lb[cur_index_num]->clone()); - if (padding_stride != 0) { - if (i == n_dim-1) { - coef_t g = gcd(index_stride[cur_index_num], static_cast(padding_stride)); - coef_t t1 = index_stride[cur_index_num] / g; - if (t1 != 1) - cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(t1)); - coef_t t2 = padding_stride / g; - if (t2 != 1) - cur_index_repr = ocg->CreateTimes(cur_index_repr, ocg->CreateInt(t2)); - } - else if (index_stride[cur_index_num] != 1) { - cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(index_stride[cur_index_num])); - } - } - - if (ir->ArrayIndexStartAt() != 0) - cur_index_repr = ocg->CreatePlus(cur_index_repr, ocg->CreateInt(ir->ArrayIndexStartAt())); - lhs_index[i] = cur_index_repr; - } - - IR_ArrayRef *tmp_array_ref = ir->CreateArrayRef(static_cast(tmp_sym), lhs_index); - - std::vector rhs_index(n_dim); - for (int i = 0; i < index_lb.size(); i++) - if (is_index_eq[i]) - rhs_index[i] = index_lb[i]->clone(); - else - rhs_index[i] = ir->builder()->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+i+1)->name()); - IR_ArrayRef *copied_array_ref = ir->CreateArrayRef(sym, rhs_index); - - copy_code_read = ir->builder()->CreateAssignment(0, tmp_array_ref->convert(), copied_array_ref->convert()); - } - - // create temporary array write back code - CG_outputRepr *copy_code_write; - if (has_write_refs) - if (index_sz.size() == 0) { - IR_ScalarRef *tmp_scalar_ref = ir->CreateScalarRef(static_cast(tmp_sym)); - - std::vector rhs_index(n_dim); - for (int i = 0; i < index_lb.size(); i++) - if (is_index_eq[i]) - rhs_index[i] = index_lb[i]->clone(); - else - rhs_index[i] = ir->builder()->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+i+1)->name()); - IR_ArrayRef *copied_array_ref = ir->CreateArrayRef(sym, rhs_index); - - copy_code_write = ir->builder()->CreateAssignment(0, copied_array_ref->convert(), tmp_scalar_ref->convert()); - } - else { - std::vector lhs_index(n_dim); - for (int i = 0; i < index_lb.size(); i++) - if (is_index_eq[i]) - lhs_index[i] = index_lb[i]->clone(); - else - lhs_index[i] = ir->builder()->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+i+1)->name()); - IR_ArrayRef *copied_array_ref = ir->CreateArrayRef(sym, lhs_index); - - std::vector rhs_index(index_sz.size()); - for (int i = 0; i < index_sz.size(); i++) { - int cur_index_num = index_sz[i].first; - CG_outputRepr *cur_index_repr = ocg->CreateMinus(ocg->CreateIdent(copy_is.set_var(level-1+privatized_levels.size()+cur_index_num+1)->name()), index_lb[cur_index_num]->clone()); - if (padding_stride != 0) { - if (i == n_dim-1) { - coef_t g = gcd(index_stride[cur_index_num], static_cast(padding_stride)); - coef_t t1 = index_stride[cur_index_num] / g; - if (t1 != 1) - cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(t1)); - coef_t t2 = padding_stride / g; - if (t2 != 1) - cur_index_repr = ocg->CreateTimes(cur_index_repr, ocg->CreateInt(t2)); - } - else if (index_stride[cur_index_num] != 1) { - cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(index_stride[cur_index_num])); - } - } - - if (ir->ArrayIndexStartAt() != 0) - cur_index_repr = ocg->CreatePlus(cur_index_repr, ocg->CreateInt(ir->ArrayIndexStartAt())); - rhs_index[i] = cur_index_repr; - } - IR_ArrayRef *tmp_array_ref = ir->CreateArrayRef(static_cast(tmp_sym), rhs_index); - - copy_code_write = ir->builder()->CreateAssignment(0, copied_array_ref->convert(), tmp_array_ref->convert()); - } - - // now we can remove those loops for array indexes that are - // dependent on others - if (!(index_sz.size() == n_dim && (sym->layout_type() == IR_ARRAY_LAYOUT_ROW_MAJOR || n_dim <= 1))) { - Relation mapping(level-1+privatized_levels.size()+n_dim, level-1+privatized_levels.size()+index_sz.size()); - F_And *f_root = mapping.add_and(); - for (int i = 1; i <= level-1+privatized_levels.size(); i++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.input_var(i), 1); - h.update_coef(mapping.output_var(i), -1); - } - - int cur_index = 0; - std::vector mapped_index(index_sz.size()); - for (int i = 0; i < n_dim; i++) - if (!is_index_eq[i]) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(mapping.input_var(level-1+privatized_levels.size()+i+1), 1); - switch (sym->layout_type()) { - case IR_ARRAY_LAYOUT_COLUMN_MAJOR: { - h.update_coef(mapping.output_var(level-1+privatized_levels.size()+index_sz.size()-cur_index), -1); - mapped_index[index_sz.size()-cur_index-1] = i; - break; - } - case IR_ARRAY_LAYOUT_ROW_MAJOR: { - h.update_coef(mapping.output_var(level-1+privatized_levels.size()+cur_index+1), -1); - mapped_index[cur_index] = i; - break; - } - default: - throw loop_error("unsupported array layout"); - } - cur_index++; - } - - wo_copy_is = Range(Restrict_Domain(copy(mapping), wo_copy_is)); - ro_copy_is = Range(Restrict_Domain(copy(mapping), ro_copy_is)); - for (int i = 1; i <= level-1+privatized_levels.size(); i++) { - wo_copy_is.name_set_var(i, copy_is.set_var(i)->name()); - ro_copy_is.name_set_var(i, copy_is.set_var(i)->name()); - } - for (int i = 0; i < index_sz.size(); i++) { - wo_copy_is.name_set_var(level-1+privatized_levels.size()+i+1, copy_is.set_var(level-1+privatized_levels.size()+mapped_index[i]+1)->name()); - ro_copy_is.name_set_var(level-1+privatized_levels.size()+i+1, copy_is.set_var(level-1+privatized_levels.size()+mapped_index[i]+1)->name()); - } - wo_copy_is.setup_names(); - ro_copy_is.setup_names(); - } - - // insert read copy statement - int old_num_stmt = stmt.size(); - int ro_copy_stmt_num = -1; - if (has_read_refs) { - Relation copy_xform(ro_copy_is.n_set(), 2*ro_copy_is.n_set()+1); - { - F_And *f_root = copy_xform.add_and(); - for (int i = 1; i <= ro_copy_is.n_set(); i++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(copy_xform.input_var(i), 1); - h.update_coef(copy_xform.output_var(2*i), -1); - } - for (int i = 1; i <= dim; i+=2) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(copy_xform.output_var(i), -1); - h.update_const(lex[i-1]); - } - for (int i = dim+2; i <= copy_xform.n_out(); i+=2) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(copy_xform.output_var(i), 1); - } - } - - Statement copy_stmt_read; - copy_stmt_read.IS = ro_copy_is; - copy_stmt_read.xform = copy_xform; - copy_stmt_read.code = copy_code_read; - copy_stmt_read.loop_level = std::vector(ro_copy_is.n_set()); - for (int i = 0; i < level-1; i++) { - copy_stmt_read.loop_level[i].type = stmt[*(active.begin())].loop_level[i].type; - if (stmt[*(active.begin())].loop_level[i].type == LoopLevelTile && - stmt[*(active.begin())].loop_level[i].payload >= level) { - int j; - for (j = 0; j < privatized_levels.size(); j++) - if (privatized_levels[j] == stmt[*(active.begin())].loop_level[i].payload) - break; - if (j == privatized_levels.size()) - copy_stmt_read.loop_level[i].payload = -1; - else - copy_stmt_read.loop_level[i].payload = level + j; - } - else - copy_stmt_read.loop_level[i].payload = stmt[*(active.begin())].loop_level[i].payload; - copy_stmt_read.loop_level[i].parallel_level = stmt[*(active.begin())].loop_level[i].parallel_level; - } - for (int i = 0; i < privatized_levels.size(); i++) { - copy_stmt_read.loop_level[level-1+i].type = stmt[*(active.begin())].loop_level[privatized_levels[i]].type; - copy_stmt_read.loop_level[level-1+i].payload = stmt[*(active.begin())].loop_level[privatized_levels[i]].payload; - copy_stmt_read.loop_level[level-1+i].parallel_level = stmt[*(active.begin())].loop_level[privatized_levels[i]].parallel_level; - } - int left_num_dim = num_dep_dim - (get_last_dep_dim_before(*(active.begin()), level) + 1); - for (int i = 0; i < min(left_num_dim, static_cast(index_sz.size())); i++) { - copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].type = LoopLevelOriginal; - copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].payload = num_dep_dim-left_num_dim+i; - copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].parallel_level = 0; - } - for (int i = min(left_num_dim, static_cast(index_sz.size())); i < index_sz.size(); i++) { - copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].type = LoopLevelUnknown; - copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].payload = -1; - copy_stmt_read.loop_level[level-1+privatized_levels.size()+i].parallel_level = 0; - } - - shiftLexicalOrder(lex, dim-1, 1); - stmt.push_back(copy_stmt_read); - ro_copy_stmt_num = stmt.size() - 1; - dep.insert(); - } - - // insert write copy statement - int wo_copy_stmt_num = -1; - if (has_write_refs) { - Relation copy_xform(wo_copy_is.n_set(), 2*wo_copy_is.n_set()+1); - { - F_And *f_root = copy_xform.add_and(); - for (int i = 1; i <= wo_copy_is.n_set(); i++) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(copy_xform.input_var(i), 1); - h.update_coef(copy_xform.output_var(2*i), -1); - } - for (int i = 1; i <= dim; i+=2) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(copy_xform.output_var(i), -1); - h.update_const(lex[i-1]); - } - for (int i = dim+2; i <= copy_xform.n_out(); i+=2) { - EQ_Handle h = f_root->add_EQ(); - h.update_coef(copy_xform.output_var(i), 1); - } - } - - Statement copy_stmt_write; - copy_stmt_write.IS = wo_copy_is; - copy_stmt_write.xform = copy_xform; - copy_stmt_write.code = copy_code_write; - copy_stmt_write.loop_level = std::vector(wo_copy_is.n_set()); - for (int i = 0; i < level-1; i++) { - copy_stmt_write.loop_level[i].type = stmt[*(active.begin())].loop_level[i].type; - if (stmt[*(active.begin())].loop_level[i].type == LoopLevelTile && - stmt[*(active.begin())].loop_level[i].payload >= level) { - int j; - for (j = 0; j < privatized_levels.size(); j++) - if (privatized_levels[j] == stmt[*(active.begin())].loop_level[i].payload) - break; - if (j == privatized_levels.size()) - copy_stmt_write.loop_level[i].payload = -1; - else - copy_stmt_write.loop_level[i].payload = level + j; - } - else - copy_stmt_write.loop_level[i].payload = stmt[*(active.begin())].loop_level[i].payload; - copy_stmt_write.loop_level[i].parallel_level = stmt[*(active.begin())].loop_level[i].parallel_level; - } - for (int i = 0; i < privatized_levels.size(); i++) { - copy_stmt_write.loop_level[level-1+i].type = stmt[*(active.begin())].loop_level[privatized_levels[i]].type; - copy_stmt_write.loop_level[level-1+i].payload = stmt[*(active.begin())].loop_level[privatized_levels[i]].payload; - copy_stmt_write.loop_level[level-1+i].parallel_level = stmt[*(active.begin())].loop_level[privatized_levels[i]].parallel_level; - } - int left_num_dim = num_dep_dim - (get_last_dep_dim_before(*(active.begin()), level) + 1); - for (int i = 0; i < min(left_num_dim, static_cast(index_sz.size())); i++) { - copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].type = LoopLevelOriginal; - copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].payload = num_dep_dim-left_num_dim+i; - copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].parallel_level = 0; - } - for (int i = min(left_num_dim, static_cast(index_sz.size())); i < index_sz.size(); i++) { - copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].type = LoopLevelUnknown; - copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].payload = -1; - copy_stmt_write.loop_level[level-1+privatized_levels.size()+i].parallel_level = 0; - } - - lex[dim-1]++; - shiftLexicalOrder(lex, dim-1, -2); - stmt.push_back(copy_stmt_write); - wo_copy_stmt_num = stmt.size() - 1; - dep.insert(); - } - - // replace original array accesses with temporary array accesses - for (int i =0; i < stmt_refs.size(); i++) - for (int j = 0; j < stmt_refs[i].second.size(); j++) { - if (index_sz.size() == 0) { - IR_ScalarRef *tmp_scalar_ref = ir->CreateScalarRef(static_cast(tmp_sym)); - ir->ReplaceExpression(stmt_refs[i].second[j], tmp_scalar_ref->convert()); - } - else { - std::vector index_repr(index_sz.size()); - for (int k = 0; k < index_sz.size(); k++) { - int cur_index_num = index_sz[k].first; - - CG_outputRepr *cur_index_repr = ocg->CreateMinus(stmt_refs[i].second[j]->index(cur_index_num), index_lb[cur_index_num]->clone()); - if (padding_stride != 0) { - if (k == n_dim-1) { - coef_t g = gcd(index_stride[cur_index_num], static_cast(padding_stride)); - coef_t t1 = index_stride[cur_index_num] / g; - if (t1 != 1) - cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(t1)); - coef_t t2 = padding_stride / g; - if (t2 != 1) - cur_index_repr = ocg->CreateTimes(cur_index_repr, ocg->CreateInt(t2)); - } - else if (index_stride[cur_index_num] != 1) { - cur_index_repr = ocg->CreateIntegerDivide(cur_index_repr, ocg->CreateInt(index_stride[cur_index_num])); - } - } - - if (ir->ArrayIndexStartAt() != 0) - cur_index_repr = ocg->CreatePlus(cur_index_repr, ocg->CreateInt(ir->ArrayIndexStartAt())); - index_repr[k] = cur_index_repr; - } - - IR_ArrayRef *tmp_array_ref = ir->CreateArrayRef(static_cast(tmp_sym), index_repr); - ir->ReplaceExpression(stmt_refs[i].second[j], tmp_array_ref->convert()); - } - } - - // update dependence graph - int dep_dim = get_last_dep_dim_before(*(active.begin()), level) + 1; - if (ro_copy_stmt_num != -1) { - for (int i = 0; i < old_num_stmt; i++) { - std::vector > D; - - for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();) { - if (active.find(i) != active.end() && active.find(j->first) == active.end()) { - std::vector dvs1, dvs2; - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if (dv.sym != NULL && dv.sym->name() == sym->name() && (dv.type == DEP_R2R || dv.type == DEP_R2W)) - dvs1.push_back(dv); - else - dvs2.push_back(dv); - } - j->second = dvs2; - if (dvs1.size() > 0) - dep.connect(ro_copy_stmt_num, j->first, dvs1); - } - else if (active.find(i) == active.end() && active.find(j->first) != active.end()) { - std::vector dvs1, dvs2; - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if (dv.sym != NULL && dv.sym->name() == sym->name() && (dv.type == DEP_R2R || dv.type == DEP_W2R)) - dvs1.push_back(dv); - else - dvs2.push_back(dv); - } - j->second = dvs2; - if (dvs1.size() > 0) - D.push_back(dvs1); - } - - if (j->second.size() == 0) - dep.vertex[i].second.erase(j++); - else - j++; - } - - for (int j = 0; j < D.size(); j++) - dep.connect(i, ro_copy_stmt_num, D[j]); - } - - // insert dependences from copy statement loop to copied statements - DependenceVector dv; - dv.type = DEP_W2R; - dv.sym = tmp_sym->clone(); - dv.lbounds = std::vector(num_dep_dim, 0); - dv.ubounds = std::vector(num_dep_dim, 0); - for (int i = dep_dim; i < num_dep_dim; i++) { - dv.lbounds[i] = -posInfinity; - dv.ubounds[i] = posInfinity; - } - for (std::set::iterator i = active.begin(); i != active.end(); i++) - dep.connect(ro_copy_stmt_num, *i, dv); - } - - if (wo_copy_stmt_num != -1) { - for (int i = 0; i < old_num_stmt; i++) { - std::vector > D; - - for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end();) { - if (active.find(i) != active.end() && active.find(j->first) == active.end()) { - std::vector dvs1, dvs2; - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if (dv.sym != NULL && dv.sym->name() == sym->name() && (dv.type == DEP_W2R || dv.type == DEP_W2W)) - dvs1.push_back(dv); - else - dvs2.push_back(dv); - } - j->second = dvs2; - if (dvs1.size() > 0) - dep.connect(wo_copy_stmt_num, j->first, dvs1); - } - else if (active.find(i) == active.end() && active.find(j->first) != active.end()) { - std::vector dvs1, dvs2; - for (int k = 0; k < j->second.size(); k++) { - DependenceVector dv = j->second[k]; - if (dv.sym != NULL && dv.sym->name() == sym->name() && (dv.type == DEP_R2W || dv.type == DEP_W2W)) - dvs1.push_back(dv); - else - dvs2.push_back(dv); - } - j->second = dvs2; - if (dvs1.size() > 0) - D.push_back(dvs1); - } - - if (j->second.size() == 0) - dep.vertex[i].second.erase(j++); - else - j++; - } - - for (int j = 0; j < D.size(); j++) - dep.connect(i, wo_copy_stmt_num, D[j]); - } - - // insert dependences from copied statements to write statements - DependenceVector dv; - dv.type = DEP_W2R; - dv.sym = tmp_sym->clone(); - dv.lbounds = std::vector(num_dep_dim, 0); - dv.ubounds = std::vector(num_dep_dim, 0); - for (int i = dep_dim; i < num_dep_dim; i++) { - dv.lbounds[i] = -posInfinity; - dv.ubounds[i] = posInfinity; - } - for (std::set::iterator i = active.begin(); i != active.end(); i++) - dep.connect(*i, wo_copy_stmt_num, dv); - - } - - // update variable name for dependences among copied statements - for (int i = 0; i < old_num_stmt; i++) { - if (active.find(i) != active.end()) - for (DependenceGraph::EdgeList::iterator j = dep.vertex[i].second.begin(); j != dep.vertex[i].second.end(); j++) - if (active.find(j->first) != active.end()) - for (int k = 0; k < j->second.size(); k++) { - IR_Symbol *s = tmp_sym->clone(); - j->second[k].sym = s; - } - } - - // insert anti-dependence from write statement to read statement - if (ro_copy_stmt_num != -1 && wo_copy_stmt_num != -1) - if (dep_dim >= 0) { - DependenceVector dv; - dv.type = DEP_R2W; - dv.sym = tmp_sym->clone(); - dv.lbounds = std::vector(num_dep_dim, 0); - dv.ubounds = std::vector(num_dep_dim, 0); - for (int k = dep_dim; k < num_dep_dim; k++) { - dv.lbounds[k] = -posInfinity; - dv.ubounds[k] = posInfinity; - } - for (int k = 0; k < dep_dim; k++) { - if (k != 0) { - dv.lbounds[k-1] = 0; - dv.ubounds[k-1] = 0; - } - dv.lbounds[k] = 1; - dv.ubounds[k] = posInfinity; - dep.connect(wo_copy_stmt_num, ro_copy_stmt_num, dv); - } - } - - - // cleanup - delete sym; - delete tmp_sym; - for (int i = 0; i < index_lb.size(); i++) { - index_lb[i]->clear(); - delete index_lb[i]; - } - for (int i = 0; i < index_sz.size(); i++) { - index_sz[i].second->clear(); - delete index_sz[i].second; - } - - return true; -} -- cgit v1.2.3-70-g09d2