From bff810cc371a38f493d688c54f71013f5a7d53bf Mon Sep 17 00:00:00 2001 From: dhuth Date: Wed, 27 Aug 2014 09:52:06 -0600 Subject: Initial commit --- loop_cuda_rose.hh | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 loop_cuda_rose.hh (limited to 'loop_cuda_rose.hh') diff --git a/loop_cuda_rose.hh b/loop_cuda_rose.hh new file mode 100644 index 0000000..5260035 --- /dev/null +++ b/loop_cuda_rose.hh @@ -0,0 +1,132 @@ +#ifndef LOOP_CUDA_HH +#define LOOP_CUDA_HH + +#include "loop.hh" +#include "mem_mapping_utils.hh" +#include +#include "rose.h" +//#include + +using namespace omega; +using namespace SageBuilder; +enum MemoryMode { GlobalMem, SharedMem, TexMem }; + +struct VarDefs { + std::string name; + std::string secondName; + SgExpression* size_expr; //array size as an expression (can be a product of other variables etc) + SgType* type; + SgVariableSymbol* in_data; //Variable of array to copy data in from (before kernel call) + SgVariableSymbol* out_data; //Variable of array to copy data out to (after kernel call) + std::vector size_multi_dim; //-1 if linearized, the constant size N, of a NxN 2D array otherwise + bool tex_mapped; //protonu-- true if this variable will be texture mapped, so no need to pass it as a argument + bool cons_mapped; + std::string original_name; //this is such a hack, to store the original name, to store a table to textures used +}; + + +class LoopCuda: public Loop{ + +public: + //std::vector new_procs; //Need adding to a fse + std::vector< std::vector > idxNames; + std::vector< std::pair > syncs; + bool useIdxNames; + std::vector index; + + SgSymbolTable* symtab; + SgSymbolTable* parameter_symtab; + SgSymbolTable* body_symtab; + SgGlobal* globals; + SgGlobal* globalScope; + SgScopeStatement* func_body; + SgFunctionDefinition* func_definition; + //protonu--inserting this here, Gabe's implementation had it + //the struct statment as nonSplitLevels + std::vector > stmt_nonSplitLevels; + + texture_memory_mapping *texture; //protonu + constant_memory_mapping *constant_mem; + std::map array_dims; + omega::CG_outputRepr *setup_code; + omega::CG_outputRepr *teardown_code; + + unsigned int code_gen_flags; + enum CodeGenFlags { + GenInit = 0x00, + GenCudaizeV2 = 0x02, + }; + + + //varibles used by cudaize_codegen + //block x, y sizes, N and num_red + int cu_bx, cu_by, cu_n, cu_num_reduce; + //block statement and level + int cu_block_stmt, cu_block_level; + //thread x, y, z + int cu_tx, cu_ty, cu_tz; + + //Anand: Adding CG_outputRepr* representations of cu_bx, cu_by, cu_tx, cu_ty + //and cu_tz for non constant loop bounds + + CG_outputRepr *cu_bx_repr, *cu_by_repr, *cu_tx_repr, *cu_ty_repr, *cu_tz_repr; + + //tile statements, and loop-levels (cudaize v1) + std::vector< std::vector > cu_thread_loop; + std::vector cu_thread_sync; + MemoryMode cu_mode; + + std::string cu_nx_name, cu_ny_name, cu_kernel_name; + int nonDummyLevel(int stmt, int level); + bool symbolExists(std::string s); + void addSync(int stmt, std::string idx); + void renameIndex(int stmt, std::string idx, std::string newName); + bool validIndexes(int stmt, const std::vector& idxs); + CG_outputRepr* extractCudaUB(int stmt_num, int level, int &outUpperBound, int &outLowerBound); + + void printCode(int effort=1, bool actuallyPrint=true) const; + void printRuntimeInfo() const; + void printIndexes() const; + SgNode* getCode(int effort = 1) const; + void printIS(); + + + void permute_cuda(int stmt, const std::vector& curOrder); + //protonu-writing a wrapper for the Chun's new permute function + bool permute(int stmt_num, const std::vector &pi); + //end--protonu. + void tile_cuda(int stmt, int level, int outer_level); + void tile_cuda(int level, int tile_size, int outer_level, std::string idxName, std::string ctrlName, TilingMethodType method=StridedTile); + void tile_cuda(int stmt, int level, int tile_size, int outer_level, std::string idxName, std::string ctrlName, TilingMethodType method=StridedTile); + bool datacopy_privatized_cuda(int stmt_num, int level, const std::string &array_name, const std::vector &privatized_levels, bool allow_extra_read = false, int fastest_changing_dimension = -1, int padding_stride = 1, int padding_alignment = 1, bool cuda_shared=false); + bool datacopy_cuda(int stmt_num, int level, const std::string &array_name, std::vector new_idxs, bool allow_extra_read = false, int fastest_changing_dimension = -1, int padding_stride = 1, int padding_alignment = 4, bool cuda_shared=false); + bool unroll_cuda(int stmt_num, int level, int unroll_amount); + //protonu--using texture memory + void copy_to_texture(const char *array_name); + void copy_to_constant(const char *array_name); + int findCurLevel(int stmt, std::string idx); + /** + * + * @param kernel_name Name of the GPU generated kernel + * @param nx Iteration space over the x dimention + * @param ny Iteration space over the y dimention + * @param tx Tile dimention over x dimention + * @param ty Tile dimention over the y dimention + * @param num_reduce The number of dimentions to reduce by mapping to the GPU implicit blocks/threads + */ + //stmnt_num is referenced from the perspective of being inside the cudize block loops + bool cudaize_v2(std::string kernel_name, std::map array_dims, + std::vector blockIdxs, std::vector threadIdxs); + SgNode* cudaize_codegen_v2(); + SgNode* codegen(); + + //protonu--have to add the constructors for the new class + //and maybe destructors (?) + LoopCuda(); + //LoopCuda(IR_Code *ir, tree_for *tf, global_symtab* gsym); + LoopCuda(IR_Control *ir_c, int loop_num);//protonu-added so as to not change ir_suif + ~LoopCuda(); + +}; + +#endif -- cgit v1.2.3-70-g09d2