summaryrefslogtreecommitdiff
path: root/test-chill/test-cases/examples/cuda-chill/mriq.lua
diff options
context:
space:
mode:
Diffstat (limited to 'test-chill/test-cases/examples/cuda-chill/mriq.lua')
-rw-r--r--test-chill/test-cases/examples/cuda-chill/mriq.lua55
1 files changed, 55 insertions, 0 deletions
diff --git a/test-chill/test-cases/examples/cuda-chill/mriq.lua b/test-chill/test-cases/examples/cuda-chill/mriq.lua
new file mode 100644
index 0000000..1170111
--- /dev/null
+++ b/test-chill/test-cases/examples/cuda-chill/mriq.lua
@@ -0,0 +1,55 @@
+--CUBLAS 2 MM Multiply
+
+--This function form intializes "CUDAIZE v2" versus "CUDAIZE v1" if you
+--call init() and use global variables to specify procedure and loop
+
+--Second parameter is procedure # and third is loop #
+init("mriq.c", "ComputeQCPU", 0)
+
+dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
+ --copy_to_shared methods
+N=32768
+M=3072
+TI=128
+TJ=128
+
+permute(0,{"j","i"})
+--tile_by_index({"j","i"}, {TI,TJ}, {l1_control="jj", l2_control="ii"}, {"jj","ii", "j", "i"})
+tile_by_index({"i"}, {TJ}, {l1_control="ii",l1_tile="i"}, {"ii", "j","i"})
+tile_by_index({"j"}, {TI}, {l1_control="jj"}, {"ii","jj", "j", "i"})
+--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
+--print_code()
+
+normalize_index("j")
+normalize_index("i")
+--print_code()
+--tile_by_index({"i"}, {TI}, {l1_control="iii",l1_tile="i"}, {"ii","jj", "iii","j","i"})
+--print_code()
+cudaize("Kernel_GPU", {x=N,y=N,z=N,Qr=N,Qi=N,kVals=M},{block={"jj"}, thread={"j"}})
+
+copy_to_shared("tx","kVals",1)
+--copy_to_shared("tx","x",1)
+--copy_to_shared("tx","y",1)
+--copy_to_shared("tx","z",1)
+
+--copy_to_texture("kVals")
+--datacopy(0, 3, "kVals", {"tt","t"},false,0,1,-16,true)
+--print_code()
+--datacopy_privatized(0,"tx","kVals",{"tx"})
+--copy_to_registers("tx","kVals")
+copy_to_registers("ii","x")
+copy_to_registers("ii","y")
+copy_to_registers("ii","z")
+copy_to_registers("ii","Qi")
+copy_to_registers("ii","Qr")
+--[[datacopy_privatized(0,"tx","x",{"tx"})
+datacopy_privatized(0,"tx","y",{"tx"})
+datacopy_privatized(0,"tx","z",{"tx"})
+datacopy_privatized(0,"tx","Qi",{"tx"})
+datacopy_privatized(0,"tx","Qr",{"tx"})
+
+
+]]--
+--unroll(0,5,64)
+print_code()
+--unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels