summaryrefslogtreecommitdiff
path: root/test-chill/test-cases/examples/cuda-chill/mv-shadow.lua
diff options
context:
space:
mode:
Diffstat (limited to 'test-chill/test-cases/examples/cuda-chill/mv-shadow.lua')
-rw-r--r--test-chill/test-cases/examples/cuda-chill/mv-shadow.lua65
1 files changed, 0 insertions, 65 deletions
diff --git a/test-chill/test-cases/examples/cuda-chill/mv-shadow.lua b/test-chill/test-cases/examples/cuda-chill/mv-shadow.lua
deleted file mode 100644
index 43e8491..0000000
--- a/test-chill/test-cases/examples/cuda-chill/mv-shadow.lua
+++ /dev/null
@@ -1,65 +0,0 @@
-init("mv-shadow.c","normalMV",0)
-dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers,
- --copy_to_shared methods
-
-N=129
-TI=32
-TJ=64
-
-N=1024
-TI=16
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
---Tile the i and j loop, introducing "ii" as the control loop for the "i"
---tile, "k" for the control loop fo the "j" tile, with the final order
---of {"ii", "k", "i", "j"}
-tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"})
---tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"})
---tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"})
---print_code()
---Normalize indx will do a tile size of one over the loop level specified
---by the input index. This is useful to get a zero lower bound and hard
---upper bound on a loop instead of it being relative to previous loop
---levels.
---normalize_index("ii")
-normalize_index("i")
-print_code()
-
---Cudaize now determines the grid dimentions from the loops themselves
---(the upper bounds of the block and thread loops). It also renames the
---given block and thread loops's indexes to the approviate values from
---the set {"bx","by","tx","ty","tz"}. The second parameter specifies the
---size of the arrays to be copied in the CUDA scaffolding.
-cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}})
---print_code()
-
---Does a datacopy, tile, and add_sync to get a shared memory copy
-
---copy_to_shared("tx", "b", 1)
---copy_to_shared("tx", "c", -16)
---print_code()
---copy_to_texture("b")
---copy_to_texture("c")
-copy_to_registers("k", "a")
---print_code()
-
-unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels
---copy_to_texture("b")
---print_code()
---unroll(0,5,0)
---print_code()