diff options
author | dhuth <derickhuth@gmail.com> | 2014-08-27 09:52:06 -0600 |
---|---|---|
committer | dhuth <derickhuth@gmail.com> | 2014-08-27 09:52:06 -0600 |
commit | bff810cc371a38f493d688c54f71013f5a7d53bf (patch) | |
tree | fbe86954bb3c01deb21da9e41ebff5baa2889a45 /examples/cuda-chill/mv-shadow.lua | |
download | chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.gz chill-bff810cc371a38f493d688c54f71013f5a7d53bf.tar.bz2 chill-bff810cc371a38f493d688c54f71013f5a7d53bf.zip |
Initial commit
Diffstat (limited to 'examples/cuda-chill/mv-shadow.lua')
-rw-r--r-- | examples/cuda-chill/mv-shadow.lua | 65 |
1 files changed, 65 insertions, 0 deletions
diff --git a/examples/cuda-chill/mv-shadow.lua b/examples/cuda-chill/mv-shadow.lua new file mode 100644 index 0000000..43e8491 --- /dev/null +++ b/examples/cuda-chill/mv-shadow.lua @@ -0,0 +1,65 @@ +init("mv-shadow.c","normalMV",0) +dofile("cudaize.lua") --defines custom tile_by_index, copy_to_registers, + --copy_to_shared methods + +N=129 +TI=32 +TJ=64 + +N=1024 +TI=16 + + + + + + + + + + + + + + + + +--Tile the i and j loop, introducing "ii" as the control loop for the "i" +--tile, "k" for the control loop fo the "j" tile, with the final order +--of {"ii", "k", "i", "j"} +tile_by_index({"i","j"}, {TI,TJ}, {l1_control="ii", l2_control="k"}, {"ii", "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="iii"}, {"ii", "k", "iii","i", "j"}) +--tile_by_index({"j"}, {TI}, {l2_control="k"}, { "k", "i", "j"}) +--tile_by_index({"i"}, {TI}, {l1_control="ii"}, {"ii", "i", "j"}) +--print_code() +--Normalize indx will do a tile size of one over the loop level specified +--by the input index. This is useful to get a zero lower bound and hard +--upper bound on a loop instead of it being relative to previous loop +--levels. +--normalize_index("ii") +normalize_index("i") +print_code() + +--Cudaize now determines the grid dimentions from the loops themselves +--(the upper bounds of the block and thread loops). It also renames the +--given block and thread loops's indexes to the approviate values from +--the set {"bx","by","tx","ty","tz"}. The second parameter specifies the +--size of the arrays to be copied in the CUDA scaffolding. +cudaize("mv_GPU", {a=N, b=N, c=N*N}, {block={"ii"}, thread={"i"}}) +--print_code() + +--Does a datacopy, tile, and add_sync to get a shared memory copy + +--copy_to_shared("tx", "b", 1) +--copy_to_shared("tx", "c", -16) +--print_code() +--copy_to_texture("b") +--copy_to_texture("c") +copy_to_registers("k", "a") +--print_code() + +unroll_to_depth(1) --won't unroll past thread/loop mapping, unrolls up to two loop levels +--copy_to_texture("b") +--print_code() +--unroll(0,5,0) +--print_code() |