diff options
Diffstat (limited to 'src/CuBaseLib')
52 files changed, 8173 insertions, 0 deletions
diff --git a/src/CuBaseLib/.depend.mk b/src/CuBaseLib/.depend.mk new file mode 100644 index 0000000..0bc1b34 --- /dev/null +++ b/src/CuBaseLib/.depend.mk @@ -0,0 +1,279 @@ +cudevice.o: cudevice.cc cudevice.h /usr/include/c++/4.6/map \ + /usr/include/c++/4.6/bits/stl_tree.h \ + /usr/include/c++/4.6/bits/stl_algobase.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \ + /usr/include/features.h /usr/include/bits/predefs.h \ + /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \ + /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \ + /usr/include/c++/4.6/bits/functexcept.h \ + /usr/include/c++/4.6/bits/exception_defines.h \ + /usr/include/c++/4.6/bits/cpp_type_traits.h \ + /usr/include/c++/4.6/ext/type_traits.h \ + /usr/include/c++/4.6/ext/numeric_traits.h \ + /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \ + /usr/include/c++/4.6/bits/concept_check.h \ + /usr/include/c++/4.6/bits/stl_iterator_base_types.h \ + /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \ + /usr/include/c++/4.6/bits/stl_iterator.h \ + /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/allocator.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \ + /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \ + /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/stl_function.h \ + /usr/include/c++/4.6/backward/binders.h \ + /usr/include/c++/4.6/bits/stl_map.h \ + /usr/include/c++/4.6/initializer_list \ + /usr/include/c++/4.6/bits/stl_multimap.h \ + /usr/include/c++/4.6/bits/range_access.h /usr/include/c++/4.6/string \ + /usr/include/c++/4.6/bits/stringfwd.h \ + /usr/include/c++/4.6/bits/char_traits.h \ + /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \ + /usr/include/wchar.h /usr/include/stdio.h \ + /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \ + /usr/include/bits/wchar.h \ + /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \ + /usr/include/xlocale.h /usr/include/bits/wchar2.h \ + /usr/include/c++/4.6/bits/localefwd.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \ + /usr/include/c++/4.6/clocale /usr/include/locale.h \ + /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \ + /usr/include/c++/4.6/cctype /usr/include/ctype.h \ + /usr/include/bits/types.h /usr/include/bits/typesizes.h \ + /usr/include/endian.h /usr/include/bits/endian.h \ + /usr/include/bits/byteswap.h /usr/include/c++/4.6/bits/ostream_insert.h \ + /usr/include/c++/4.6/bits/cxxabi_forced.h \ + /usr/include/c++/4.6/bits/basic_string.h \ + /usr/include/c++/4.6/ext/atomicity.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \ + /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \ + /usr/include/bits/sched.h /usr/include/bits/time.h \ + /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \ + /usr/include/bits/setjmp.h /usr/include/unistd.h \ + /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \ + /usr/include/bits/confname.h /usr/include/getopt.h \ + /usr/include/bits/unistd.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \ + /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \ + /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \ + /usr/include/c++/4.6/bits/ios_base.h \ + /usr/include/c++/4.6/bits/locale_classes.h \ + /usr/include/c++/4.6/bits/locale_classes.tcc \ + /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \ + /usr/include/c++/4.6/bits/basic_ios.h \ + /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \ + /usr/include/wctype.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \ + /usr/include/c++/4.6/bits/streambuf_iterator.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \ + /usr/include/c++/4.6/bits/locale_facets.tcc \ + /usr/include/c++/4.6/bits/basic_ios.tcc \ + /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \ + /usr/include/c++/4.6/bits/istream.tcc \ + /usr/local/cuda-5.0/include/cublas.h \ + /usr/local/cuda-5.0/include/cuda_runtime.h \ + /usr/local/cuda-5.0/include/host_config.h \ + /usr/local/cuda-5.0/include/builtin_types.h \ + /usr/local/cuda-5.0/include/device_types.h \ + /usr/local/cuda-5.0/include/host_defines.h \ + /usr/local/cuda-5.0/include/driver_types.h \ + /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \ + /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \ + /usr/include/limits.h /usr/include/bits/posix1_lim.h \ + /usr/include/bits/local_lim.h /usr/include/linux/limits.h \ + /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \ + /usr/include/bits/stdio_lim.h \ + /usr/local/cuda-5.0/include/surface_types.h \ + /usr/local/cuda-5.0/include/texture_types.h \ + /usr/local/cuda-5.0/include/vector_types.h \ + /usr/local/cuda-5.0/include/channel_descriptor.h \ + /usr/local/cuda-5.0/include/cuda_runtime_api.h \ + /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \ + /usr/local/cuda-5.0/include/driver_functions.h \ + /usr/local/cuda-5.0/include/vector_functions.h \ + /usr/local/cuda-5.0/include/cublas_api.h \ + /usr/local/cuda-5.0/include/cuComplex.h /usr/include/math.h \ + /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \ + /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \ + /usr/include/bits/nan.h /usr/include/bits/mathdef.h \ + /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \ + /usr/local/cuda-5.0/include/cuda.h /usr/include/stdlib.h \ + /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \ + /usr/include/sys/types.h /usr/include/sys/select.h \ + /usr/include/bits/select.h /usr/include/bits/sigset.h \ + /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \ + /usr/include/alloca.h /usr/include/bits/stdlib.h cumatrix.h \ + /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \ + ../KaldiLib/Matrix.h /usr/include/c++/4.6/stdexcept ../KaldiLib/cblas.h \ + ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \ + /usr/include/c++/4.6/cstdlib /usr/include/string.h \ + /usr/include/bits/string3.h ../KaldiLib/MathAux.h \ + /usr/include/c++/4.6/cmath ../KaldiLib/Types.h ../KaldiLib/Error.h \ + /usr/include/execinfo.h ../KaldiLib/Matrix.tcc \ + /usr/include/c++/4.6/cfloat \ + /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \ + /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \ + /usr/include/c++/4.6/cstdio /usr/include/libio.h \ + /usr/include/_G_config.h /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h /usr/include/bits/stdio2.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \ + /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \ + /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \ + /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \ + /usr/include/c++/4.6/bits/stl_algo.h \ + /usr/include/c++/4.6/bits/algorithmfwd.h \ + /usr/include/c++/4.6/bits/stl_heap.h \ + /usr/include/c++/4.6/bits/stl_tempbuf.h \ + /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \ + /usr/include/c++/4.6/vector \ + /usr/include/c++/4.6/bits/stl_uninitialized.h \ + /usr/include/c++/4.6/bits/stl_vector.h \ + /usr/include/c++/4.6/bits/stl_bvector.h \ + /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \ + /usr/include/c++/4.6/cstddef ../KaldiLib/Vector.tcc \ + /usr/include/c++/4.6/cstring ../KaldiLib/Matrix.h cukernels.h \ + /usr/local/cuda-5.0/include/vector_types.h cumatrix.tcc \ + /usr/local/cuda-5.0/include/cuda_runtime_api.h ../KaldiLib/Timer.h \ + /usr/include/sys/time.h cucommon.h ../KaldiLib/Error.h cuvector.h \ + ../KaldiLib/Vector.h cuvector.tcc cudevice.h cumath.h +cumath.o: cumath.cc cumath.h cumatrix.h /usr/include/c++/4.6/sstream \ + /usr/include/c++/4.6/istream /usr/include/c++/4.6/ios \ + /usr/include/c++/4.6/iosfwd \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \ + /usr/include/features.h /usr/include/bits/predefs.h \ + /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \ + /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \ + /usr/include/c++/4.6/bits/stringfwd.h \ + /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \ + /usr/include/wchar.h /usr/include/stdio.h \ + /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \ + /usr/include/bits/wchar.h \ + /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \ + /usr/include/xlocale.h /usr/include/bits/wchar2.h \ + /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \ + /usr/include/c++/4.6/bits/stl_algobase.h \ + /usr/include/c++/4.6/bits/functexcept.h \ + /usr/include/c++/4.6/bits/exception_defines.h \ + /usr/include/c++/4.6/bits/cpp_type_traits.h \ + /usr/include/c++/4.6/ext/type_traits.h \ + /usr/include/c++/4.6/ext/numeric_traits.h \ + /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \ + /usr/include/c++/4.6/bits/concept_check.h \ + /usr/include/c++/4.6/bits/stl_iterator_base_types.h \ + /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \ + /usr/include/c++/4.6/bits/stl_iterator.h \ + /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \ + /usr/include/c++/4.6/clocale /usr/include/locale.h \ + /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \ + /usr/include/ctype.h /usr/include/bits/types.h \ + /usr/include/bits/typesizes.h /usr/include/endian.h \ + /usr/include/bits/endian.h /usr/include/bits/byteswap.h \ + /usr/include/c++/4.6/bits/ios_base.h \ + /usr/include/c++/4.6/ext/atomicity.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \ + /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \ + /usr/include/bits/sched.h /usr/include/bits/time.h \ + /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \ + /usr/include/bits/setjmp.h /usr/include/unistd.h \ + /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \ + /usr/include/bits/confname.h /usr/include/getopt.h \ + /usr/include/bits/unistd.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \ + /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \ + /usr/include/c++/4.6/bits/allocator.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \ + /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \ + /usr/include/c++/4.6/bits/ostream_insert.h \ + /usr/include/c++/4.6/bits/cxxabi_forced.h \ + /usr/include/c++/4.6/bits/stl_function.h \ + /usr/include/c++/4.6/backward/binders.h \ + /usr/include/c++/4.6/bits/range_access.h \ + /usr/include/c++/4.6/bits/basic_string.h \ + /usr/include/c++/4.6/initializer_list \ + /usr/include/c++/4.6/bits/basic_string.tcc \ + /usr/include/c++/4.6/bits/locale_classes.tcc \ + /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \ + /usr/include/c++/4.6/bits/basic_ios.h \ + /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \ + /usr/include/wctype.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \ + /usr/include/c++/4.6/bits/streambuf_iterator.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \ + /usr/include/c++/4.6/bits/locale_facets.tcc \ + /usr/include/c++/4.6/bits/basic_ios.tcc /usr/include/c++/4.6/ostream \ + /usr/include/c++/4.6/bits/ostream.tcc \ + /usr/include/c++/4.6/bits/istream.tcc \ + /usr/include/c++/4.6/bits/sstream.tcc ../KaldiLib/Matrix.h \ + /usr/include/stdlib.h /usr/include/bits/waitflags.h \ + /usr/include/bits/waitstatus.h /usr/include/sys/types.h \ + /usr/include/sys/select.h /usr/include/bits/select.h \ + /usr/include/bits/sigset.h /usr/include/bits/select2.h \ + /usr/include/sys/sysmacros.h /usr/include/alloca.h \ + /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \ + /usr/include/c++/4.6/iostream ../KaldiLib/cblas.h ../KaldiLib/clapack.h \ + ../KaldiLib/cblas.h ../KaldiLib/Common.h /usr/include/c++/4.6/cstdlib \ + /usr/include/string.h /usr/include/bits/string3.h ../KaldiLib/MathAux.h \ + /usr/include/c++/4.6/cmath /usr/include/math.h \ + /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \ + /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \ + /usr/include/bits/nan.h /usr/include/bits/mathdef.h \ + /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \ + ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \ + ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \ + /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \ + /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \ + /usr/include/c++/4.6/cstdio /usr/include/libio.h \ + /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \ + /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \ + /usr/include/bits/stdio2.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \ + /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \ + /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \ + /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \ + /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \ + /usr/include/c++/4.6/bits/stl_algo.h \ + /usr/include/c++/4.6/bits/algorithmfwd.h \ + /usr/include/c++/4.6/bits/stl_heap.h \ + /usr/include/c++/4.6/bits/stl_tempbuf.h \ + /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \ + /usr/include/c++/4.6/vector \ + /usr/include/c++/4.6/bits/stl_uninitialized.h \ + /usr/include/c++/4.6/bits/stl_vector.h \ + /usr/include/c++/4.6/bits/stl_bvector.h \ + /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \ + /usr/include/c++/4.6/cstddef ../KaldiLib/Vector.tcc \ + /usr/include/c++/4.6/cstring ../KaldiLib/Matrix.h cukernels.h \ + /usr/local/cuda-5.0/include/vector_types.h \ + /usr/local/cuda-5.0/include/builtin_types.h \ + /usr/local/cuda-5.0/include/device_types.h \ + /usr/local/cuda-5.0/include/host_defines.h \ + /usr/local/cuda-5.0/include/driver_types.h \ + /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \ + /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \ + /usr/include/limits.h /usr/include/bits/posix1_lim.h \ + /usr/include/bits/local_lim.h /usr/include/linux/limits.h \ + /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \ + /usr/local/cuda-5.0/include/surface_types.h \ + /usr/local/cuda-5.0/include/texture_types.h \ + /usr/local/cuda-5.0/include/vector_types.h cumatrix.tcc \ + /usr/local/cuda-5.0/include/cuda_runtime_api.h \ + /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \ + /usr/local/cuda-5.0/include/cublas.h \ + /usr/local/cuda-5.0/include/cuda_runtime.h \ + /usr/local/cuda-5.0/include/host_config.h \ + /usr/local/cuda-5.0/include/channel_descriptor.h \ + /usr/local/cuda-5.0/include/cuda_runtime_api.h \ + /usr/local/cuda-5.0/include/driver_functions.h \ + /usr/local/cuda-5.0/include/vector_functions.h \ + /usr/local/cuda-5.0/include/cublas_api.h \ + /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \ + /usr/include/sys/time.h cucommon.h ../KaldiLib/Error.h cuvector.h \ + ../KaldiLib/Vector.h cuvector.tcc cudevice.h /usr/include/c++/4.6/map \ + /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \ + /usr/include/c++/4.6/bits/stl_multimap.h diff --git a/src/CuBaseLib/.svn/entries b/src/CuBaseLib/.svn/entries new file mode 100644 index 0000000..c84cfaf --- /dev/null +++ b/src/CuBaseLib/.svn/entries @@ -0,0 +1,572 @@ +10 + +dir +117 +svn+ssh://merlin.fit.vutbr.cz/svn/TNet/trunk/src/CuBaseLib +svn+ssh://merlin.fit.vutbr.cz/svn/TNet + + + +2012-03-23T13:22:49.912359Z +110 +iveselyk + + + + + + + + + + + + + + +bda6da93-004a-4ae9-8e07-715c10848801 + +cuvector.h +file + + + + +2012-04-02T13:49:13.000000Z +b3ee9f2c8cb663233c57f8409626b2b4 +2011-04-04T17:14:16.666438Z +46 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +2176 + +cukernels.h +file + + + + +2012-04-02T13:49:13.000000Z +45a1ded3d70c77dd88ad829ab297aef7 +2011-09-19T11:12:27.685840Z +69 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +4100 + +cumatrix.h +file + + + + +2012-04-02T13:49:13.000000Z +311eb907ea742705a8cb7cedf708a316 +2012-02-07T17:50:53.635354Z +103 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +5223 + +curand.tcc +file + + + + +2012-04-02T13:49:13.000000Z +002df511ddd5aaaf936cc8d4188fc1ae +2011-02-24T12:12:08.754106Z +34 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +5068 + +curandkernels.h +file + + + + +2012-04-02T13:49:13.000000Z +63dd2817bc838424d9a7991b3bfd7653 +2011-02-24T12:12:08.754106Z +34 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +897 + +cukernels.cu +file + + + + +2012-04-02T13:49:13.000000Z +11ba351040079306c93d33a36a3fd663 +2011-09-19T11:12:27.685840Z +69 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +17013 + +cuvector.tcc +file + + + + +2012-04-02T13:49:14.000000Z +9a81d31c3acf55bd49fd9480e68369b7 +2011-04-29T12:18:20.752880Z +49 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +6514 + +curand.h +file + + + + +2012-04-02T13:49:13.000000Z +45b3049e6e49bf0d360033072c9dde18 +2011-02-24T12:12:08.754106Z +34 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +654 + +cumatrix.tcc +file + + + + +2012-04-02T13:49:14.000000Z +5e150cb43f80d5d923c605865689c340 +2012-02-07T17:50:53.635354Z +103 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +17625 + +cumath.cc +file + + + + +2012-04-02T13:49:14.000000Z +fba9157202573cb0c816b13f12b4e983 +2011-03-07T10:43:43.160610Z +40 +iveselyk + + + + + + + + + + + + + + + + + + + + + +17426 + +curandkernels.cu +file + + + + +2012-04-02T13:49:14.000000Z +e799b057186a7997cd3501d1d9f25341 +2011-02-24T12:12:08.754106Z +34 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +3765 + +cucommon.h +file + + + + +2012-04-02T13:49:14.000000Z +f38d7325c130f751cc00c75591dbd75a +2011-02-24T12:12:08.754106Z +34 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +959 + +cumath.h +file + + + + +2012-04-02T13:49:14.000000Z +f67afa65b4ff27708ee807b74f7cbf04 +2011-03-07T10:43:43.160610Z +40 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +5743 + +cudevice.cc +file + + + + +2012-04-02T13:49:14.000000Z +d67d39c4be979930583bc7ba52f45f3c +2012-03-23T13:22:49.912359Z +110 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +3221 + +Makefile +file + + + + +2012-04-02T13:49:14.000000Z +9998baf0d675da3b4da44d283c46e7d7 +2011-03-24T17:03:17.103393Z +43 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +995 + +cudevice.h +file + + + + +2012-04-02T13:49:14.000000Z +c87181021fd59feadcb9b72ca97c893d +2012-03-23T13:22:49.912359Z +110 +iveselyk +has-props + + + + + + + + + + + + + + + + + + + + +1606 + diff --git a/src/CuBaseLib/.svn/prop-base/Makefile.svn-base b/src/CuBaseLib/.svn/prop-base/Makefile.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/Makefile.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/cucommon.h.svn-base b/src/CuBaseLib/.svn/prop-base/cucommon.h.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/cucommon.h.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/cudevice.cc.svn-base b/src/CuBaseLib/.svn/prop-base/cudevice.cc.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/cudevice.cc.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/cudevice.h.svn-base b/src/CuBaseLib/.svn/prop-base/cudevice.h.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/cudevice.h.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/cukernels.cu.svn-base b/src/CuBaseLib/.svn/prop-base/cukernels.cu.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/cukernels.cu.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/cukernels.h.svn-base b/src/CuBaseLib/.svn/prop-base/cukernels.h.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/cukernels.h.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/cumath.h.svn-base b/src/CuBaseLib/.svn/prop-base/cumath.h.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/cumath.h.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/cumatrix.h.svn-base b/src/CuBaseLib/.svn/prop-base/cumatrix.h.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/cumatrix.h.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/cumatrix.tcc.svn-base b/src/CuBaseLib/.svn/prop-base/cumatrix.tcc.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/cumatrix.tcc.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/curand.h.svn-base b/src/CuBaseLib/.svn/prop-base/curand.h.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/curand.h.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/curand.tcc.svn-base b/src/CuBaseLib/.svn/prop-base/curand.tcc.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/curand.tcc.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/curandkernels.cu.svn-base b/src/CuBaseLib/.svn/prop-base/curandkernels.cu.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/curandkernels.cu.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/curandkernels.h.svn-base b/src/CuBaseLib/.svn/prop-base/curandkernels.h.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/curandkernels.h.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/cuvector.h.svn-base b/src/CuBaseLib/.svn/prop-base/cuvector.h.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/cuvector.h.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/prop-base/cuvector.tcc.svn-base b/src/CuBaseLib/.svn/prop-base/cuvector.tcc.svn-base new file mode 100644 index 0000000..eff4929 --- /dev/null +++ b/src/CuBaseLib/.svn/prop-base/cuvector.tcc.svn-base @@ -0,0 +1,5 @@ +K 12 +svn:keywords +V 23 +Date Revision Author Id +END diff --git a/src/CuBaseLib/.svn/text-base/Makefile.svn-base b/src/CuBaseLib/.svn/text-base/Makefile.svn-base new file mode 100644 index 0000000..b574c4a --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/Makefile.svn-base @@ -0,0 +1,59 @@ + +include ../tnet.mk + +INCLUDE = -I. -I../ -I../KaldiLib + + +CUDA_INCLUDE= -I$(CUDA_TK_BASE)/include +CUDA_BIN=$(CUDA_TK_BASE)/bin + + +CUSRC=$(wildcard *.cu) +CUOBJ=$(patsubst %.cu, %.o, $(CUSRC)) + + + +CUDA_FLAGS = -g -Xcompiler -fPIC --verbose +ifeq ($(BITS64), true) + CUDA_FLAGS += --machine 64 + BUT_FORCE_GCC64 = ln -s `which x86_64-linux-gcc` $(PWD)/gcc + BUT_UNLINK_GCC64 = unlink $(PWD)/gcc +else + CUDA_FLAGS += --machine 32 +endif + +ifeq ($(DOUBLEPRECISION), true) + CUDA_FLAGS += --gpu-architecture compute_13 --gpu-code sm_13 +endif + + + + +all : libCuBase.a + +libCuBase.a : $(CUOBJ) $(OBJ) + $(AR) ruv $@ $? + $(RANLIB) $@ + + +%.o : %.cu + $(BUT_FORCE_GCC64) + export PATH=$(PWD):$(CUDA_BIN):$(PATH); $(CUDA_BIN)/nvcc -c $< -o $@ -I. $(CUDA_INCLUDE) $(CUDA_FLAGS) + $(BUT_UNLINK_GCC64) + +%.o : %.cc + $(CXX) -c $< -o $@ $(CXXFLAGS) $(CUDA_INCLUDE) $(INCLUDE) + + + + +.PHONY: clean depend + +clean : + rm -f *.o *.a + +depend: + $(CXX) -M $(CXXFLAGS) *.cc $(INCLUDE) $(CUDA_INCLUDE) > .depend.mk + +-include .depend.mk + diff --git a/src/CuBaseLib/.svn/text-base/cucommon.h.svn-base b/src/CuBaseLib/.svn/text-base/cucommon.h.svn-base new file mode 100644 index 0000000..6dc7e94 --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/cucommon.h.svn-base @@ -0,0 +1,46 @@ +#ifndef _CUCOMMON_H_ +#define _CUCOMMON_H_ + +#include <iostream> +#include <sstream> + +#include <cuda_runtime_api.h> + +#include "Error.h" + + + +#define cuSafeCall(fun) \ +{ \ + int ret; \ + if((ret = (fun)) != 0) { \ + std::ostringstream os; \ + os << "CUDA ERROR #" << ret << " " << __FILE__ ":" << __LINE__ << " " << __func__ << "()" << " '" << #fun << "' " << cudaGetErrorString((cudaError_t)ret); \ + throw(MyException(os.str())); \ + } \ + cudaThreadSynchronize(); \ +} + + + + +namespace TNet { + + /** The size of edge of CUDA square block **/ + static const int CUBLOCK = 16; + + /** Number of blocks in which is split task of size 'size' **/ + inline int n_blocks(int size, int block_size) + { return size / block_size + ((size % block_size == 0)? 0 : 1); } + + /** Printing dim3 output operator **/ + inline std::ostream& operator<<(std::ostream& os, dim3 arr) { + os << "[" << arr.x << "," << arr.y << "," << arr.z << "]"; + return os; + } + +} + + + +#endif diff --git a/src/CuBaseLib/.svn/text-base/cudevice.cc.svn-base b/src/CuBaseLib/.svn/text-base/cudevice.cc.svn-base new file mode 100644 index 0000000..90c5bf3 --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/cudevice.cc.svn-base @@ -0,0 +1,129 @@ + +#include <cudevice.h> +#include <cublas.h> +#include <cuda.h> + +/////////////////// +//DEBUG: Just make sure it compiles... +#include "cumatrix.h" +#include "cuvector.h" +#include "cumath.h" +template class TNet::CuMatrix<float>; +template class TNet::CuVector<float>; +template class TNet::CuMath<float>; +/////////////////// + +namespace TNet { + + + /********************************************************************************** + * CuDevice:: + */ + CuDevice:: + CuDevice() + : mIsPresent(false), mVerbose(false) + { + //get number of devices + int N_GPU = 0; + cudaGetDeviceCount(&N_GPU); + + //select device if more than one + if(N_GPU > 1) { + char name[128]; + size_t free, total; + std::vector<float> free_mem_ratio; + //get ratios of memory use + std::cout << "Selecting from " << N_GPU << " GPUs\n"; + for(int n=0; n<N_GPU; n++) { + std::cout << "cudaSetDevice(" << n << "): "; + cuSafeCall(cudaSetDevice(n));//context created by cuSafeCall(...) + cuDeviceGetName(name,128,n); + std::cout << name << "\t"; + cuSafeCall(cuMemGetInfo(&free,&total)); + std::cout << "free: " << free/1024/1024 << "M, " + << "total: "<< total/1024/1024 << "M, " + << "ratio: "<< free/(float)total << "\n"; + free_mem_ratio.push_back(free/(float)total); + cudaThreadExit();//destroy context + } + //find GPU with max free memory + int max_id=0; + for(int n=1; n<free_mem_ratio.size(); n++) { + if(free_mem_ratio[n] > free_mem_ratio[max_id]) max_id=n; + } + std::cout << "Selected device: " << max_id << " (automatically)\n"; + cuSafeCall(cudaSetDevice(max_id)); + } + + if(N_GPU > 0) { + //initialize the CUBLAS + cuSafeCall(cublasInit()); + mIsPresent = true; + } else { + Warning("No CUDA enabled GPU is present!"); + } + } + + CuDevice:: + ~CuDevice() + { + if(mIsPresent) { + cuSafeCall(cublasShutdown()); + if(mVerbose) { + TraceLog("CUBLAS released"); + PrintProfile(); + } + } else { + Warning("No CUDA enabled GPU was present!"); + } + } + + + void + CuDevice:: + SelectGPU(int gpu_id) + { + //get number of devices + int N_GPU = 0; + cudaGetDeviceCount(&N_GPU); + if(gpu_id >= N_GPU) { + KALDI_ERR << "Cannot select GPU " << gpu_id + << ", detected " << N_GPU << " CUDA capable cards!"; + } + //release old card + cuSafeCall(cublasShutdown()); + cudaThreadExit(); + //select new card + cuSafeCall(cudaSetDevice(gpu_id)); + //initialize CUBLAS + cuSafeCall(cublasInit()); + std::cout << "Selected device " << gpu_id << " (manually)\n"; + } + + + std::string + CuDevice:: + GetFreeMemory() + { + size_t mem_free, mem_total; + cuMemGetInfo(&mem_free, &mem_total); + std::ostringstream os; + os << "Free:" << mem_free/(1024*1024) << "MB " + << "Used:" << (mem_total-mem_free)/(1024*1024) << "MB " + << "Total:" << mem_total/(1024*1024) << "MB"; + return os.str(); + } + + + //////////////////////////////////////////////// + // Instance of the static singleton + // + CuDevice CuDevice::msDevice; + // + //////////////////////////////////////////////// + + + +} + + diff --git a/src/CuBaseLib/.svn/text-base/cudevice.h.svn-base b/src/CuBaseLib/.svn/text-base/cudevice.h.svn-base new file mode 100644 index 0000000..c5eeb7b --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/cudevice.h.svn-base @@ -0,0 +1,79 @@ +#ifndef _CUDEVICE_H_ +#define _CUDEVICE_H_ + +#include <map> +#include <string> +#include <iostream> + +namespace TNet { + + /** + * Singleton object which represents CUDA device + * responsible for CUBLAS initilalisation + * and memory block registration + */ + class CuDevice + { + // Singleton interface... + private: + CuDevice(); + CuDevice(CuDevice&); + CuDevice& operator=(CuDevice&); + + public: + ~CuDevice(); + static CuDevice& Instantiate() + { return msDevice; } + + private: + static CuDevice msDevice; + + + /**********************************/ + // Instance interface + public: + + void SelectGPU(int gpu_id); + + /// Check if the CUDA device is in the system + bool IsPresent() + { return mIsPresent; } + + void Verbose(bool verbose) + { mVerbose = verbose; } + + /// Sum the IO time + void AccuProfile(const std::string& key,double time) + { + if(mProfileMap.find(key) == mProfileMap.end()) { + mProfileMap[key] = 0.0; + } + mProfileMap[key] += time; + } + + void PrintProfile() + { + std::cout << "[cudevice profile]\n"; + std::map<std::string, double>::iterator it; + for(it = mProfileMap.begin(); it != mProfileMap.end(); ++it) { + std::cout << it->first << "\t" << it->second << "s\n"; + } + } + + void ResetProfile() + { mProfileMap.clear(); } + + std::string GetFreeMemory(); + + + private: + std::map<std::string, double> mProfileMap; + bool mIsPresent; + bool mVerbose; + }; //class CuDevice + + +} + + +#endif diff --git a/src/CuBaseLib/.svn/text-base/cukernels.cu.svn-base b/src/CuBaseLib/.svn/text-base/cukernels.cu.svn-base new file mode 100644 index 0000000..d6f866d --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/cukernels.cu.svn-base @@ -0,0 +1,626 @@ + +#include <cfloat> +#include "cukernels.h" + + + +/***************** + * CUDA kernels + */ +//CuMatrix +template<typename T> +__global__ +static void _set_const(T* mat, T value, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) + mat[index] = value; +} + + + +template<typename T> +__global__ +static void _apply_log(T* mat, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) + mat[index] = log(mat[index]); +} + + +template<typename T> +__global__ +static void _apply_mask(T* mat, const float* mask, MatrixDim dmat, MatrixDim dmask) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*dmat.stride; + int index2 = i + j*dmask.stride; + if ( i < dmat.cols && j < dmat.rows ) + if(mask[index2] == 0) mat[index] = 0; +} + + +template<typename T> +__global__ +static void _apply_l1(T* mat, T l1, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) { + T value = mat[index]; + T tgt; + if(abs(value) < l1) { + tgt = 0; + } else { + tgt = (value > 0?value-l1:value+l1); + } + mat[index] = tgt; + } +} + + +template<typename T> +__global__ +static void _scale_cols(T* mat, const T* scale, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) + mat[index] *= scale[i]; +} + + +template<typename T> +__global__ +static void _scale_rows(T* mat, const T* scale, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) + mat[index] *= scale[j]; +} + + +template<typename T> +__global__ +static void _add_scaled(T alpha, const T* A, T beta, T* dst, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) + dst[index] = alpha*A[index] + beta*dst[index]; +} + + +template<typename T> +__global__ +static void _add_scaled_row(T alpha, const T* row, T beta, T* dst, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + +#if 0 + //this does not accelerate :( + __shared__ T aux[16]; + if(threadIdx.y == 0 && i < d.cols) aux[threadIdx.x] = row[i]; + __syncthreads(); + + if ( i < d.cols && j < d.rows ) + dst[index] = alpha*aux[threadIdx.x] + beta*dst[index]; +#else + if ( i < d.cols && j < d.rows ) + dst[index] = alpha*row[i] + beta*dst[index]; +#endif +} + + +template<typename T> +__global__ +static void _mul_elem(T* mat, const T* A, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) + mat[index] = mat[index] * A[index]; +} + + +template<typename T> +__global__ +static void _log_elem(T* mat, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) { + if(mat[index] < FLT_MIN) mat[index] = FLT_MIN; + mat[index] = log(mat[index]); + } +} + + + + +//CuVector +template<typename T> +__global__ +static void _add_col_sum(T alpha, const T* mat, T beta, T* vec, MatrixDim d) { + + int i = blockIdx.x * blockDim.x + threadIdx.x; + + //This should be called 1-D + int j = blockIdx.y * blockDim.y + threadIdx.y; + if(j > 0) return; + + if(i < d.cols) { + double sum = 0.0; + for(int k = 0; k < d.rows; k++) { + sum += mat[i+k*d.stride]; + } + vec[i] = alpha*sum + beta*vec[i]; + } +} + + +template<typename T> +__global__ +static void _add_col_sum_reduce(T alpha, const T* mat, T beta, T* vec, MatrixDim d) { + + //flipped x,y for reducing... x..row, y..col + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + + if(blockIdx.x > 0) return; + if(blockDim.y != 1) return; + + //copy vector to shared mem + __shared__ T aux[512]; + aux[threadIdx.x] = mat[i+j*d.stride]; + __syncthreads(); + + T sum = _sum_reduce(aux); + __syncthreads(); + //copy out the result + vec[i] = alpha*sum + beta*vec[i]; +} + + + +//CuMath +template<typename T> +__global__ +static void _sigmoid(T*y, const T*x, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if( i < d.cols && j < d.rows ) { + T res = 1.0 / (1.0 + exp(-x[index])); + /* + if(res < 0.001) res = 0.001; + if(res > 0.999) res = 0.999; + */ + y[index] = res; + } +} + + +template<typename T> +__global__ +static void _diff_sigmoid(T*eout, const T*e, const T*y, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if( i < d.cols && j < d.rows ) + eout[index] = y[index]*(1.0-y[index]) * e[index]; +} + + +template<typename T> +__global__ +static void _softmax(T*y, const T*x, MatrixDim d) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + if(j >= d.rows) return; + + //copy to output and find max... + double max = -1e20; + double sum = 0.0; + for(int i=0; i<d.cols; i++) { + if(max < x[i+j*d.stride]) max = x[i+j*d.stride]; + y[i+j*d.stride] = x[i+j*d.stride]; + } + //subtract max, apply exp, sum up... + for(int i=0; i<d.cols; i++) { + y[i+j*d.stride] = exp(y[i+j*d.stride] - max); + sum += y[i+j*d.stride]; + } + //normalize by sum... + for(int i=0; i<d.cols; i++) { + y[i+j*d.stride] /= sum; + } +} + + + + +template<typename T> +__device__ +static T _max_reduce(T buffer[]) { + + // Total number of active threads + int nTotalThreads = blockDim.x; + __syncthreads(); + + while(nTotalThreads > 1) { + int halfPoint = ((1+nTotalThreads) >> 1); // divide by two + // only the first half of the threads will be active. + if (threadIdx.x < halfPoint) { + // Get the shared value stored by another thread + T temp = -1e20; + if(threadIdx.x+halfPoint < nTotalThreads) { + temp = buffer[threadIdx.x + halfPoint]; + } + if (temp > buffer[threadIdx.x]) buffer[threadIdx.x] = temp; + } + __syncthreads(); + nTotalThreads = ((1+nTotalThreads) >> 1); // divide by two. + } + // the result + return buffer[0]; +} + + + + +template<typename T> +__device__ +static T _sum_reduce(T buffer[]) { + + // Total number of active threads + int nTotalThreads = blockDim.x; + __syncthreads(); + + while(nTotalThreads > 1) { + int halfPoint = ((1+nTotalThreads) >> 1); // divide by two + // only the first half of the threads will be active. + if (threadIdx.x < halfPoint) { + // Get the shared value stored by another thread + T temp = 0.0; + if(threadIdx.x+halfPoint < nTotalThreads) { + temp = buffer[threadIdx.x + halfPoint]; + } + buffer[threadIdx.x] += temp; + } + __syncthreads(); + nTotalThreads = ((1+nTotalThreads) >> 1); // divide by two. + } + // the result + return buffer[0]; +} + + + +template<typename T> +__global__ +static void _softmax_reduce(T*y, const T*x, MatrixDim d) { + + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + + if(blockIdx.x > 0) return; + if(blockDim.y > 1) return; + + __shared__ T row_data[256]; + __shared__ T aux[256]; + + //copy the input to row_data + row_data[i] = x[i+j*d.stride]; + __syncthreads(); + + //copy input to aux + aux[i] = row_data[i]; + __syncthreads(); + //get the maximum value + T max = _max_reduce(aux); + __syncthreads(); + + //calculate exp(data-max) + row_data[i] = exp(row_data[i]-max); + + //copy the values to aux + aux[i] = row_data[i]; + __syncthreads(); + //get the sum + T sum = _sum_reduce(aux); + __syncthreads(); + + //divide the values + row_data[i] /= sum; + //copy out + y[i+j*d.stride] = row_data[i]; + +} + + + +template<typename T> +__global__ +static void _expand(T* y, const T* x, const int* off, MatrixDim d_out, MatrixDim d_in) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d_out.stride; + if( i < d_out.cols && j < d_out.rows ) { + int src_col = i % d_in.cols; + int src_row = j + off[i / d_in.cols]; + if(src_row < 0) src_row = 0; + if(src_row >= d_in.rows) src_row = d_in.rows-1; + y[index] = x[src_col + src_row*d_in.stride]; + } +} + + +template<typename T> +__global__ +static void _rearrange(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d_out.stride; + if( i < d_out.cols && j < d_out.rows ) { + int src_col = copy_from[i]; + if(src_col >= 0 && src_col < d_in.cols) { + y[index] = x[src_col + j*d_in.stride]; + } else { + y[index] = 1.0/0.0; + } + } +} + + +template<typename T> +__global__ +static void _randomize(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d_out.stride; + if( i < d_out.cols && j < d_out.rows ) { + int src_row = copy_from[j]; + y[index] = x[i + src_row*d_in.stride]; + } +} + + +template<typename T> +__global__ +static void _check_class(const T* out, const T* des, int* match, MatrixDim d) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + if(j>0) return; + + if(i<d.rows) { + int out_id = -1, des_id = -2; + T out_max = -1e20, des_max = -1e20; + + for(int k=0; k<d.cols; k++) { + T val = out[k + i*d.stride]; + if(val > out_max) { out_max = val; out_id = k; } + } + for(int k=0; k<d.cols; k++) { + T val = des[k + i*d.stride]; + if(val > des_max) { des_max = val; des_id = k; } + } + + match[i] = ((out_id == des_id)?1:0); + } +} + + +template<typename T> +__device__ +static int _max_id_reduce(T val[],int idx[]) { + + // Total number of active threads + int nTotalThreads = blockDim.x; + __syncthreads(); + + while(nTotalThreads > 1) { + int halfPoint = ((1+nTotalThreads) >> 1); // divide by two + // only the first half of the threads will be active. + if (threadIdx.x < halfPoint) { + // Get the shared value stored by another thread + T temp = -1e20; + if(threadIdx.x+halfPoint < nTotalThreads) { + temp = val[idx[threadIdx.x + halfPoint]]; + } + if (temp > val[idx[threadIdx.x]]) idx[threadIdx.x]=idx[threadIdx.x + halfPoint]; + } + __syncthreads(); + nTotalThreads = ((1+nTotalThreads) >> 1); // divide by two. + } + // the result + return idx[0]; +} + + + + + + +template<typename T> +__global__ +static void _check_class_reduce(const T* out, const T* des, int* match, MatrixDim d) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + + if(blockIdx.x > 0) return; + if(blockDim.y != 1) return; + + __shared__ T value[256]; + __shared__ int index[256]; + + value[threadIdx.x] = out[i+j*d.stride]; + index[threadIdx.x] = threadIdx.x; + __syncthreads(); + + int out_max = _max_id_reduce(value,index); + __syncthreads(); + + value[threadIdx.x] = des[i+j*d.stride]; + index[threadIdx.x] = threadIdx.x; + __syncthreads(); + + int des_max = _max_id_reduce(value,index); + __syncthreads(); + + if(threadIdx.x == 0) { + match[j] = ((out_max == des_max)?1:0); + } +} + + + + +/************** + * C wrappers around CUDA kernels + */ +//:FLOAT: +//CuMatrix +void cudaF_set_const(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) +{ _set_const<<<Gr,Bl>>>(mat,value,d); } + +void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) +{ _apply_log<<<Gr,Bl>>>(mat,d); } + +void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const float* mask, MatrixDim dmat, MatrixDim dmask) +{ _apply_mask<<<Gr,Bl>>>(mat,mask,dmat,dmask); } + +void cudaF_apply_l1(dim3 Gr, dim3 Bl, float* mat, float l1, MatrixDim d) +{ _apply_l1<<<Gr,Bl>>>(mat,l1,d); } + +void cudaF_scale_cols(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d) +{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); } + +void cudaF_scale_rows(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d) +{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); } + +void cudaF_add_scaled(dim3 Gr, dim3 Bl, float alpha, const float* A, float beta, float* dst, MatrixDim d) +{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); } + +void cudaF_add_scaled_row(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d) +{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); } + +void cudaF_mul_elem(dim3 Gr, dim3 Bl, float*mat, const float*A, MatrixDim d) +{ _mul_elem<<<Gr,Bl>>>(mat,A,d); } + +void cudaF_log_elem(dim3 Gr, dim3 Bl, float*mat, MatrixDim d) +{ _log_elem<<<Gr,Bl>>>(mat,d); } + +//CuVector +void cudaF_add_col_sum(size_t Gr, size_t Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d) +{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); } + +void cudaF_add_col_sum_reduce(dim3 Gr, dim3 Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d) +{ _add_col_sum_reduce<<<Gr,Bl>>>(alpha,mat,beta,vec,d); } + +//CuMath +void cudaF_sigmoid (dim3 Gr, dim3 Bl, float *y, const float*x, MatrixDim d) +{ _sigmoid<<<Gr,Bl>>>(y, x, d); } + +void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float*eout, const float*e, const float*y, MatrixDim d) { + _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d); +} + +void cudaF_softmax (size_t Gr, size_t Bl, float*y, const float*x, MatrixDim d) +{ _softmax<<<Gr,Bl>>>(y, x, d); } + +void cudaF_softmax_reduce (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d) +{ _softmax_reduce<<<Gr,Bl>>>(y, x, d); } + + +void cudaF_expand(dim3 Gr, dim3 Bl, float* y, const float* x, const int* off, MatrixDim d_out, MatrixDim d_in) +{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); } + + +void cudaF_rearrange(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in) +{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); } + + +void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in) +{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); } + + +void cudaF_check_class(size_t Gr, size_t Bl, const float* out, const float* des, int* match, MatrixDim d) +{ _check_class<<<Gr,Bl>>>(out,des,match,d); } + +void cudaF_check_class_reduce(dim3 Gr, dim3 Bl, const float* out, const float* des, int* match, MatrixDim d) +{ _check_class_reduce<<<Gr,Bl>>>(out,des,match,d); } + + + + +//:DOUBLE: +//CuMatrix +void cudaD_set_const(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) +{ _set_const<<<Gr,Bl>>>(mat,value,d); } + +void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) +{ _apply_log<<<Gr,Bl>>>(mat,d); } + +void cudaD_scale_cols(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d) +{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); } + +void cudaD_scale_rows(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d) +{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); } + +void cudaD_add_scaled(dim3 Gr, dim3 Bl, double alpha, const double* A, double beta, double* dst, MatrixDim d) +{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); } + +void cudaD_add_scaled_row(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d) +{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); } + +void cudaD_mul_elem(dim3 Gr, dim3 Bl, double*mat, const double*A, MatrixDim d) +{ _mul_elem<<<Gr,Bl>>>(mat,A,d); } + +void cudaD_log_elem(dim3 Gr, dim3 Bl, double*mat, MatrixDim d) +{ _log_elem<<<Gr,Bl>>>(mat,d); } + +//CuVector +void cudaD_add_col_sum(size_t Gr, size_t Bl, double alpha, const double* mat, double beta, double* vec, MatrixDim d) +{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); } + +//CuMath +void cudaD_sigmoid (dim3 Gr, dim3 Bl, double *y, const double*x, MatrixDim d) +{ _sigmoid<<<Gr,Bl>>>(y, x, d); } + + +void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double*eout, const double*e, const double*y, MatrixDim d) { + _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d); +} + +void cudaD_softmax (size_t Gr, size_t Bl, double*y, const double*x, MatrixDim d) +{ _softmax<<<Gr,Bl>>>(y, x, d); } + + +void cudaD_expand(dim3 Gr, dim3 Bl, double* y, const double* x, const int* off, MatrixDim d_out, MatrixDim d_in) +{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); } + + +void cudaD_rearrange(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in) +{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); } + + +void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in) +{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); } + + +void cudaD_check_class(size_t Gr, size_t Bl, const double* out, const double* des, int* match, MatrixDim d) +{ _check_class<<<Gr,Bl>>>(out,des,match,d); } + + + + diff --git a/src/CuBaseLib/.svn/text-base/cukernels.h.svn-base b/src/CuBaseLib/.svn/text-base/cukernels.h.svn-base new file mode 100644 index 0000000..d8320b5 --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/cukernels.h.svn-base @@ -0,0 +1,81 @@ +#ifndef _cuda_kernels_h_ +#define _cuda_kernels_h_ + + +extern "C" { + +#pragma GCC diagnostic ignored "-Wshadow"; +#include <vector_types.h> +#pragma GCC diagnostic warning "-Wshadow"; + + typedef struct MatrixDim_ { + int rows; + int cols; + int stride; + } MatrixDim; + + /************* + * Float instances + */ + //CuMatrix + void cudaF_set_const(dim3 Gr, dim3 Bl, float*mat, float value, MatrixDim d); + void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); + void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const float* mask, MatrixDim dmat, MatrixDim dmask); + void cudaF_apply_l1(dim3 Gr, dim3 Bl, float* mat, float l1, MatrixDim d); + void cudaF_scale_cols(dim3 Gr, dim3 Bl, float*mat, const float* scale, MatrixDim d); + void cudaF_scale_rows(dim3 Gr, dim3 Bl, float*mat, const float* scale, MatrixDim d); + void cudaF_add_scaled(dim3 Gr, dim3 Bl, float alpha, const float* A, float beta, float* dst, MatrixDim d); + void cudaF_add_scaled_row(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d); + void cudaF_mul_elem(dim3 Gr, dim3 Bl, float*mat, const float*A, MatrixDim d); + void cudaF_log_elem(dim3 Gr, dim3 Bl, float*mat, MatrixDim d); + + //CuVector + void cudaF_add_col_sum(size_t Gr, size_t Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d); + void cudaF_add_col_sum_reduce(dim3 Gr, dim3 Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d); + + //CuMath + void cudaF_softmax (size_t Gr, size_t Bl, float*y, const float*x, MatrixDim d); + void cudaF_softmax_reduce (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d); + void cudaF_sigmoid (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d); + void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float* eout, const float* e, const float* y, MatrixDim d); + + void cudaF_expand(dim3 Gr, dim3 Bl, float* y, const float* x, const int* off, MatrixDim d_out, MatrixDim d_in); + void cudaF_rearrange(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in); + void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in); + + void cudaF_check_class(size_t Gr, size_t Bl, const float* out, const float* des, int* match, MatrixDim d); + void cudaF_check_class_reduce(dim3 Gr, dim3 Bl, const float* out, const float* des, int* match, MatrixDim d); + + + + /************* + * Double instances + */ + //CuMatrix + void cudaD_set_const(dim3 Gr, dim3 Bl, double*mat, double value, MatrixDim d); + void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); + void cudaD_scale_cols(dim3 Gr, dim3 Bl, double*mat, const double* scale, MatrixDim d); + void cudaD_scale_rows(dim3 Gr, dim3 Bl, double*mat, const double* scale, MatrixDim d); + void cudaD_add_scaled(dim3 Gr, dim3 Bl, double alpha, const double* A, double beta, double* dst, MatrixDim d); + void cudaD_add_scaled_row(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d); + void cudaD_mul_elem(dim3 Gr, dim3 Bl, double*mat, const double*A, MatrixDim d); + void cudaD_log_elem(dim3 Gr, dim3 Bl, double*mat, MatrixDim d); + + //CuVector + void cudaD_add_col_sum(size_t Gr, size_t Bl, double alpha, const double* mat, double beta, double* vec, MatrixDim d); + + //CuMath + void cudaD_softmax (size_t Gr, size_t Bl, double*y, const double*x, MatrixDim d); + void cudaD_sigmoid (dim3 Gr, dim3 Bl, double*y, const double*x, MatrixDim d); + void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double* eout, const double* e, const double* y, MatrixDim d); + + void cudaD_expand(dim3 Gr, dim3 Bl, double* y, const double* x, const int* off, MatrixDim d_out, MatrixDim d_in); + void cudaD_rearrange(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in); + void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in); + + void cudaD_check_class(size_t Gr, size_t Bl, const double* out, const double* des, int* match, MatrixDim d); + + +} + +#endif diff --git a/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base b/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base new file mode 100644 index 0000000..d718324 --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base @@ -0,0 +1,574 @@ + + + +#include "cumath.h" +#include "cukernels.h" + + +namespace TNet { + + ////////////////////////////////////////////////////////////////////////////// + //// CuMath<> Template specializations (float) + //// + template<> + void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK)); + + cudaF_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + template<> + void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK)); + + cudaF_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X) + { + Timer tim; tim.Start(); + +#if 0 + //disable 'reduce' functions + size_t dimBlock = CUBLOCK; + size_t dimGrid = n_blocks(X.Rows(),CUBLOCK); + + cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); + cuSafeCall(cudaGetLastError()); +#else + if(X.Cols() > 256) { + //use old implementation (can't use reduction due to + //limited size of shared memory) + size_t dimBlock = CUBLOCK; + size_t dimGrid = n_blocks(X.Rows(),CUBLOCK); + + cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); + cuSafeCall(cudaGetLastError()); + } else { + //use implementation with reduction + dim3 dimBlock(X.Cols(),1); + dim3 dimGrid(1,X.Rows()); + + cudaF_softmax_reduce(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); + cuSafeCall(cudaGetLastError()); + } +#endif + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf) + { + Timer tim; tim.Start(); + + assert(Y.Rows() == X.Rows()); + assert((X.Cols() % block_transf.Rows()) == 0); + assert((Y.Cols() % block_transf.Cols()) == 0); + assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols())); + + int blocks = X.Cols() / block_transf.Rows(); + + for(int i = 0; i < blocks; i++) { + int m = block_transf.Cols(); + int n = X.Rows(); + int k = block_transf.Rows(); + + /* + //DEBUG MESSAGE + std::cout << "N N " << m << " " << n << " " << k << " " + << 1.0 << " " << block_transf << " " << block_transf.Stride() + << " " << X+i*k << " " << X.Stride() << " " + << 0.0 << " " << Y+i*n << " " << Y.Stride() + << "\n" << std::flush; + */ + + + cublasSgemm('N', 'N', m, n, k, + 1.0, block_transf.pCUData(), block_transf.Stride(), + X.pCUData()+i*k, X.Stride(), + 0.0, Y.pCUData()+i*m, Y.Stride()); + } + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + + template<> + void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets) + { + Timer tim; tim.Start(); + + assert(Y.Rows() == X.Rows()); + assert(X.Cols() * frameOffsets.Dim() == Y.Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); + + cudaF_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom) + { + Timer tim; tim.Start(); + + assert(copyFrom.Dim() == Y.Cols()); + assert(Y.Rows() == X.Rows()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); + + cudaF_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom) + { + Timer tim; tim.Start(); + + assert(X.Cols() == Y.Cols()); + assert(X.Rows() == Y.Rows()); + assert(copyFrom.Dim() <= Y.Rows()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK)); + + MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim(); + MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim(); + + cudaF_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match) + { + Timer tim; tim.Start(); + + assert(out.Cols() == des.Cols()); + assert(out.Rows() == des.Rows()); + assert(out.Stride() == des.Stride()); + assert(match.Dim() == out.Rows()); + + if(out.Cols() > 256) { + size_t dimBlock = CUBLOCK; + size_t dimGrid = n_blocks(out.Rows(),CUBLOCK); + + cudaF_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim()); + cuSafeCall(cudaGetLastError()); + } else { + dim3 dimBlock(out.Cols(),1); + dim3 dimGrid(1,out.Rows()); + + cudaF_check_class_reduce(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim()); + cuSafeCall(cudaGetLastError()); + } + + + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC) + { + Timer tim; tim.Start(); + // CUBLAS is col major, TNet is row major + // keep trans..., just swap A&B argumets: A->B B->A + // + // WARNING + // NO DIMENSION CHECK!!! + + //m,n,k is cublas m,n,k + size_t m = ((transB=='T' || transB=='t')? B.Rows() : B.Cols()); + size_t n = ((transA=='T' || transA=='t')? A.Cols() : A.Rows()); + size_t k = ((transB=='T' || transB=='t')? B.Cols() : B.Rows()); + size_t k1 = ((transA=='T' || transA=='t')? A.Rows() : A.Cols()); + + k = ((k<k1)?k:k1); + m = ((m<C.Cols())?m:C.Cols()); + n = ((n<C.Rows())?m:C.Rows()); + +#if 0 + std::cout << "A " << transA << " "<< A.Rows() << " " << A.Cols() << " " << A.Stride() << " " << offA + << "; B " << transB << " "<< B.Rows() << " " << B.Cols() << " " << B.Stride() << " " << offB + << "; C " << C.Rows() << " " << C.Cols() << " " << C.Stride() << " " << offC + << "; alpha" << alpha << " beta" << beta << " REALmnk:" << m <<" "<< n <<" "<< k << std::endl; +#endif + + + cublasSgemm(transB, transA, m, n, k, + alpha, B.pCUData()+offB, B.Stride(), + A.pCUData()+offA, A.Stride(), + beta, C.pCUData()+offC, C.Stride()); + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + +/* + + template<> + void CuMath<float>::Gemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY) + { + Timer tim; tim.Start(); + // CUBLAS is col major, TNet is row major + // y = alpha * op(A) * x + beta * y, + + size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor) + size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor) + + // switch the trans parameter! + char cu_trans; + if(trans == 't' || trans == 'T') { + cu_trans = 'n'; + } else if (trans == 'n' || trans == 'N') { + cu_trans = 't'; + } else { + Error(std::string("Unknown trans")+trans); + } + + //check the dims + if(cu_trans == 'n') { + assert(dimX == n); + assert(dimY == m); + } else { + assert(dimX == m); + assert(dimY == n); + } + + //run gemv + cublasSgemv(cu_trans,m,n,alpha, + A.pCUData(), A.Stride(), x, 1, + beta, y, 1); + + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + */ + + /** + * offsetY tells how many outputs of 'Ax' mutiplication is skipped at the beginning, + */ + template<> + void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY) + { + Timer tim; tim.Start(); + // CUBLAS is col major, TNet is row major + // y = alpha * op(A) * x + beta * y, + + size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor) + size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor) + + // switch the trans parameter! + char cu_trans; + if(trans == 't' || trans == 'T') { + cu_trans = 'n'; + } else if (trans == 'n' || trans == 'N') { + cu_trans = 't'; + } else { + Error(std::string("Unknown trans")+trans); + } + + // select part of matrix for compute + size_t cu_offset = 0; + if(cu_trans == 'n') { + cu_offset += offsetY; + assert(m >= dimY+offsetY); + m = dimY; + } else { + cu_offset += offsetY*A.Stride(); + assert(n >= dimY+offsetY); + n = dimY; + } + + //check the dims + if(cu_trans == 'n') { + assert(dimX == n); + assert(dimY == m); + } else { + assert(dimX == m); + assert(dimY == n); + } + + //run gemv + cublasSgemv(cu_trans,m,n,alpha, + A.pCUData()+cu_offset, A.Stride(), x, 1, + beta, y, 1); + + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A) { + Timer tim; tim.Start(); + // CUBLAS is col major, TNet is row major + // -> switch x and y + + // A = alpha * x * transpose(y) + A, + + assert(dimX == A.Rows()); + assert(dimY == A.Cols()); + + size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor) + size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor) + + cublasSger(m,n,alpha,y,1,x,1,A.pCUData(),A.Stride()); + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out) + { + Timer tim; tim.Start(); + + assert(out.Dim() % in.Dim() == 0); + int n_copies = out.Dim()/in.Dim(); + CuVector<int> offsets(n_copies); + //offsets.SetConst(0); done implicitly! + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(out.Dim(), CUBLOCK)); + + MatrixDim dim_in = { 1, in.Dim(), in.Dim() }; + MatrixDim dim_out = { 1, out.Dim(), out.Dim() }; + cudaF_expand(dimGrid, dimBlock, out.pCUData(), in.pCUData(), offsets.pCUData(), dim_out, dim_in); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out) + { + Timer tim; tim.Start(); + + assert(in.Dim() % out.Dim() == 0); + + size_t dimBlock = CUBLOCK; + size_t dimGrid = n_blocks(out.Dim(),CUBLOCK); + + MatrixDim dim = { in.Dim()/out.Dim(), out.Dim(), out.Dim() }; + + cudaF_add_col_sum(dimGrid,dimBlock,alpha,in.pCUData(),beta,out.pCUData(),dim); + + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + ////////////////////////////////////////////////////////////////////////////// + //// CuMath<> Template specializations (double) + //// + template<> + void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK)); + + cudaD_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + template<> + void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK)); + + cudaD_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X) + { + Timer tim; tim.Start(); + + size_t dimBlock = CUBLOCK; + size_t dimGrid = n_blocks(X.Rows(),CUBLOCK); + + cudaD_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf) + { + Timer tim; tim.Start(); + + assert(Y.Rows() == X.Rows()); + assert((X.Cols() % block_transf.Rows()) == 0); + assert((Y.Cols() % block_transf.Cols()) == 0); + assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols())); + + int blocks = X.Cols() / block_transf.Rows(); + + for(int i = 0; i < blocks; i++) { + int m = block_transf.Cols(); + int n = X.Rows(); + int k = block_transf.Rows(); + + /* + //DEBUG MESSAGE + std::cout << "N N " << m << " " << n << " " << k << " " + << 1.0 << " " << block_transf << " " << block_transf.Stride() + << " " << X+i*k << " " << X.Stride() << " " + << 0.0 << " " << Y+i*n << " " << Y.Stride() + << "\n" << std::flush; + */ + + + cublasDgemm('N', 'N', m, n, k, + 1.0, block_transf.pCUData(), block_transf.Stride(), + X.pCUData()+i*k, X.Stride(), + 0.0, Y.pCUData()+i*m, Y.Stride()); + } + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + + template<> + void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets) + { + Timer tim; tim.Start(); + + assert(Y.Rows() == X.Rows()); + assert(X.Cols() * frameOffsets.Dim() == Y.Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); + + cudaD_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom) + { + Timer tim; tim.Start(); + + assert(copyFrom.Dim() == Y.Cols()); + assert(Y.Rows() == X.Rows()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); + + cudaD_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom) + { + Timer tim; tim.Start(); + + assert(X.Cols() == Y.Cols()); + assert(X.Rows() == Y.Rows()); + assert(copyFrom.Dim() <= Y.Rows()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK)); + + MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim(); + MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim(); + + cudaD_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match) + { + Timer tim; tim.Start(); + + assert(out.Cols() == des.Cols()); + assert(out.Rows() == des.Rows()); + assert(out.Stride() == des.Stride()); + assert(match.Dim() == out.Rows()); + + size_t dimBlock = CUBLOCK; + size_t dimGrid = n_blocks(out.Rows(),CUBLOCK); + + cudaD_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + +} diff --git a/src/CuBaseLib/.svn/text-base/cumath.h.svn-base b/src/CuBaseLib/.svn/text-base/cumath.h.svn-base new file mode 100644 index 0000000..5680082 --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/cumath.h.svn-base @@ -0,0 +1,146 @@ +#ifndef _CUMATH_H_ +#define _CUMATH_H_ + +#include "cumatrix.h" + +#include "Timer.h" +#include "cudevice.h" + +namespace TNet { + + + /** + * Group of Math operations for the NN training + */ + template<typename _ElemT> + class CuMath + { + public: + + /// Y = Sigmoid(X) + static void Sigmoid(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X) + { Error("__func__ Not implemented"); } + + /// Eout = E(1-E) * Y + static void DiffSigmoid(CuMatrix<_ElemT>& Eout, const CuMatrix<_ElemT>& Ein, const CuMatrix<_ElemT>& Y) + { Error("__func__ Not implemented"); } + + /// Y = Softmax(X) + static void Softmax(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X) + { Error("__func__ Not implemented"); } + + /// for DCT in FeaCat + static void BlockLinearity(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuMatrix<_ElemT>& block_transf) + { Error("__func__ Not implemented"); } + + static void Expand(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& frameOffsets) + { Error("__func__ Not implemented"); } + + /// ie. switch cols according to copyFrom + static void Rearrange(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& copyFrom) + { Error("__func__ Not implemented"); } + + /// ie. switch rows according to copyFrom + static void Randomize(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& copyFrom) + { Error("__func__ Not implemented"); } + + /// check match in the classification for Xentropy + static void CheckClass(const CuMatrix<_ElemT>& out, const CuMatrix<_ElemT> &des, CuVector<int>& match) + { Error("__func__ Not implemented"); } + + /// gemm with offset for CuSharedLinearity + static void OffsetGemm(char transA, char transB, _ElemT alpha, const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B, _ElemT beta, CuMatrix<_ElemT>& C, int offA, int offB, int offC) + { Error("__func__ Not implemented"); } + + /// gemv with offset for CuRecurrent + static void OffsetGemv(char trans, _ElemT alpha, const CuMatrix<_ElemT>& A, const _ElemT* x, size_t dimX, _ElemT beta, _ElemT* y, size_t dimY, size_t offsetY) + { Error("__func__ Not implemented"); } + + /// ger for weight updates in CuRecurrent + static void BlasGer(_ElemT alpha, const _ElemT* x, size_t dimX, const _ElemT* y, size_t dimY, CuMatrix<_ElemT>& A) + { Error("__func__ Not implemented"); } + + /// concatenate one vector several times for CuSharedLinearity + static void VecExpand(const CuVector<_ElemT>&in, CuVector<_ElemT>&out) + { Error("__func__ Not implemented"); } + + /// sum the vector as if it was matrix data for CuSharedLinearity + static void VecAddColSum(_ElemT alpha, const CuVector<_ElemT>&in, _ElemT beta, CuVector<_ElemT>&out) + { Error("__func__ Not implemented"); } + + }; //class CuMath:: + + + ////////////////////////////////////////////////////////////////////////////// + //// CuMath<> Template specializations (float) + //// + template<> + void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X); + + template<> + void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y); + + template<> + void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X); + + template<> + void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf); + + template<> + void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets); + + template<> + void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom); + + template<> + void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom); + + template<> + void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match); + + template<> + void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC); + + template<> + void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY); + + template<> + void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A); + + template<> + void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out); + + template<> + void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out); + + + ////////////////////////////////////////////////////////////////////////////// + //// CuMath<> Template specializations (double) + //// + template<> + void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X); + + template<> + void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y); + + template<> + void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X); + + template<> + void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf); + + template<> + void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets); + + template<> + void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom); + + template<> + void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom); + + template<> + void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match); + +} + +#endif diff --git a/src/CuBaseLib/.svn/text-base/cumatrix.h.svn-base b/src/CuBaseLib/.svn/text-base/cumatrix.h.svn-base new file mode 100644 index 0000000..4e767e3 --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/cumatrix.h.svn-base @@ -0,0 +1,199 @@ +#ifndef _CUMATRIX_H_ +#define _CUMATRIX_H_ + +#include <sstream> + +#include "Matrix.h" +#include "cukernels.h" + + + +namespace TNet { + + template<typename _ElemT> class CuVector; + + /** + * Matrix for CUDA computing + */ + template<typename _ElemT> + class CuMatrix + { + typedef CuMatrix<_ElemT> ThisType; + + public: + + /// Default Constructor + CuMatrix<_ElemT>() + : mRows(0), mCols(0), mStride(0), mpCUData(NULL) + { } + /// Constructor with memory initialisation + CuMatrix<_ElemT>(size_t rows, size_t cols) + : mRows(0), mCols(0), mStride(0), mpCUData(NULL) + { Init(rows, cols); } + + /// Destructor + ~CuMatrix() + { Destroy(); } + + /// Dimensions + size_t Rows() const + { return mRows; } + + size_t Cols() const + { return mCols; } + + size_t Stride() const + { return mStride; } + + ::MatrixDim Dim() const + { ::MatrixDim d = { + static_cast<int>(mRows), + static_cast<int>(mCols), + static_cast<int>(mStride) + }; + return d; + } + + /// Get raw pointer + const _ElemT* pCUData() const + { return mpCUData; } + _ElemT* pCUData() + { return mpCUData; } + + /// Get raw row pointer + const _ElemT* pCURowData(size_t r) const + { assert(r < Rows()); return mpCUData+r*mStride; } + _ElemT* pCURowData(size_t r) + { assert(r < Rows()); return mpCUData+r*mStride; } + + /// Get size of matrix in bytes + size_t MSize() const + { return mRows*mStride*sizeof(_ElemT); } + /// Get size of matrix row in bytes + size_t MRowSize() const + { return mStride*sizeof(_ElemT); } + + /// Allocate the memory + ThisType& Init(size_t rows, size_t cols); + + /// Deallocate the memory + void Destroy(); + + /// Copy functions (reallocates when needed) + ThisType& CopyFrom(const CuMatrix<_ElemT>& rSrc); + ThisType& CopyFrom(const Matrix<_ElemT>& rSrc); + Matrix<_ElemT>& CopyTo(Matrix<_ElemT>& rDst) const; + + /// Copy rowCnt rows from rSrc, starting by row srcOri, + /// copying to memory block starting by row dstOri + void CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri); + + /// Copy colCnt columns from rSrc, starting by col srcOri, + /// copying to memory block starting by row dstOri + void CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri); + + + // Math operations, some calling kernels + // + void SetZero(); + + void SetConst(_ElemT value) + { Error("__func__ Not implemented"); } + + void ApplyLog() + { Error("__func__ Not implemented"); } + + void ApplyMask(const CuMatrix<BaseFloat>& mask) + { Error("__func__ Not implemented"); } + + void ApplyL1(BaseFloat l1) + { Error("__func__ Not implemented"); } + + /// scale i'th column by scale[i] + void ScaleCols(const CuVector<_ElemT>& scale) + { Error("__func__ Not implemented"); } + + /// scale i'th row by scale[i] + void ScaleRows(const CuVector<_ElemT>& scale) + { Error("__func__ Not implemented"); } + + /// B = aplha * A + beta * B + void AddScaled(_ElemT alpha, const CuMatrix<_ElemT>& A, _ElemT beta) + { Error("__func__ Not implemented"); } + + /// B = aplha * row + beta * B + void AddScaledRow(_ElemT alpha, const CuVector<_ElemT>& row, _ElemT beta) + { Error("__func__ Not implemented"); } + + /// C = alpha * A(^T)*B(^T) + beta * C + void Gemm(char transa, char transb, + _ElemT alpha, + const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B, + _ElemT beta) + { Error("__func__ Not implemented"); } + + /// A = alpha * x*y^T + A + void BlasGer(_ElemT alpha, + const CuVector<_ElemT>& x, const CuVector<_ElemT>& y) + { Error("__func__ Not implemented"); } + + + /// Multiply two matrices elementhwise: C = A .* C + void MulElem(const CuMatrix<_ElemT>& A) + { Error("__func__ Not implemented"); } + + /// A = log(A) + void LogElem() + { Error("__func__ Not implemented"); } + + void Print() const + { + Matrix<_ElemT> mat(Rows(),Cols()); + CopyTo(mat); + std::cout << mat; + } + + + + void CheckData() + { + Matrix<_ElemT> mat; + CopyTo(mat); + for(size_t i=0; i<Rows(); i++) { + for(size_t j=0; j<Cols(); j++) { + if(std::isnan(mat(i,j)) || std::isinf(mat(i,j))) { + std::ostringstream os; + os << "Invalid value:" << mat(i,j) << "at row"<<i<<" col"<<j<<"\n"; + Error(os.str()); + } + } + } + } + + + private: + size_t mRows; + size_t mCols; + size_t mStride; + + _ElemT* mpCUData; + + }; + + + /// Prints the matrix dimensions and pointer to stream + template<typename _ElemT> + inline std::ostream& operator << (std::ostream& out, const CuMatrix<_ElemT>& mat) + { + out << "[CUMATRIX R" << mat.Rows() << " C" << mat.Cols() << " S" << mat.Stride() + << " PTR" << mat.pCUData() << "]" << std::flush; + return out; + } + + +} + + +#include "cumatrix.tcc" + +#endif diff --git a/src/CuBaseLib/.svn/text-base/cumatrix.tcc.svn-base b/src/CuBaseLib/.svn/text-base/cumatrix.tcc.svn-base new file mode 100644 index 0000000..4582e8d --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/cumatrix.tcc.svn-base @@ -0,0 +1,627 @@ + +#include <cuda_runtime_api.h> +#include <cublas.h> + +#include "Timer.h" +#include "cucommon.h" +#include "cuvector.h" +#include "cudevice.h" + +namespace TNet { + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuMatrix<_ElemT>& + CuMatrix<_ElemT>:: + Init(size_t rows, size_t cols) + { + if(mRows == rows && mCols == cols) { + //SetZero(); + return *this; + } + + Destroy(); + + size_t row_bytes = cols * sizeof(_ElemT); + size_t pitch; + cuSafeCall(cudaMallocPitch((void**)&mpCUData, &pitch, row_bytes, rows)); + mRows = rows; mCols = cols; + mStride = pitch/sizeof(_ElemT); + SetZero(); + + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + void + CuMatrix<_ElemT>:: + Destroy() + { + if(NULL != mpCUData) { + cuSafeCall(cudaFree(mpCUData)); + mpCUData = NULL; + } + mRows = mCols = mStride = 0; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuMatrix<_ElemT>& + CuMatrix<_ElemT>:: + CopyFrom(const CuMatrix<_ElemT>& rSrc) + { + Init(rSrc.Rows(),rSrc.Cols()); + + Timer tim; tim.Start(); + + size_t dst_pitch = mStride*sizeof(_ElemT); + size_t src_pitch = rSrc.Stride()*sizeof(_ElemT); + size_t width = rSrc.Cols()*sizeof(_ElemT); + cuSafeCall(cudaMemcpy2D(mpCUData, dst_pitch, rSrc.pCUData(), src_pitch, width, rSrc.Rows(), cudaMemcpyDeviceToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromD2D",tim.Val()); + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuMatrix<_ElemT>& + CuMatrix<_ElemT>:: + CopyFrom(const Matrix<_ElemT>& rSrc) + { + Init(rSrc.Rows(),rSrc.Cols()); + + Timer tim; tim.Start(); + + size_t dst_pitch = mStride*sizeof(_ElemT); + size_t src_pitch = rSrc.Stride()*sizeof(_ElemT); + size_t width = rSrc.Cols()*sizeof(_ElemT); + cuSafeCall(cudaMemcpy2D(mpCUData, dst_pitch, rSrc.pData(), src_pitch, width, rSrc.Rows(), cudaMemcpyHostToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromH2D",tim.Val()); + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + Matrix<_ElemT>& + CuMatrix<_ElemT>:: + CopyTo(Matrix<_ElemT>& rDst) const + { + if(rDst.Rows() != Rows() || rDst.Cols() != Cols()) { + rDst.Init(Rows(),Cols()); + } + + Timer tim; tim.Start(); + + size_t src_pitch = mStride*sizeof(_ElemT); + size_t dst_pitch = rDst.Stride()*sizeof(_ElemT); + size_t width = Cols()*sizeof(_ElemT); + cuSafeCall(cudaMemcpy2D(rDst.pData(), dst_pitch, pCUData(), src_pitch, width, Rows(), cudaMemcpyDeviceToHost)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToD2H",tim.Val()); + + return rDst; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + void + CuMatrix<_ElemT>:: + CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri) + { + assert(rowCnt+srcOri <= rSrc.Rows()); + assert(rowCnt+dstOri <= Rows()); + assert(Cols() == rSrc.Cols()); + + Timer tim; tim.Start(); + + size_t dst_pitch = mStride*sizeof(_ElemT); + size_t src_pitch = rSrc.Stride()*sizeof(_ElemT); + size_t width = rSrc.Cols()*sizeof(_ElemT); + + const _ElemT* p_src = rSrc.pCUData() + srcOri*rSrc.Stride(); + _ElemT* p_dst = mpCUData + dstOri*mStride; + + cuSafeCall(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, rowCnt, cudaMemcpyDeviceToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyRowsD2D",tim.Val()); + + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + void + CuMatrix<_ElemT>:: + CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri) + { + assert(colCnt+srcOri <= rSrc.Cols()); + assert(colCnt+dstOri <= Cols()); + assert(Rows() == rSrc.Rows()); + + Timer tim; tim.Start(); + + size_t dst_pitch = mStride*sizeof(_ElemT); + size_t src_pitch = rSrc.Stride()*sizeof(_ElemT); + size_t width = colCnt*sizeof(_ElemT); + + const _ElemT* p_src = rSrc.pCUData() + srcOri; + _ElemT* p_dst = mpCUData + dstOri; + + cuSafeCall(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, Rows(), cudaMemcpyDeviceToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyColsD2D",tim.Val()); + + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + void + CuMatrix<_ElemT>:: + SetZero() + { + Timer tim; tim.Start(); + cuSafeCall(cudaMemset(mpCUData, 0, mRows*mStride*sizeof(_ElemT))); + tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero",tim.Val()); + } + + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + + //////////////////////////////////////////////////////////////////////// + //// CuMatrix:: templeate specializations (float) + //// + template<> + inline void CuMatrix<float>::SetConst(float value) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_set_const(dimGrid,dimBlock,mpCUData,value,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<float>::ApplyLog() + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_apply_log(dimGrid,dimBlock,mpCUData,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<float>::ApplyMask(const CuMatrix<BaseFloat>& mask) + { + Timer tim; tim.Start(); + + assert(mask.Rows() == Rows()); + assert(mask.Cols() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_apply_mask(dimGrid,dimBlock,mpCUData,mask.pCUData(),Dim(),mask.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<float>::ApplyL1(float l1) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_apply_l1(dimGrid,dimBlock,mpCUData,l1,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<float>::ScaleCols(const CuVector<float>& scale) + { + Timer tim; tim.Start(); + + assert(scale.Dim() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_scale_cols(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim()); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<float>::ScaleRows(const CuVector<float>& scale) + { + Timer tim; tim.Start(); + + assert(scale.Dim() == Rows()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_scale_rows(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim()); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<float>::AddScaled(float alpha, const CuMatrix<float>& A, float beta) + { + Timer tim; tim.Start(); + + assert(A.Rows() == Rows()); + assert(A.Cols() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_add_scaled(dimGrid,dimBlock,alpha,A.pCUData(),beta,mpCUData,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<float>::AddScaledRow(float alpha, const CuVector<float>& row, float beta) + { + Timer tim; tim.Start(); + + if(row.Dim() != Cols()) { + std::ostringstream os; + os << "Non matching dimensions: Cols:" << Cols() << " VectorDim:" << row.Dim(); + Error(os.str()); + } + assert(row.Dim() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_add_scaled_row(dimGrid,dimBlock,alpha,row.pCUData(),beta,mpCUData,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<float>::Gemm(char transa, char transb, + float alpha, + const CuMatrix<float>& A, const CuMatrix<float>& B, + float beta) + { + // CUBLAS is col major, TNet is row major + // keep trans..., just swap A&B argumets: A->B B->A + size_t m = ((transb=='T' || transb=='t')? B.Rows() : B.Cols()); + size_t n = ((transa=='T' || transa=='t')? A.Cols() : A.Rows()); + size_t k = ((transb=='T' || transb=='t')? B.Cols() : B.Rows()); + size_t k1 = ((transa=='T' || transa=='t')? A.Rows() : A.Cols()); + + assert(m == Cols()); + assert(n == Rows()); + assert(k == k1); + + #if 0 + //DEBUG MESSAGE + std::cout << "\n" << transb << " " << transa << " " << m << " " << n << " " << k << " " << + alpha << " " << B << " " << B.Stride() << " " << + A << " " << A.Stride() << " " << beta << " " << C << " " << + C.Stride() << "\n" << std::flush; + #endif + + Timer tim; tim.Start(); + + cublasSgemm(transb, transa, m, n, k, + alpha, B.pCUData(), B.Stride(), A.pCUData(), A.Stride(), + beta, mpCUData, Stride()); + + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<float>::BlasGer(float alpha, + const CuVector<float>& x, const CuVector<float>& y) + { + // CUBLAS is col major, TNet is row major + // just swap x and y + assert(x.Dim() == Rows()); + assert(y.Dim() == Cols()); + + Timer tim; tim.Start(); + + cublasSger(Cols(),Rows(),alpha,y.pCUData(),1,x.pCUData(),1,mpCUData,Stride()); + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<float>::MulElem(const CuMatrix<float>& A) + { + Timer tim; tim.Start(); + + assert(mCols == A.Cols()); + assert(mRows == A.Rows()); + assert(mStride == A.Stride()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_mul_elem(dimGrid,dimBlock,mpCUData, A.pCUData(), Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<float>::LogElem() + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_log_elem(dimGrid,dimBlock,mpCUData, Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + + + //////////////////////////////////////////////////////////////////////// + //// CuMatrix:: templeate specializations (double) + //// + template<> + inline void CuMatrix<double>::SetConst(double value) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_set_const(dimGrid,dimBlock,mpCUData,value,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<double>::ApplyLog() + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_apply_log(dimGrid,dimBlock,mpCUData,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<double>::ScaleCols(const CuVector<double>& scale) + { + Timer tim; tim.Start(); + + assert(scale.Dim() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_scale_cols(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim()); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<double>::ScaleRows(const CuVector<double>& scale) + { + Timer tim; tim.Start(); + + assert(scale.Dim() == Rows()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_scale_rows(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim()); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<double>::AddScaled(double alpha, const CuMatrix<double>& A, double beta) + { + Timer tim; tim.Start(); + + assert(A.Rows() == Rows()); + assert(A.Cols() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_add_scaled(dimGrid,dimBlock,alpha,A.pCUData(),beta,mpCUData,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<double>::AddScaledRow(double alpha, const CuVector<double>& row, double beta) + { + Timer tim; tim.Start(); + + assert(row.Dim() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_add_scaled_row(dimGrid,dimBlock,alpha,row.pCUData(),beta,mpCUData,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<double>::Gemm(char transa, char transb, + double alpha, + const CuMatrix<double>& A, const CuMatrix<double>& B, + double beta) + { + // CUBLAS is col major, TNet is row major + // keep trans..., just swap A&B argumets: A->B B->A + size_t m = ((transb=='T' || transb=='t')? B.Rows() : B.Cols()); + size_t n = ((transa=='T' || transa=='t')? A.Cols() : A.Rows()); + size_t k = ((transb=='T' || transb=='t')? B.Cols() : B.Rows()); + size_t k1 = ((transa=='T' || transa=='t')? A.Rows() : A.Cols()); + + assert(m == Cols()); + assert(n == Rows()); + assert(k == k1); + + #if 0 + //DEBUG MESSAGE + std::cout << "\n" << transb << " " << transa << " " << m << " " << n << " " << k << " " << + alpha << " " << B << " " << B.Stride() << " " << + A << " " << A.Stride() << " " << beta << " " << C << " " << + C.Stride() << "\n" << std::flush; + #endif + + Timer tim; tim.Start(); + + cublasDgemm(transb, transa, m, n, k, + alpha, B.pCUData(), B.Stride(), A.pCUData(), A.Stride(), + beta, mpCUData, Stride()); + + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + template<> + inline void CuMatrix<double>::BlasGer(double alpha, + const CuVector<double>& x, const CuVector<double>& y) + { + // CUBLAS is col major, TNet is row major + // just swap x and y + assert(x.Dim() == Rows()); + assert(y.Dim() == Cols()); + + Timer tim; tim.Start(); + + cublasDger(Cols(),Rows(),alpha,y.pCUData(),1,x.pCUData(),1,mpCUData,Stride()); + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + + template<> + inline void CuMatrix<double>::MulElem(const CuMatrix<double>& A) + { + Timer tim; tim.Start(); + + assert(mCols == A.Cols()); + assert(mRows == A.Rows()); + assert(mStride == A.Stride()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_mul_elem(dimGrid,dimBlock,mpCUData, A.pCUData(), Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<double>::LogElem() + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_log_elem(dimGrid,dimBlock,mpCUData, Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + +} diff --git a/src/CuBaseLib/.svn/text-base/curand.h.svn-base b/src/CuBaseLib/.svn/text-base/curand.h.svn-base new file mode 100644 index 0000000..8aa66d5 --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/curand.h.svn-base @@ -0,0 +1,40 @@ +#ifndef _CU_RAND_H_ +#define _CU_RAND_H_ + + +#include "cumatrix.h" + + +namespace TNet { + + template<typename T> + class CuRand { + public: + + CuRand(size_t rows, size_t cols) + { SeedGpu(rows,cols); } + + ~CuRand() { } + + void SeedGpu(size_t rows, size_t cols); + void Rand(CuMatrix<T>& tgt); + void GaussRand(CuMatrix<T>& tgt); + + void BinarizeProbs(const CuMatrix<T>& probs, CuMatrix<T>& states); + void AddGaussNoise(CuMatrix<T>& tgt, T gscale = 1.0); + + private: + static void SeedRandom(Matrix<unsigned>& mat); + + private: + CuMatrix<unsigned> z1, z2, z3, z4; + CuMatrix<T> tmp; + }; + +} + + +#include "curand.tcc" + + +#endif diff --git a/src/CuBaseLib/.svn/text-base/curand.tcc.svn-base b/src/CuBaseLib/.svn/text-base/curand.tcc.svn-base new file mode 100644 index 0000000..e337189 --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/curand.tcc.svn-base @@ -0,0 +1,228 @@ + +#include <cstdlib> +#include "curandkernels.h" + + +namespace TNet { + + + + template<typename T> + inline void + CuRand<T>:: + SeedGpu(size_t rows, size_t cols) + { + Matrix<unsigned> mat(rows,cols); + SeedRandom(mat); + z1.CopyFrom(mat); + SeedRandom(mat); + z2.CopyFrom(mat); + SeedRandom(mat); + z3.CopyFrom(mat); + SeedRandom(mat); + z4.CopyFrom(mat); + + /* + std::cout << "RANDININIT" << std::endl; + z1.Print(); + z2.Print(); + z3.Print(); + z4.Print(); + std::cout << "RANDININIT" << std::endl; + */ + + tmp.Init(rows,cols); + } + + + + template<typename T> + inline void + CuRand<T>:: + SeedRandom(Matrix<unsigned>& mat) { + for(size_t j=0; j<mat.Rows(); j++) { + for(size_t i=0; i<mat.Cols(); i++) { + unsigned value = 0; + while(value <= 128) { value = lrand48(); } + mat(j,i) = value; + } + } + } + + + template<typename T> + inline void + CuRand<T>:: + AddGaussNoise(CuMatrix<T>& tgt, T gscale) + { + GaussRand(tmp); + tgt.AddScaled(gscale,tmp,1.0); + } + + + + + + + + //////////////////////////////////////////////////////////////////////////// + //// invalid general wrappers over CUDA kernels + template<typename T> + inline void + CuRand<T>:: + Rand(CuMatrix<T>& tgt) + { Error("Unimplemented"); } + + template<typename T> + inline void + CuRand<T>:: + GaussRand(CuMatrix<T>& tgt) + { Error("Unimplemented"); } + + template<typename T> + inline void + CuRand<T>:: + BinarizeProbs(const CuMatrix<T>& probs, CuMatrix<T>& states) + { Error("Unimplemented"); } + + + ////////////////////////////////////////////////////////////////////////// + //// float specializations + template<> + inline void + CuRand<float>:: + Rand(CuMatrix<float>& tgt) + { + Timer tim; tim.Start(); + + tgt.Init(z1.Rows(), z1.Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK)); + + cudaF_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim()); + + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void + CuRand<float>:: + GaussRand(CuMatrix<float>& tgt) + { + + Timer tim; tim.Start(); + + tgt.Init(z1.Rows(), z1.Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK)); + + cudaF_gauss_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim()); + + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void + CuRand<float>:: + BinarizeProbs(const CuMatrix<float>& probs, CuMatrix<float>& states) + { + if(probs.Rows() != z1.Rows() || probs.Cols() != z1.Cols()) { + Error("Non matching dims!!"); + } + + states.Init(z1.Rows(),z1.Cols()); + Rand(tmp); + + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(z1.Cols(), CUBLOCK), n_blocks(z1.Rows(),CUBLOCK)); + + cudaF_binarize_probs(dimGrid,dimBlock,states.pCUData(), probs.pCUData(), tmp.pCUData(),states.Dim()); + + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + ////////////////////////////////////////////////////////////////////////// + //// double specializations + template<> + inline void + CuRand<double>:: + Rand(CuMatrix<double>& tgt) + { + Timer tim; tim.Start(); + + tgt.Init(z1.Rows(), z1.Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK)); + + cudaD_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim()); + + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void + CuRand<double>:: + GaussRand(CuMatrix<double>& tgt) + { + + Timer tim; tim.Start(); + + tgt.Init(z1.Rows(), z1.Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK)); + + cudaD_gauss_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim()); + + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void + CuRand<double>:: + BinarizeProbs(const CuMatrix<double>& probs, CuMatrix<double>& states) + { + if(probs.Rows() != z1.Rows() || probs.Cols() != z1.Cols()) { + Error("Non matching dims!!"); + } + + states.Init(z1.Rows(),z1.Cols()); + Rand(tmp); + + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(z1.Cols(), CUBLOCK), n_blocks(z1.Rows(),CUBLOCK)); + + cudaD_binarize_probs(dimGrid,dimBlock,states.pCUData(), probs.pCUData(), tmp.pCUData(),states.Dim()); + + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + +} diff --git a/src/CuBaseLib/.svn/text-base/curandkernels.cu.svn-base b/src/CuBaseLib/.svn/text-base/curandkernels.cu.svn-base new file mode 100644 index 0000000..7e1c8dd --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/curandkernels.cu.svn-base @@ -0,0 +1,135 @@ + +#include "curandkernels.h" + + + +// +//Hybrid Tauss/LCG random number generator +// +//http://http.developer.nvidia.com/GPUGems3/gpugems3_ch37.html + + +// S1, S2, S3, and M are all constants, and z is part of the +// private per-thread generator state. +__device__ +static unsigned TausStep(unsigned &z, int S1, int S2, int S3, unsigned M) +{ + unsigned b=(((z << S1) ^ z) >> S2); + return z = (((z & M) << S3) ^ b); +} + +// A and C are constants +__device__ +static unsigned LCGStep(unsigned &z, unsigned A, unsigned C) +{ + return z=(A*z+C); +} + +template<typename T> +__device__ +static T HybridTaus(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4) +{ + // Combined period is lcm(p1,p2,p3,p4)~ 2^121 + T randval; + do { + randval = 2.3283064365387e-10 * ( // Periods + TausStep(z1, 13, 19, 12, 4294967294UL) ^ // p1=2^31-1 + TausStep(z2, 2, 25, 4, 4294967288UL) ^ // p2=2^30-1 + TausStep(z3, 3, 11, 17, 4294967280UL) ^ // p3=2^28-1 + LCGStep(z4, 1664525, 1013904223UL) // p4=2^32 + ); + } while (!(randval > 0.0 && randval < 1.0)); + return randval; +} + + + + +template<typename T> +__global__ +static void _rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if( i < d.cols && j < d.rows ) { + mat[index] = HybridTaus<T>(z1[index],z2[index],z3[index],z4[index]); + } +} + +/* +float2 BoxMuller() +{ + float u0=HybridTaus (), u1=HybridTaus (); + float r=sqrt(-2 log(u0)); + float theta=2*PI*u1; + return make_float2(r*sin(theta),r*cos(theta)); +} +*/ + +template<typename T> +__device__ +static T BoxMuller(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4) +{ + const T M_2PI = 6.283185307179586476925286766558; + + T u0 = HybridTaus<T>(z1,z2,z3,z4), u1 = HybridTaus<T>(z1,z2,z3,z4); + T r = sqrt(-2.0 * log(u0)); + T theta = M_2PI * u1; + return r*sin(theta); + +} + + +template<typename T> +__global__ +static void _gauss_rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if( i < d.cols && j < d.rows ) { + mat[index] = BoxMuller<T>(z1[index],z2[index],z3[index],z4[index]); + } +} + + +template<typename T> +__global__ +static void _binarize_probs(T* states, const T* probs, const T* rand, MatrixDim d) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if( i < d.cols && j < d.rows ) { + states[index] = ((probs[index] > rand[index])? 1.0 : 0.0); + } +} + + + +/************ + * :FLOAT: + */ +void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); } + +void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); } + +void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d) +{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); } + + +/************ + * :DOUBLE: + */ +void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); } + +void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); } + +void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d) +{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); } + diff --git a/src/CuBaseLib/.svn/text-base/curandkernels.h.svn-base b/src/CuBaseLib/.svn/text-base/curandkernels.h.svn-base new file mode 100644 index 0000000..69b589f --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/curandkernels.h.svn-base @@ -0,0 +1,34 @@ +#ifndef _cuda_rand_kernels_h_ +#define _cuda_rand_kernels_h_ + + +#include "cukernels.h" + + +extern "C" { + //************** + //float + // + void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d); + + + void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d); + + + void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d); + + //************** + //double + // + void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d); + + + void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d); + + + void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d); + +} + + +#endif diff --git a/src/CuBaseLib/.svn/text-base/cuvector.h.svn-base b/src/CuBaseLib/.svn/text-base/cuvector.h.svn-base new file mode 100644 index 0000000..945565a --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/cuvector.h.svn-base @@ -0,0 +1,104 @@ +#ifndef _CUVECTOR_H_ +#define _CUVECTOR_H_ + +#include "Vector.h" + +namespace TNet { + + template<typename _ElemT> class CuMatrix; + + /** + * Matrix for CUDA computing + */ + template<typename _ElemT> + class CuVector + { + typedef CuVector<_ElemT> ThisType; + + public: + + /// Default Constructor + CuVector<_ElemT>() + : mDim(0), mpCUData(NULL) + { } + /// Constructor with memory initialisation + CuVector<_ElemT>(size_t dim) + : mDim(0), mpCUData(NULL) + { Init(dim); } + + /// Destructor + ~CuVector() + { Destroy(); } + + /// Dimensions + size_t Dim() const + { return mDim; } + + /* + ::MatrixDim Dim() const + { ::MatrixDim d = { mDim, 1, 1 }; return d; } + */ + + /// Get raw pointer + const _ElemT* pCUData() const + { return mpCUData; } + _ElemT* pCUData() + { return mpCUData; } + + /// Allocate the memory + ThisType& Init(size_t dim); + + /// Deallocate the memory + void Destroy(); + + /// Copy functions (reallocates when needed) + ThisType& CopyFrom(const CuVector<_ElemT>& rSrc); + ThisType& CopyFrom(const Vector<_ElemT>& rSrc); + Vector<_ElemT>& CopyTo(Vector<_ElemT>& rDst) const; + + + + // Math operations + // + void SetZero(); + + void SetConst(_ElemT value) + { Error("__func__ Not implemented"); } + + void AddScaled(_ElemT alpha, const CuVector<_ElemT>& vec, _ElemT beta) + { Error("__func__ Not implemented"); } + + void AddColSum(_ElemT alpha, const CuMatrix<_ElemT>& mat, _ElemT beta) + { Error("__func__ Not implemented"); } + + void Print() const + { + Vector<_ElemT> vec(Dim()); + CopyTo(vec); + std::cout << vec << "\n"; + } + + + private: + size_t mDim; + _ElemT* mpCUData; + }; + + + /// Prints the matrix dimensions and pointer to stream + template<typename _ElemT> + inline std::ostream& operator << (std::ostream& out, const CuVector<_ElemT>& vec) + { + size_t d = vec.Dim(); + out << "[CuVector D" << d + << " PTR" << vec.pCUData() << "]" << std::flush; + return out; + } + + +} + + +#include "cuvector.tcc" + +#endif diff --git a/src/CuBaseLib/.svn/text-base/cuvector.tcc.svn-base b/src/CuBaseLib/.svn/text-base/cuvector.tcc.svn-base new file mode 100644 index 0000000..0107859 --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/cuvector.tcc.svn-base @@ -0,0 +1,254 @@ + +#include <cuda_runtime_api.h> + +#include "Timer.h" +#include "cucommon.h" +#include "cumatrix.h" +#include "cudevice.h" + +namespace TNet { + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuVector<_ElemT>& + CuVector<_ElemT>:: + Init(size_t dim) + { + if(mDim == dim) { + //SetZero(); + return *this; + } + + Destroy(); + + cuSafeCall(cudaMalloc((void**)&mpCUData, dim*sizeof(_ElemT))); + mDim = dim; + SetZero(); + + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + void + CuVector<_ElemT>:: + Destroy() + { + if(NULL != mpCUData) { + cuSafeCall(cudaFree(mpCUData)); + mpCUData = NULL; + } + mDim = 0; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuVector<_ElemT>& + CuVector<_ElemT>:: + CopyFrom(const CuVector<_ElemT>& rSrc) + { + Init(rSrc.Dim()); + + Timer tim; tim.Start(); + + cuSafeCall(cudaMemcpy(mpCUData, rSrc.pCUData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyDeviceToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromD2D",tim.Val()); + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuVector<_ElemT>& + CuVector<_ElemT>:: + CopyFrom(const Vector<_ElemT>& rSrc) + { + Init(rSrc.Dim()); + + Timer tim; tim.Start(); + + cuSafeCall(cudaMemcpy(mpCUData, rSrc.pData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyHostToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromH2D",tim.Val()); + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + Vector<_ElemT>& + CuVector<_ElemT>:: + CopyTo(Vector<_ElemT>& rDst) const + { + if(rDst.Dim() != mDim) { + rDst.Init(mDim); + } + + Timer tim; tim.Start(); + + cuSafeCall(cudaMemcpy(rDst.pData(), pCUData(), mDim*sizeof(_ElemT), cudaMemcpyDeviceToHost)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyToD2H",tim.Val()); + + return rDst; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + + template<typename _ElemT> + void + CuVector<_ElemT>:: + SetZero() + { + Timer tim; tim.Start(); + cuSafeCall(cudaMemset(mpCUData, 0, mDim*sizeof(_ElemT))); + tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::SetZero",tim.Val()); + } + + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + + + + //////////////////////////////////////////////////////////////////////// + //// CuVector:: templeate specializations (float) + //// + template<> + inline void CuVector<float>::SetConst(float value) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(Dim(), CUBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cudaF_set_const(dimGrid,dimBlock,mpCUData,value,d); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuVector<float>::AddScaled(float alpha, const CuVector<float>& vec, float beta) + { + Timer tim; tim.Start(); + + assert(vec.Dim() == Dim()); + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(Dim(), CUBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cudaF_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuVector<float>::AddColSum(float alpha, const CuMatrix<float>& mat, float beta) + { + Timer tim; tim.Start(); + + assert(mat.Cols() == Dim()); + + /** + * Rows()<=512 limit due to limited shared memory + * Cols()<=256 limit due to coalesced memory alignment: + * matrices with huge strides have slow access!!! + */ + if(mat.Rows() > 512 || mat.Cols() > 256) { + size_t dimBlock = CUBLOCK*2; + size_t dimGrid = n_blocks(Dim(),CUBLOCK*2); + + cudaF_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim()); + cuSafeCall(cudaGetLastError()); + } else { + dim3 dimBlock(mat.Rows(),1); + dim3 dimGrid(1,Dim()); + + cudaF_add_col_sum_reduce(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim()); + cuSafeCall(cudaGetLastError()); + } + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + + //////////////////////////////////////////////////////////////////////// + //// CuVector:: templeate specializations (double) + //// + template<> + inline void CuVector<double>::SetConst(double value) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(Dim(), CUBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cudaD_set_const(dimGrid,dimBlock,mpCUData,value,d); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuVector<double>::AddScaled(double alpha, const CuVector<double>& vec, double beta) + { + Timer tim; tim.Start(); + + assert(vec.Dim() == Dim()); + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(Dim(), CUBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cudaD_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuVector<double>::AddColSum(double alpha, const CuMatrix<double>& mat, double beta) + { + Timer tim; tim.Start(); + + assert(mat.Cols() == Dim()); + + size_t dimBlock = CUBLOCK*2; + size_t dimGrid = n_blocks(Dim(),CUBLOCK*2); + + cudaD_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim()); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + +} + + + diff --git a/src/CuBaseLib/Makefile b/src/CuBaseLib/Makefile new file mode 100644 index 0000000..cb7c00b --- /dev/null +++ b/src/CuBaseLib/Makefile @@ -0,0 +1,59 @@ + +include ../tnet.mk + +INCLUDE = -I. -I../ -I../KaldiLib + + +CUDA_INCLUDE= -I$(CUDA_TK_BASE)/include +CUDA_BIN=$(CUDA_TK_BASE)/bin + + +CUSRC=$(wildcard *.cu) +CUOBJ=$(patsubst %.cu, %.o, $(CUSRC)) + + + +CUDA_FLAGS = -g -Xcompiler -fPIC --verbose +ifeq ($(BITS64), true) + CUDA_FLAGS += --machine 64 + #BUT_FORCE_GCC64 = ln -s `which x86_64-linux-gcc` $(PWD)/gcc + #BUT_UNLINK_GCC64 = unlink $(PWD)/gcc +else + CUDA_FLAGS += --machine 32 +endif + +ifeq ($(DOUBLEPRECISION), true) + CUDA_FLAGS += --gpu-architecture compute_13 --gpu-code sm_13 +endif + + + + +all : libCuBase.a + +libCuBase.a : $(CUOBJ) $(OBJ) + $(AR) ruv $@ $? + $(RANLIB) $@ + + +%.o : %.cu + $(BUT_FORCE_GCC64) + export PATH=$(PWD):$(CUDA_BIN):$(PATH); $(CUDA_BIN)/nvcc -c $< -o $@ -I. $(CUDA_INCLUDE) $(CUDA_FLAGS) + $(BUT_UNLINK_GCC64) + +%.o : %.cc + $(CXX) -c $< -o $@ $(CXXFLAGS) $(CUDA_INCLUDE) $(INCLUDE) + + + + +.PHONY: clean depend + +clean : + rm -f *.o *.a + +depend: + $(CXX) -M $(CXXFLAGS) *.cc $(INCLUDE) $(CUDA_INCLUDE) > .depend.mk + +-include .depend.mk + diff --git a/src/CuBaseLib/cucommon.h b/src/CuBaseLib/cucommon.h new file mode 100644 index 0000000..6dc7e94 --- /dev/null +++ b/src/CuBaseLib/cucommon.h @@ -0,0 +1,46 @@ +#ifndef _CUCOMMON_H_ +#define _CUCOMMON_H_ + +#include <iostream> +#include <sstream> + +#include <cuda_runtime_api.h> + +#include "Error.h" + + + +#define cuSafeCall(fun) \ +{ \ + int ret; \ + if((ret = (fun)) != 0) { \ + std::ostringstream os; \ + os << "CUDA ERROR #" << ret << " " << __FILE__ ":" << __LINE__ << " " << __func__ << "()" << " '" << #fun << "' " << cudaGetErrorString((cudaError_t)ret); \ + throw(MyException(os.str())); \ + } \ + cudaThreadSynchronize(); \ +} + + + + +namespace TNet { + + /** The size of edge of CUDA square block **/ + static const int CUBLOCK = 16; + + /** Number of blocks in which is split task of size 'size' **/ + inline int n_blocks(int size, int block_size) + { return size / block_size + ((size % block_size == 0)? 0 : 1); } + + /** Printing dim3 output operator **/ + inline std::ostream& operator<<(std::ostream& os, dim3 arr) { + os << "[" << arr.x << "," << arr.y << "," << arr.z << "]"; + return os; + } + +} + + + +#endif diff --git a/src/CuBaseLib/cudevice.cc b/src/CuBaseLib/cudevice.cc new file mode 100644 index 0000000..90c5bf3 --- /dev/null +++ b/src/CuBaseLib/cudevice.cc @@ -0,0 +1,129 @@ + +#include <cudevice.h> +#include <cublas.h> +#include <cuda.h> + +/////////////////// +//DEBUG: Just make sure it compiles... +#include "cumatrix.h" +#include "cuvector.h" +#include "cumath.h" +template class TNet::CuMatrix<float>; +template class TNet::CuVector<float>; +template class TNet::CuMath<float>; +/////////////////// + +namespace TNet { + + + /********************************************************************************** + * CuDevice:: + */ + CuDevice:: + CuDevice() + : mIsPresent(false), mVerbose(false) + { + //get number of devices + int N_GPU = 0; + cudaGetDeviceCount(&N_GPU); + + //select device if more than one + if(N_GPU > 1) { + char name[128]; + size_t free, total; + std::vector<float> free_mem_ratio; + //get ratios of memory use + std::cout << "Selecting from " << N_GPU << " GPUs\n"; + for(int n=0; n<N_GPU; n++) { + std::cout << "cudaSetDevice(" << n << "): "; + cuSafeCall(cudaSetDevice(n));//context created by cuSafeCall(...) + cuDeviceGetName(name,128,n); + std::cout << name << "\t"; + cuSafeCall(cuMemGetInfo(&free,&total)); + std::cout << "free: " << free/1024/1024 << "M, " + << "total: "<< total/1024/1024 << "M, " + << "ratio: "<< free/(float)total << "\n"; + free_mem_ratio.push_back(free/(float)total); + cudaThreadExit();//destroy context + } + //find GPU with max free memory + int max_id=0; + for(int n=1; n<free_mem_ratio.size(); n++) { + if(free_mem_ratio[n] > free_mem_ratio[max_id]) max_id=n; + } + std::cout << "Selected device: " << max_id << " (automatically)\n"; + cuSafeCall(cudaSetDevice(max_id)); + } + + if(N_GPU > 0) { + //initialize the CUBLAS + cuSafeCall(cublasInit()); + mIsPresent = true; + } else { + Warning("No CUDA enabled GPU is present!"); + } + } + + CuDevice:: + ~CuDevice() + { + if(mIsPresent) { + cuSafeCall(cublasShutdown()); + if(mVerbose) { + TraceLog("CUBLAS released"); + PrintProfile(); + } + } else { + Warning("No CUDA enabled GPU was present!"); + } + } + + + void + CuDevice:: + SelectGPU(int gpu_id) + { + //get number of devices + int N_GPU = 0; + cudaGetDeviceCount(&N_GPU); + if(gpu_id >= N_GPU) { + KALDI_ERR << "Cannot select GPU " << gpu_id + << ", detected " << N_GPU << " CUDA capable cards!"; + } + //release old card + cuSafeCall(cublasShutdown()); + cudaThreadExit(); + //select new card + cuSafeCall(cudaSetDevice(gpu_id)); + //initialize CUBLAS + cuSafeCall(cublasInit()); + std::cout << "Selected device " << gpu_id << " (manually)\n"; + } + + + std::string + CuDevice:: + GetFreeMemory() + { + size_t mem_free, mem_total; + cuMemGetInfo(&mem_free, &mem_total); + std::ostringstream os; + os << "Free:" << mem_free/(1024*1024) << "MB " + << "Used:" << (mem_total-mem_free)/(1024*1024) << "MB " + << "Total:" << mem_total/(1024*1024) << "MB"; + return os.str(); + } + + + //////////////////////////////////////////////// + // Instance of the static singleton + // + CuDevice CuDevice::msDevice; + // + //////////////////////////////////////////////// + + + +} + + diff --git a/src/CuBaseLib/cudevice.h b/src/CuBaseLib/cudevice.h new file mode 100644 index 0000000..c5eeb7b --- /dev/null +++ b/src/CuBaseLib/cudevice.h @@ -0,0 +1,79 @@ +#ifndef _CUDEVICE_H_ +#define _CUDEVICE_H_ + +#include <map> +#include <string> +#include <iostream> + +namespace TNet { + + /** + * Singleton object which represents CUDA device + * responsible for CUBLAS initilalisation + * and memory block registration + */ + class CuDevice + { + // Singleton interface... + private: + CuDevice(); + CuDevice(CuDevice&); + CuDevice& operator=(CuDevice&); + + public: + ~CuDevice(); + static CuDevice& Instantiate() + { return msDevice; } + + private: + static CuDevice msDevice; + + + /**********************************/ + // Instance interface + public: + + void SelectGPU(int gpu_id); + + /// Check if the CUDA device is in the system + bool IsPresent() + { return mIsPresent; } + + void Verbose(bool verbose) + { mVerbose = verbose; } + + /// Sum the IO time + void AccuProfile(const std::string& key,double time) + { + if(mProfileMap.find(key) == mProfileMap.end()) { + mProfileMap[key] = 0.0; + } + mProfileMap[key] += time; + } + + void PrintProfile() + { + std::cout << "[cudevice profile]\n"; + std::map<std::string, double>::iterator it; + for(it = mProfileMap.begin(); it != mProfileMap.end(); ++it) { + std::cout << it->first << "\t" << it->second << "s\n"; + } + } + + void ResetProfile() + { mProfileMap.clear(); } + + std::string GetFreeMemory(); + + + private: + std::map<std::string, double> mProfileMap; + bool mIsPresent; + bool mVerbose; + }; //class CuDevice + + +} + + +#endif diff --git a/src/CuBaseLib/cukernels.cu b/src/CuBaseLib/cukernels.cu new file mode 100644 index 0000000..d6f866d --- /dev/null +++ b/src/CuBaseLib/cukernels.cu @@ -0,0 +1,626 @@ + +#include <cfloat> +#include "cukernels.h" + + + +/***************** + * CUDA kernels + */ +//CuMatrix +template<typename T> +__global__ +static void _set_const(T* mat, T value, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) + mat[index] = value; +} + + + +template<typename T> +__global__ +static void _apply_log(T* mat, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) + mat[index] = log(mat[index]); +} + + +template<typename T> +__global__ +static void _apply_mask(T* mat, const float* mask, MatrixDim dmat, MatrixDim dmask) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*dmat.stride; + int index2 = i + j*dmask.stride; + if ( i < dmat.cols && j < dmat.rows ) + if(mask[index2] == 0) mat[index] = 0; +} + + +template<typename T> +__global__ +static void _apply_l1(T* mat, T l1, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) { + T value = mat[index]; + T tgt; + if(abs(value) < l1) { + tgt = 0; + } else { + tgt = (value > 0?value-l1:value+l1); + } + mat[index] = tgt; + } +} + + +template<typename T> +__global__ +static void _scale_cols(T* mat, const T* scale, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) + mat[index] *= scale[i]; +} + + +template<typename T> +__global__ +static void _scale_rows(T* mat, const T* scale, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) + mat[index] *= scale[j]; +} + + +template<typename T> +__global__ +static void _add_scaled(T alpha, const T* A, T beta, T* dst, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) + dst[index] = alpha*A[index] + beta*dst[index]; +} + + +template<typename T> +__global__ +static void _add_scaled_row(T alpha, const T* row, T beta, T* dst, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + +#if 0 + //this does not accelerate :( + __shared__ T aux[16]; + if(threadIdx.y == 0 && i < d.cols) aux[threadIdx.x] = row[i]; + __syncthreads(); + + if ( i < d.cols && j < d.rows ) + dst[index] = alpha*aux[threadIdx.x] + beta*dst[index]; +#else + if ( i < d.cols && j < d.rows ) + dst[index] = alpha*row[i] + beta*dst[index]; +#endif +} + + +template<typename T> +__global__ +static void _mul_elem(T* mat, const T* A, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) + mat[index] = mat[index] * A[index]; +} + + +template<typename T> +__global__ +static void _log_elem(T* mat, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if ( i < d.cols && j < d.rows ) { + if(mat[index] < FLT_MIN) mat[index] = FLT_MIN; + mat[index] = log(mat[index]); + } +} + + + + +//CuVector +template<typename T> +__global__ +static void _add_col_sum(T alpha, const T* mat, T beta, T* vec, MatrixDim d) { + + int i = blockIdx.x * blockDim.x + threadIdx.x; + + //This should be called 1-D + int j = blockIdx.y * blockDim.y + threadIdx.y; + if(j > 0) return; + + if(i < d.cols) { + double sum = 0.0; + for(int k = 0; k < d.rows; k++) { + sum += mat[i+k*d.stride]; + } + vec[i] = alpha*sum + beta*vec[i]; + } +} + + +template<typename T> +__global__ +static void _add_col_sum_reduce(T alpha, const T* mat, T beta, T* vec, MatrixDim d) { + + //flipped x,y for reducing... x..row, y..col + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + + if(blockIdx.x > 0) return; + if(blockDim.y != 1) return; + + //copy vector to shared mem + __shared__ T aux[512]; + aux[threadIdx.x] = mat[i+j*d.stride]; + __syncthreads(); + + T sum = _sum_reduce(aux); + __syncthreads(); + //copy out the result + vec[i] = alpha*sum + beta*vec[i]; +} + + + +//CuMath +template<typename T> +__global__ +static void _sigmoid(T*y, const T*x, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if( i < d.cols && j < d.rows ) { + T res = 1.0 / (1.0 + exp(-x[index])); + /* + if(res < 0.001) res = 0.001; + if(res > 0.999) res = 0.999; + */ + y[index] = res; + } +} + + +template<typename T> +__global__ +static void _diff_sigmoid(T*eout, const T*e, const T*y, MatrixDim d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if( i < d.cols && j < d.rows ) + eout[index] = y[index]*(1.0-y[index]) * e[index]; +} + + +template<typename T> +__global__ +static void _softmax(T*y, const T*x, MatrixDim d) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + if(j >= d.rows) return; + + //copy to output and find max... + double max = -1e20; + double sum = 0.0; + for(int i=0; i<d.cols; i++) { + if(max < x[i+j*d.stride]) max = x[i+j*d.stride]; + y[i+j*d.stride] = x[i+j*d.stride]; + } + //subtract max, apply exp, sum up... + for(int i=0; i<d.cols; i++) { + y[i+j*d.stride] = exp(y[i+j*d.stride] - max); + sum += y[i+j*d.stride]; + } + //normalize by sum... + for(int i=0; i<d.cols; i++) { + y[i+j*d.stride] /= sum; + } +} + + + + +template<typename T> +__device__ +static T _max_reduce(T buffer[]) { + + // Total number of active threads + int nTotalThreads = blockDim.x; + __syncthreads(); + + while(nTotalThreads > 1) { + int halfPoint = ((1+nTotalThreads) >> 1); // divide by two + // only the first half of the threads will be active. + if (threadIdx.x < halfPoint) { + // Get the shared value stored by another thread + T temp = -1e20; + if(threadIdx.x+halfPoint < nTotalThreads) { + temp = buffer[threadIdx.x + halfPoint]; + } + if (temp > buffer[threadIdx.x]) buffer[threadIdx.x] = temp; + } + __syncthreads(); + nTotalThreads = ((1+nTotalThreads) >> 1); // divide by two. + } + // the result + return buffer[0]; +} + + + + +template<typename T> +__device__ +static T _sum_reduce(T buffer[]) { + + // Total number of active threads + int nTotalThreads = blockDim.x; + __syncthreads(); + + while(nTotalThreads > 1) { + int halfPoint = ((1+nTotalThreads) >> 1); // divide by two + // only the first half of the threads will be active. + if (threadIdx.x < halfPoint) { + // Get the shared value stored by another thread + T temp = 0.0; + if(threadIdx.x+halfPoint < nTotalThreads) { + temp = buffer[threadIdx.x + halfPoint]; + } + buffer[threadIdx.x] += temp; + } + __syncthreads(); + nTotalThreads = ((1+nTotalThreads) >> 1); // divide by two. + } + // the result + return buffer[0]; +} + + + +template<typename T> +__global__ +static void _softmax_reduce(T*y, const T*x, MatrixDim d) { + + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + + if(blockIdx.x > 0) return; + if(blockDim.y > 1) return; + + __shared__ T row_data[256]; + __shared__ T aux[256]; + + //copy the input to row_data + row_data[i] = x[i+j*d.stride]; + __syncthreads(); + + //copy input to aux + aux[i] = row_data[i]; + __syncthreads(); + //get the maximum value + T max = _max_reduce(aux); + __syncthreads(); + + //calculate exp(data-max) + row_data[i] = exp(row_data[i]-max); + + //copy the values to aux + aux[i] = row_data[i]; + __syncthreads(); + //get the sum + T sum = _sum_reduce(aux); + __syncthreads(); + + //divide the values + row_data[i] /= sum; + //copy out + y[i+j*d.stride] = row_data[i]; + +} + + + +template<typename T> +__global__ +static void _expand(T* y, const T* x, const int* off, MatrixDim d_out, MatrixDim d_in) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d_out.stride; + if( i < d_out.cols && j < d_out.rows ) { + int src_col = i % d_in.cols; + int src_row = j + off[i / d_in.cols]; + if(src_row < 0) src_row = 0; + if(src_row >= d_in.rows) src_row = d_in.rows-1; + y[index] = x[src_col + src_row*d_in.stride]; + } +} + + +template<typename T> +__global__ +static void _rearrange(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d_out.stride; + if( i < d_out.cols && j < d_out.rows ) { + int src_col = copy_from[i]; + if(src_col >= 0 && src_col < d_in.cols) { + y[index] = x[src_col + j*d_in.stride]; + } else { + y[index] = 1.0/0.0; + } + } +} + + +template<typename T> +__global__ +static void _randomize(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d_out.stride; + if( i < d_out.cols && j < d_out.rows ) { + int src_row = copy_from[j]; + y[index] = x[i + src_row*d_in.stride]; + } +} + + +template<typename T> +__global__ +static void _check_class(const T* out, const T* des, int* match, MatrixDim d) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + if(j>0) return; + + if(i<d.rows) { + int out_id = -1, des_id = -2; + T out_max = -1e20, des_max = -1e20; + + for(int k=0; k<d.cols; k++) { + T val = out[k + i*d.stride]; + if(val > out_max) { out_max = val; out_id = k; } + } + for(int k=0; k<d.cols; k++) { + T val = des[k + i*d.stride]; + if(val > des_max) { des_max = val; des_id = k; } + } + + match[i] = ((out_id == des_id)?1:0); + } +} + + +template<typename T> +__device__ +static int _max_id_reduce(T val[],int idx[]) { + + // Total number of active threads + int nTotalThreads = blockDim.x; + __syncthreads(); + + while(nTotalThreads > 1) { + int halfPoint = ((1+nTotalThreads) >> 1); // divide by two + // only the first half of the threads will be active. + if (threadIdx.x < halfPoint) { + // Get the shared value stored by another thread + T temp = -1e20; + if(threadIdx.x+halfPoint < nTotalThreads) { + temp = val[idx[threadIdx.x + halfPoint]]; + } + if (temp > val[idx[threadIdx.x]]) idx[threadIdx.x]=idx[threadIdx.x + halfPoint]; + } + __syncthreads(); + nTotalThreads = ((1+nTotalThreads) >> 1); // divide by two. + } + // the result + return idx[0]; +} + + + + + + +template<typename T> +__global__ +static void _check_class_reduce(const T* out, const T* des, int* match, MatrixDim d) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + + if(blockIdx.x > 0) return; + if(blockDim.y != 1) return; + + __shared__ T value[256]; + __shared__ int index[256]; + + value[threadIdx.x] = out[i+j*d.stride]; + index[threadIdx.x] = threadIdx.x; + __syncthreads(); + + int out_max = _max_id_reduce(value,index); + __syncthreads(); + + value[threadIdx.x] = des[i+j*d.stride]; + index[threadIdx.x] = threadIdx.x; + __syncthreads(); + + int des_max = _max_id_reduce(value,index); + __syncthreads(); + + if(threadIdx.x == 0) { + match[j] = ((out_max == des_max)?1:0); + } +} + + + + +/************** + * C wrappers around CUDA kernels + */ +//:FLOAT: +//CuMatrix +void cudaF_set_const(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) +{ _set_const<<<Gr,Bl>>>(mat,value,d); } + +void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) +{ _apply_log<<<Gr,Bl>>>(mat,d); } + +void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const float* mask, MatrixDim dmat, MatrixDim dmask) +{ _apply_mask<<<Gr,Bl>>>(mat,mask,dmat,dmask); } + +void cudaF_apply_l1(dim3 Gr, dim3 Bl, float* mat, float l1, MatrixDim d) +{ _apply_l1<<<Gr,Bl>>>(mat,l1,d); } + +void cudaF_scale_cols(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d) +{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); } + +void cudaF_scale_rows(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d) +{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); } + +void cudaF_add_scaled(dim3 Gr, dim3 Bl, float alpha, const float* A, float beta, float* dst, MatrixDim d) +{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); } + +void cudaF_add_scaled_row(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d) +{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); } + +void cudaF_mul_elem(dim3 Gr, dim3 Bl, float*mat, const float*A, MatrixDim d) +{ _mul_elem<<<Gr,Bl>>>(mat,A,d); } + +void cudaF_log_elem(dim3 Gr, dim3 Bl, float*mat, MatrixDim d) +{ _log_elem<<<Gr,Bl>>>(mat,d); } + +//CuVector +void cudaF_add_col_sum(size_t Gr, size_t Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d) +{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); } + +void cudaF_add_col_sum_reduce(dim3 Gr, dim3 Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d) +{ _add_col_sum_reduce<<<Gr,Bl>>>(alpha,mat,beta,vec,d); } + +//CuMath +void cudaF_sigmoid (dim3 Gr, dim3 Bl, float *y, const float*x, MatrixDim d) +{ _sigmoid<<<Gr,Bl>>>(y, x, d); } + +void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float*eout, const float*e, const float*y, MatrixDim d) { + _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d); +} + +void cudaF_softmax (size_t Gr, size_t Bl, float*y, const float*x, MatrixDim d) +{ _softmax<<<Gr,Bl>>>(y, x, d); } + +void cudaF_softmax_reduce (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d) +{ _softmax_reduce<<<Gr,Bl>>>(y, x, d); } + + +void cudaF_expand(dim3 Gr, dim3 Bl, float* y, const float* x, const int* off, MatrixDim d_out, MatrixDim d_in) +{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); } + + +void cudaF_rearrange(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in) +{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); } + + +void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in) +{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); } + + +void cudaF_check_class(size_t Gr, size_t Bl, const float* out, const float* des, int* match, MatrixDim d) +{ _check_class<<<Gr,Bl>>>(out,des,match,d); } + +void cudaF_check_class_reduce(dim3 Gr, dim3 Bl, const float* out, const float* des, int* match, MatrixDim d) +{ _check_class_reduce<<<Gr,Bl>>>(out,des,match,d); } + + + + +//:DOUBLE: +//CuMatrix +void cudaD_set_const(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) +{ _set_const<<<Gr,Bl>>>(mat,value,d); } + +void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) +{ _apply_log<<<Gr,Bl>>>(mat,d); } + +void cudaD_scale_cols(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d) +{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); } + +void cudaD_scale_rows(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d) +{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); } + +void cudaD_add_scaled(dim3 Gr, dim3 Bl, double alpha, const double* A, double beta, double* dst, MatrixDim d) +{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); } + +void cudaD_add_scaled_row(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d) +{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); } + +void cudaD_mul_elem(dim3 Gr, dim3 Bl, double*mat, const double*A, MatrixDim d) +{ _mul_elem<<<Gr,Bl>>>(mat,A,d); } + +void cudaD_log_elem(dim3 Gr, dim3 Bl, double*mat, MatrixDim d) +{ _log_elem<<<Gr,Bl>>>(mat,d); } + +//CuVector +void cudaD_add_col_sum(size_t Gr, size_t Bl, double alpha, const double* mat, double beta, double* vec, MatrixDim d) +{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); } + +//CuMath +void cudaD_sigmoid (dim3 Gr, dim3 Bl, double *y, const double*x, MatrixDim d) +{ _sigmoid<<<Gr,Bl>>>(y, x, d); } + + +void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double*eout, const double*e, const double*y, MatrixDim d) { + _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d); +} + +void cudaD_softmax (size_t Gr, size_t Bl, double*y, const double*x, MatrixDim d) +{ _softmax<<<Gr,Bl>>>(y, x, d); } + + +void cudaD_expand(dim3 Gr, dim3 Bl, double* y, const double* x, const int* off, MatrixDim d_out, MatrixDim d_in) +{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); } + + +void cudaD_rearrange(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in) +{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); } + + +void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in) +{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); } + + +void cudaD_check_class(size_t Gr, size_t Bl, const double* out, const double* des, int* match, MatrixDim d) +{ _check_class<<<Gr,Bl>>>(out,des,match,d); } + + + + diff --git a/src/CuBaseLib/cukernels.h b/src/CuBaseLib/cukernels.h new file mode 100644 index 0000000..d8320b5 --- /dev/null +++ b/src/CuBaseLib/cukernels.h @@ -0,0 +1,81 @@ +#ifndef _cuda_kernels_h_ +#define _cuda_kernels_h_ + + +extern "C" { + +#pragma GCC diagnostic ignored "-Wshadow"; +#include <vector_types.h> +#pragma GCC diagnostic warning "-Wshadow"; + + typedef struct MatrixDim_ { + int rows; + int cols; + int stride; + } MatrixDim; + + /************* + * Float instances + */ + //CuMatrix + void cudaF_set_const(dim3 Gr, dim3 Bl, float*mat, float value, MatrixDim d); + void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); + void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const float* mask, MatrixDim dmat, MatrixDim dmask); + void cudaF_apply_l1(dim3 Gr, dim3 Bl, float* mat, float l1, MatrixDim d); + void cudaF_scale_cols(dim3 Gr, dim3 Bl, float*mat, const float* scale, MatrixDim d); + void cudaF_scale_rows(dim3 Gr, dim3 Bl, float*mat, const float* scale, MatrixDim d); + void cudaF_add_scaled(dim3 Gr, dim3 Bl, float alpha, const float* A, float beta, float* dst, MatrixDim d); + void cudaF_add_scaled_row(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d); + void cudaF_mul_elem(dim3 Gr, dim3 Bl, float*mat, const float*A, MatrixDim d); + void cudaF_log_elem(dim3 Gr, dim3 Bl, float*mat, MatrixDim d); + + //CuVector + void cudaF_add_col_sum(size_t Gr, size_t Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d); + void cudaF_add_col_sum_reduce(dim3 Gr, dim3 Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d); + + //CuMath + void cudaF_softmax (size_t Gr, size_t Bl, float*y, const float*x, MatrixDim d); + void cudaF_softmax_reduce (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d); + void cudaF_sigmoid (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d); + void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float* eout, const float* e, const float* y, MatrixDim d); + + void cudaF_expand(dim3 Gr, dim3 Bl, float* y, const float* x, const int* off, MatrixDim d_out, MatrixDim d_in); + void cudaF_rearrange(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in); + void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in); + + void cudaF_check_class(size_t Gr, size_t Bl, const float* out, const float* des, int* match, MatrixDim d); + void cudaF_check_class_reduce(dim3 Gr, dim3 Bl, const float* out, const float* des, int* match, MatrixDim d); + + + + /************* + * Double instances + */ + //CuMatrix + void cudaD_set_const(dim3 Gr, dim3 Bl, double*mat, double value, MatrixDim d); + void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); + void cudaD_scale_cols(dim3 Gr, dim3 Bl, double*mat, const double* scale, MatrixDim d); + void cudaD_scale_rows(dim3 Gr, dim3 Bl, double*mat, const double* scale, MatrixDim d); + void cudaD_add_scaled(dim3 Gr, dim3 Bl, double alpha, const double* A, double beta, double* dst, MatrixDim d); + void cudaD_add_scaled_row(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d); + void cudaD_mul_elem(dim3 Gr, dim3 Bl, double*mat, const double*A, MatrixDim d); + void cudaD_log_elem(dim3 Gr, dim3 Bl, double*mat, MatrixDim d); + + //CuVector + void cudaD_add_col_sum(size_t Gr, size_t Bl, double alpha, const double* mat, double beta, double* vec, MatrixDim d); + + //CuMath + void cudaD_softmax (size_t Gr, size_t Bl, double*y, const double*x, MatrixDim d); + void cudaD_sigmoid (dim3 Gr, dim3 Bl, double*y, const double*x, MatrixDim d); + void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double* eout, const double* e, const double* y, MatrixDim d); + + void cudaD_expand(dim3 Gr, dim3 Bl, double* y, const double* x, const int* off, MatrixDim d_out, MatrixDim d_in); + void cudaD_rearrange(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in); + void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in); + + void cudaD_check_class(size_t Gr, size_t Bl, const double* out, const double* des, int* match, MatrixDim d); + + +} + +#endif diff --git a/src/CuBaseLib/cumath.cc b/src/CuBaseLib/cumath.cc new file mode 100644 index 0000000..d718324 --- /dev/null +++ b/src/CuBaseLib/cumath.cc @@ -0,0 +1,574 @@ + + + +#include "cumath.h" +#include "cukernels.h" + + +namespace TNet { + + ////////////////////////////////////////////////////////////////////////////// + //// CuMath<> Template specializations (float) + //// + template<> + void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK)); + + cudaF_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + template<> + void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK)); + + cudaF_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X) + { + Timer tim; tim.Start(); + +#if 0 + //disable 'reduce' functions + size_t dimBlock = CUBLOCK; + size_t dimGrid = n_blocks(X.Rows(),CUBLOCK); + + cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); + cuSafeCall(cudaGetLastError()); +#else + if(X.Cols() > 256) { + //use old implementation (can't use reduction due to + //limited size of shared memory) + size_t dimBlock = CUBLOCK; + size_t dimGrid = n_blocks(X.Rows(),CUBLOCK); + + cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); + cuSafeCall(cudaGetLastError()); + } else { + //use implementation with reduction + dim3 dimBlock(X.Cols(),1); + dim3 dimGrid(1,X.Rows()); + + cudaF_softmax_reduce(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); + cuSafeCall(cudaGetLastError()); + } +#endif + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf) + { + Timer tim; tim.Start(); + + assert(Y.Rows() == X.Rows()); + assert((X.Cols() % block_transf.Rows()) == 0); + assert((Y.Cols() % block_transf.Cols()) == 0); + assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols())); + + int blocks = X.Cols() / block_transf.Rows(); + + for(int i = 0; i < blocks; i++) { + int m = block_transf.Cols(); + int n = X.Rows(); + int k = block_transf.Rows(); + + /* + //DEBUG MESSAGE + std::cout << "N N " << m << " " << n << " " << k << " " + << 1.0 << " " << block_transf << " " << block_transf.Stride() + << " " << X+i*k << " " << X.Stride() << " " + << 0.0 << " " << Y+i*n << " " << Y.Stride() + << "\n" << std::flush; + */ + + + cublasSgemm('N', 'N', m, n, k, + 1.0, block_transf.pCUData(), block_transf.Stride(), + X.pCUData()+i*k, X.Stride(), + 0.0, Y.pCUData()+i*m, Y.Stride()); + } + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + + template<> + void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets) + { + Timer tim; tim.Start(); + + assert(Y.Rows() == X.Rows()); + assert(X.Cols() * frameOffsets.Dim() == Y.Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); + + cudaF_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom) + { + Timer tim; tim.Start(); + + assert(copyFrom.Dim() == Y.Cols()); + assert(Y.Rows() == X.Rows()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); + + cudaF_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom) + { + Timer tim; tim.Start(); + + assert(X.Cols() == Y.Cols()); + assert(X.Rows() == Y.Rows()); + assert(copyFrom.Dim() <= Y.Rows()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK)); + + MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim(); + MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim(); + + cudaF_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match) + { + Timer tim; tim.Start(); + + assert(out.Cols() == des.Cols()); + assert(out.Rows() == des.Rows()); + assert(out.Stride() == des.Stride()); + assert(match.Dim() == out.Rows()); + + if(out.Cols() > 256) { + size_t dimBlock = CUBLOCK; + size_t dimGrid = n_blocks(out.Rows(),CUBLOCK); + + cudaF_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim()); + cuSafeCall(cudaGetLastError()); + } else { + dim3 dimBlock(out.Cols(),1); + dim3 dimGrid(1,out.Rows()); + + cudaF_check_class_reduce(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim()); + cuSafeCall(cudaGetLastError()); + } + + + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC) + { + Timer tim; tim.Start(); + // CUBLAS is col major, TNet is row major + // keep trans..., just swap A&B argumets: A->B B->A + // + // WARNING + // NO DIMENSION CHECK!!! + + //m,n,k is cublas m,n,k + size_t m = ((transB=='T' || transB=='t')? B.Rows() : B.Cols()); + size_t n = ((transA=='T' || transA=='t')? A.Cols() : A.Rows()); + size_t k = ((transB=='T' || transB=='t')? B.Cols() : B.Rows()); + size_t k1 = ((transA=='T' || transA=='t')? A.Rows() : A.Cols()); + + k = ((k<k1)?k:k1); + m = ((m<C.Cols())?m:C.Cols()); + n = ((n<C.Rows())?m:C.Rows()); + +#if 0 + std::cout << "A " << transA << " "<< A.Rows() << " " << A.Cols() << " " << A.Stride() << " " << offA + << "; B " << transB << " "<< B.Rows() << " " << B.Cols() << " " << B.Stride() << " " << offB + << "; C " << C.Rows() << " " << C.Cols() << " " << C.Stride() << " " << offC + << "; alpha" << alpha << " beta" << beta << " REALmnk:" << m <<" "<< n <<" "<< k << std::endl; +#endif + + + cublasSgemm(transB, transA, m, n, k, + alpha, B.pCUData()+offB, B.Stride(), + A.pCUData()+offA, A.Stride(), + beta, C.pCUData()+offC, C.Stride()); + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + +/* + + template<> + void CuMath<float>::Gemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY) + { + Timer tim; tim.Start(); + // CUBLAS is col major, TNet is row major + // y = alpha * op(A) * x + beta * y, + + size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor) + size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor) + + // switch the trans parameter! + char cu_trans; + if(trans == 't' || trans == 'T') { + cu_trans = 'n'; + } else if (trans == 'n' || trans == 'N') { + cu_trans = 't'; + } else { + Error(std::string("Unknown trans")+trans); + } + + //check the dims + if(cu_trans == 'n') { + assert(dimX == n); + assert(dimY == m); + } else { + assert(dimX == m); + assert(dimY == n); + } + + //run gemv + cublasSgemv(cu_trans,m,n,alpha, + A.pCUData(), A.Stride(), x, 1, + beta, y, 1); + + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + */ + + /** + * offsetY tells how many outputs of 'Ax' mutiplication is skipped at the beginning, + */ + template<> + void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY) + { + Timer tim; tim.Start(); + // CUBLAS is col major, TNet is row major + // y = alpha * op(A) * x + beta * y, + + size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor) + size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor) + + // switch the trans parameter! + char cu_trans; + if(trans == 't' || trans == 'T') { + cu_trans = 'n'; + } else if (trans == 'n' || trans == 'N') { + cu_trans = 't'; + } else { + Error(std::string("Unknown trans")+trans); + } + + // select part of matrix for compute + size_t cu_offset = 0; + if(cu_trans == 'n') { + cu_offset += offsetY; + assert(m >= dimY+offsetY); + m = dimY; + } else { + cu_offset += offsetY*A.Stride(); + assert(n >= dimY+offsetY); + n = dimY; + } + + //check the dims + if(cu_trans == 'n') { + assert(dimX == n); + assert(dimY == m); + } else { + assert(dimX == m); + assert(dimY == n); + } + + //run gemv + cublasSgemv(cu_trans,m,n,alpha, + A.pCUData()+cu_offset, A.Stride(), x, 1, + beta, y, 1); + + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A) { + Timer tim; tim.Start(); + // CUBLAS is col major, TNet is row major + // -> switch x and y + + // A = alpha * x * transpose(y) + A, + + assert(dimX == A.Rows()); + assert(dimY == A.Cols()); + + size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor) + size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor) + + cublasSger(m,n,alpha,y,1,x,1,A.pCUData(),A.Stride()); + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out) + { + Timer tim; tim.Start(); + + assert(out.Dim() % in.Dim() == 0); + int n_copies = out.Dim()/in.Dim(); + CuVector<int> offsets(n_copies); + //offsets.SetConst(0); done implicitly! + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(out.Dim(), CUBLOCK)); + + MatrixDim dim_in = { 1, in.Dim(), in.Dim() }; + MatrixDim dim_out = { 1, out.Dim(), out.Dim() }; + cudaF_expand(dimGrid, dimBlock, out.pCUData(), in.pCUData(), offsets.pCUData(), dim_out, dim_in); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out) + { + Timer tim; tim.Start(); + + assert(in.Dim() % out.Dim() == 0); + + size_t dimBlock = CUBLOCK; + size_t dimGrid = n_blocks(out.Dim(),CUBLOCK); + + MatrixDim dim = { in.Dim()/out.Dim(), out.Dim(), out.Dim() }; + + cudaF_add_col_sum(dimGrid,dimBlock,alpha,in.pCUData(),beta,out.pCUData(),dim); + + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + ////////////////////////////////////////////////////////////////////////////// + //// CuMath<> Template specializations (double) + //// + template<> + void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK)); + + cudaD_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + template<> + void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK)); + + cudaD_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X) + { + Timer tim; tim.Start(); + + size_t dimBlock = CUBLOCK; + size_t dimGrid = n_blocks(X.Rows(),CUBLOCK); + + cudaD_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf) + { + Timer tim; tim.Start(); + + assert(Y.Rows() == X.Rows()); + assert((X.Cols() % block_transf.Rows()) == 0); + assert((Y.Cols() % block_transf.Cols()) == 0); + assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols())); + + int blocks = X.Cols() / block_transf.Rows(); + + for(int i = 0; i < blocks; i++) { + int m = block_transf.Cols(); + int n = X.Rows(); + int k = block_transf.Rows(); + + /* + //DEBUG MESSAGE + std::cout << "N N " << m << " " << n << " " << k << " " + << 1.0 << " " << block_transf << " " << block_transf.Stride() + << " " << X+i*k << " " << X.Stride() << " " + << 0.0 << " " << Y+i*n << " " << Y.Stride() + << "\n" << std::flush; + */ + + + cublasDgemm('N', 'N', m, n, k, + 1.0, block_transf.pCUData(), block_transf.Stride(), + X.pCUData()+i*k, X.Stride(), + 0.0, Y.pCUData()+i*m, Y.Stride()); + } + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + + template<> + void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets) + { + Timer tim; tim.Start(); + + assert(Y.Rows() == X.Rows()); + assert(X.Cols() * frameOffsets.Dim() == Y.Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); + + cudaD_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom) + { + Timer tim; tim.Start(); + + assert(copyFrom.Dim() == Y.Cols()); + assert(Y.Rows() == X.Rows()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); + + cudaD_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom) + { + Timer tim; tim.Start(); + + assert(X.Cols() == Y.Cols()); + assert(X.Rows() == Y.Rows()); + assert(copyFrom.Dim() <= Y.Rows()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK)); + + MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim(); + MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim(); + + cudaD_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match) + { + Timer tim; tim.Start(); + + assert(out.Cols() == des.Cols()); + assert(out.Rows() == des.Rows()); + assert(out.Stride() == des.Stride()); + assert(match.Dim() == out.Rows()); + + size_t dimBlock = CUBLOCK; + size_t dimGrid = n_blocks(out.Rows(),CUBLOCK); + + cudaD_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + +} diff --git a/src/CuBaseLib/cumath.h b/src/CuBaseLib/cumath.h new file mode 100644 index 0000000..5680082 --- /dev/null +++ b/src/CuBaseLib/cumath.h @@ -0,0 +1,146 @@ +#ifndef _CUMATH_H_ +#define _CUMATH_H_ + +#include "cumatrix.h" + +#include "Timer.h" +#include "cudevice.h" + +namespace TNet { + + + /** + * Group of Math operations for the NN training + */ + template<typename _ElemT> + class CuMath + { + public: + + /// Y = Sigmoid(X) + static void Sigmoid(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X) + { Error("__func__ Not implemented"); } + + /// Eout = E(1-E) * Y + static void DiffSigmoid(CuMatrix<_ElemT>& Eout, const CuMatrix<_ElemT>& Ein, const CuMatrix<_ElemT>& Y) + { Error("__func__ Not implemented"); } + + /// Y = Softmax(X) + static void Softmax(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X) + { Error("__func__ Not implemented"); } + + /// for DCT in FeaCat + static void BlockLinearity(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuMatrix<_ElemT>& block_transf) + { Error("__func__ Not implemented"); } + + static void Expand(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& frameOffsets) + { Error("__func__ Not implemented"); } + + /// ie. switch cols according to copyFrom + static void Rearrange(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& copyFrom) + { Error("__func__ Not implemented"); } + + /// ie. switch rows according to copyFrom + static void Randomize(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& copyFrom) + { Error("__func__ Not implemented"); } + + /// check match in the classification for Xentropy + static void CheckClass(const CuMatrix<_ElemT>& out, const CuMatrix<_ElemT> &des, CuVector<int>& match) + { Error("__func__ Not implemented"); } + + /// gemm with offset for CuSharedLinearity + static void OffsetGemm(char transA, char transB, _ElemT alpha, const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B, _ElemT beta, CuMatrix<_ElemT>& C, int offA, int offB, int offC) + { Error("__func__ Not implemented"); } + + /// gemv with offset for CuRecurrent + static void OffsetGemv(char trans, _ElemT alpha, const CuMatrix<_ElemT>& A, const _ElemT* x, size_t dimX, _ElemT beta, _ElemT* y, size_t dimY, size_t offsetY) + { Error("__func__ Not implemented"); } + + /// ger for weight updates in CuRecurrent + static void BlasGer(_ElemT alpha, const _ElemT* x, size_t dimX, const _ElemT* y, size_t dimY, CuMatrix<_ElemT>& A) + { Error("__func__ Not implemented"); } + + /// concatenate one vector several times for CuSharedLinearity + static void VecExpand(const CuVector<_ElemT>&in, CuVector<_ElemT>&out) + { Error("__func__ Not implemented"); } + + /// sum the vector as if it was matrix data for CuSharedLinearity + static void VecAddColSum(_ElemT alpha, const CuVector<_ElemT>&in, _ElemT beta, CuVector<_ElemT>&out) + { Error("__func__ Not implemented"); } + + }; //class CuMath:: + + + ////////////////////////////////////////////////////////////////////////////// + //// CuMath<> Template specializations (float) + //// + template<> + void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X); + + template<> + void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y); + + template<> + void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X); + + template<> + void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf); + + template<> + void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets); + + template<> + void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom); + + template<> + void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom); + + template<> + void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match); + + template<> + void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC); + + template<> + void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY); + + template<> + void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A); + + template<> + void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out); + + template<> + void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out); + + + ////////////////////////////////////////////////////////////////////////////// + //// CuMath<> Template specializations (double) + //// + template<> + void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X); + + template<> + void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y); + + template<> + void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X); + + template<> + void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf); + + template<> + void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets); + + template<> + void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom); + + template<> + void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom); + + template<> + void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match); + +} + +#endif diff --git a/src/CuBaseLib/cumatrix.h b/src/CuBaseLib/cumatrix.h new file mode 100644 index 0000000..887b92d --- /dev/null +++ b/src/CuBaseLib/cumatrix.h @@ -0,0 +1,221 @@ +#ifndef _CUMATRIX_H_ +#define _CUMATRIX_H_ + +#include <sstream> + +#include "Matrix.h" +#include "cukernels.h" + + + +namespace TNet { + + template<typename _ElemT> class CuVector; + + /** + * \brief Matrix for CUDA computing + */ + template<typename _ElemT> + class CuMatrix + { + typedef CuMatrix<_ElemT> ThisType; + + public: + + /// Default Constructor + CuMatrix<_ElemT>() + : mRows(0), mCols(0), mStride(0), mpCUData(NULL),isOwn(false) + { } + /// Constructor with memory initialisation + CuMatrix<_ElemT>(size_t rows, size_t cols) + : mRows(0), mCols(0), mStride(0), mpCUData(NULL),isOwn(false) + { Init(rows, cols); } + + /// Destructor + ~CuMatrix() + { Destroy(); } + + /// Dimensions + size_t Rows() const + { return mRows; } + + size_t Cols() const + { return mCols; } + + size_t Stride() const + { return mStride; } + + ::MatrixDim Dim() const + { ::MatrixDim d = { + static_cast<int>(mRows), + static_cast<int>(mCols), + static_cast<int>(mStride) + }; + return d; + } + + /// Get raw pointer + const _ElemT* pCUData() const + { return mpCUData; } + _ElemT* pCUData() + { return mpCUData; } + + /// Get raw row pointer + const _ElemT* pCURowData(size_t r) const + { assert(r < Rows()); return mpCUData+r*mStride; } + _ElemT* pCURowData(size_t r) + { assert(r < Rows()); return mpCUData+r*mStride; } + + /// Get size of matrix in bytes + size_t MSize() const + { return mRows*mStride*sizeof(_ElemT); } + /// Get size of matrix row in bytes + size_t MRowSize() const + { return mStride*sizeof(_ElemT); } + + /// Allocate the memory + ThisType& Init(size_t rows, size_t cols); + /// Copy the ptr of rSrc starting at x with span of cols + ThisType& Init(CuMatrix<_ElemT>& rSrc, size_t x, size_t cols); + /// Copy the settings of rSrc + ThisType& Init(CuMatrix<_ElemT>& rSrc); + + /// Deallocate the memory + void Destroy(); + + /// Copy functions (reallocates when needed) + ThisType& CopyFrom(const CuMatrix<_ElemT>& rSrc); + ThisType& CopyFrom(const Matrix<_ElemT>& rSrc); + Matrix<_ElemT>& CopyTo(Matrix<_ElemT>& rDst) const; + + /// Copy rowCnt rows from rSrc, starting by row srcOri, + /// copying to memory block starting by row dstOri + void CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri); + + /// Copy colCnt columns from rSrc, starting by col srcOri, + /// copying to memory block starting by row dstOri + void CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri); + + + // Math operations, some calling kernels + // + void SetZero(); + + void SetConst(_ElemT value) + { Error("__func__ Not implemented"); } + + /// Natural Logarithm of every elements + void ApplyLog() + { Error("__func__ Not implemented"); } + + /// Setting values to zero if mask[i][j]==0 + void ApplyMask(const CuMatrix<BaseFloat>& mask) + { Error("__func__ Not implemented"); } + + /** + * \brief Apply Lasso function + * + * \param l1 \f$ L^1 \_ Norm \f$ function parameter + * + * Lasso: \f[ Y_{ij} = \left\{ + * \begin{array}{lr} + * X_{ij} + l1 & , X_{ij} < -l1 \\ + * 0 & , |X_{ij}| \le l1 \\ + * X_{ij} - l1 & , X_{ij} > -l1 + * \end{array} + * \right. \f] + */ + void ApplyL1(BaseFloat l1) + { Error("__func__ Not implemented"); } + + /// scale i'th column by scale[i] + void ScaleCols(const CuVector<_ElemT>& scale) + { Error("__func__ Not implemented"); } + + /// scale i'th row by scale[i] + void ScaleRows(const CuVector<_ElemT>& scale) + { Error("__func__ Not implemented"); } + + /// B = aplha * A + beta * B + void AddScaled(_ElemT alpha, const CuMatrix<_ElemT>& A, _ElemT beta) + { Error("__func__ Not implemented"); } + + /// B = aplha * row + beta * B + void AddScaledRow(_ElemT alpha, const CuVector<_ElemT>& row, _ElemT beta) + { Error("__func__ Not implemented"); } + + /// C = alpha * A(^T)*B(^T) + beta * C + void Gemm(char transa, char transb, + _ElemT alpha, + const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B, + _ElemT beta) + { Error("__func__ Not implemented"); } + + /// A = alpha * x*y^T + A + void BlasGer(_ElemT alpha, + const CuVector<_ElemT>& x, const CuVector<_ElemT>& y) + { Error("__func__ Not implemented"); } + + + /// Multiply two matrices elementhwise: C = A .* C + void MulElem(const CuMatrix<_ElemT>& A) + { Error("__func__ Not implemented"); } + + /// A = log(A) + void LogElem() + { Error("__func__ Not implemented"); } + + void Print() const + { + Matrix<_ElemT> mat(Rows(),Cols()); + CopyTo(mat); + std::cout << mat; + } + + + + /// Check if contains invalid value + void CheckData() + { + Matrix<_ElemT> mat; + CopyTo(mat); + for(size_t i=0; i<Rows(); i++) { + for(size_t j=0; j<Cols(); j++) { + if(std::isnan(mat(i,j)) || std::isinf(mat(i,j))) { + std::ostringstream os; + os << "Invalid value:" << mat(i,j) << "at row"<<i<<" col"<<j<<"\n"; + Error(os.str()); + } + } + } + } + + + private: + size_t mRows; + size_t mCols; + size_t mStride; + + _ElemT* mpCUData; + + bool isOwn; + + }; + + + /// Prints the matrix dimensions and pointer to stream + template<typename _ElemT> + inline std::ostream& operator << (std::ostream& out, const CuMatrix<_ElemT>& mat) + { + out << "[CUMATRIX R" << mat.Rows() << " C" << mat.Cols() << " S" << mat.Stride() + << " PTR" << mat.pCUData() << "]" << std::flush; + return out; + } + + +} + + +#include "cumatrix.tcc" + +#endif diff --git a/src/CuBaseLib/cumatrix.h~ b/src/CuBaseLib/cumatrix.h~ new file mode 100644 index 0000000..a0ad5cd --- /dev/null +++ b/src/CuBaseLib/cumatrix.h~ @@ -0,0 +1,214 @@ +#ifndef _CUMATRIX_H_ +#define _CUMATRIX_H_ + +#include <sstream> + +#include "Matrix.h" +#include "cukernels.h" + + + +namespace TNet { + + template<typename _ElemT> class CuVector; + + /** + * \brief Matrix for CUDA computing + */ + template<typename _ElemT> + class CuMatrix + { + typedef CuMatrix<_ElemT> ThisType; + + public: + + /// Default Constructor + CuMatrix<_ElemT>() + : mRows(0), mCols(0), mStride(0), mpCUData(NULL) + { } + /// Constructor with memory initialisation + CuMatrix<_ElemT>(size_t rows, size_t cols) + : mRows(0), mCols(0), mStride(0), mpCUData(NULL) + { Init(rows, cols); } + + /// Destructor + ~CuMatrix() + { Destroy(); } + + /// Dimensions + size_t Rows() const + { return mRows; } + + size_t Cols() const + { return mCols; } + + size_t Stride() const + { return mStride; } + + ::MatrixDim Dim() const + { ::MatrixDim d = { + static_cast<int>(mRows), + static_cast<int>(mCols), + static_cast<int>(mStride) + }; + return d; + } + + /// Get raw pointer + const _ElemT* pCUData() const + { return mpCUData; } + _ElemT* pCUData() + { return mpCUData; } + + /// Get raw row pointer + const _ElemT* pCURowData(size_t r) const + { assert(r < Rows()); return mpCUData+r*mStride; } + _ElemT* pCURowData(size_t r) + { assert(r < Rows()); return mpCUData+r*mStride; } + + /// Get size of matrix in bytes + size_t MSize() const + { return mRows*mStride*sizeof(_ElemT); } + /// Get size of matrix row in bytes + size_t MRowSize() const + { return mStride*sizeof(_ElemT); } + + /// Allocate the memory + ThisType& Init(size_t rows, size_t cols); + + /// Deallocate the memory + void Destroy(); + + /// Copy functions (reallocates when needed) + ThisType& CopyFrom(const CuMatrix<_ElemT>& rSrc); + ThisType& CopyFrom(const Matrix<_ElemT>& rSrc); + Matrix<_ElemT>& CopyTo(Matrix<_ElemT>& rDst) const; + + /// Copy rowCnt rows from rSrc, starting by row srcOri, + /// copying to memory block starting by row dstOri + void CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri); + + /// Copy colCnt columns from rSrc, starting by col srcOri, + /// copying to memory block starting by row dstOri + void CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri); + + + // Math operations, some calling kernels + // + void SetZero(); + + void SetConst(_ElemT value) + { Error("__func__ Not implemented"); } + + /// Natural Logarithm of every elements + void ApplyLog() + { Error("__func__ Not implemented"); } + + /// Setting values to zero if mask[i][j]==0 + void ApplyMask(const CuMatrix<BaseFloat>& mask) + { Error("__func__ Not implemented"); } + + /** + * \brief Apply Lasso function + * + * \param l1 \f$ L^1 \_ Norm \f$ function parameter + * + * Lasso: \f[ Y_{ij} = \left\{ + * \begin{array}{lr} + * X_{ij} + l1 & , X_{ij} < -l1 \\ + * 0 & , |X_{ij}| \le l1 \\ + * X_{ij} - l1 & , X_{ij} > -l1 + * \end{array} + * \right. \f] + */ + void ApplyL1(BaseFloat l1) + { Error("__func__ Not implemented"); } + + /// scale i'th column by scale[i] + void ScaleCols(const CuVector<_ElemT>& scale) + { Error("__func__ Not implemented"); } + + /// scale i'th row by scale[i] + void ScaleRows(const CuVector<_ElemT>& scale) + { Error("__func__ Not implemented"); } + + /// B = aplha * A + beta * B + void AddScaled(_ElemT alpha, const CuMatrix<_ElemT>& A, _ElemT beta) + { Error("__func__ Not implemented"); } + + /// B = aplha * row + beta * B + void AddScaledRow(_ElemT alpha, const CuVector<_ElemT>& row, _ElemT beta) + { Error("__func__ Not implemented"); } + + /// C = alpha * A(^T)*B(^T) + beta * C + void Gemm(char transa, char transb, + _ElemT alpha, + const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B, + _ElemT beta) + { Error("__func__ Not implemented"); } + + /// A = alpha * x*y^T + A + void BlasGer(_ElemT alpha, + const CuVector<_ElemT>& x, const CuVector<_ElemT>& y) + { Error("__func__ Not implemented"); } + + + /// Multiply two matrices elementhwise: C = A .* C + void MulElem(const CuMatrix<_ElemT>& A) + { Error("__func__ Not implemented"); } + + /// A = log(A) + void LogElem() + { Error("__func__ Not implemented"); } + + void Print() const + { + Matrix<_ElemT> mat(Rows(),Cols()); + CopyTo(mat); + std::cout << mat; + } + + + + void CheckData() + { + Matrix<_ElemT> mat; + CopyTo(mat); + for(size_t i=0; i<Rows(); i++) { + for(size_t j=0; j<Cols(); j++) { + if(std::isnan(mat(i,j)) || std::isinf(mat(i,j))) { + std::ostringstream os; + os << "Invalid value:" << mat(i,j) << "at row"<<i<<" col"<<j<<"\n"; + Error(os.str()); + } + } + } + } + + + private: + size_t mRows; + size_t mCols; + size_t mStride; + + _ElemT* mpCUData; + + }; + + + /// Prints the matrix dimensions and pointer to stream + template<typename _ElemT> + inline std::ostream& operator << (std::ostream& out, const CuMatrix<_ElemT>& mat) + { + out << "[CUMATRIX R" << mat.Rows() << " C" << mat.Cols() << " S" << mat.Stride() + << " PTR" << mat.pCUData() << "]" << std::flush; + return out; + } + + +} + + +#include "cumatrix.tcc" + +#endif diff --git a/src/CuBaseLib/cumatrix.tcc b/src/CuBaseLib/cumatrix.tcc new file mode 100644 index 0000000..7d6a136 --- /dev/null +++ b/src/CuBaseLib/cumatrix.tcc @@ -0,0 +1,660 @@ + +#include <cuda_runtime_api.h> +#include <cublas.h> + +#include "Timer.h" +#include "cucommon.h" +#include "cuvector.h" +#include "cudevice.h" + +namespace TNet { + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuMatrix<_ElemT>& + CuMatrix<_ElemT>:: + Init(size_t rows, size_t cols) + { + if(mRows == rows && mCols == cols) { + //SetZero(); + return *this; + } + + Destroy(); + + size_t row_bytes = cols * sizeof(_ElemT); + size_t pitch; + cuSafeCall(cudaMallocPitch((void**)&mpCUData, &pitch, row_bytes, rows+1)); + mRows = rows; mCols = cols; + mStride = pitch/sizeof(_ElemT); + SetZero(); + + isOwn=true; + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuMatrix<_ElemT>& + CuMatrix<_ElemT>:: + Init(CuMatrix<_ElemT>& rSrc, size_t x, size_t cols) + { + mRows = rSrc.Rows(); + mCols = cols; + mStride = rSrc.Stride(); + mpCUData = rSrc.pCUData() + x; + isOwn=false; + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuMatrix<_ElemT>& + CuMatrix<_ElemT>:: + Init(CuMatrix<_ElemT>& rSrc) + { + mRows = rSrc.Rows(); + mCols = rSrc.Cols(); + mStride = rSrc.Stride(); + mpCUData = rSrc.pCUData(); + isOwn=false; + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + void + CuMatrix<_ElemT>:: + Destroy() + { + if(NULL != mpCUData && isOwn) { + cuSafeCall(cudaFree(mpCUData)); + } + mpCUData = NULL; + mRows = mCols = mStride = 0; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuMatrix<_ElemT>& + CuMatrix<_ElemT>:: + CopyFrom(const CuMatrix<_ElemT>& rSrc) + { + Init(rSrc.Rows(),rSrc.Cols()); + + Timer tim; tim.Start(); + + size_t dst_pitch = mStride*sizeof(_ElemT); + size_t src_pitch = rSrc.Stride()*sizeof(_ElemT); + size_t width = rSrc.Cols()*sizeof(_ElemT); + cuSafeCall(cudaMemcpy2D(mpCUData, dst_pitch, rSrc.pCUData(), src_pitch, width, rSrc.Rows(), cudaMemcpyDeviceToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromD2D",tim.Val()); + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuMatrix<_ElemT>& + CuMatrix<_ElemT>:: + CopyFrom(const Matrix<_ElemT>& rSrc) + { + Init(rSrc.Rows(),rSrc.Cols()); + + Timer tim; tim.Start(); + + size_t dst_pitch = mStride*sizeof(_ElemT); + size_t src_pitch = rSrc.Stride()*sizeof(_ElemT); + size_t width = rSrc.Cols()*sizeof(_ElemT); + cuSafeCall(cudaMemcpy2D(mpCUData, dst_pitch, rSrc.pData(), src_pitch, width, rSrc.Rows(), cudaMemcpyHostToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromH2D",tim.Val()); + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + Matrix<_ElemT>& + CuMatrix<_ElemT>:: + CopyTo(Matrix<_ElemT>& rDst) const + { + if(rDst.Rows() != Rows() || rDst.Cols() != Cols()) { + rDst.Init(Rows(),Cols()); + } + + Timer tim; tim.Start(); + + size_t src_pitch = mStride*sizeof(_ElemT); + size_t dst_pitch = rDst.Stride()*sizeof(_ElemT); + size_t width = Cols()*sizeof(_ElemT); + cuSafeCall(cudaMemcpy2D(rDst.pData(), dst_pitch, pCUData(), src_pitch, width, Rows(), cudaMemcpyDeviceToHost)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToD2H",tim.Val()); + + return rDst; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + void + CuMatrix<_ElemT>:: + CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri) + { + assert(rowCnt+srcOri <= rSrc.Rows()); + assert(rowCnt+dstOri <= Rows()); + assert(Cols() == rSrc.Cols()); + + Timer tim; tim.Start(); + + size_t dst_pitch = mStride*sizeof(_ElemT); + size_t src_pitch = rSrc.Stride()*sizeof(_ElemT); + size_t width = rSrc.Cols()*sizeof(_ElemT); + + const _ElemT* p_src = rSrc.pCUData() + srcOri*rSrc.Stride(); + _ElemT* p_dst = mpCUData + dstOri*mStride; + + cuSafeCall(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, rowCnt, cudaMemcpyDeviceToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyRowsD2D",tim.Val()); + + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + void + CuMatrix<_ElemT>:: + CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri) + { + assert(colCnt+srcOri <= rSrc.Cols()); + assert(colCnt+dstOri <= Cols()); + assert(Rows() == rSrc.Rows()); + + Timer tim; tim.Start(); + + size_t dst_pitch = mStride*sizeof(_ElemT); + size_t src_pitch = rSrc.Stride()*sizeof(_ElemT); + size_t width = colCnt*sizeof(_ElemT); + + const _ElemT* p_src = rSrc.pCUData() + srcOri; + _ElemT* p_dst = mpCUData + dstOri; + + cuSafeCall(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, Rows(), cudaMemcpyDeviceToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyColsD2D",tim.Val()); + + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + void + CuMatrix<_ElemT>:: + SetZero() + { + Timer tim; tim.Start(); + cuSafeCall(cudaMemset(mpCUData, 0, mRows*mStride*sizeof(_ElemT))); + tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero",tim.Val()); + } + + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + + //////////////////////////////////////////////////////////////////////// + //// CuMatrix:: templeate specializations (float) + //// + template<> + inline void CuMatrix<float>::SetConst(float value) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + (dimGrid,dimBlock,mpCUData,value,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<float>::ApplyLog() + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_apply_log(dimGrid,dimBlock,mpCUData,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<float>::ApplyMask(const CuMatrix<BaseFloat>& mask) + { + Timer tim; tim.Start(); + + assert(mask.Rows() == Rows()); + assert(mask.Cols() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_apply_mask(dimGrid,dimBlock,mpCUData,mask.pCUData(),Dim(),mask.Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<float>::ApplyL1(float l1) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_apply_l1(dimGrid,dimBlock,mpCUData,l1,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<float>::ScaleCols(const CuVector<float>& scale) + { + Timer tim; tim.Start(); + + assert(scale.Dim() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_scale_cols(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim()); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<float>::ScaleRows(const CuVector<float>& scale) + { + Timer tim; tim.Start(); + + assert(scale.Dim() == Rows()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_scale_rows(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim()); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<float>::AddScaled(float alpha, const CuMatrix<float>& A, float beta) + { + Timer tim; tim.Start(); + + assert(A.Rows() == Rows()); + assert(A.Cols() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_add_scaled(dimGrid,dimBlock,alpha,A.pCUData(),beta,mpCUData,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<float>::AddScaledRow(float alpha, const CuVector<float>& row, float beta) + { + Timer tim; tim.Start(); + + if(row.Dim() != Cols()) { + std::ostringstream os; + os << "Non matching dimensions: Cols:" << Cols() << " VectorDim:" << row.Dim(); + Error(os.str()); + } + assert(row.Dim() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_add_scaled_row(dimGrid,dimBlock,alpha,row.pCUData(),beta,mpCUData,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<float>::Gemm(char transa, char transb, + float alpha, + const CuMatrix<float>& A, const CuMatrix<float>& B, + float beta) + { + // CUBLAS is col major, TNet is row major + // keep trans..., just swap A&B argumets: A->B B->A + size_t m = ((transb=='T' || transb=='t')? B.Rows() : B.Cols()); + size_t n = ((transa=='T' || transa=='t')? A.Cols() : A.Rows()); + size_t k = ((transb=='T' || transb=='t')? B.Cols() : B.Rows()); + size_t k1 = ((transa=='T' || transa=='t')? A.Rows() : A.Cols()); + + assert(m == Cols()); + assert(n == Rows()); + assert(k == k1); + + #if 0 + //DEBUG MESSAGE + std::cout << "\n" << transb << " " << transa << " " << m << " " << n << " " << k << " " << + alpha << " " << B << " " << B.Stride() << " " << + A << " " << A.Stride() << " " << beta << " " << C << " " << + C.Stride() << "\n" << std::flush; + #endif + + Timer tim; tim.Start(); + + cublasSgemm(transb, transa, m, n, k, + alpha, B.pCUData(), B.Stride(), A.pCUData(), A.Stride(), + beta, mpCUData, Stride()); + + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<float>::BlasGer(float alpha, + const CuVector<float>& x, const CuVector<float>& y) + { + // CUBLAS is col major, TNet is row major + // just swap x and y + assert(x.Dim() == Rows()); + assert(y.Dim() == Cols()); + + Timer tim; tim.Start(); + + cublasSger(Cols(),Rows(),alpha,y.pCUData(),1,x.pCUData(),1,mpCUData,Stride()); + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<float>::MulElem(const CuMatrix<float>& A) + { + Timer tim; tim.Start(); + + assert(mCols == A.Cols()); + assert(mRows == A.Rows()); + assert(mStride == A.Stride()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_mul_elem(dimGrid,dimBlock,mpCUData, A.pCUData(), Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<float>::LogElem() + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaF_log_elem(dimGrid,dimBlock,mpCUData, Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + + + //////////////////////////////////////////////////////////////////////// + //// CuMatrix:: templeate specializations (double) + //// + template<> + inline void CuMatrix<double>::SetConst(double value) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_set_const(dimGrid,dimBlock,mpCUData,value,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<double>::ApplyLog() + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_apply_log(dimGrid,dimBlock,mpCUData,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<double>::ScaleCols(const CuVector<double>& scale) + { + Timer tim; tim.Start(); + + assert(scale.Dim() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_scale_cols(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim()); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<double>::ScaleRows(const CuVector<double>& scale) + { + Timer tim; tim.Start(); + + assert(scale.Dim() == Rows()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_scale_rows(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim()); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<double>::AddScaled(double alpha, const CuMatrix<double>& A, double beta) + { + Timer tim; tim.Start(); + + assert(A.Rows() == Rows()); + assert(A.Cols() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_add_scaled(dimGrid,dimBlock,alpha,A.pCUData(),beta,mpCUData,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<double>::AddScaledRow(double alpha, const CuVector<double>& row, double beta) + { + Timer tim; tim.Start(); + + assert(row.Dim() == Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_add_scaled_row(dimGrid,dimBlock,alpha,row.pCUData(),beta,mpCUData,Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + template<> + inline void CuMatrix<double>::Gemm(char transa, char transb, + double alpha, + const CuMatrix<double>& A, const CuMatrix<double>& B, + double beta) + { + // CUBLAS is col major, TNet is row major + // keep trans..., just swap A&B argumets: A->B B->A + size_t m = ((transb=='T' || transb=='t')? B.Rows() : B.Cols()); + size_t n = ((transa=='T' || transa=='t')? A.Cols() : A.Rows()); + size_t k = ((transb=='T' || transb=='t')? B.Cols() : B.Rows()); + size_t k1 = ((transa=='T' || transa=='t')? A.Rows() : A.Cols()); + + assert(m == Cols()); + assert(n == Rows()); + assert(k == k1); + + #if 0 + //DEBUG MESSAGE + std::cout << "\n" << transb << " " << transa << " " << m << " " << n << " " << k << " " << + alpha << " " << B << " " << B.Stride() << " " << + A << " " << A.Stride() << " " << beta << " " << C << " " << + C.Stride() << "\n" << std::flush; + #endif + + Timer tim; tim.Start(); + + cublasDgemm(transb, transa, m, n, k, + alpha, B.pCUData(), B.Stride(), A.pCUData(), A.Stride(), + beta, mpCUData, Stride()); + + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + template<> + inline void CuMatrix<double>::BlasGer(double alpha, + const CuVector<double>& x, const CuVector<double>& y) + { + // CUBLAS is col major, TNet is row major + // just swap x and y + assert(x.Dim() == Rows()); + assert(y.Dim() == Cols()); + + Timer tim; tim.Start(); + + cublasDger(Cols(),Rows(),alpha,y.pCUData(),1,x.pCUData(),1,mpCUData,Stride()); + cuSafeCall(cublasGetError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + + template<> + inline void CuMatrix<double>::MulElem(const CuMatrix<double>& A) + { + Timer tim; tim.Start(); + + assert(mCols == A.Cols()); + assert(mRows == A.Rows()); + assert(mStride == A.Stride()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_mul_elem(dimGrid,dimBlock,mpCUData, A.pCUData(), Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuMatrix<double>::LogElem() + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK)); + + cudaD_log_elem(dimGrid,dimBlock,mpCUData, Dim()); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + +} diff --git a/src/CuBaseLib/curand.h b/src/CuBaseLib/curand.h new file mode 100644 index 0000000..8aa66d5 --- /dev/null +++ b/src/CuBaseLib/curand.h @@ -0,0 +1,40 @@ +#ifndef _CU_RAND_H_ +#define _CU_RAND_H_ + + +#include "cumatrix.h" + + +namespace TNet { + + template<typename T> + class CuRand { + public: + + CuRand(size_t rows, size_t cols) + { SeedGpu(rows,cols); } + + ~CuRand() { } + + void SeedGpu(size_t rows, size_t cols); + void Rand(CuMatrix<T>& tgt); + void GaussRand(CuMatrix<T>& tgt); + + void BinarizeProbs(const CuMatrix<T>& probs, CuMatrix<T>& states); + void AddGaussNoise(CuMatrix<T>& tgt, T gscale = 1.0); + + private: + static void SeedRandom(Matrix<unsigned>& mat); + + private: + CuMatrix<unsigned> z1, z2, z3, z4; + CuMatrix<T> tmp; + }; + +} + + +#include "curand.tcc" + + +#endif diff --git a/src/CuBaseLib/curand.tcc b/src/CuBaseLib/curand.tcc new file mode 100644 index 0000000..e337189 --- /dev/null +++ b/src/CuBaseLib/curand.tcc @@ -0,0 +1,228 @@ + +#include <cstdlib> +#include "curandkernels.h" + + +namespace TNet { + + + + template<typename T> + inline void + CuRand<T>:: + SeedGpu(size_t rows, size_t cols) + { + Matrix<unsigned> mat(rows,cols); + SeedRandom(mat); + z1.CopyFrom(mat); + SeedRandom(mat); + z2.CopyFrom(mat); + SeedRandom(mat); + z3.CopyFrom(mat); + SeedRandom(mat); + z4.CopyFrom(mat); + + /* + std::cout << "RANDININIT" << std::endl; + z1.Print(); + z2.Print(); + z3.Print(); + z4.Print(); + std::cout << "RANDININIT" << std::endl; + */ + + tmp.Init(rows,cols); + } + + + + template<typename T> + inline void + CuRand<T>:: + SeedRandom(Matrix<unsigned>& mat) { + for(size_t j=0; j<mat.Rows(); j++) { + for(size_t i=0; i<mat.Cols(); i++) { + unsigned value = 0; + while(value <= 128) { value = lrand48(); } + mat(j,i) = value; + } + } + } + + + template<typename T> + inline void + CuRand<T>:: + AddGaussNoise(CuMatrix<T>& tgt, T gscale) + { + GaussRand(tmp); + tgt.AddScaled(gscale,tmp,1.0); + } + + + + + + + + //////////////////////////////////////////////////////////////////////////// + //// invalid general wrappers over CUDA kernels + template<typename T> + inline void + CuRand<T>:: + Rand(CuMatrix<T>& tgt) + { Error("Unimplemented"); } + + template<typename T> + inline void + CuRand<T>:: + GaussRand(CuMatrix<T>& tgt) + { Error("Unimplemented"); } + + template<typename T> + inline void + CuRand<T>:: + BinarizeProbs(const CuMatrix<T>& probs, CuMatrix<T>& states) + { Error("Unimplemented"); } + + + ////////////////////////////////////////////////////////////////////////// + //// float specializations + template<> + inline void + CuRand<float>:: + Rand(CuMatrix<float>& tgt) + { + Timer tim; tim.Start(); + + tgt.Init(z1.Rows(), z1.Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK)); + + cudaF_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim()); + + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void + CuRand<float>:: + GaussRand(CuMatrix<float>& tgt) + { + + Timer tim; tim.Start(); + + tgt.Init(z1.Rows(), z1.Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK)); + + cudaF_gauss_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim()); + + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void + CuRand<float>:: + BinarizeProbs(const CuMatrix<float>& probs, CuMatrix<float>& states) + { + if(probs.Rows() != z1.Rows() || probs.Cols() != z1.Cols()) { + Error("Non matching dims!!"); + } + + states.Init(z1.Rows(),z1.Cols()); + Rand(tmp); + + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(z1.Cols(), CUBLOCK), n_blocks(z1.Rows(),CUBLOCK)); + + cudaF_binarize_probs(dimGrid,dimBlock,states.pCUData(), probs.pCUData(), tmp.pCUData(),states.Dim()); + + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + ////////////////////////////////////////////////////////////////////////// + //// double specializations + template<> + inline void + CuRand<double>:: + Rand(CuMatrix<double>& tgt) + { + Timer tim; tim.Start(); + + tgt.Init(z1.Rows(), z1.Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK)); + + cudaD_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim()); + + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void + CuRand<double>:: + GaussRand(CuMatrix<double>& tgt) + { + + Timer tim; tim.Start(); + + tgt.Init(z1.Rows(), z1.Cols()); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK)); + + cudaD_gauss_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim()); + + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void + CuRand<double>:: + BinarizeProbs(const CuMatrix<double>& probs, CuMatrix<double>& states) + { + if(probs.Rows() != z1.Rows() || probs.Cols() != z1.Cols()) { + Error("Non matching dims!!"); + } + + states.Init(z1.Rows(),z1.Cols()); + Rand(tmp); + + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK,CUBLOCK); + dim3 dimGrid(n_blocks(z1.Cols(), CUBLOCK), n_blocks(z1.Rows(),CUBLOCK)); + + cudaD_binarize_probs(dimGrid,dimBlock,states.pCUData(), probs.pCUData(), tmp.pCUData(),states.Dim()); + + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + +} diff --git a/src/CuBaseLib/curandkernels.cu b/src/CuBaseLib/curandkernels.cu new file mode 100644 index 0000000..5e42258 --- /dev/null +++ b/src/CuBaseLib/curandkernels.cu @@ -0,0 +1,135 @@ + +#include "curandkernels.h" + + + +// +//Hybrid Tausworthe/LCG random number generator +// +//http://http.developer.nvidia.com/GPUGems3/gpugems3_ch37.html + + +// S1, S2, S3, and M are all constants, and z is part of the +// private per-thread generator state. +__device__ +static unsigned TausStep(unsigned &z, int S1, int S2, int S3, unsigned M) +{ + unsigned b=(((z << S1) ^ z) >> S2); + return z = (((z & M) << S3) ^ b); +} + +// A and C are constants +__device__ +static unsigned LCGStep(unsigned &z, unsigned A, unsigned C) +{ + return z=(A*z+C); +} + +template<typename T> +__device__ +static T HybridTaus(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4) +{ + // Combined period is lcm(p1,p2,p3,p4)~ 2^121 + T randval; + do { + randval = 2.3283064365387e-10 * ( // Periods + TausStep(z1, 13, 19, 12, 4294967294UL) ^ // p1=2^31-1 + TausStep(z2, 2, 25, 4, 4294967288UL) ^ // p2=2^30-1 + TausStep(z3, 3, 11, 17, 4294967280UL) ^ // p3=2^28-1 + LCGStep(z4, 1664525, 1013904223UL) // p4=2^32 + ); + } while (!(randval > 0.0 && randval < 1.0)); + return randval; +} + + + + +template<typename T> +__global__ +static void _rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if( i < d.cols && j < d.rows ) { + mat[index] = HybridTaus<T>(z1[index],z2[index],z3[index],z4[index]); + } +} + +/* +float2 BoxMuller() +{ + float u0=HybridTaus (), u1=HybridTaus (); + float r=sqrt(-2 log(u0)); + float theta=2*PI*u1; + return make_float2(r*sin(theta),r*cos(theta)); +} +*/ + +template<typename T> +__device__ +static T BoxMuller(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4) +{ + const T M_2PI = 6.283185307179586476925286766558; + + T u0 = HybridTaus<T>(z1,z2,z3,z4), u1 = HybridTaus<T>(z1,z2,z3,z4); + T r = sqrt(-2.0 * log(u0)); + T theta = M_2PI * u1; + return r*sin(theta); + +} + + +template<typename T> +__global__ +static void _gauss_rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if( i < d.cols && j < d.rows ) { + mat[index] = BoxMuller<T>(z1[index],z2[index],z3[index],z4[index]); + } +} + + +template<typename T> +__global__ +static void _binarize_probs(T* states, const T* probs, const T* rand, MatrixDim d) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if( i < d.cols && j < d.rows ) { + states[index] = ((probs[index] > rand[index])? 1.0 : 0.0); + } +} + + + +/************ + * :FLOAT: + */ +void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); } + +void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); } + +void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d) +{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); } + + +/************ + * :DOUBLE: + */ +void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); } + +void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); } + +void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d) +{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); } + diff --git a/src/CuBaseLib/curandkernels.cu~ b/src/CuBaseLib/curandkernels.cu~ new file mode 100644 index 0000000..7e1c8dd --- /dev/null +++ b/src/CuBaseLib/curandkernels.cu~ @@ -0,0 +1,135 @@ + +#include "curandkernels.h" + + + +// +//Hybrid Tauss/LCG random number generator +// +//http://http.developer.nvidia.com/GPUGems3/gpugems3_ch37.html + + +// S1, S2, S3, and M are all constants, and z is part of the +// private per-thread generator state. +__device__ +static unsigned TausStep(unsigned &z, int S1, int S2, int S3, unsigned M) +{ + unsigned b=(((z << S1) ^ z) >> S2); + return z = (((z & M) << S3) ^ b); +} + +// A and C are constants +__device__ +static unsigned LCGStep(unsigned &z, unsigned A, unsigned C) +{ + return z=(A*z+C); +} + +template<typename T> +__device__ +static T HybridTaus(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4) +{ + // Combined period is lcm(p1,p2,p3,p4)~ 2^121 + T randval; + do { + randval = 2.3283064365387e-10 * ( // Periods + TausStep(z1, 13, 19, 12, 4294967294UL) ^ // p1=2^31-1 + TausStep(z2, 2, 25, 4, 4294967288UL) ^ // p2=2^30-1 + TausStep(z3, 3, 11, 17, 4294967280UL) ^ // p3=2^28-1 + LCGStep(z4, 1664525, 1013904223UL) // p4=2^32 + ); + } while (!(randval > 0.0 && randval < 1.0)); + return randval; +} + + + + +template<typename T> +__global__ +static void _rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if( i < d.cols && j < d.rows ) { + mat[index] = HybridTaus<T>(z1[index],z2[index],z3[index],z4[index]); + } +} + +/* +float2 BoxMuller() +{ + float u0=HybridTaus (), u1=HybridTaus (); + float r=sqrt(-2 log(u0)); + float theta=2*PI*u1; + return make_float2(r*sin(theta),r*cos(theta)); +} +*/ + +template<typename T> +__device__ +static T BoxMuller(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4) +{ + const T M_2PI = 6.283185307179586476925286766558; + + T u0 = HybridTaus<T>(z1,z2,z3,z4), u1 = HybridTaus<T>(z1,z2,z3,z4); + T r = sqrt(-2.0 * log(u0)); + T theta = M_2PI * u1; + return r*sin(theta); + +} + + +template<typename T> +__global__ +static void _gauss_rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if( i < d.cols && j < d.rows ) { + mat[index] = BoxMuller<T>(z1[index],z2[index],z3[index],z4[index]); + } +} + + +template<typename T> +__global__ +static void _binarize_probs(T* states, const T* probs, const T* rand, MatrixDim d) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int index = i + j*d.stride; + if( i < d.cols && j < d.rows ) { + states[index] = ((probs[index] > rand[index])? 1.0 : 0.0); + } +} + + + +/************ + * :FLOAT: + */ +void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); } + +void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); } + +void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d) +{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); } + + +/************ + * :DOUBLE: + */ +void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); } + +void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) +{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); } + +void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d) +{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); } + diff --git a/src/CuBaseLib/curandkernels.h b/src/CuBaseLib/curandkernels.h new file mode 100644 index 0000000..69b589f --- /dev/null +++ b/src/CuBaseLib/curandkernels.h @@ -0,0 +1,34 @@ +#ifndef _cuda_rand_kernels_h_ +#define _cuda_rand_kernels_h_ + + +#include "cukernels.h" + + +extern "C" { + //************** + //float + // + void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d); + + + void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d); + + + void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d); + + //************** + //double + // + void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d); + + + void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d); + + + void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d); + +} + + +#endif diff --git a/src/CuBaseLib/cuvector.h b/src/CuBaseLib/cuvector.h new file mode 100644 index 0000000..7bfe116 --- /dev/null +++ b/src/CuBaseLib/cuvector.h @@ -0,0 +1,121 @@ +#ifndef _CUVECTOR_H_ +#define _CUVECTOR_H_ + +#include "Vector.h" + +namespace TNet { + + template<typename _ElemT> class CuMatrix; + + /** + * \brief Vector for CUDA computing + * + * Vector is a matrix consist of a single row + */ + template<typename _ElemT> + class CuVector + { + typedef CuVector<_ElemT> ThisType; + + public: + + /// Default Constructor + CuVector<_ElemT>() + : mDim(0), mpCUData(NULL) + { } + /// Constructor with memory initialisation + CuVector<_ElemT>(size_t dim) + : mDim(0), mpCUData(NULL) + { Init(dim); } + + /// Destructor + ~CuVector() + { Destroy(); } + + /// Dimensions + size_t Dim() const + { return mDim; } + + /* + ::MatrixDim Dim() const + { ::MatrixDim d = { mDim, 1, 1 }; return d; } + */ + + /// Get raw pointer + const _ElemT* pCUData() const + { return mpCUData; } + _ElemT* pCUData() + { return mpCUData; } + + /// Allocate the memory + ThisType& Init(size_t dim); + + /// Deallocate the memory + void Destroy(); + + /// Copy functions (reallocates when needed) + ThisType& CopyFrom(const CuVector<_ElemT>& rSrc); + ThisType& CopyFrom(const Vector<_ElemT>& rSrc); + Vector<_ElemT>& CopyTo(Vector<_ElemT>& rDst) const; + + + + // Math operations + /// Set to All Zeros + void SetZero(); + + /// Set to Constant + /// + /// \param[in] value Desired constant value + void SetConst(_ElemT value) + { Error("__func__ Not implemented"); } + + /// Add scaled vector + /// + /// \param[in] alpha + /// \param[in] vec + /// \param[in] beta + /// Cal \f$ A=\alpha vec + \beta A \f$ + void AddScaled(_ElemT alpha, const CuVector<_ElemT>& vec, _ElemT beta) + { Error("__func__ Not implemented"); } + + //// Add scaled vector + /// + /// \param[in] alpha + /// \param[in] mat + /// \param[in] beta + /// Cal \f$ A=\alpha mat.ColSum() + \beta A \f$ + void AddColSum(_ElemT alpha, const CuMatrix<_ElemT>& mat, _ElemT beta) + { Error("__func__ Not implemented"); } + + void Print() const + { + Vector<_ElemT> vec(Dim()); + CopyTo(vec); + std::cout << vec << "\n"; + } + + + private: + size_t mDim; + _ElemT* mpCUData; + }; + + + /// Prints the matrix dimensions and pointer to stream + template<typename _ElemT> + inline std::ostream& operator << (std::ostream& out, const CuVector<_ElemT>& vec) + { + size_t d = vec.Dim(); + out << "[CuVector D" << d + << " PTR" << vec.pCUData() << "]" << std::flush; + return out; + } + + +} + + +#include "cuvector.tcc" + +#endif diff --git a/src/CuBaseLib/cuvector.h~ b/src/CuBaseLib/cuvector.h~ new file mode 100644 index 0000000..d118ec4 --- /dev/null +++ b/src/CuBaseLib/cuvector.h~ @@ -0,0 +1,104 @@ +#ifndef _CUVECTOR_H_ +#define _CUVECTOR_H_ + +#include "Vector.h" + +namespace TNet { + + template<typename _ElemT> class CuMatrix; + + /** + * \brief Vector for CUDA computing + */ + template<typename _ElemT> + class CuVector + { + typedef CuVector<_ElemT> ThisType; + + public: + + /// Default Constructor + CuVector<_ElemT>() + : mDim(0), mpCUData(NULL) + { } + /// Constructor with memory initialisation + CuVector<_ElemT>(size_t dim) + : mDim(0), mpCUData(NULL) + { Init(dim); } + + /// Destructor + ~CuVector() + { Destroy(); } + + /// Dimensions + size_t Dim() const + { return mDim; } + + /* + ::MatrixDim Dim() const + { ::MatrixDim d = { mDim, 1, 1 }; return d; } + */ + + /// Get raw pointer + const _ElemT* pCUData() const + { return mpCUData; } + _ElemT* pCUData() + { return mpCUData; } + + /// Allocate the memory + ThisType& Init(size_t dim); + + /// Deallocate the memory + void Destroy(); + + /// Copy functions (reallocates when needed) + ThisType& CopyFrom(const CuVector<_ElemT>& rSrc); + ThisType& CopyFrom(const Vector<_ElemT>& rSrc); + Vector<_ElemT>& CopyTo(Vector<_ElemT>& rDst) const; + + + + // Math operations + // + void SetZero(); + + void SetConst(_ElemT value) + { Error("__func__ Not implemented"); } + + void AddScaled(_ElemT alpha, const CuVector<_ElemT>& vec, _ElemT beta) + { Error("__func__ Not implemented"); } + + void AddColSum(_ElemT alpha, const CuMatrix<_ElemT>& mat, _ElemT beta) + { Error("__func__ Not implemented"); } + + void Print() const + { + Vector<_ElemT> vec(Dim()); + CopyTo(vec); + std::cout << vec << "\n"; + } + + + private: + size_t mDim; + _ElemT* mpCUData; + }; + + + /// Prints the matrix dimensions and pointer to stream + template<typename _ElemT> + inline std::ostream& operator << (std::ostream& out, const CuVector<_ElemT>& vec) + { + size_t d = vec.Dim(); + out << "[CuVector D" << d + << " PTR" << vec.pCUData() << "]" << std::flush; + return out; + } + + +} + + +#include "cuvector.tcc" + +#endif diff --git a/src/CuBaseLib/cuvector.tcc b/src/CuBaseLib/cuvector.tcc new file mode 100644 index 0000000..0107859 --- /dev/null +++ b/src/CuBaseLib/cuvector.tcc @@ -0,0 +1,254 @@ + +#include <cuda_runtime_api.h> + +#include "Timer.h" +#include "cucommon.h" +#include "cumatrix.h" +#include "cudevice.h" + +namespace TNet { + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuVector<_ElemT>& + CuVector<_ElemT>:: + Init(size_t dim) + { + if(mDim == dim) { + //SetZero(); + return *this; + } + + Destroy(); + + cuSafeCall(cudaMalloc((void**)&mpCUData, dim*sizeof(_ElemT))); + mDim = dim; + SetZero(); + + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + void + CuVector<_ElemT>:: + Destroy() + { + if(NULL != mpCUData) { + cuSafeCall(cudaFree(mpCUData)); + mpCUData = NULL; + } + mDim = 0; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuVector<_ElemT>& + CuVector<_ElemT>:: + CopyFrom(const CuVector<_ElemT>& rSrc) + { + Init(rSrc.Dim()); + + Timer tim; tim.Start(); + + cuSafeCall(cudaMemcpy(mpCUData, rSrc.pCUData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyDeviceToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromD2D",tim.Val()); + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuVector<_ElemT>& + CuVector<_ElemT>:: + CopyFrom(const Vector<_ElemT>& rSrc) + { + Init(rSrc.Dim()); + + Timer tim; tim.Start(); + + cuSafeCall(cudaMemcpy(mpCUData, rSrc.pData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyHostToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromH2D",tim.Val()); + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + Vector<_ElemT>& + CuVector<_ElemT>:: + CopyTo(Vector<_ElemT>& rDst) const + { + if(rDst.Dim() != mDim) { + rDst.Init(mDim); + } + + Timer tim; tim.Start(); + + cuSafeCall(cudaMemcpy(rDst.pData(), pCUData(), mDim*sizeof(_ElemT), cudaMemcpyDeviceToHost)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyToD2H",tim.Val()); + + return rDst; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + + template<typename _ElemT> + void + CuVector<_ElemT>:: + SetZero() + { + Timer tim; tim.Start(); + cuSafeCall(cudaMemset(mpCUData, 0, mDim*sizeof(_ElemT))); + tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::SetZero",tim.Val()); + } + + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + + + + //////////////////////////////////////////////////////////////////////// + //// CuVector:: templeate specializations (float) + //// + template<> + inline void CuVector<float>::SetConst(float value) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(Dim(), CUBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cudaF_set_const(dimGrid,dimBlock,mpCUData,value,d); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuVector<float>::AddScaled(float alpha, const CuVector<float>& vec, float beta) + { + Timer tim; tim.Start(); + + assert(vec.Dim() == Dim()); + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(Dim(), CUBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cudaF_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuVector<float>::AddColSum(float alpha, const CuMatrix<float>& mat, float beta) + { + Timer tim; tim.Start(); + + assert(mat.Cols() == Dim()); + + /** + * Rows()<=512 limit due to limited shared memory + * Cols()<=256 limit due to coalesced memory alignment: + * matrices with huge strides have slow access!!! + */ + if(mat.Rows() > 512 || mat.Cols() > 256) { + size_t dimBlock = CUBLOCK*2; + size_t dimGrid = n_blocks(Dim(),CUBLOCK*2); + + cudaF_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim()); + cuSafeCall(cudaGetLastError()); + } else { + dim3 dimBlock(mat.Rows(),1); + dim3 dimGrid(1,Dim()); + + cudaF_add_col_sum_reduce(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim()); + cuSafeCall(cudaGetLastError()); + } + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + + //////////////////////////////////////////////////////////////////////// + //// CuVector:: templeate specializations (double) + //// + template<> + inline void CuVector<double>::SetConst(double value) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(Dim(), CUBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cudaD_set_const(dimGrid,dimBlock,mpCUData,value,d); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuVector<double>::AddScaled(double alpha, const CuVector<double>& vec, double beta) + { + Timer tim; tim.Start(); + + assert(vec.Dim() == Dim()); + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(Dim(), CUBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cudaD_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuVector<double>::AddColSum(double alpha, const CuMatrix<double>& mat, double beta) + { + Timer tim; tim.Start(); + + assert(mat.Cols() == Dim()); + + size_t dimBlock = CUBLOCK*2; + size_t dimGrid = n_blocks(Dim(),CUBLOCK*2); + + cudaD_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim()); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + +} + + + |