mathFunctions.h 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. #ifndef __MATH_FUNCTINS_H__
  2. #define __MATH_FUNCTINS_H__
  3. #include <stdint.h>
  4. #include <cmath> // for std::fabs and std::signbit
  5. #include <cblas.h>
  6. #include <cudnn.h>
  7. #include <cublas_v2.h>
  8. #include <cuda.h>
  9. #include <cuda_runtime.h>
  10. #include <curand.h>
  11. #include <driver_types.h> // cuda driver types
  12. #include <algorithm>
  13. #include <glog/logging.h>
  14. #define PERMUTELAYER_ORDERNUM 4
  15. #define BLOCK 512
  16. //
  17. // CUDA macros
  18. //
  19. // CUDA: various checks for different function calls.
  20. #define CUDA_CHECK(condition) \
  21. /* Code block avoids redefinition of cudaError_t error */ \
  22. do { \
  23. cudaError_t error = condition; \
  24. CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
  25. } while (0)
  26. #define CUBLAS_CHECK(condition) \
  27. do { \
  28. cublasStatus_t status = condition; \
  29. CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
  30. << cublasGetErrorString(status); \
  31. } while (0)
  32. #define CURAND_CHECK(condition) \
  33. do { \
  34. curandStatus_t status = condition; \
  35. CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
  36. << curandGetErrorString(status); \
  37. } while (0)
  38. // CUDA: grid stride looping
  39. #define CUDA_KERNEL_LOOP(i, n) \
  40. for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
  41. i < (n); \
  42. i += blockDim.x * gridDim.x)
  43. // CUDA: check for error after kernel execution and exit loudly if there is one.
  44. #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
  45. // CUDA: library error reporting.
  46. const char* cublasGetErrorString(cublasStatus_t error);
  47. const char* curandGetErrorString(curandStatus_t error);
  48. // CUDA: use 512 threads per block
  49. const int TENSORRT_CUDA_NUM_THREADS = 256;
  50. // CUDA: number of blocks for threads.
  51. inline int TENSORRT_GET_BLOCKS(const int N) {
  52. return (N + TENSORRT_CUDA_NUM_THREADS - 1) / TENSORRT_CUDA_NUM_THREADS;
  53. }
  54. /*
  55. * function: X[i] = alpha,initialize X with constant alpha
  56. *
  57. */
  58. template <typename Dtype>
  59. void tensorrt_gpu_set(const int N, const Dtype alpha, Dtype *X);
  60. /*
  61. * function: y[index] = pow(a[index], alpha)
  62. *@params n: the dims of matrix a
  63. *@params a: matrix
  64. *@params y: vector
  65. */
  66. template <typename Dtype>
  67. void tensorrt_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y);
  68. /*
  69. *function:y = alpha*A*x + beta*y;
  70. *@params handle: handle
  71. *@params TransA: transpose flag
  72. *@params M: the rows of A
  73. *@params N: the cols of A
  74. *@params alpha: the coefficient of A*x
  75. *@params A: matrix [M x N]
  76. *@params x: vector x
  77. *@params beta: the coefficient of y
  78. *@params y: vector y
  79. */
  80. template <typename Dtype>
  81. void tensorrt_gpu_gemv(cublasHandle_t handle,const CBLAS_TRANSPOSE TransA, const int M, const int N,
  82. const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
  83. Dtype* y);
  84. template <typename Dtype>
  85. void tensorrt_gpu_divbsx(const int nthreads, const Dtype* A,
  86. const Dtype* v, const int rows, const int cols, const CBLAS_TRANSPOSE trans,
  87. Dtype* B);
  88. template <typename Dtype>
  89. void tensorrt_gpu_mulbsx(const int nthreads, const Dtype* A,
  90. const Dtype* v, const int rows, const int cols, const CBLAS_TRANSPOSE trans,
  91. Dtype* B);
  92. cudaError_t tensorrt_gpu_permute(const int nthreads,float* const bottom_data,const bool forward,
  93. const int* permute_order,const int* old_steps,const int* new_steps,const int num_axes,float* const top_data,cudaStream_t stream);
  94. cudaError_t SoftmaxLayer(const float *bottom_data, int count, int channels, int outer_num_, int inner_num_, float *scale_data, float *top_data, cudaStream_t stream);
  95. cudaError_t ConcatLayer(int nthreads, const float *bottom_data, bool kForward, int num_concats_, int concat_input_size_, int top_concat_axis, int bottom_concat_axis, int offset_concat_axis, float *top_data, cudaStream_t stream);
  96. //cudaError_t cudaSoftmax(int n, int channels, float* x, float*y, cudaStream_t stream);
  97. //virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
  98. cudaError_t cudaSoftmax_caffe(int count,int channels,float* x,float* y, cudaStream_t stream);
  99. cudaError_t cudaDetectionOutput_caffe( int bottom0_count,
  100. int bottom1_count,
  101. float* loc_data,
  102. float* bottom1,
  103. float* prior_data,
  104. float* bottom3,
  105. float* bottom4,
  106. float* y,
  107. cudaStream_t stream);
  108. #endif