#ifndef __MATH_FUNCTINS_H__ #define __MATH_FUNCTINS_H__ #include #include // for std::fabs and std::signbit #include #include #include #include #include #include #include // cuda driver types #include #include #define PERMUTELAYER_ORDERNUM 4 #define BLOCK 512 // // CUDA macros // // CUDA: various checks for different function calls. #define CUDA_CHECK(condition) \ /* Code block avoids redefinition of cudaError_t error */ \ do { \ cudaError_t error = condition; \ CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ } while (0) #define CUBLAS_CHECK(condition) \ do { \ cublasStatus_t status = condition; \ CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \ << cublasGetErrorString(status); \ } while (0) #define CURAND_CHECK(condition) \ do { \ curandStatus_t status = condition; \ CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \ << curandGetErrorString(status); \ } while (0) // CUDA: grid stride looping #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); \ i += blockDim.x * gridDim.x) // CUDA: check for error after kernel execution and exit loudly if there is one. #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError()) // CUDA: library error reporting. const char* cublasGetErrorString(cublasStatus_t error); const char* curandGetErrorString(curandStatus_t error); // CUDA: use 512 threads per block const int TENSORRT_CUDA_NUM_THREADS = 256; // CUDA: number of blocks for threads. inline int TENSORRT_GET_BLOCKS(const int N) { return (N + TENSORRT_CUDA_NUM_THREADS - 1) / TENSORRT_CUDA_NUM_THREADS; } /* * function: X[i] = alpha,initialize X with constant alpha * */ template void tensorrt_gpu_set(const int N, const Dtype alpha, Dtype *X); /* * function: y[index] = pow(a[index], alpha) *@params n: the dims of matrix a *@params a: matrix *@params y: vector */ template void tensorrt_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y); /* *function:y = alpha*A*x + beta*y; *@params handle: handle *@params TransA: transpose flag *@params M: the rows of A *@params N: the cols of A *@params alpha: the coefficient of A*x *@params A: matrix [M x N] *@params x: vector x *@params beta: the coefficient of y *@params y: vector y */ template void tensorrt_gpu_gemv(cublasHandle_t handle,const CBLAS_TRANSPOSE TransA, const int M, const int N, const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, Dtype* y); template void tensorrt_gpu_divbsx(const int nthreads, const Dtype* A, const Dtype* v, const int rows, const int cols, const CBLAS_TRANSPOSE trans, Dtype* B); template void tensorrt_gpu_mulbsx(const int nthreads, const Dtype* A, const Dtype* v, const int rows, const int cols, const CBLAS_TRANSPOSE trans, Dtype* B); cudaError_t tensorrt_gpu_permute(const int nthreads,float* const bottom_data,const bool forward, const int* permute_order,const int* old_steps,const int* new_steps,const int num_axes,float* const top_data,cudaStream_t stream); cudaError_t SoftmaxLayer(const float *bottom_data, int count, int channels, int outer_num_, int inner_num_, float *scale_data, float *top_data, cudaStream_t stream); cudaError_t ConcatLayer(int nthreads, const float *bottom_data, bool kForward, int num_concats_, int concat_input_size_, int top_concat_axis, int bottom_concat_axis, int offset_concat_axis, float *top_data, cudaStream_t stream); //cudaError_t cudaSoftmax(int n, int channels, float* x, float*y, cudaStream_t stream); //virtual void Forward_gpu(const vector*>& bottom,const vector*>& top); cudaError_t cudaSoftmax_caffe(int count,int channels,float* x,float* y, cudaStream_t stream); cudaError_t cudaDetectionOutput_caffe( int bottom0_count, int bottom1_count, float* loc_data, float* bottom1, float* prior_data, float* bottom3, float* bottom4, float* y, cudaStream_t stream); #endif