ADPilot
/
modularization


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
							#include "argsParser.h"
#include "buffers.h"
#include "common.h"
#include "logger.h"
#include "parserOnnxConfig.h"
#include "NvInfer.h"
#include <cuda_runtime_api.h>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <sys/time.h>
#include <chrono>
#include "preprocess.h"
#include "postprocess.h"
#include "scatter_cuda.h"

// below  head files are defined in TensorRt/samples/common
#include "EntropyCalibrator.h"
#include "BatchStream.h"

#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))


struct Params{
    std::string pfeOnnxFilePath = "";
    std::string rpnOnnxFilePath = "";
    std::string pfeSerializedEnginePath = "";
    std::string rpnSerializedEnginePath = "";

    // Input Output Names
    std::vector<std::string> pfeInputTensorNames;
    std::vector<std::string> rpnInputTensorNames;
    std::vector<std::string> pfeOutputTensorNames;
    std::map<std::string, std::vector<std::string>> rpnOutputTensorNames;

    // Input Output Paths
    std::string savePath ;
    std::vector<std::string>  filePaths;
    
    // Attrs
    int dlaCore = -1;
    bool fp16 = false;
    bool int8 = false;
    bool load_engine = false;
    int batch_size = 1;
};

class CenterPoint
{
    template <typename T>
    using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;

public:
    CenterPoint(const Params params)
        : mParams(params)
        ,BATCH_SIZE_(params.batch_size)
        , mEngine(nullptr)
        ,mEngineRPN(nullptr)
    {

        //const int NUM_THREADS, const int MAX_NUM_PILLARS, const int GRID_X_SIZE, const int GRID_Y_SIZE):
        scatter_cuda_ptr_.reset(new ScatterCuda(PFE_OUTPUT_DIM, PFE_OUTPUT_DIM, BEV_W, BEV_H ));
        // mallocate a global memory for pointer

        GPU_CHECK(cudaMalloc((void**)&dev_points_, MAX_POINTS * POINT_DIM * sizeof(float)));
        GPU_CHECK(cudaMemset(dev_points_,0, MAX_POINTS * POINT_DIM * sizeof(float)));

        GPU_CHECK(cudaMalloc((void**)&dev_indices_,MAX_PILLARS * sizeof(int)));
        GPU_CHECK(cudaMemset(dev_indices_,0,MAX_PILLARS * sizeof(int)));

        /**
         * @brief : Create and Init Variables for PreProcess
         * 
         */
        GPU_CHECK(cudaMalloc((void**)& p_bev_idx_, MAX_POINTS * sizeof(int)));
        GPU_CHECK(cudaMalloc((void**)& p_point_num_assigned_, MAX_POINTS * sizeof(int)));
        GPU_CHECK(cudaMalloc((void**)& p_mask_, MAX_POINTS * sizeof(bool)));
        GPU_CHECK(cudaMalloc((void**)& bev_voxel_idx_, BEV_H * BEV_W * sizeof(int)));

        GPU_CHECK(cudaMemset(p_bev_idx_, 0, MAX_POINTS * sizeof(int)));
        GPU_CHECK(cudaMemset(p_point_num_assigned_, 0, MAX_POINTS * sizeof(int)));
        GPU_CHECK(cudaMemset(p_mask_, 0, MAX_POINTS * sizeof(bool)));
        GPU_CHECK(cudaMemset(bev_voxel_idx_, 0, BEV_H * BEV_W * sizeof(int)));

        GPU_CHECK(cudaMalloc((void**)&v_point_sum_, MAX_PILLARS * 3 *sizeof(float)));
        GPU_CHECK(cudaMalloc((void**)&v_range_, MAX_PILLARS * sizeof(int)));
        GPU_CHECK(cudaMalloc((void**)&v_point_num_, MAX_PILLARS * sizeof(int)));


        GPU_CHECK(cudaMemset(v_range_,0, MAX_PILLARS * sizeof(int)));
        GPU_CHECK(cudaMemset(v_point_sum_, 0, MAX_PILLARS * 3 * sizeof(float)));

        /**
         * @brief : Create and Init Variables for PostProcess
         * 
         */
        GPU_CHECK(cudaMalloc((void**)&dev_score_idx_, OUTPUT_W * OUTPUT_H * sizeof(int)));
        GPU_CHECK(cudaMemset(dev_score_idx_, -1 , OUTPUT_W * OUTPUT_H * sizeof(int)));

        GPU_CHECK(cudaMallocHost((void**)& mask_cpu, INPUT_NMS_MAX_SIZE * DIVUP (INPUT_NMS_MAX_SIZE ,THREADS_PER_BLOCK_NMS) * sizeof(unsigned long long)));
        GPU_CHECK(cudaMemset(mask_cpu, 0 ,  INPUT_NMS_MAX_SIZE * DIVUP (INPUT_NMS_MAX_SIZE ,THREADS_PER_BLOCK_NMS) * sizeof(unsigned long long)));

        GPU_CHECK(cudaMallocHost((void**)& remv_cpu, THREADS_PER_BLOCK_NMS * sizeof(unsigned long long)));
        GPU_CHECK(cudaMemset(remv_cpu, 0 ,  THREADS_PER_BLOCK_NMS  * sizeof(unsigned long long)));

        GPU_CHECK(cudaMallocHost((void**)&host_score_idx_, OUTPUT_W * OUTPUT_H  * sizeof(int)));
        GPU_CHECK(cudaMemset(host_score_idx_, -1, OUTPUT_W * OUTPUT_H  * sizeof(int)));

        GPU_CHECK(cudaMallocHost((void**)&host_keep_data_, INPUT_NMS_MAX_SIZE * sizeof(long)));
        GPU_CHECK(cudaMemset(host_keep_data_, -1, INPUT_NMS_MAX_SIZE * sizeof(long)));

        GPU_CHECK(cudaMallocHost((void**)&host_boxes_, OUTPUT_NMS_MAX_SIZE * 9 * sizeof(float)));
        GPU_CHECK(cudaMemset(host_boxes_, 0 ,  OUTPUT_NMS_MAX_SIZE * 9 * sizeof(float)));

        GPU_CHECK(cudaMallocHost((void**)&host_label_, OUTPUT_NMS_MAX_SIZE * sizeof(int)));
        GPU_CHECK(cudaMemset(host_label_, -1, OUTPUT_NMS_MAX_SIZE * sizeof(int)));

    }

    ~CenterPoint() {
    // Free host pointers
    // Free global pointers 
    std::cout << "Free Variables . \n";
    GPU_CHECK(cudaFree(dev_indices_));
    GPU_CHECK(cudaFree(dev_points_));
    GPU_CHECK(cudaFree(dev_score_idx_));

    GPU_CHECK(cudaFree( p_bev_idx_)); 
    GPU_CHECK(cudaFree( p_point_num_assigned_));
    GPU_CHECK(cudaFree( p_mask_));
    GPU_CHECK(cudaFree( bev_voxel_idx_)); // H * W
    GPU_CHECK(cudaFree( v_point_sum_));
    GPU_CHECK(cudaFree( v_range_));
    GPU_CHECK(cudaFree( v_point_num_));


    GPU_CHECK(cudaFreeHost(host_keep_data_));
    GPU_CHECK(cudaFreeHost(host_boxes_));
    GPU_CHECK(cudaFreeHost(host_label_));
    GPU_CHECK(cudaFreeHost(host_score_idx_));
    GPU_CHECK(cudaFreeHost(remv_cpu));
    GPU_CHECK(cudaFreeHost(mask_cpu));


    // // Free engine 
    // std::cout << "Free PFE Engine  .\n";
    // mEngine->destroy();
    // std::cout << "Free RPN Engine  .\n";
    // mEngineRPN->destroy();
    }

    std::shared_ptr<nvinfer1::ICudaEngine>  build( std::string onnxFilePath,std::string saveEnginePath);
    std::shared_ptr<nvinfer1::ICudaEngine>  buildFromSerializedEngine(std::string serializedEngineFile);
    bool infer(float* lidarpoints,int pointsnum,std::vector<Box>& predResult);
    bool engineInitlization();
    bool testinfer();
    

private:
    // device pointers 
    float* dev_scattered_feature_;
    float* dev_points_ ;
    int* dev_indices_;
    int* dev_score_idx_;
    long* dev_keep_data_;
    SampleUniquePtr<ScatterCuda> scatter_cuda_ptr_;

    // device pointers for preprocess
    int* p_bev_idx_; 
    int* p_point_num_assigned_;
    bool* p_mask_;
    int* bev_voxel_idx_; // H * W
    float* v_point_sum_;
    int* v_range_;
    int* v_point_num_;
    

    // host  variables for post process
    long* host_keep_data_;
    float* host_boxes_;
    int* host_label_;
    int* host_score_idx_;
    unsigned long long* mask_cpu;
    unsigned long long* remv_cpu;

    Params mParams;
    int BATCH_SIZE_ = 1;
    nvinfer1::Dims mInputDims;  //!< The dimensions of the input to the network.
    nvinfer1::Dims mOutputDims; //!< The dimensions of the output to the network.
    int mNumber{0};             //!< The number to classify
    std::shared_ptr<nvinfer1::ICudaEngine> mEngine; //!< The TensorRT engine used to run the network
    std::shared_ptr<nvinfer1::ICudaEngine> mEngineRPN;

    samplesCommon::BufferManager * mbuffers;
    samplesCommon::BufferManager * mbuffersRPN;

    SampleUniquePtr<nvinfer1::IExecutionContext> mContext;
    SampleUniquePtr<nvinfer1::IExecutionContext> mContextRPN;

    //!
    //! \brief Parses an ONNX model for MNIST and creates a TensorRT network
    //!
    bool constructNetwork(SampleUniquePtr<nvinfer1::IBuilder>& builder,
        SampleUniquePtr<nvinfer1::INetworkDefinition>& network, SampleUniquePtr<nvinfer1::IBuilderConfig>& config,
        SampleUniquePtr<nvonnxparser::IParser>& parser,
       std::string onnxFilePath);
    //!
    //! \brief Reads the input  and stores the result in a managed buffer
    //!
    bool processInput(void*& points, std::string& pointFilePath, int& pointNum);
    //!
    //! \brief Classifies digits and verify result
    //!
    void saveOutput(std::vector<Box>& predResult, std::string& inputFileName, std::string savePath);
};