Browse Source

add percerption_lidar_* int api.

yuchuli 4 months ago
parent
commit
35a612c68a
100 changed files with 13047 additions and 0 deletions
  1. 91 0
      src/api/commonfile/PyModuleCommModule.py
  2. 1 0
      src/api/commonfile/libcaffe.so
  3. BIN
      src/api/commonfile/libcaffe.so.1.0.0
  4. BIN
      src/api/commonfile/libmodulecomm.so
  5. BIN
      src/api/commonfile/libxmlparam.so
  6. 80 0
      src/api/commonfile/modulecomm.h
  7. BIN
      src/api/commonfile/modulecommpython.so
  8. 47 0
      src/api/commonfile/xmlparam.h
  9. 294 0
      src/api/perception_lidar_PointPillars_MultiHead/Tracker/Ctracker.cpp
  10. 292 0
      src/api/perception_lidar_PointPillars_MultiHead/Tracker/Ctracker.h
  11. 723 0
      src/api/perception_lidar_PointPillars_MultiHead/Tracker/HungarianAlg.cpp
  12. 39 0
      src/api/perception_lidar_PointPillars_MultiHead/Tracker/HungarianAlg.h
  13. 873 0
      src/api/perception_lidar_PointPillars_MultiHead/Tracker/Kalman.cpp
  14. 120 0
      src/api/perception_lidar_PointPillars_MultiHead/Tracker/Kalman.h
  15. 65 0
      src/api/perception_lidar_PointPillars_MultiHead/Tracker/ShortPathCalculator.h
  16. 159 0
      src/api/perception_lidar_PointPillars_MultiHead/Tracker/Tracking.hpp
  17. 147 0
      src/api/perception_lidar_PointPillars_MultiHead/Tracker/defines.h
  18. 492 0
      src/api/perception_lidar_PointPillars_MultiHead/Tracker/track.cpp
  19. 303 0
      src/api/perception_lidar_PointPillars_MultiHead/Tracker/track.h
  20. 246 0
      src/api/perception_lidar_PointPillars_MultiHead/cfgs/cbgs_pp_multihead.yaml
  21. 141 0
      src/api/perception_lidar_PointPillars_MultiHead/common.h
  22. 377 0
      src/api/perception_lidar_PointPillars_MultiHead/main.cpp
  23. 422 0
      src/api/perception_lidar_PointPillars_MultiHead/nms.cu
  24. 64 0
      src/api/perception_lidar_PointPillars_MultiHead/nms.h
  25. 51 0
      src/api/perception_lidar_PointPillars_MultiHead/object.proto
  26. 11 0
      src/api/perception_lidar_PointPillars_MultiHead/objectarray.proto
  27. 182 0
      src/api/perception_lidar_PointPillars_MultiHead/perception_lidar_PointPillars_MultiHead.pro
  28. 507 0
      src/api/perception_lidar_PointPillars_MultiHead/pointpillars.cc
  29. 287 0
      src/api/perception_lidar_PointPillars_MultiHead/pointpillars.h
  30. 383 0
      src/api/perception_lidar_PointPillars_MultiHead/postprocess.cu
  31. 125 0
      src/api/perception_lidar_PointPillars_MultiHead/postprocess.h
  32. 410 0
      src/api/perception_lidar_PointPillars_MultiHead/preprocess.cu
  33. 138 0
      src/api/perception_lidar_PointPillars_MultiHead/preprocess.h
  34. 73 0
      src/api/perception_lidar_PointPillars_MultiHead/scatter.cu
  35. 77 0
      src/api/perception_lidar_PointPillars_MultiHead/scatter.h
  36. 282 0
      src/api/perception_lidar_cnn_segmentation/caffe/blob.hpp
  37. 22 0
      src/api/perception_lidar_cnn_segmentation/caffe/caffe.hpp
  38. 193 0
      src/api/perception_lidar_cnn_segmentation/caffe/common.hpp
  39. 154 0
      src/api/perception_lidar_cnn_segmentation/caffe/data_transformer.hpp
  40. 301 0
      src/api/perception_lidar_cnn_segmentation/caffe/filler.hpp
  41. 53 0
      src/api/perception_lidar_cnn_segmentation/caffe/internal_thread.hpp
  42. 477 0
      src/api/perception_lidar_cnn_segmentation/caffe/layer.hpp
  43. 141 0
      src/api/perception_lidar_cnn_segmentation/caffe/layer_factory.hpp
  44. 68 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/absval_layer.hpp
  45. 99 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/accuracy_layer.hpp
  46. 77 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/argmax_layer.hpp
  47. 174 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/base_conv_layer.hpp
  48. 82 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/base_data_layer.hpp
  49. 78 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/batch_norm_layer.hpp
  50. 83 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/batch_reindex_layer.hpp
  51. 54 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/bias_layer.hpp
  52. 70 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/bnll_layer.hpp
  53. 75 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/clip_layer.hpp
  54. 87 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/concat_layer.hpp
  55. 101 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/contrastive_loss_layer.hpp
  56. 84 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/conv_layer.hpp
  57. 78 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/crop_layer.hpp
  58. 72 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_conv_layer.hpp
  59. 68 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_deconv_layer.hpp
  60. 49 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_lcn_layer.hpp
  61. 44 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_lrn_layer.hpp
  62. 49 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_pooling_layer.hpp
  63. 46 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_relu_layer.hpp
  64. 46 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_sigmoid_layer.hpp
  65. 45 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_softmax_layer.hpp
  66. 46 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_tanh_layer.hpp
  67. 40 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/data_layer.hpp
  68. 51 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/deconv_layer.hpp
  69. 80 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/dropout_layer.hpp
  70. 47 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/dummy_data_layer.hpp
  71. 51 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/eltwise_layer.hpp
  72. 86 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/elu_layer.hpp
  73. 52 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/embed_layer.hpp
  74. 107 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/euclidean_loss_layer.hpp
  75. 80 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/exp_layer.hpp
  76. 77 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/filter_layer.hpp
  77. 61 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/flatten_layer.hpp
  78. 64 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/hdf5_data_layer.hpp
  79. 62 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/hdf5_output_layer.hpp
  80. 104 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/hinge_loss_layer.hpp
  81. 65 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/im2col_layer.hpp
  82. 47 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/image_data_layer.hpp
  83. 146 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/infogain_loss_layer.hpp
  84. 52 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/inner_product_layer.hpp
  85. 42 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/input_layer.hpp
  86. 82 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/log_layer.hpp
  87. 53 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/loss_layer.hpp
  88. 94 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/lrn_layer.hpp
  89. 154 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/lstm_layer.hpp
  90. 63 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/memory_data_layer.hpp
  91. 92 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/multinomial_logistic_loss_layer.hpp
  92. 48 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/mvn_layer.hpp
  93. 32 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/neuron_layer.hpp
  94. 45 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/parameter_layer.hpp
  95. 61 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/pooling_layer.hpp
  96. 89 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/power_layer.hpp
  97. 101 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/prelu_layer.hpp
  98. 55 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/python_layer.hpp
  99. 187 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/recurrent_layer.hpp
  100. 59 0
      src/api/perception_lidar_cnn_segmentation/caffe/layers/reduction_layer.hpp

+ 91 - 0
src/api/commonfile/PyModuleCommModule.py

@@ -0,0 +1,91 @@
+
+import threading  
+import time  
+
+import modulecommpython
+import numpy as np  
+
+
+modulelock = threading.Lock()  
+nThread = 0
+  
+class PyModuleComm:  
+    def __init__(self,strname):  
+        # 初始化代码...  
+        print("name: ",strname)
+        self.strmemname = strname
+        self.mbRegister = False
+        global nThread
+        nThread = nThread+1
+        self.mnMode = 0
+        print("nThread = ",nThread)
+        self.obj = modulecommpython.get_ca_object()
+        pass  
+
+    def RegisterRecv(self,call):
+        if self.mbRegister:
+            print(" Have register, can't register other.")
+            return
+        print("Register: ",self.strmemname)
+        self.mpcall = call
+        self.mbRegister = True
+        self.mbRun = True
+        self.mpthread = threading.Thread(target=self.threadrecvdata, args=(self.strmemname,))  
+        self.mpthread.start()
+        print("complete create thread.")
+        self.mnMode = 1
+        self.obj.RegisterRecv(self.strmemname)
+
+    def RegiseterSend(self,nSize,nPacCount):
+        if self.mbRegister:
+            print(" Have register, can't register other.")
+            return
+        print("Register: ",self.strmemname)
+        self.mnsize = nSize
+        self.mnPacCount = nPacCount
+        self.mbRegister = True
+        self.mnMode = 2
+        self.obj.RegisterSend(self.strmemname,nSize,nPacCount)
+    
+    def SendData(self,arr,nsendsize):      
+    #    nrealsize = np.zeros(1, dtype=np.int32)  
+        nrtn = self.obj.SendData(arr,nsendsize)
+  
+    def threadrecvdata(self, arg):  
+        # 这个函数将被线程执行  
+ #       print(f"线程开始执行,参数是 {arg}")  
+        nBuffSize = int(1000)
+        arr = np.zeros(nBuffSize,dtype=np.int8)
+        recvtime = np.zeros(1,dtype=np.int64)
+        nrealsize = np.zeros(1,dtype=np.int32)
+        while self.mbRun:
+            nrtn = self.obj.RecvData(arr,nBuffSize,nrealsize,recvtime)
+            if nrtn > 0:
+                self.mpcall(arr,nrtn,recvtime)
+            else:
+                pass
+            if nrtn < 0:
+                nBuffSize = int(nrealsize[0] * 2)
+                arr = np.zeros(nBuffSize,dtype=np.int8)
+            else:
+                time.sleep(0.001)
+            
+        print("threadrecvdata complete.")
+
+    def stop_thread(self):
+        self.mbRun = False
+        self.mpthread.join()
+  
+    def start_thread(self, arg):  
+        # 创建线程对象,target参数指向要在线程中运行的函数  
+        self.mbRun = True
+        self.mpthread = threading.Thread(target=self.my_function, args=(arg,))  
+          
+        # 启动线程  
+        self.mpthread.start()  
+          
+        # 可以在这里添加其他代码,主线程会继续执行  
+        print("主线程继续执行...")  
+  
+        # 如果需要等待线程结束,可以调用 join() 方法  
+        # thread.join()  

+ 1 - 0
src/api/commonfile/libcaffe.so

@@ -0,0 +1 @@
+libcaffe.so.1.0.0

BIN
src/api/commonfile/libcaffe.so.1.0.0


BIN
src/api/commonfile/libmodulecomm.so


BIN
src/api/commonfile/libxmlparam.so


+ 80 - 0
src/api/commonfile/modulecomm.h

@@ -0,0 +1,80 @@
+#ifndef MODULECOMM_H
+#define MODULECOMM_H
+
+
+#include <QtCore/qglobal.h>
+#include <QDateTime>
+
+#include <functional>
+
+
+
+
+#if defined(MODULECOMM_LIBRARY)
+#  define MODULECOMMSHARED_EXPORT Q_DECL_EXPORT
+#else
+#  define MODULECOMMSHARED_EXPORT Q_DECL_IMPORT
+#endif
+
+
+
+
+
+
+//#include <iostream>
+//#include <thread>
+
+//using namespace std::placeholders;
+
+#ifndef IV_MODULE_FUN
+
+typedef std::function<void(const char * ,const unsigned int , const unsigned int , QDateTime * ,const char *)> ModuleFun;
+typedef void (* SMCallBack)(const char * strdata,const unsigned int nSize,const unsigned int index,const QDateTime * dt,const char * strmemname);
+#define IV_MODULE_FUN
+#endif
+
+namespace iv {
+namespace modulecomm {
+
+enum ModuleComm_TYPE
+{
+    ModuleComm_SHAREMEM = 0,
+    ModuleComm_INTERIOR = 1,
+    ModuleComm_FASTRTPS = 2,
+    ModuleComm_FASTRTPS_TCP = 3,
+    ModuleComm_UNDEFINE = 4
+};
+
+void *  RegisterSend(const char * strcommname);
+
+
+void *  RegisterSend(const char * strcommname,const unsigned int nBufSize,const unsigned int nMsgBufCount
+                                            ,ModuleComm_TYPE xmctype,const unsigned short nport);
+void *  RegisterRecv(const char * strcommname,SMCallBack pCall,
+                                            ModuleComm_TYPE xmctype ,const char * strip,const unsigned short );
+void *  RegisterRecvPlus(const char * strcommname,ModuleFun xFun,
+                                                ModuleComm_TYPE xmctype,const char * strip,const unsigned short );
+
+
+//void * MODULECOMMSHARED_EXPORT RegisterSend(const char * strcommname,const unsigned int nBufSize,const unsigned int nMsgBufCount
+//                                            ,ModuleComm_TYPE xmctype = ModuleComm_UNDEFINE,const unsigned short nport = 5100);
+//void * MODULECOMMSHARED_EXPORT RegisterRecv(const char * strcommname,SMCallBack pCall,
+//                                            ModuleComm_TYPE xmctype = ModuleComm_UNDEFINE,const char * strip = 0,const unsigned short = 5100);
+//void * MODULECOMMSHARED_EXPORT RegisterRecvPlus(const char * strcommname,ModuleFun xFun,
+//                                                ModuleComm_TYPE xmctype = ModuleComm_UNDEFINE,const char * strip = 0,const unsigned short = 5100);
+
+void MODULECOMMSHARED_EXPORT ModuleSendMsg(void * pHandle,const char * strdata,const unsigned int nDataLen);
+void MODULECOMMSHARED_EXPORT Unregister(void * pHandle);
+void MODULECOMMSHARED_EXPORT PauseComm(void * pHandle);
+void MODULECOMMSHARED_EXPORT ContintuComm(void * pHandle);
+
+void *  RegisterSend(const char * strcommname,const unsigned int nBufSize,const unsigned int nMsgBufCount);
+void *  RegisterRecv(const char * strcommname,SMCallBack pCall);
+void *  RegisterRecvPlus(const char * strcommname,ModuleFun xFun);
+
+
+}
+
+}
+
+#endif // MODULECOMM_H

BIN
src/api/commonfile/modulecommpython.so


+ 47 - 0
src/api/commonfile/xmlparam.h

@@ -0,0 +1,47 @@
+#ifndef XMLPARAM_H
+#define XMLPARAM_H
+
+
+#include <QtCore/qglobal.h>
+
+#include <string>
+
+#if defined(XMLPARAM_LIBRARY)
+#  define XMLPARAMSHARED_EXPORT Q_DECL_EXPORT
+#else
+#  define XMLPARAMSHARED_EXPORT Q_DECL_IMPORT
+#endif
+
+namespace iv {
+namespace xmlparam {
+
+class XMLPARAMSHARED_EXPORT Xmlparam
+{
+
+public:
+    Xmlparam(std::string filepath);
+    std::string GetParam(std::string paramname,std::string defaultvalue);
+    std::string GetParam(std::string paramname,const char *  strdefaultvalue);
+    int GetParam(std::string paramname, int ndefaultvalue);
+    double GetParam(std::string paramname, double dfdefvalue);
+    float GetParam(std::string paramname, float fdefvalue);
+    bool GetParam(std::string paramname, bool bdefValue);
+
+    void GetParam(std::string paramname,std::string & strvalue,std::string defaultvalue);
+    void GetParam(std::string paramname,std::string & strvalue,const char *  strdefaultvalue);
+    void GetParam(std::string paramname, int & nvalue, int ndefaultvalue);
+    void GetParam(std::string paramname, double & dfvalue,double dfdefvalue);
+    void GetParam(std::string paramname, float & fvalue, float fdefvalue);
+    void GetParam(std::string paramname, bool & bvalue,bool bdefvalue);
+
+    QString GetVersion();
+private:
+    int * mpx;
+
+};
+}
+
+}
+
+
+#endif // XMLPARAM_H

+ 294 - 0
src/api/perception_lidar_PointPillars_MultiHead/Tracker/Ctracker.cpp

@@ -0,0 +1,294 @@
+#include "Ctracker.h"
+///
+/// \brief CTracker::CTracker
+/// Tracker. Manage tracks. Create, remove, update.
+/// \param settings
+///
+CTracker::CTracker(TrackerSettings& settings)
+{
+    m_settings = settings;
+}
+///
+/// \brief CTracker::CTracker
+/// Tracker. Manage tracks. Create, remove, update.
+/// \param settings
+///
+CTracker::CTracker(const TrackerSettings& settings)
+    :
+      m_settings(settings),
+      m_nextTrackID(0)
+{
+    ShortPathCalculator* spcalc = nullptr;
+    SPSettings spSettings = { settings.m_distThres, 12 };
+    switch (m_settings.m_matchType)
+    {
+    case tracking::MatchHungrian:
+        spcalc = new SPHungrian(spSettings);
+        break;
+        //    case tracking::MatchBipart:
+        //        spcalc = new SPBipart(spSettings);
+        //        break;
+    }
+    assert(spcalc != nullptr);
+    m_SPCalculator = std::unique_ptr<ShortPathCalculator>(spcalc);
+}
+
+///
+/// \brief CTracker::~CTracker
+///
+CTracker::~CTracker(void)
+{
+}
+///
+/// \brief CTracker::setSettings
+/// Tracker. Manage tracks. Create, remove, update.
+/// \param settings
+///
+void CTracker::setSettings(TrackerSettings& settings)
+{
+    m_settings = settings;
+    m_nextTrackID = 0;
+    ShortPathCalculator* spcalc = nullptr;
+    SPSettings spSettings = { settings.m_distThres, 12 };
+    switch (m_settings.m_matchType)
+    {
+    case tracking::MatchHungrian:
+        spcalc = new SPHungrian(spSettings);
+        break;
+        //    case tracking::MatchBipart:
+        //        spcalc = new SPBipart(spSettings);
+        //        break;
+    }
+    assert(spcalc != nullptr);
+    m_SPCalculator = std::unique_ptr<ShortPathCalculator>(spcalc);
+}
+///
+/// \brief CTracker::Update
+/// \param regions
+/// \param currFrame
+/// \param fps
+///
+void CTracker::Update(
+        const regions_t& regions,
+        cv::UMat currFrame,
+        float fps
+        )
+{
+    UpdateTrackingState(regions, currFrame, fps);
+
+    currFrame.copyTo(m_prevFrame);
+}
+
+///
+/// \brief CTracker::UpdateTrackingState
+/// \param regions
+/// \param currFrame
+/// \param fps
+///
+void CTracker::UpdateTrackingState(
+        const regions_t& regions,
+        cv::UMat currFrame,
+        float fps
+        )
+{
+    const size_t N = m_tracks.size();	// Tracking objects
+    const size_t M = regions.size();	// Detections or regions
+
+    assignments_t assignment(N, -1); // Assignments regions -> tracks
+
+    if (!m_tracks.empty())
+    {
+        // Distance matrix between all tracks to all regions
+        distMatrix_t costMatrix(N * M);
+        const track_t maxPossibleCost = 1e3;//static_cast<track_t>(currFrame.cols * currFrame.rows);
+        track_t maxCost = 0;
+        CreateDistaceMatrix(regions, costMatrix, maxPossibleCost, maxCost, currFrame);
+
+        // Solving assignment problem (shortest paths)
+        m_SPCalculator->Solve(costMatrix, N, M, assignment, maxCost);//row->col(trackid->regionid)
+
+        // clean assignment from pairs with large distance
+        for (size_t i = 0; i < assignment.size(); i++)
+        {
+            if (assignment[i] != -1)
+            {
+#ifdef DEBUG_SHOW
+                std::cout<<costMatrix[i + assignment[i] * N]<<", ";
+#endif
+                if (costMatrix[i + assignment[i] * N] > m_settings.m_distThres)
+                {
+                    assignment[i] = -1;
+                    m_tracks[i]->SkippedFrames()++;
+                }
+            }
+            else
+            {
+#ifdef DEBUG_SHOW
+                std::cout<<-1<<", ";
+#endif
+                // If track have no assigned detect, then increment skipped frames counter.
+                m_tracks[i]->SkippedFrames()++;
+            }
+        }
+#ifdef DEBUG_SHOW
+                std::cout<<std::endl;
+#endif
+        // If track didn't get detects long time, remove it.
+        for (size_t i = 0; i < m_tracks.size();)
+        {
+            if (m_tracks[i]->SkippedFrames() > m_settings.m_maximumAllowedSkippedFrames ||
+                    m_tracks[i]->IsStaticTimeout(cvRound(fps * (m_settings.m_maxStaticTime - m_settings.m_minStaticTime))))
+            {
+                m_tracks.erase(m_tracks.begin() + i);
+                assignment.erase(assignment.begin() + i);
+            }
+            else
+            {
+                ++i;
+            }
+        }
+    }
+
+    // Search for unassigned detects and start new tracks for them.
+    for (size_t i = 0; i < regions.size(); ++i)
+    {
+        if (find(assignment.begin(), assignment.end(), i) == assignment.end())
+        {
+            m_tracks.push_back(std::make_unique<CTrack>(regions[i],
+                                                        m_settings.m_kalmanType,
+                                                        m_settings.m_dt,
+                                                        m_settings.m_accelNoiseMag,
+                                                        m_settings.m_useAcceleration,
+                                                        m_nextTrackID++%500,
+                                                        i,
+                                                        m_settings.m_filterGoal,
+                                                        m_settings.m_lostTrackType));
+        }
+    }
+
+    // Update Kalman Filters state
+    const ptrdiff_t stop_i = static_cast<ptrdiff_t>(assignment.size());
+#pragma omp parallel for
+    for (ptrdiff_t i = 0; i < stop_i; ++i)
+    {
+        // If track updated less than one time, than filter state is not correct.
+        if (assignment[i] != -1) // If we have assigned detect, then update using its coordinates,
+        {
+            m_tracks[i]->DetectedFrames()++;
+            m_tracks[i]->SkippedFrames() = 0;
+            m_tracks[i]->Update(
+                        regions[assignment[i]], true,
+                    m_settings.m_maxTraceLength,
+                    m_prevFrame, currFrame,
+                    m_settings.m_useAbandonedDetection ? cvRound(m_settings.m_minStaticTime * fps) : 0);
+        }
+        else				     // if not continue using predictions
+        {
+            m_tracks[i]->Update(CRegion(), false, m_settings.m_maxTraceLength, m_prevFrame, currFrame, 0);
+        }
+        m_tracks[i]->m_regionID = assignment[i];
+    }
+}
+
+///
+/// \brief CTracker::CreateDistaceMatrix
+/// \param regions
+/// \param costMatrix
+/// \param maxPossibleCost
+/// \param maxCost
+///
+void CTracker::CreateDistaceMatrix(const regions_t& regions, distMatrix_t& costMatrix, track_t maxPossibleCost, track_t& maxCost, cv::UMat currFrame)
+{
+    const size_t N = m_tracks.size();	// Tracking objects
+    maxCost = 0;
+
+    for (size_t i = 0; i < N; ++i)
+    {
+        const auto& track = m_tracks[i];
+
+        // Calc predicted area for track
+//        cv::Size_<track_t> minRadius;
+//        if (m_settings.m_minAreaRadiusPix < 0)
+//        {
+//            minRadius.width = m_settings.m_minAreaRadiusK * track->LastRegion().m_rect.size.width;
+//            minRadius.height = m_settings.m_minAreaRadiusK * track->LastRegion().m_rect.size.height;
+//        }
+//        else
+//        {
+//            minRadius.width = m_settings.m_minAreaRadiusPix;
+//            minRadius.height = m_settings.m_minAreaRadiusPix;
+//        }
+        //cv::RotatedRect predictedArea = track->CalcPredictionEllipse(minRadius);
+
+        // Calc distance between track and regions
+        for (size_t j = 0; j < regions.size(); ++j)
+        {
+            const auto& reg = regions[j];
+
+            auto dist = maxPossibleCost;
+            if (reg.m_type==-1?(m_settings.CheckType(m_tracks[i]->LastRegion().m_type_name, reg.m_type_name)):(m_settings.CheckType(m_tracks[i]->LastRegion().m_type, reg.m_type)))
+            {
+                dist = 0;
+                size_t ind = 0;
+                if (m_settings.m_distType[ind] > 0.0f && ind == tracking::DistCenters)
+                {
+#if 0
+                    track_t ellipseDist = track->IsInsideArea(reg.m_rrect.center, predictedArea);
+                    if (ellipseDist > 1)
+                        dist += m_settings.m_distType[ind];
+                    else
+                        dist += ellipseDist * m_settings.m_distType[ind];
+#else
+                    dist += m_settings.m_distType[ind] * track->CalcDistCenter(reg);
+#endif
+                }
+                ++ind;
+
+                if (m_settings.m_distType[ind] > 0.0f && ind == tracking::DistRects)
+                {
+#if 0
+                    track_t ellipseDist = track->IsInsideArea(reg.m_rect.center, predictedArea);
+                    if (ellipseDist < 1)
+                    {
+                        track_t dw = track->WidthDist(reg);
+                        track_t dh = track->HeightDist(reg);
+                        dist += m_settings.m_distType[ind] * (1 - (1 - ellipseDist) * (dw + dh) * 0.5f);
+                    }
+                    else
+                    {
+                        dist += m_settings.m_distType[ind];
+                    }
+                    //std::cout << "dist = " << dist << ", ed = " << ellipseDist << ", dw = " << dw << ", dh = " << dh << std::endl;
+#else
+                    dist += m_settings.m_distType[ind] * track->CalcDistRect(reg);
+#endif
+                }
+                ++ind;
+                if (m_settings.m_distType[ind] > 0.0f && ind == tracking::DistRect3Ds)
+                {
+#if 0
+                    track_t ellipseDist = track->IsInsideArea(reg.m_rect.center, predictedArea);
+                    if (ellipseDist < 1)
+                    {
+                        track_t dw = track->WidthDist(reg);
+                        track_t dh = track->HeightDist(reg);
+                        dist += m_settings.m_distType[ind] * (1 - (1 - ellipseDist) * (dw + dh) * 0.5f);
+                    }
+                    else
+                    {
+                        dist += m_settings.m_distType[ind];
+                    }
+                    //std::cout << "dist = " << dist << ", ed = " << ellipseDist << ", dw = " << dw << ", dh = " << dh << std::endl;
+#else
+                    dist += m_settings.m_distType[ind] * track->CalcDistRect3D(reg);
+#endif
+                }
+                ++ind;
+                assert(ind == tracking::DistsCount);
+            }
+            costMatrix[i + j * N] = dist;
+            if (dist > maxCost)
+                maxCost = dist;
+        }
+    }
+}

+ 292 - 0
src/api/perception_lidar_PointPillars_MultiHead/Tracker/Ctracker.h

@@ -0,0 +1,292 @@
+#pragma once
+#include <iostream>
+#include <vector>
+#include <memory>
+#include <array>
+#include <deque>
+#include <numeric>
+#include <map>
+#include <set>
+
+#include "defines.h"
+#include "track.h"
+#include "ShortPathCalculator.h"
+
+// ----------------------------------------------------------------------
+
+///
+/// \brief The TrackerSettings struct
+///
+struct TrackerSettings
+{
+    tracking::KalmanType m_kalmanType = tracking::KalmanLinear;
+    tracking::FilterGoal m_filterGoal = tracking::FilterCenter;
+    tracking::LostTrackType m_lostTrackType = tracking::TrackNone;
+    tracking::MatchType m_matchType = tracking::MatchHungrian;
+
+	std::array<track_t, tracking::DistsCount> m_distType;
+
+    ///
+    /// \brief m_dt
+    /// Time step for Kalman
+    ///
+    track_t m_dt = 1.0f;
+
+    ///
+    /// \brief m_accelNoiseMag
+    /// Noise magnitude for Kalman
+    ///
+    track_t m_accelNoiseMag = 0.1f;
+
+	///
+	/// \brief m_useAcceleration
+	/// Constant velocity or constant acceleration motion model
+	///
+	bool m_useAcceleration = false;
+
+    ///
+    /// \brief m_distThres
+    /// Distance threshold for Assignment problem: from 0 to 1
+    ///
+    track_t m_distThres = 0.8f;
+
+    ///
+    /// \brief m_minAreaRadius
+    /// Minimal area radius in pixels for objects centers
+    ///
+    track_t m_minAreaRadiusPix = 20.f;
+
+	///
+	/// \brief m_minAreaRadius
+	/// Minimal area radius in ration for object size.. Used if m_minAreaRadiusPix < 0
+	///
+	track_t m_minAreaRadiusK = 0.5f;
+
+    ///
+    /// \brief m_maximumAllowedSkippedFrames
+    /// If the object don't assignment more than this frames then it will be removed
+    ///
+    size_t m_maximumAllowedSkippedFrames = 25;
+
+    ///
+    /// \brief m_maxTraceLength
+    /// The maximum trajectory length
+    ///
+    size_t m_maxTraceLength = 50;
+
+    ///
+    /// \brief m_useAbandonedDetection
+    /// Detection abandoned objects
+    ///
+    bool m_useAbandonedDetection = false;
+
+    ///
+    /// \brief m_minStaticTime
+    /// After this time (in seconds) the object is considered abandoned
+    ///
+    int m_minStaticTime = 5;
+    ///
+    /// \brief m_maxStaticTime
+    /// After this time (in seconds) the abandoned object will be removed
+    ///
+    int m_maxStaticTime = 25;
+
+	///
+	/// \brief m_nearTypes
+	/// Object types that can be matched while tracking
+	///
+    std::map<int, std::set<int>> m_nearTypes;
+
+    std::map<std::string, std::set<std::string>> m_nearTypeNames;
+
+	///
+	TrackerSettings()
+	{
+        m_distType[tracking::DistCenters] = 0.5f;
+        m_distType[tracking::DistRects] = 0.5f;
+        m_distType[tracking::DistRect3Ds] = 0.f;
+
+		assert(CheckDistance());
+	}
+
+	///
+	bool CheckDistance() const
+	{
+		track_t sum = std::accumulate(m_distType.begin(), m_distType.end(), 0.0f);
+        track_t maxOne = std::max(1.0f, std::fabs(sum));
+		return std::fabs(sum - 1.0f) <= std::numeric_limits<track_t>::epsilon() * maxOne;
+	}
+
+	///
+	bool SetDistances(std::array<track_t, tracking::DistsCount> distType)
+	{
+		bool res = true;
+		auto oldDists = m_distType;
+		m_distType = distType;
+		if (!CheckDistance())
+		{
+			m_distType = oldDists;
+			res = false;
+		}
+		return res;
+	}
+
+	///
+	bool SetDistance(tracking::DistType distType)
+	{
+		std::fill(m_distType.begin(), m_distType.end(), 0.0f);
+		m_distType[distType] = 1.f;
+		return true;
+	}
+
+	///
+    void AddNearTypes(const int& type1, const int& type2, bool sym)
+	{
+        auto AddOne = [&](const int& type1, const int& type2)
+		{
+			auto it = m_nearTypes.find(type1);
+			if (it == std::end(m_nearTypes))
+			{
+                m_nearTypes[type1] = std::set<int>{ type2 };
+			}
+			else
+			{
+				it->second.insert(type2);
+			}
+		};
+		AddOne(type1, type2);
+		if (sym)
+        {
+			AddOne(type2, type1);
+        }
+	}
+
+	///
+    bool CheckType(const int& type1, const int& type2) const
+	{
+        bool res = type1==-1 || type2==-1 || (type1 == type2);
+		if (!res)
+		{
+			auto it = m_nearTypes.find(type1);
+			if (it != std::end(m_nearTypes))
+			{
+				res = it->second.find(type2) != std::end(it->second);
+			}
+		}
+		return res;
+	}
+
+    ///
+    void AddNearTypes(const std::string& type1, const std::string& type2, bool sym)
+    {
+        auto AddOne = [&](const std::string& type1, const std::string& type2)
+        {
+            auto it = m_nearTypeNames.find(type1);
+            if (it == std::end(m_nearTypeNames))
+            {
+                m_nearTypeNames[type1] = std::set<std::string>{ type2 };
+            }
+            else
+            {
+                it->second.insert(type2);
+            }
+        };
+        AddOne(type1, type2);
+        if (sym)
+        {
+            AddOne(type2, type1);
+        }
+    }
+
+    ///
+    bool CheckType(const std::string& type1, const std::string& type2) const
+    {
+        bool res = type1.empty() || type2.empty() || (type1 == type2);
+        if (!res)
+        {
+            auto it = m_nearTypeNames.find(type1);
+            if (it != std::end(m_nearTypeNames))
+            {
+                res = it->second.find(type2) != std::end(it->second);
+            }
+        }
+        return res;
+    }
+};
+
+///
+/// \brief The CTracker class
+///
+class CTracker
+{
+public:
+    CTracker(TrackerSettings& settings);
+    CTracker(const TrackerSettings& settings);
+    CTracker(const CTracker&) = delete;
+    CTracker(CTracker&&) = delete;
+	CTracker& operator=(const CTracker&) = delete;
+	CTracker& operator=(CTracker&&) = delete;
+	
+	~CTracker(void);
+    void setSettings(TrackerSettings& settings);
+    void Update(const regions_t& regions, cv::UMat currFrame, float fps);
+
+    ///
+    /// \brief CanGrayFrameToTrack
+    /// \return
+    ///
+    bool CanGrayFrameToTrack() const
+    {
+        bool needColor = (m_settings.m_lostTrackType == tracking::LostTrackType::TrackNone);
+        return !needColor;
+    }
+
+	///
+	/// \brief CanColorFrameToTrack
+	/// \return
+	///
+	bool CanColorFrameToTrack() const
+	{
+		return true;
+	}
+
+    ///
+    /// \brief GetTracksCount
+    /// \return
+    ///
+	size_t GetTracksCount() const
+	{
+		return m_tracks.size();
+	}
+    ///
+    /// \brief GetTracks
+    /// \return
+    ///
+	std::vector<TrackingObject> GetTracks() const
+	{
+		std::vector<TrackingObject> tracks;
+		if (!m_tracks.empty())
+		{
+			tracks.reserve(m_tracks.size());
+			for (const auto& track : m_tracks)
+			{
+                tracks.push_back(track->ConstructObject());
+			}
+		}
+		return tracks;
+	}
+
+private:
+    TrackerSettings m_settings;
+
+	tracks_t m_tracks;
+
+    size_t m_nextTrackID;
+
+    cv::UMat m_prevFrame;
+
+    std::unique_ptr<ShortPathCalculator> m_SPCalculator;
+
+    void CreateDistaceMatrix(const regions_t& regions, distMatrix_t& costMatrix, track_t maxPossibleCost, track_t& maxCost, cv::UMat currFrame);
+    void UpdateTrackingState(const regions_t& regions, cv::UMat currFrame, float fps);
+};

+ 723 - 0
src/api/perception_lidar_PointPillars_MultiHead/Tracker/HungarianAlg.cpp

@@ -0,0 +1,723 @@
+#include "HungarianAlg.h"
+#include <limits>
+
+// --------------------------------------------------------------------------
+//
+// --------------------------------------------------------------------------
+AssignmentProblemSolver::AssignmentProblemSolver()
+{
+}
+
+// --------------------------------------------------------------------------
+//
+// --------------------------------------------------------------------------
+AssignmentProblemSolver::~AssignmentProblemSolver()
+{
+}
+
+// --------------------------------------------------------------------------
+//
+// --------------------------------------------------------------------------
+track_t AssignmentProblemSolver::Solve(
+	const distMatrix_t& distMatrixIn,
+	size_t nOfRows,
+	size_t nOfColumns,
+	std::vector<int>& assignment,
+	TMethod Method
+	)
+{
+	assignment.resize(nOfRows, -1);
+
+	track_t cost = 0;
+
+	switch (Method)
+	{
+	case optimal:
+		assignmentoptimal(assignment, cost, distMatrixIn, nOfRows, nOfColumns);
+		break;
+
+	case many_forbidden_assignments:
+		assignmentsuboptimal1(assignment, cost, distMatrixIn, nOfRows, nOfColumns);
+		break;
+
+	case without_forbidden_assignments:
+		assignmentsuboptimal2(assignment, cost, distMatrixIn, nOfRows, nOfColumns);
+		break;
+	}
+
+	return cost;
+}
+// --------------------------------------------------------------------------
+// Computes the optimal assignment (minimum overall costs) using Munkres algorithm.
+// --------------------------------------------------------------------------
+void AssignmentProblemSolver::assignmentoptimal(assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns)
+{
+	// Generate distance cv::Matrix 
+	// and check cv::Matrix elements positiveness :)
+
+	// Total elements number
+	size_t nOfElements = nOfRows * nOfColumns;
+	// Memory allocation
+	track_t* distMatrix = (track_t *)malloc(nOfElements * sizeof(track_t));
+
+    if (distMatrix == nullptr)
+    {
+        return;
+    }
+
+	// Pointer to last element
+	track_t* distMatrixEnd = distMatrix + nOfElements;
+
+	for (size_t row = 0; row < nOfElements; row++)
+	{
+        track_t value = distMatrixIn[row];
+		assert(value >= 0);
+		distMatrix[row] = value;
+	}
+
+	// Memory allocation
+	bool* coveredColumns = (bool*)calloc(nOfColumns, sizeof(bool));
+	bool* coveredRows = (bool*)calloc(nOfRows, sizeof(bool));
+	bool* starMatrix = (bool*)calloc(nOfElements, sizeof(bool));
+	bool* primeMatrix = (bool*)calloc(nOfElements, sizeof(bool));
+	bool* newStarMatrix = (bool*)calloc(nOfElements, sizeof(bool)); /* used in step4 */
+
+	/* preliminary steps */
+	if (nOfRows <= nOfColumns)
+	{
+		for (size_t row = 0; row < nOfRows; row++)
+		{
+			/* find the smallest element in the row */
+            track_t* distMatrixTemp = distMatrix + row;
+			track_t  minValue = *distMatrixTemp;
+			distMatrixTemp += nOfRows;
+			while (distMatrixTemp < distMatrixEnd)
+			{
+				track_t value = *distMatrixTemp;
+				if (value < minValue)
+				{
+					minValue = value;
+				}
+				distMatrixTemp += nOfRows;
+			}
+			/* subtract the smallest element from each element of the row */
+			distMatrixTemp = distMatrix + row;
+			while (distMatrixTemp < distMatrixEnd)
+			{
+				*distMatrixTemp -= minValue;
+				distMatrixTemp += nOfRows;
+			}
+		}
+		/* Steps 1 and 2a */
+		for (size_t row = 0; row < nOfRows; row++)
+		{
+			for (size_t col = 0; col < nOfColumns; col++)
+			{
+				if (distMatrix[row + nOfRows*col] == 0)
+				{
+					if (!coveredColumns[col])
+					{
+						starMatrix[row + nOfRows * col] = true;
+						coveredColumns[col] = true;
+						break;
+					}
+				}
+			}
+		}
+	}
+	else /* if(nOfRows > nOfColumns) */
+	{
+		for (size_t col = 0; col < nOfColumns; col++)
+		{
+			/* find the smallest element in the column */
+			track_t* distMatrixTemp = distMatrix + nOfRows*col;
+			track_t* columnEnd = distMatrixTemp + nOfRows;
+			track_t  minValue = *distMatrixTemp++;
+			while (distMatrixTemp < columnEnd)
+			{
+				track_t value = *distMatrixTemp++;
+				if (value < minValue)
+				{
+					minValue = value;
+				}
+			}
+			/* subtract the smallest element from each element of the column */
+			distMatrixTemp = distMatrix + nOfRows*col;
+			while (distMatrixTemp < columnEnd)
+			{
+				*distMatrixTemp++ -= minValue;
+			}
+		}
+		/* Steps 1 and 2a */
+		for (size_t col = 0; col < nOfColumns; col++)
+		{
+			for (size_t row = 0; row < nOfRows; row++)
+			{
+				if (distMatrix[row + nOfRows*col] == 0)
+				{
+					if (!coveredRows[row])
+					{
+						starMatrix[row + nOfRows*col] = true;
+						coveredColumns[col] = true;
+						coveredRows[row] = true;
+						break;
+					}
+				}
+			}
+		}
+
+		for (size_t row = 0; row < nOfRows; row++)
+		{
+			coveredRows[row] = false;
+		}
+	}
+	/* move to step 2b */
+	step2b(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, (nOfRows <= nOfColumns) ? nOfRows : nOfColumns);
+	/* compute cost and remove invalid assignments */
+	computeassignmentcost(assignment, cost, distMatrixIn, nOfRows);
+	/* free allocated memory */
+	free(distMatrix);
+	free(coveredColumns);
+	free(coveredRows);
+	free(starMatrix);
+	free(primeMatrix);
+	free(newStarMatrix);
+	return;
+}
+// --------------------------------------------------------------------------
+//
+// --------------------------------------------------------------------------
+void AssignmentProblemSolver::buildassignmentvector(assignments_t& assignment, bool *starMatrix, size_t nOfRows, size_t nOfColumns)
+{
+    for (size_t row = 0; row < nOfRows; row++)
+	{
+        for (size_t col = 0; col < nOfColumns; col++)
+		{
+			if (starMatrix[row + nOfRows * col])
+			{
+				assignment[row] = static_cast<int>(col);
+				break;
+			}
+		}
+	}
+}
+// --------------------------------------------------------------------------
+//
+// --------------------------------------------------------------------------
+void AssignmentProblemSolver::computeassignmentcost(const assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows)
+{
+	for (size_t row = 0; row < nOfRows; row++)
+	{
+		const int col = assignment[row];
+		if (col >= 0)
+		{
+			cost += distMatrixIn[row + nOfRows * col];
+		}
+	}
+}
+
+// --------------------------------------------------------------------------
+//
+// --------------------------------------------------------------------------
+void AssignmentProblemSolver::step2a(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim)
+{
+    bool *starMatrixTemp, *columnEnd;
+    /* cover every column containing a starred zero */
+    for (size_t col = 0; col < nOfColumns; col++)
+    {
+        starMatrixTemp = starMatrix + nOfRows * col;
+        columnEnd = starMatrixTemp + nOfRows;
+        while (starMatrixTemp < columnEnd)
+        {
+            if (*starMatrixTemp++)
+            {
+                coveredColumns[col] = true;
+                break;
+            }
+        }
+    }
+    /* move to step 3 */
+	step2b(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim);
+}
+
+// --------------------------------------------------------------------------
+//
+// --------------------------------------------------------------------------
+void AssignmentProblemSolver::step2b(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim)
+{
+	/* count covered columns */
+    size_t nOfCoveredColumns = 0;
+    for (size_t col = 0; col < nOfColumns; col++)
+	{
+		if (coveredColumns[col])
+		{
+			nOfCoveredColumns++;
+		}
+	}
+	if (nOfCoveredColumns == minDim)
+	{
+		/* algorithm finished */
+		buildassignmentvector(assignment, starMatrix, nOfRows, nOfColumns);
+	}
+	else
+	{
+		/* move to step 3 */
+		step3_5(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim);
+	}
+}
+
+// --------------------------------------------------------------------------
+//
+// --------------------------------------------------------------------------
+void AssignmentProblemSolver::step3_5(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim)
+{
+	for (;;)
+	{
+		/* step 3 */
+		bool zerosFound = true;
+		while (zerosFound)
+		{
+			zerosFound = false;
+			for (size_t col = 0; col < nOfColumns; col++)
+			{
+				if (!coveredColumns[col])
+				{
+					for (size_t row = 0; row < nOfRows; row++)
+					{
+						if ((!coveredRows[row]) && (distMatrix[row + nOfRows*col] == 0))
+						{
+							/* prime zero */
+							primeMatrix[row + nOfRows*col] = true;
+							/* find starred zero in current row */
+							size_t starCol = 0;
+							for (; starCol < nOfColumns; starCol++)
+							{
+								if (starMatrix[row + nOfRows * starCol])
+								{
+									break;
+								}
+							}
+							if (starCol == nOfColumns) /* no starred zero found */
+							{
+								/* move to step 4 */
+								step4(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim, row, col);
+								return;
+							}
+							else
+							{
+								coveredRows[row] = true;
+								coveredColumns[starCol] = false;
+								zerosFound = true;
+								break;
+							}
+						}
+					}
+				}
+			}
+		}
+		/* step 5 */
+        track_t h = std::numeric_limits<track_t>::max();
+		for (size_t row = 0; row < nOfRows; row++)
+		{
+			if (!coveredRows[row])
+			{
+				for (size_t col = 0; col < nOfColumns; col++)
+				{
+					if (!coveredColumns[col])
+					{
+                        const track_t value = distMatrix[row + nOfRows*col];
+						if (value < h)
+						{
+							h = value;
+						}
+					}
+				}
+			}
+		}
+		/* add h to each covered row */
+		for (size_t row = 0; row < nOfRows; row++)
+		{
+			if (coveredRows[row])
+			{
+				for (size_t col = 0; col < nOfColumns; col++)
+				{
+					distMatrix[row + nOfRows*col] += h;
+				}
+			}
+		}
+		/* subtract h from each uncovered column */
+		for (size_t col = 0; col < nOfColumns; col++)
+		{
+			if (!coveredColumns[col])
+			{
+				for (size_t row = 0; row < nOfRows; row++)
+				{
+					distMatrix[row + nOfRows*col] -= h;
+				}
+			}
+		}
+	}
+}
+
+// --------------------------------------------------------------------------
+//
+// --------------------------------------------------------------------------
+void AssignmentProblemSolver::step4(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim, size_t row, size_t col)
+{
+	const size_t nOfElements = nOfRows * nOfColumns;
+	/* generate temporary copy of starMatrix */
+	for (size_t n = 0; n < nOfElements; n++)
+	{
+		newStarMatrix[n] = starMatrix[n];
+	}
+	/* star current zero */
+	newStarMatrix[row + nOfRows*col] = true;
+	/* find starred zero in current column */
+	size_t starCol = col;
+	size_t starRow = 0;
+	for (; starRow < nOfRows; starRow++)
+	{
+		if (starMatrix[starRow + nOfRows * starCol])
+		{
+			break;
+		}
+	}
+	while (starRow < nOfRows)
+	{
+		/* unstar the starred zero */
+		newStarMatrix[starRow + nOfRows*starCol] = false;
+		/* find primed zero in current row */
+		size_t primeRow = starRow;
+		size_t primeCol = 0;
+		for (; primeCol < nOfColumns; primeCol++)
+		{
+			if (primeMatrix[primeRow + nOfRows * primeCol])
+			{
+				break;
+			}
+		}
+		/* star the primed zero */
+		newStarMatrix[primeRow + nOfRows*primeCol] = true;
+		/* find starred zero in current column */
+		starCol = primeCol;
+		for (starRow = 0; starRow < nOfRows; starRow++)
+		{
+			if (starMatrix[starRow + nOfRows * starCol])
+			{
+				break;
+			}
+		}
+	}
+	/* use temporary copy as new starMatrix */
+	/* delete all primes, uncover all rows */
+    for (size_t n = 0; n < nOfElements; n++)
+	{
+		primeMatrix[n] = false;
+		starMatrix[n] = newStarMatrix[n];
+	}
+    for (size_t n = 0; n < nOfRows; n++)
+	{
+		coveredRows[n] = false;
+	}
+	/* move to step 2a */
+	step2a(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim);
+}
+
+// --------------------------------------------------------------------------
+// Computes a suboptimal solution. Good for cases without forbidden assignments.
+// --------------------------------------------------------------------------
+void AssignmentProblemSolver::assignmentsuboptimal2(assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns)
+{
+	/* make working copy of distance Matrix */
+	const size_t nOfElements = nOfRows * nOfColumns;
+	track_t* distMatrix = (track_t*)malloc(nOfElements * sizeof(track_t));
+	for (size_t n = 0; n < nOfElements; n++)
+	{
+		distMatrix[n] = distMatrixIn[n];
+	}
+
+	/* recursively search for the minimum element and do the assignment */
+	for (;;)
+	{
+		/* find minimum distance observation-to-track pair */
+		track_t minValue = std::numeric_limits<track_t>::max();
+		size_t tmpRow = 0;
+		size_t tmpCol = 0;
+		for (size_t row = 0; row < nOfRows; row++)
+		{
+			for (size_t col = 0; col < nOfColumns; col++)
+			{
+				const track_t value = distMatrix[row + nOfRows*col];
+				if (value != std::numeric_limits<track_t>::max() && (value < minValue))
+				{
+					minValue = value;
+					tmpRow = row;
+					tmpCol = col;
+				}
+			}
+		}
+
+		if (minValue != std::numeric_limits<track_t>::max())
+		{
+			assignment[tmpRow] = static_cast<int>(tmpCol);
+			cost += minValue;
+			for (size_t n = 0; n < nOfRows; n++)
+			{
+				distMatrix[n + nOfRows*tmpCol] = std::numeric_limits<track_t>::max();
+			}
+			for (size_t n = 0; n < nOfColumns; n++)
+			{
+				distMatrix[tmpRow + nOfRows*n] = std::numeric_limits<track_t>::max();
+			}
+		}
+		else
+		{
+			break;
+		}
+	}
+
+	free(distMatrix);
+}
+// --------------------------------------------------------------------------
+// Computes a suboptimal solution. Good for cases with many forbidden assignments.
+// --------------------------------------------------------------------------
+void AssignmentProblemSolver::assignmentsuboptimal1(assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns)
+{
+	/* make working copy of distance Matrix */
+	const size_t nOfElements = nOfRows * nOfColumns;
+	track_t* distMatrix = (track_t *)malloc(nOfElements * sizeof(track_t));
+    for (size_t n = 0; n < nOfElements; n++)
+	{
+		distMatrix[n] = distMatrixIn[n];
+	}
+
+	/* allocate memory */
+	int* nOfValidObservations = (int *)calloc(nOfRows, sizeof(int));
+	int* nOfValidTracks = (int *)calloc(nOfColumns, sizeof(int));
+
+	/* compute number of validations */
+	bool infiniteValueFound = false;
+	bool finiteValueFound = false;
+	for (size_t row = 0; row < nOfRows; row++)
+	{
+		for (size_t col = 0; col < nOfColumns; col++)
+		{
+			if (distMatrix[row + nOfRows*col] != std::numeric_limits<track_t>::max())
+			{
+				nOfValidTracks[col] += 1;
+				nOfValidObservations[row] += 1;
+				finiteValueFound = true;
+			}
+			else
+			{
+				infiniteValueFound = true;
+			}
+		}
+	}
+
+	if (infiniteValueFound)
+	{
+		if (!finiteValueFound)
+		{
+            /* free allocated memory */
+            free(nOfValidObservations);
+            free(nOfValidTracks);
+            free(distMatrix);
+
+			return;
+		}
+		bool repeatSteps = true;
+
+		while (repeatSteps)
+		{
+			repeatSteps = false;
+
+			/* step 1: reject assignments of multiply validated tracks to singly validated observations		 */
+			for (size_t col = 0; col < nOfColumns; col++)
+			{
+				bool singleValidationFound = false;
+				for (size_t row = 0; row < nOfRows; row++)
+				{
+					if (distMatrix[row + nOfRows * col] != std::numeric_limits<track_t>::max() && (nOfValidObservations[row] == 1))
+					{
+						singleValidationFound = true;
+						break;
+					}
+				}
+				if (singleValidationFound)
+				{
+					for (size_t nestedRow = 0; nestedRow < nOfRows; nestedRow++)
+						if ((nOfValidObservations[nestedRow] > 1) && distMatrix[nestedRow + nOfRows * col] != std::numeric_limits<track_t>::max())
+						{
+							distMatrix[nestedRow + nOfRows * col] = std::numeric_limits<track_t>::max();
+							nOfValidObservations[nestedRow] -= 1;
+							nOfValidTracks[col] -= 1;
+							repeatSteps = true;
+						}
+				}
+			}
+
+			/* step 2: reject assignments of multiply validated observations to singly validated tracks */
+			if (nOfColumns > 1)
+			{
+				for (size_t row = 0; row < nOfRows; row++)
+				{
+					bool singleValidationFound = false;
+                    for (size_t col = 0; col < nOfColumns; col++)
+					{
+						if (distMatrix[row + nOfRows*col] != std::numeric_limits<track_t>::max() && (nOfValidTracks[col] == 1))
+						{
+							singleValidationFound = true;
+							break;
+						}
+					}
+
+					if (singleValidationFound)
+					{
+						for (size_t col = 0; col < nOfColumns; col++)
+						{
+							if ((nOfValidTracks[col] > 1) && distMatrix[row + nOfRows*col] != std::numeric_limits<track_t>::max())
+							{
+								distMatrix[row + nOfRows*col] = std::numeric_limits<track_t>::max();
+								nOfValidObservations[row] -= 1;
+								nOfValidTracks[col] -= 1;
+								repeatSteps = true;
+							}
+						}
+					}
+				}
+			}
+		} /* while(repeatSteps) */
+
+		/* for each multiply validated track that validates only with singly validated  */
+		/* observations, choose the observation with minimum distance */
+        for (size_t row = 0; row < nOfRows; row++)
+		{
+			if (nOfValidObservations[row] > 1)
+			{
+				bool allSinglyValidated = true;
+				track_t minValue = std::numeric_limits<track_t>::max();
+				size_t tmpCol = 0;
+                for (size_t col = 0; col < nOfColumns; col++)
+				{
+					const track_t value = distMatrix[row + nOfRows*col];
+					if (value != std::numeric_limits<track_t>::max())
+					{
+						if (nOfValidTracks[col] > 1)
+						{
+							allSinglyValidated = false;
+							break;
+						}
+						else if ((nOfValidTracks[col] == 1) && (value < minValue))
+						{
+							tmpCol = col;
+							minValue = value;
+						}
+					}
+				}
+
+				if (allSinglyValidated)
+				{
+					assignment[row] = static_cast<int>(tmpCol);
+					cost += minValue;
+					for (size_t n = 0; n < nOfRows; n++)
+					{
+						distMatrix[n + nOfRows*tmpCol] = std::numeric_limits<track_t>::max();
+					}
+					for (size_t n = 0; n < nOfColumns; n++)
+					{
+						distMatrix[row + nOfRows*n] = std::numeric_limits<track_t>::max();
+					}
+				}
+			}
+		}
+
+		// for each multiply validated observation that validates only with singly validated  track, choose the track with minimum distance
+        for (size_t col = 0; col < nOfColumns; col++)
+		{
+			if (nOfValidTracks[col] > 1)
+			{
+				bool allSinglyValidated = true;
+				track_t minValue = std::numeric_limits<track_t>::max();
+				size_t tmpRow = 0;
+				for (size_t row = 0; row < nOfRows; row++)
+				{
+					const track_t value = distMatrix[row + nOfRows*col];
+					if (value != std::numeric_limits<track_t>::max())
+					{
+						if (nOfValidObservations[row] > 1)
+						{
+							allSinglyValidated = false;
+							break;
+						}
+						else if ((nOfValidObservations[row] == 1) && (value < minValue))
+						{
+							tmpRow = row;
+							minValue = value;
+						}
+					}
+				}
+
+				if (allSinglyValidated)
+				{
+					assignment[tmpRow] = static_cast<int>(col);
+					cost += minValue;
+					for (size_t n = 0; n < nOfRows; n++)
+					{
+						distMatrix[n + nOfRows*col] = std::numeric_limits<track_t>::max();
+					}
+					for (size_t n = 0; n < nOfColumns; n++)
+					{
+						distMatrix[tmpRow + nOfRows*n] = std::numeric_limits<track_t>::max();
+					}
+				}
+			}
+		}
+	} /* if(infiniteValueFound) */
+
+
+	/* now, recursively search for the minimum element and do the assignment */
+	for (;;)
+	{
+		/* find minimum distance observation-to-track pair */
+		track_t minValue = std::numeric_limits<track_t>::max();
+		size_t tmpRow = 0;
+		size_t tmpCol = 0;
+		for (size_t row = 0; row < nOfRows; row++)
+		{
+			for (size_t col = 0; col < nOfColumns; col++)
+			{
+				const track_t value = distMatrix[row + nOfRows*col];
+				if (value != std::numeric_limits<track_t>::max() && (value < minValue))
+				{
+					minValue = value;
+					tmpRow = row;
+					tmpCol = col;
+				}
+			}
+		}
+
+		if (minValue != std::numeric_limits<track_t>::max())
+		{
+			assignment[tmpRow] = static_cast<int>(tmpCol);
+			cost += minValue;
+			for (size_t n = 0; n < nOfRows; n++)
+			{
+				distMatrix[n + nOfRows*tmpCol] = std::numeric_limits<track_t>::max();
+			}
+			for (size_t n = 0; n < nOfColumns; n++)
+			{
+				distMatrix[tmpRow + nOfRows*n] = std::numeric_limits<track_t>::max();
+			}
+		}
+		else
+		{
+			break;
+		}
+	}
+
+	/* free allocated memory */
+	free(nOfValidObservations);
+	free(nOfValidTracks);
+	free(distMatrix);
+}

+ 39 - 0
src/api/perception_lidar_PointPillars_MultiHead/Tracker/HungarianAlg.h

@@ -0,0 +1,39 @@
+#include <vector>
+#include <iostream>
+#include <limits>
+#include <time.h>
+#include "defines.h"
+// http://community.topcoder.com/tc?module=Static&d1=tutorials&d2=hungarianAlgorithm
+
+///
+/// \brief The AssignmentProblemSolver class
+///
+class AssignmentProblemSolver
+{
+private:
+	// Computes the optimal assignment (minimum overall costs) using Munkres algorithm.
+	void assignmentoptimal(assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns);
+	void buildassignmentvector(assignments_t& assignment, bool *starMatrix, size_t nOfRows, size_t nOfColumns);
+	void computeassignmentcost(const assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows);
+	void step2a(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim);
+	void step2b(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim);
+	void step3_5(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim);
+	void step4(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim, size_t row, size_t col);
+
+	// Computes a suboptimal solution. Good for cases with many forbidden assignments.
+	void assignmentsuboptimal1(assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns);
+	// Computes a suboptimal solution. Good for cases with many forbidden assignments.
+	void assignmentsuboptimal2(assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns);
+
+public:
+	enum TMethod
+	{
+		optimal,
+		many_forbidden_assignments,
+		without_forbidden_assignments
+	};
+
+	AssignmentProblemSolver();
+	~AssignmentProblemSolver();
+	track_t Solve(const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns, assignments_t& assignment, TMethod Method = optimal);
+};

+ 873 - 0
src/api/perception_lidar_PointPillars_MultiHead/Tracker/Kalman.cpp

@@ -0,0 +1,873 @@
+#include "Kalman.h"
+#include <iostream>
+#include <vector>
+
+//---------------------------------------------------------------------------
+TKalmanFilter::TKalmanFilter(
+        tracking::KalmanType type,
+	    bool useAcceleration,
+        track_t deltaTime, // time increment (lower values makes target more "massive")
+        track_t accelNoiseMag
+        )
+    :
+      m_type(type),
+      m_initialized(false),
+      m_deltaTime(deltaTime),
+      m_deltaTimeMin(deltaTime),
+      m_deltaTimeMax(2 * deltaTime),
+      m_lastDist(0),
+      m_accelNoiseMag(accelNoiseMag),
+	  m_useAcceleration(useAcceleration)
+{
+    m_deltaStep = (m_deltaTimeMax - m_deltaTimeMin) / m_deltaStepsCount;
+}
+
+//---------------------------------------------------------------------------
+void TKalmanFilter::CreateLinear(cv::Point3f xy0, cv::Point3f xyv0)
+{
+    // We don't know acceleration, so, assume it to process noise.
+    // But we can guess, the range of acceleration values thich can be achieved by tracked object.
+    // Process noise. (standard deviation of acceleration: m/s^2)
+    // shows, woh much target can accelerate.
+
+    // 4 state variables, 2 measurements
+    m_linearKalman.init(4, 2, 0, El_t);
+    // Transition cv::Matrix
+    m_linearKalman.transitionMatrix = (cv::Mat_<track_t>(4, 4) <<
+                                        1, 0, m_deltaTime, 0,
+                                        0, 1, 0, m_deltaTime,
+                                        0, 0, 1, 0,
+                                        0, 0, 0, 1);
+
+    // init...
+    m_lastPointResult = xy0;
+    m_linearKalman.statePre.at<track_t>(0) = xy0.x;  // x
+    m_linearKalman.statePre.at<track_t>(1) = xy0.y;  // y
+    m_linearKalman.statePre.at<track_t>(2) = xyv0.x; // vx
+    m_linearKalman.statePre.at<track_t>(3) = xyv0.y; // vy
+
+    m_linearKalman.statePost.at<track_t>(0) = xy0.x;
+    m_linearKalman.statePost.at<track_t>(1) = xy0.y;
+    m_linearKalman.statePost.at<track_t>(2) = xyv0.x;
+    m_linearKalman.statePost.at<track_t>(3) = xyv0.y;
+
+    cv::setIdentity(m_linearKalman.measurementMatrix);
+
+    m_linearKalman.processNoiseCov = (cv::Mat_<track_t>(4, 4) <<
+                                       pow(m_deltaTime,4.0)/4.0	,0						,pow(m_deltaTime,3.0)/2.0		,0,
+                                       0						,pow(m_deltaTime,4.0)/4.0	,0							,pow(m_deltaTime,3.0)/2.0,
+                                       pow(m_deltaTime,3.0)/2.0	,0						,pow(m_deltaTime,2.0)			,0,
+                                       0						,pow(m_deltaTime,3.0)/2.0	,0							,pow(m_deltaTime,2.0));
+
+
+    m_linearKalman.processNoiseCov *= m_accelNoiseMag;
+
+    cv::setIdentity(m_linearKalman.measurementNoiseCov, cv::Scalar::all(0.1));
+
+    cv::setIdentity(m_linearKalman.errorCovPost, cv::Scalar::all(.1));
+
+    m_initialized = true;
+}
+//---------------------------------------------------------------------------
+void TKalmanFilter::CreateLinear(cv::Rect_<track_t> rect0, Point_t rectv0)
+{
+    // We don't know acceleration, so, assume it to process noise.
+    // But we can guess, the range of acceleration values thich can be achieved by tracked object.
+    // Process noise. (standard deviation of acceleration: m/s^2)
+    // shows, woh much target can accelerate.
+
+    // 8 state variables (x, y, vx, vy, width, height, vw, vh), 4 measurements (x, y, width, height)
+    m_linearKalman.init(8, 4, 0, El_t);
+    // Transition cv::Matrix
+    m_linearKalman.transitionMatrix = (cv::Mat_<track_t>(8, 8) <<
+                                        1, 0, 0, 0, m_deltaTime, 0,           0,           0,
+                                        0, 1, 0, 0, 0,           m_deltaTime, 0,           0,
+                                        0, 0, 1, 0, 0,           0,           m_deltaTime, 0,
+                                        0, 0, 0, 1, 0,           0,           0,           m_deltaTime,
+                                        0, 0, 0, 0, 1,           0,           0,           0,
+                                        0, 0, 0, 0, 0,           1,           0,           0,
+                                        0, 0, 0, 0, 0,           0,           1,           0,
+                                        0, 0, 0, 0, 0,           0,           0,           1);
+
+    // init...
+    m_linearKalman.statePre.at<track_t>(0) = rect0.x;      // x
+    m_linearKalman.statePre.at<track_t>(1) = rect0.y;      // y
+    m_linearKalman.statePre.at<track_t>(2) = rect0.width;  // width
+    m_linearKalman.statePre.at<track_t>(3) = rect0.height; // height
+    m_linearKalman.statePre.at<track_t>(4) = rectv0.x;     // dx
+    m_linearKalman.statePre.at<track_t>(5) = rectv0.y;     // dy
+    m_linearKalman.statePre.at<track_t>(6) = 0;            // dw
+    m_linearKalman.statePre.at<track_t>(7) = 0;            // dh
+
+    m_linearKalman.statePost.at<track_t>(0) = rect0.x;
+    m_linearKalman.statePost.at<track_t>(1) = rect0.y;
+    m_linearKalman.statePost.at<track_t>(2) = rect0.width;
+    m_linearKalman.statePost.at<track_t>(3) = rect0.height;
+    m_linearKalman.statePost.at<track_t>(4) = rectv0.x;
+    m_linearKalman.statePost.at<track_t>(5) = rectv0.y;
+    m_linearKalman.statePost.at<track_t>(6) = 0;
+    m_linearKalman.statePost.at<track_t>(7) = 0;
+
+    cv::setIdentity(m_linearKalman.measurementMatrix);
+
+    track_t n1 = pow(m_deltaTime, 4.f) / 4.f;
+    track_t n2 = pow(m_deltaTime, 3.f) / 2.f;
+    track_t n3 = pow(m_deltaTime, 2.f);
+    m_linearKalman.processNoiseCov = (cv::Mat_<track_t>(8, 8) <<
+                                       n1, 0,  0,  0,  n2, 0,  0,  0,
+                                       0,  n1, 0,  0,  0,  n2, 0,  0,
+                                       0,  0,  n1, 0,  0,  0,  n2, 0,
+                                       0,  0,  0,  n1, 0,  0,  0,  n2,
+                                       n2, 0,  0,  0,  n3, 0,  0,  0,
+                                       0,  n2, 0,  0,  0,  n3, 0,  0,
+                                       0,  0,  n2, 0,  0,  0,  n3, 0,
+                                       0,  0,  0,  n2, 0,  0,  0,  n3);
+
+    m_linearKalman.processNoiseCov *= m_accelNoiseMag;
+
+    cv::setIdentity(m_linearKalman.measurementNoiseCov, cv::Scalar::all(0.1));
+
+    cv::setIdentity(m_linearKalman.errorCovPost, cv::Scalar::all(.1));
+
+    m_initialized = true;
+}
+//---------------------------------------------------------------------------
+void TKalmanFilter::CreateLinear3D(Rect3D rect0, cv::Point3f rectv0)
+{
+    // We don't know acceleration, so, assume it to process noise.
+    // But we can guess, the range of acceleration values thich can be achieved by tracked object.
+    // Process noise. (standard deviation of acceleration: m/s^2)
+    // shows, woh much target can accelerate.
+
+    // 14 state variables (x, y, z, width, height, length, yaw, d...), 7 measurements (x, y, z, width, height, length, yaw)
+    m_linearKalman.init(14, 7, 0, El_t);
+    // Transition cv::Matrix
+    m_linearKalman.transitionMatrix = (cv::Mat_<track_t>(14, 14) <<
+                                        1, 0, 0, 0, 0, 0, 0, m_deltaTime, 0,           0,           0,           0,           0,           0,
+                                        0, 1, 0, 0, 0, 0, 0, 0,           m_deltaTime, 0,           0,           0,           0,           0,
+                                        0, 0, 1, 0, 0, 0, 0, 0,           0,           m_deltaTime, 0,           0,           0,           0,
+                                        0, 0, 0, 1, 0, 0, 0, 0,           0,           0,           m_deltaTime, 0,           0,           0,
+                                        0, 0, 0, 0, 1, 0, 0, 0,           0,           0,           0,           m_deltaTime, 0,           0,
+                                        0, 0, 0, 0, 0, 1, 0, 0,           0,           0,           0,           0,           m_deltaTime, 0,
+                                        0, 0, 0, 0, 0, 0, 1, 0,           0,           0,           0,           0,           0,           m_deltaTime,
+                                        0, 0, 0, 0, 0, 0, 0, 1,           0,           0,           0,           0,           0,           0,
+                                        0, 0, 0, 0, 0, 0, 0, 0,           1,           0,           0,           0,           0,           0,
+                                        0, 0, 0, 0, 0, 0, 0, 0,           0,           1,           0,           0,           0,           0,
+                                        0, 0, 0, 0, 0, 0, 0, 0,           0,           0,           1,           0,           0,           0,
+                                        0, 0, 0, 0, 0, 0, 0, 0,           0,           0,           0,           1,           0,           0,
+                                        0, 0, 0, 0, 0, 0, 0, 0,           0,           0,           0,           0,           1,           0,
+                                        0, 0, 0, 0, 0, 0, 0, 0,           0,           0,           0,           0,           0,           1);
+
+    // init...
+    m_linearKalman.statePre.at<track_t>(0) = rect0.center.x;      // x
+    m_linearKalman.statePre.at<track_t>(1) = rect0.center.y;      // y
+    m_linearKalman.statePre.at<track_t>(2) = rect0.center.z;      // z
+    m_linearKalman.statePre.at<track_t>(3) = rect0.size.width;  // width
+    m_linearKalman.statePre.at<track_t>(4) = rect0.size.height; // height
+    m_linearKalman.statePre.at<track_t>(5) = rect0.size.length; // length
+    m_linearKalman.statePre.at<track_t>(6) = rect0.yaw; // yaw
+    m_linearKalman.statePre.at<track_t>(7) = rectv0.x;     // dx
+    m_linearKalman.statePre.at<track_t>(8) = rectv0.y;     // dy
+    m_linearKalman.statePre.at<track_t>(9) = 0;            // dz
+    m_linearKalman.statePre.at<track_t>(10) = 0;            // dw
+    m_linearKalman.statePre.at<track_t>(11) = 0;            // dh
+    m_linearKalman.statePre.at<track_t>(12) = 0;            // dl
+    m_linearKalman.statePre.at<track_t>(13) = 0;            // dyaw
+
+    m_linearKalman.statePost.at<track_t>(0) = rect0.center.x;
+    m_linearKalman.statePost.at<track_t>(1) = rect0.center.y;
+    m_linearKalman.statePost.at<track_t>(2) = rect0.center.z;
+    m_linearKalman.statePost.at<track_t>(3) = rect0.size.width;
+    m_linearKalman.statePost.at<track_t>(4) = rect0.size.height;
+    m_linearKalman.statePost.at<track_t>(5) = rect0.size.length;
+    m_linearKalman.statePost.at<track_t>(6) = rect0.yaw;
+    m_linearKalman.statePost.at<track_t>(7) = rectv0.x;
+    m_linearKalman.statePost.at<track_t>(8) = rectv0.y;
+    m_linearKalman.statePost.at<track_t>(9) = 0;
+    m_linearKalman.statePost.at<track_t>(10) = 0;
+    m_linearKalman.statePost.at<track_t>(11) = 0;
+    m_linearKalman.statePost.at<track_t>(12) = 0;
+    m_linearKalman.statePost.at<track_t>(13) = 0;
+
+    cv::setIdentity(m_linearKalman.measurementMatrix);
+
+    track_t n1 = pow(m_deltaTime, 4.f) / 4.f;
+    track_t n2 = pow(m_deltaTime, 3.f) / 2.f;
+    track_t n3 = pow(m_deltaTime, 2.f);
+    m_linearKalman.processNoiseCov = (cv::Mat_<track_t>(14, 14) <<
+                                       n1, 0,  0,  0,  0,  0,  0,  n2, 0,  0,  0,  0,  0,  0,
+                                       0,  n1, 0,  0,  0,  0,  0,  0,  n2, 0,  0,  0,  0,  0,
+                                       0,  0,  n1, 0,  0,  0,  0,  0,  0,  n2, 0,  0,  0,  0,
+                                       0,  0,  0,  n1, 0,  0,  0,  0,  0,  0,  n2, 0,  0,  0,
+                                       0,  0,  0,  0,  n1, 0,  0,  0,  0,  0,  0,  n2, 0,  0,
+                                       0,  0,  0,  0,  0,  n1, 0,  0,  0,  0,  0,  0,  n2, 0,
+                                       0,  0,  0,  0,  0,  0,  n1, 0,  0,  0,  0,  0,  0,  n2,
+                                       n2, 0,  0,  0,  0,  0,  0,  n3, 0,  0,  0,  0,  0,  0,
+                                       0,  n2, 0,  0,  0,  0,  0,  0,  n3, 0,  0,  0,  0,  0,
+                                       0,  0,  n2, 0,  0,  0,  0,  0,  0,  n3, 0,  0,  0,  0,
+                                       0,  0,  0,  n2, 0,  0,  0,  0,  0,  0,  n3, 0,  0,  0,
+                                       0,  0,  0,  0,  n2, 0,  0,  0,  0,  0,  0,  n3, 0,  0,
+                                       0,  0,  0,  0,  0,  n2, 0,  0,  0,  0,  0,  0,  n3, 0,
+                                       0,  0,  0,  0,  0,  0,  n2, 0,  0,  0,  0,  0,  0,  n3);
+
+    m_linearKalman.processNoiseCov *= m_accelNoiseMag;
+
+    cv::setIdentity(m_linearKalman.measurementNoiseCov, cv::Scalar::all(0.1));
+
+    cv::setIdentity(m_linearKalman.errorCovPost, cv::Scalar::all(.1));
+
+    m_initialized = true;
+}
+
+//---------------------------------------------------------------------------
+void TKalmanFilter::CreateLinearAcceleration(cv::Point3f xy0, cv::Point3f xyv0)
+{
+	// 6 state variables, 2 measurements
+	m_linearKalman.init(6, 2, 0, El_t);
+	// Transition cv::Matrix
+	const track_t dt = m_deltaTime;
+	const track_t dt2 = 0.5f * m_deltaTime * m_deltaTime;
+	m_linearKalman.transitionMatrix = (cv::Mat_<track_t>(6, 6) <<
+		1, 0, dt, 0,  dt2, 0,
+		0, 1, 0,  dt, 0,   dt2,
+		0, 0, 1,  0,  dt,  0,
+		0, 0, 0,  1,  0,   dt,
+	    0, 0, 0,  0,  1,   0,
+	    0, 0, 0,  0,  0,   1);
+
+	// init...
+	m_lastPointResult = xy0;
+	m_linearKalman.statePre.at<track_t>(0) = xy0.x;  // x
+	m_linearKalman.statePre.at<track_t>(1) = xy0.y;  // y
+	m_linearKalman.statePre.at<track_t>(2) = xyv0.x; // vx
+	m_linearKalman.statePre.at<track_t>(3) = xyv0.y; // vy
+	m_linearKalman.statePre.at<track_t>(4) = 0;      // ax
+	m_linearKalman.statePre.at<track_t>(5) = 0;      // ay
+
+	m_linearKalman.statePost.at<track_t>(0) = xy0.x;
+	m_linearKalman.statePost.at<track_t>(1) = xy0.y;
+	m_linearKalman.statePost.at<track_t>(2) = xyv0.x;
+	m_linearKalman.statePost.at<track_t>(3) = xyv0.y;
+	m_linearKalman.statePost.at<track_t>(4) = 0;
+	m_linearKalman.statePost.at<track_t>(5) = 0;
+
+	cv::setIdentity(m_linearKalman.measurementMatrix);
+
+	track_t n1 = pow(m_deltaTime, 4.f) / 4.f;
+	track_t n2 = pow(m_deltaTime, 3.f) / 2.f;
+	track_t n3 = pow(m_deltaTime, 2.f);
+	m_linearKalman.processNoiseCov = (cv::Mat_<track_t>(6, 6) <<
+		n1, 0, n2, 0, n2, 0,
+		0, n1, 0, n2, 0, n2,
+		n2, 0, n3, 0, n3, 0,
+		0, n2, 0, n3, 0, n3,
+		0, 0, n2, 0, n3, 0,
+		0, 0, 0, n2, 0, n3);
+
+	m_linearKalman.processNoiseCov *= m_accelNoiseMag;
+
+	cv::setIdentity(m_linearKalman.measurementNoiseCov, cv::Scalar::all(0.1));
+
+	cv::setIdentity(m_linearKalman.errorCovPost, cv::Scalar::all(.1));
+
+	m_initialized = true;
+}
+//---------------------------------------------------------------------------
+void TKalmanFilter::CreateLinearAcceleration(cv::Rect_<track_t> rect0, Point_t rectv0)
+{
+    // 12 state variables (x, y, vx, vy, ax, ay, width, height, vw, vh, aw, ah), 4 measurements (x, y, width, height)
+    m_linearKalman.init(12, 4, 0, El_t);
+    // Transition cv::Matrix
+    const track_t dt = m_deltaTime;
+    const track_t dt2 = 0.5f * m_deltaTime * m_deltaTime;
+    m_linearKalman.transitionMatrix = (cv::Mat_<track_t>(12, 12) <<
+        1, 0, 0, 0, dt, 0,  0,  0,  dt2, 0,   dt2, 0,
+        0, 1, 0, 0, 0,  dt, 0,  0,  0,   dt2, 0,   dt2,
+        0, 0, 1, 0, 0,  0,  dt, 0,  0,   0,   dt2, 0,
+        0, 0, 0, 1, 0,  0,  0,  dt, 0,   0,   0,   dt2,
+        0, 0, 0, 0, 1,  0,  0,  0,  dt,  0,   0,   0,
+        0, 0, 0, 0, 0,  1,  0,  0,  0,   dt,  0,   0,
+        0, 0, 0, 0, 0,  0,  1,  0,  0,   0,   dt,  0,
+        0, 0, 0, 0, 0,  0,  0,  1,  0,   0,   0,   dt,
+        0, 0, 0, 0, 0,  0,  0,  0,  1,   0,   0,   0,
+        0, 0, 0, 0, 0,  0,  0,  0,  0,   1,   0,   0,
+        0, 0, 0, 0, 0,  0,  0,  0,  0,   0,   0,   1);
+
+    // init...
+    m_linearKalman.statePre.at<track_t>(0) = rect0.x;      // x
+    m_linearKalman.statePre.at<track_t>(1) = rect0.y;      // y
+    m_linearKalman.statePre.at<track_t>(2) = rect0.width;  // width
+    m_linearKalman.statePre.at<track_t>(3) = rect0.height; // height
+    m_linearKalman.statePre.at<track_t>(4) = rectv0.x;     // dx
+    m_linearKalman.statePre.at<track_t>(5) = rectv0.y;     // dy
+    m_linearKalman.statePre.at<track_t>(6) = 0;            // dw
+    m_linearKalman.statePre.at<track_t>(7) = 0;            // dh
+    m_linearKalman.statePre.at<track_t>(8) = 0;            // ax
+    m_linearKalman.statePre.at<track_t>(9) = 0;            // ay
+    m_linearKalman.statePre.at<track_t>(10) = 0;           // aw
+    m_linearKalman.statePre.at<track_t>(11) = 0;           // ah
+
+    m_linearKalman.statePost.at<track_t>(0) = rect0.x;
+    m_linearKalman.statePost.at<track_t>(1) = rect0.y;
+    m_linearKalman.statePost.at<track_t>(2) = rect0.width;
+    m_linearKalman.statePost.at<track_t>(3) = rect0.height;
+    m_linearKalman.statePost.at<track_t>(4) = rectv0.x;
+    m_linearKalman.statePost.at<track_t>(5) = rectv0.y;
+    m_linearKalman.statePost.at<track_t>(6) = 0;
+    m_linearKalman.statePost.at<track_t>(7) = 0;
+    m_linearKalman.statePost.at<track_t>(8) = 0;
+    m_linearKalman.statePost.at<track_t>(9) = 0;
+    m_linearKalman.statePost.at<track_t>(10) = 0;
+    m_linearKalman.statePost.at<track_t>(11) = 0;
+
+    cv::setIdentity(m_linearKalman.measurementMatrix);
+
+    track_t n1 = pow(m_deltaTime, 4.f) / 4.f;
+    track_t n2 = pow(m_deltaTime, 3.f) / 2.f;
+    track_t n3 = pow(m_deltaTime, 2.f);
+    m_linearKalman.processNoiseCov = (cv::Mat_<track_t>(12, 12) <<
+        n1, 0, 0, 0, n2, 0, 0, 0, n2, 0, n2,
+        0, n1, 0, 0, 0, n2, 0, 0, 0, n2, 0, n2,
+        0, 0, n1, 0, 0, 0, n2, 0, 0, 0, n2, 0,
+        0, 0, 0, n1, 0, 0, 0, n2, 0, 0, 0, n2,
+        n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, n3, 0,
+        0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, n3,
+        0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0,
+        0, 0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3,
+        n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, 0, 0,
+        0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, 0,
+        0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0,
+        0, 0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3);
+
+    m_linearKalman.processNoiseCov *= m_accelNoiseMag;
+
+    cv::setIdentity(m_linearKalman.measurementNoiseCov, cv::Scalar::all(0.1));
+
+    cv::setIdentity(m_linearKalman.errorCovPost, cv::Scalar::all(.1));
+
+    m_initialized = true;
+}
+//---------------------------------------------------------------------------
+void TKalmanFilter::CreateLinearAcceleration3D(Rect3D rect0, cv::Point3f rectv0)
+{
+	// 12 state variables (x, y, vx, vy, ax, ay, width, height, vw, vh, aw, ah), 4 measurements (x, y, width, height)
+	m_linearKalman.init(12, 4, 0, El_t);
+	// Transition cv::Matrix
+	const track_t dt = m_deltaTime;
+	const track_t dt2 = 0.5f * m_deltaTime * m_deltaTime;
+	m_linearKalman.transitionMatrix = (cv::Mat_<track_t>(12, 12) <<
+		1, 0, 0, 0, dt, 0,  0,  0,  dt2, 0,   dt2, 0,
+		0, 1, 0, 0, 0,  dt, 0,  0,  0,   dt2, 0,   dt2,
+		0, 0, 1, 0, 0,  0,  dt, 0,  0,   0,   dt2, 0,
+		0, 0, 0, 1, 0,  0,  0,  dt, 0,   0,   0,   dt2,
+		0, 0, 0, 0, 1,  0,  0,  0,  dt,  0,   0,   0,
+		0, 0, 0, 0, 0,  1,  0,  0,  0,   dt,  0,   0,
+		0, 0, 0, 0, 0,  0,  1,  0,  0,   0,   dt,  0,
+		0, 0, 0, 0, 0,  0,  0,  1,  0,   0,   0,   dt,
+		0, 0, 0, 0, 0,  0,  0,  0,  1,   0,   0,   0,
+		0, 0, 0, 0, 0,  0,  0,  0,  0,   1,   0,   0,
+		0, 0, 0, 0, 0,  0,  0,  0,  0,   0,   0,   1);
+
+	// init...
+    m_linearKalman.statePre.at<track_t>(0) = rect0.center.x;      // x
+    m_linearKalman.statePre.at<track_t>(1) = rect0.center.y;      // y
+    m_linearKalman.statePre.at<track_t>(2) = rect0.size.width;  // width
+    m_linearKalman.statePre.at<track_t>(3) = rect0.size.height; // height
+	m_linearKalman.statePre.at<track_t>(4) = rectv0.x;     // dx
+	m_linearKalman.statePre.at<track_t>(5) = rectv0.y;     // dy
+	m_linearKalman.statePre.at<track_t>(6) = 0;            // dw
+	m_linearKalman.statePre.at<track_t>(7) = 0;            // dh
+	m_linearKalman.statePre.at<track_t>(8) = 0;            // ax
+	m_linearKalman.statePre.at<track_t>(9) = 0;            // ay
+	m_linearKalman.statePre.at<track_t>(10) = 0;           // aw
+	m_linearKalman.statePre.at<track_t>(11) = 0;           // ah
+
+    m_linearKalman.statePost.at<track_t>(0) = rect0.center.x;
+    m_linearKalman.statePost.at<track_t>(1) = rect0.center.y;
+    m_linearKalman.statePost.at<track_t>(2) = rect0.size.width;
+    m_linearKalman.statePost.at<track_t>(3) = rect0.size.height;
+	m_linearKalman.statePost.at<track_t>(4) = rectv0.x;
+	m_linearKalman.statePost.at<track_t>(5) = rectv0.y;
+	m_linearKalman.statePost.at<track_t>(6) = 0;
+	m_linearKalman.statePost.at<track_t>(7) = 0;
+	m_linearKalman.statePost.at<track_t>(8) = 0;
+	m_linearKalman.statePost.at<track_t>(9) = 0;
+	m_linearKalman.statePost.at<track_t>(10) = 0;
+	m_linearKalman.statePost.at<track_t>(11) = 0;
+
+	cv::setIdentity(m_linearKalman.measurementMatrix);
+
+	track_t n1 = pow(m_deltaTime, 4.f) / 4.f;
+	track_t n2 = pow(m_deltaTime, 3.f) / 2.f;
+	track_t n3 = pow(m_deltaTime, 2.f);
+	m_linearKalman.processNoiseCov = (cv::Mat_<track_t>(12, 12) <<
+		n1, 0, 0, 0, n2, 0, 0, 0, n2, 0, n2,
+		0, n1, 0, 0, 0, n2, 0, 0, 0, n2, 0, n2,
+		0, 0, n1, 0, 0, 0, n2, 0, 0, 0, n2, 0,
+		0, 0, 0, n1, 0, 0, 0, n2, 0, 0, 0, n2,
+		n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, n3, 0,
+		0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, n3,
+		0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0,
+		0, 0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3,
+		n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, 0, 0,
+		0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, 0,
+		0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0,
+		0, 0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3);
+
+	m_linearKalman.processNoiseCov *= m_accelNoiseMag;
+
+	cv::setIdentity(m_linearKalman.measurementNoiseCov, cv::Scalar::all(0.1));
+
+	cv::setIdentity(m_linearKalman.errorCovPost, cv::Scalar::all(.1));
+
+	m_initialized = true;
+}
+
+//---------------------------------------------------------------------------
+cv::Point3f TKalmanFilter::GetPointPrediction()
+{
+    if (m_initialized)
+    {
+        cv::Mat prediction;
+
+        switch (m_type)
+        {
+        case tracking::KalmanLinear:
+            prediction = m_linearKalman.predict();
+            break;
+        }
+
+        m_lastPointResult = cv::Point3f(prediction.at<track_t>(0), prediction.at<track_t>(1), prediction.at<track_t>(2));
+    }
+    else
+    {
+
+    }
+    return m_lastPointResult;
+}
+
+//---------------------------------------------------------------------------
+cv::Point3f TKalmanFilter::Update(cv::Point3f pt, bool dataCorrect)
+{
+    if (!m_initialized)
+    {
+        if (m_initialPoints.size() < MIN_INIT_VALS)
+        {
+            if (dataCorrect)
+            {
+                m_initialPoints.push_back(pt);
+                m_lastPointResult = pt;
+            }
+        }
+        if (m_initialPoints.size() == MIN_INIT_VALS)
+        {
+            track_t kx = 0;
+            track_t bx = 0;
+            track_t ky = 0;
+            track_t by = 0;
+            get_lin_regress_params(m_initialPoints, 0, MIN_INIT_VALS, kx, bx, ky, by);//predict p,v
+            cv::Point3f xy0(kx * (MIN_INIT_VALS - 1) + bx, ky * (MIN_INIT_VALS - 1) + by, m_lastPointResult.z);
+            cv::Point3f xyv0(kx, ky,0);
+
+            switch (m_type)
+            {
+            case tracking::KalmanLinear:
+                if (m_useAcceleration)
+					CreateLinearAcceleration(xy0, xyv0);
+				else
+					CreateLinear(xy0, xyv0);
+                break;
+            }
+            m_lastDist = 0;
+        }
+    }
+
+    if (m_initialized)
+    {
+        cv::Mat measurement(2, 1, Mat_t(1));
+        if (!dataCorrect)
+        {
+            measurement.at<track_t>(0) = m_lastPointResult.x;  //update using prediction
+            measurement.at<track_t>(1) = m_lastPointResult.y;
+        }
+        else
+        {
+            measurement.at<track_t>(0) = pt.x;  //update using measurements
+            measurement.at<track_t>(1) = pt.y;
+        }
+        // Correction
+        cv::Mat estimated;
+        switch (m_type)
+        {
+        case tracking::KalmanLinear:
+        {
+            estimated = m_linearKalman.correct(measurement);
+
+            // Inertia correction
+			if (!m_useAcceleration)
+			{
+				track_t currDist = sqrtf(sqr(estimated.at<track_t>(0) - pt.x) + sqr(estimated.at<track_t>(1) - pt.y));
+				if (currDist > m_lastDist)
+				{
+					m_deltaTime = std::min(m_deltaTime + m_deltaStep, m_deltaTimeMax);
+				}
+				else
+				{
+					m_deltaTime = std::max(m_deltaTime - m_deltaStep, m_deltaTimeMin);
+				}
+				m_lastDist = currDist;
+
+				m_linearKalman.transitionMatrix.at<track_t>(0, 2) = m_deltaTime;
+				m_linearKalman.transitionMatrix.at<track_t>(1, 3) = m_deltaTime;
+			}
+            break;
+        }
+        }
+
+        m_lastPointResult.x = estimated.at<track_t>(0);   //update using measurements
+        m_lastPointResult.y = estimated.at<track_t>(1);
+    }
+    else
+    {
+        if (dataCorrect)
+        {
+            m_lastPointResult = pt;
+        }
+    }
+    return m_lastPointResult;
+}
+//---------------------------------------------------------------------------
+cv::Rect TKalmanFilter::GetRectPrediction()
+{
+    if (m_initialized)
+    {
+        cv::Mat prediction;
+
+        switch (m_type)
+        {
+        case tracking::KalmanLinear:
+            prediction = m_linearKalman.predict();
+            break;
+        }
+
+        m_lastRectResult = cv::Rect_<track_t>(prediction.at<track_t>(0), prediction.at<track_t>(1), prediction.at<track_t>(2), prediction.at<track_t>(3));
+    }
+    else
+    {
+
+    }
+    return cv::Rect(static_cast<int>(m_lastRectResult.x), static_cast<int>(m_lastRectResult.y), static_cast<int>(m_lastRectResult.width), static_cast<int>(m_lastRectResult.height));
+}
+
+//---------------------------------------------------------------------------
+cv::Rect TKalmanFilter::Update(cv::Rect rect, bool dataCorrect)
+{
+    if (!m_initialized)
+    {
+        if (m_initialRects.size() < MIN_INIT_VALS)
+        {
+            if (dataCorrect)
+            {
+                m_initialRects.push_back(rect);
+                m_lastRectResult.x = static_cast<track_t>(rect.x);
+                m_lastRectResult.y = static_cast<track_t>(rect.y);
+                m_lastRectResult.width = static_cast<track_t>(rect.width);
+                m_lastRectResult.height = static_cast<track_t>(rect.height);
+            }
+        }
+        if (m_initialRects.size() == MIN_INIT_VALS)
+        {
+            std::vector<Point_t> initialPoints;
+            Point_t averageSize(0, 0);
+            for (const auto& r : m_initialRects)
+            {
+                initialPoints.emplace_back(static_cast<track_t>(r.x), static_cast<track_t>(r.y));
+                averageSize.x += r.width;
+                averageSize.y += r.height;
+            }
+            averageSize.x /= MIN_INIT_VALS;
+            averageSize.y /= MIN_INIT_VALS;
+
+            track_t kx = 0;
+            track_t bx = 0;
+            track_t ky = 0;
+            track_t by = 0;
+            get_lin_regress_params(initialPoints, 0, MIN_INIT_VALS, kx, bx, ky, by);
+            cv::Rect_<track_t> rect0(kx * (MIN_INIT_VALS - 1) + bx, ky * (MIN_INIT_VALS - 1) + by, averageSize.x, averageSize.y);
+            Point_t rectv0(kx, ky);
+
+            switch (m_type)
+            {
+            case tracking::KalmanLinear:
+                if (m_useAcceleration)
+                    CreateLinearAcceleration(rect0, rectv0);
+                else
+                    CreateLinear(rect0, rectv0);
+                break;
+            }
+        }
+    }
+
+    if (m_initialized)
+    {
+        cv::Mat measurement(4, 1, Mat_t(1));
+        if (!dataCorrect)
+        {
+            measurement.at<track_t>(0) = m_lastRectResult.x;  // update using prediction
+            measurement.at<track_t>(1) = m_lastRectResult.y;
+            measurement.at<track_t>(2) = m_lastRectResult.width;
+            measurement.at<track_t>(3) = m_lastRectResult.height;
+        }
+        else
+        {
+            measurement.at<track_t>(0) = static_cast<track_t>(rect.x);  // update using measurements
+            measurement.at<track_t>(1) = static_cast<track_t>(rect.y);
+            measurement.at<track_t>(2) = static_cast<track_t>(rect.width);
+            measurement.at<track_t>(3) = static_cast<track_t>(rect.height);
+        }
+        // Correction
+        cv::Mat estimated;
+        switch (m_type)
+        {
+        case tracking::KalmanLinear:
+        {
+            estimated = m_linearKalman.correct(measurement);
+
+            m_lastRectResult.x = estimated.at<track_t>(0);   //update using measurements
+            m_lastRectResult.y = estimated.at<track_t>(1);
+            m_lastRectResult.width = estimated.at<track_t>(2);
+            m_lastRectResult.height = estimated.at<track_t>(3);
+
+            // Inertia correction
+            if (!m_useAcceleration)
+            {
+                track_t currDist = sqrtf(sqr(estimated.at<track_t>(0) - rect.x) + sqr(estimated.at<track_t>(1) - rect.y) + sqr(estimated.at<track_t>(2) - rect.width) + sqr(estimated.at<track_t>(3) - rect.height));
+                if (currDist > m_lastDist)
+                {
+                    m_deltaTime = std::min(m_deltaTime + m_deltaStep, m_deltaTimeMax);
+                }
+                else
+                {
+                    m_deltaTime = std::max(m_deltaTime - m_deltaStep, m_deltaTimeMin);
+                }
+                m_lastDist = currDist;
+
+                m_linearKalman.transitionMatrix.at<track_t>(0, 4) = m_deltaTime;
+                m_linearKalman.transitionMatrix.at<track_t>(1, 5) = m_deltaTime;
+                m_linearKalman.transitionMatrix.at<track_t>(2, 6) = m_deltaTime;
+                m_linearKalman.transitionMatrix.at<track_t>(3, 7) = m_deltaTime;
+            }
+            break;
+        }
+        }
+    }
+    else
+    {
+        if (dataCorrect)
+        {
+            m_lastRectResult.x = static_cast<track_t>(rect.x);
+            m_lastRectResult.y = static_cast<track_t>(rect.y);
+            m_lastRectResult.width = static_cast<track_t>(rect.width);
+            m_lastRectResult.height = static_cast<track_t>(rect.height);
+        }
+    }
+    return cv::Rect(static_cast<int>(m_lastRectResult.x), static_cast<int>(m_lastRectResult.y), static_cast<int>(m_lastRectResult.width), static_cast<int>(m_lastRectResult.height));
+}
+//---------------------------------------------------------------------------
+Rect3D TKalmanFilter::GetRect3DPrediction()
+{
+    if (m_initialized)
+    {
+        cv::Mat prediction;
+
+        switch (m_type)
+        {
+        case tracking::KalmanLinear:
+            prediction = m_linearKalman.predict();
+            break;
+        }
+
+        m_lastRect3DResult = Rect3D(cv::Point3f(prediction.at<track_t>(0), prediction.at<track_t>(1), prediction.at<track_t>(2)), Size3D(prediction.at<track_t>(3), prediction.at<track_t>(4), prediction.at<track_t>(5)), prediction.at<track_t>(6));
+    }
+    else
+    {
+
+    }
+    return m_lastRect3DResult;
+}
+
+//---------------------------------------------------------------------------
+Rect3D TKalmanFilter::Update(Rect3D rect, bool dataCorrect)
+{
+    if (!m_initialized)
+    {
+        if (m_initialRect3Ds.size() < MIN_INIT_VALS)
+        {
+            if (dataCorrect)
+            {
+                m_initialRect3Ds.push_back(rect);
+                m_lastRect3DResult.center.x = static_cast<track_t>(rect.center.x);
+                m_lastRect3DResult.center.y = static_cast<track_t>(rect.center.y);
+                m_lastRect3DResult.center.z = static_cast<track_t>(rect.center.z);
+                m_lastRect3DResult.size.width = static_cast<track_t>(rect.size.width);
+                m_lastRect3DResult.size.height = static_cast<track_t>(rect.size.height);
+                m_lastRect3DResult.size.length = static_cast<track_t>(rect.size.length);
+                m_lastRect3DResult.yaw = static_cast<track_t>(rect.yaw);
+            }
+        }
+        if (m_initialRect3Ds.size() == MIN_INIT_VALS)
+        {
+            std::vector<Point_t> initialPoints;
+            cv::Point3f averageSize(0, 0, 0);
+            float averageZ = 0;
+            float averageYaw = 0;
+            for (const auto& r : m_initialRect3Ds)
+            {
+                initialPoints.emplace_back(static_cast<track_t>(r.center.x), static_cast<track_t>(r.center.y));
+                averageZ += r.center.z;
+                averageSize.x += r.size.width;
+                averageSize.y += r.size.height;
+                averageSize.z += r.size.length;
+                averageYaw += r.yaw;
+            }
+            averageZ /= MIN_INIT_VALS;
+            averageSize.x /= MIN_INIT_VALS;
+            averageSize.y /= MIN_INIT_VALS;
+            averageSize.z /= MIN_INIT_VALS;
+            averageYaw /= MIN_INIT_VALS;
+
+            track_t kx = 0;
+            track_t bx = 0;
+            track_t ky = 0;
+            track_t by = 0;
+            get_lin_regress_params(initialPoints, 0, MIN_INIT_VALS, kx, bx, ky, by);
+            Rect3D rect0(cv::Point3f(kx * (MIN_INIT_VALS - 1) + bx, ky * (MIN_INIT_VALS - 1) + by, averageZ), Size3D(averageSize.x, averageSize.y,averageSize.z), averageYaw);
+            cv::Point3f rectv0(kx, ky, 0);
+
+            switch (m_type)
+            {
+            case tracking::KalmanLinear:
+				if (m_useAcceleration)
+                    CreateLinearAcceleration3D(rect0, rectv0);
+				else
+                    CreateLinear3D(rect0, rectv0);
+                break;
+            }
+        }
+    }
+
+    if (m_initialized)
+    {
+        cv::Mat measurement(7, 1, Mat_t(1));
+        if (!dataCorrect)
+        {
+            measurement.at<track_t>(0) = m_lastRect3DResult.center.x;  // update using prediction
+            measurement.at<track_t>(1) = m_lastRect3DResult.center.y;
+            measurement.at<track_t>(2) = m_lastRect3DResult.center.z;
+            measurement.at<track_t>(3) = m_lastRect3DResult.size.width;
+            measurement.at<track_t>(4) = m_lastRect3DResult.size.height;
+            measurement.at<track_t>(5) = m_lastRect3DResult.size.length;
+            measurement.at<track_t>(6) = m_lastRect3DResult.yaw;
+        }
+        else
+        {
+            measurement.at<track_t>(0) = static_cast<track_t>(rect.center.x);  // update using measurements
+            measurement.at<track_t>(1) = static_cast<track_t>(rect.center.y);
+            measurement.at<track_t>(2) = static_cast<track_t>(rect.center.z);
+            measurement.at<track_t>(3) = static_cast<track_t>(rect.size.width);
+            measurement.at<track_t>(4) = static_cast<track_t>(rect.size.height);
+            measurement.at<track_t>(5) = static_cast<track_t>(rect.size.length);
+            measurement.at<track_t>(6) = static_cast<track_t>(rect.yaw);
+        }
+        // Correction
+        cv::Mat estimated;
+        switch (m_type)
+        {
+        case tracking::KalmanLinear:
+        {
+            estimated = m_linearKalman.correct(measurement);
+
+            m_lastRect3DResult.center.x = estimated.at<track_t>(0);   //update using measurements
+            m_lastRect3DResult.center.y = estimated.at<track_t>(1);
+            m_lastRect3DResult.center.z = estimated.at<track_t>(2);
+            m_lastRect3DResult.size.width = estimated.at<track_t>(3);
+            m_lastRect3DResult.size.height = estimated.at<track_t>(4);
+            m_lastRect3DResult.size.length = estimated.at<track_t>(5);
+            m_lastRect3DResult.yaw = estimated.at<track_t>(6);
+
+            // Inertia correction
+			if (!m_useAcceleration)
+			{
+                track_t currDist = sqrtf(sqr(estimated.at<track_t>(0) - rect.center.x) + sqr(estimated.at<track_t>(1) - rect.center.y)+ sqr(estimated.at<track_t>(2) - rect.center.z) + sqr(estimated.at<track_t>(3) - rect.size.width) + sqr(estimated.at<track_t>(4) - rect.size.height) + sqr(estimated.at<track_t>(5) - rect.size.length) + sqr(estimated.at<track_t>(6) - rect.yaw));
+				if (currDist > m_lastDist)
+				{
+					m_deltaTime = std::min(m_deltaTime + m_deltaStep, m_deltaTimeMax);
+				}
+				else
+				{
+					m_deltaTime = std::max(m_deltaTime - m_deltaStep, m_deltaTimeMin);
+				}
+				m_lastDist = currDist;
+
+                m_linearKalman.transitionMatrix.at<track_t>(0, 7) = m_deltaTime;
+                m_linearKalman.transitionMatrix.at<track_t>(1, 8) = m_deltaTime;
+                m_linearKalman.transitionMatrix.at<track_t>(2, 9) = m_deltaTime;
+                m_linearKalman.transitionMatrix.at<track_t>(3, 10) = m_deltaTime;
+                m_linearKalman.transitionMatrix.at<track_t>(4, 11) = m_deltaTime;
+                m_linearKalman.transitionMatrix.at<track_t>(5, 12) = m_deltaTime;
+                m_linearKalman.transitionMatrix.at<track_t>(6, 13) = m_deltaTime;
+			}
+            break;
+        }
+        }
+    }
+    else
+    {
+        if (dataCorrect)
+        {
+            m_lastRect3DResult.center.x = static_cast<track_t>(rect.center.x);
+            m_lastRect3DResult.center.y = static_cast<track_t>(rect.center.y);
+            m_lastRect3DResult.center.z = static_cast<track_t>(rect.center.z);
+            m_lastRect3DResult.size.width = static_cast<track_t>(rect.size.width);
+            m_lastRect3DResult.size.height = static_cast<track_t>(rect.size.height);
+            m_lastRect3DResult.size.length = static_cast<track_t>(rect.size.length);
+            m_lastRect3DResult.yaw = static_cast<track_t>(rect.yaw);
+        }
+    }
+    return m_lastRect3DResult;
+}
+
+//---------------------------------------------------------------------------
+cv::Vec<track_t, 2> TKalmanFilter::GetVelocity() const
+{
+    cv::Vec<track_t, 2> res(0, 0);
+    if (m_initialized)
+    {
+        switch (m_type)
+        {
+        case tracking::KalmanLinear:
+        {
+            if (m_linearKalman.statePre.rows > 3)
+            {
+                int indX = 2;
+                int indY = 3;
+                if (m_linearKalman.statePre.rows > 5)
+                {
+                    indX = 4;
+                    indY = 5;
+                    if (m_linearKalman.statePre.rows > 8)
+                    {
+                        indX = 7;
+                        indY = 8;
+                    }
+                    res[0] = m_linearKalman.statePre.at<track_t>(indX);
+                    res[1] = m_linearKalman.statePre.at<track_t>(indY);
+                }
+            }
+            break;
+        }
+
+        }
+    }
+    return res;
+}

+ 120 - 0
src/api/perception_lidar_PointPillars_MultiHead/Tracker/Kalman.h

@@ -0,0 +1,120 @@
+#pragma once
+#include "defines.h"
+#include <memory>
+#include <deque>
+
+#include <opencv2/opencv.hpp>
+
+///
+/// \brief The TKalmanFilter class
+/// http://www.morethantechnical.com/2011/06/17/simple-kalman-filter-for-tracking-using-opencv-2-2-w-code/
+///
+class TKalmanFilter
+{
+public:
+    TKalmanFilter(tracking::KalmanType type, bool useAcceleration, track_t deltaTime, track_t accelNoiseMag);
+    ~TKalmanFilter() = default;
+
+    cv::Point3f GetPointPrediction();
+    cv::Point3f Update(cv::Point3f pt, bool dataCorrect);
+
+    cv::Rect GetRectPrediction();
+    cv::Rect Update(cv::Rect rect, bool dataCorrect);
+
+    Rect3D GetRect3DPrediction();
+    Rect3D Update(Rect3D rect, bool dataCorrect);
+
+	cv::Vec<track_t, 2> GetVelocity() const;
+
+private:
+    tracking::KalmanType m_type = tracking::KalmanLinear;
+    cv::KalmanFilter m_linearKalman;
+    static const size_t MIN_INIT_VALS = 4;
+    std::deque<cv::Point3f> m_initialPoints;
+    cv::Point3f m_lastPointResult;
+
+    std::deque<cv::Rect> m_initialRects;
+    cv::Rect_<track_t> m_lastRectResult;
+    cv::Rect_<track_t> m_lastRect;
+
+    std::deque<Rect3D> m_initialRect3Ds;
+    Rect3D m_lastRect3DResult;
+    cv::Rect_<track_t> m_lastRect3D;
+
+    bool m_initialized = false;
+    track_t m_deltaTime = 0.2f;
+    track_t m_deltaTimeMin = 0.2f;
+    track_t m_deltaTimeMax = 2 * 0.2f;
+    track_t m_lastDist = 0;
+    track_t m_deltaStep = 0;
+    static const int m_deltaStepsCount = 20;
+    track_t m_accelNoiseMag = 0.5f;
+	bool m_useAcceleration = false; // If set true then will be used motion model x(t) = x0 + v0 * t + a * t^2 / 2
+
+	// Constant velocity model
+    void CreateLinear(cv::Point3f xy0, cv::Point3f xyv0);
+    void CreateLinear(cv::Rect_<track_t> rect0, Point_t rectv0);
+    void CreateLinear3D(Rect3D rect0, cv::Point3f rectv0);
+
+	// Constant acceleration model
+	// https://www.mathworks.com/help/driving/ug/linear-kalman-filters.html
+    void CreateLinearAcceleration(cv::Point3f xy0, cv::Point3f xyv0);
+    void CreateLinearAcceleration(cv::Rect_<track_t> rect0, Point_t rectv0);
+    void CreateLinearAcceleration3D(Rect3D rect0, cv::Point3f rectv0);
+};
+
+//---------------------------------------------------------------------------
+///
+/// \brief sqr
+/// \param val
+/// \return
+///
+template<class T> inline
+T sqr(T val)
+{
+    return val * val;
+}
+
+///
+/// \brief get_lin_regress_params
+/// \param in_data
+/// \param start_pos
+/// \param in_data_size
+/// \param kx
+/// \param bx
+/// \param ky
+/// \param by
+///
+template<typename T, typename CONT>
+void get_lin_regress_params(
+        const CONT& in_data,
+        size_t start_pos,
+        size_t in_data_size,
+        T& kx, T& bx, T& ky, T& by)
+{
+    T m1(0.), m2(0.);
+    T m3_x(0.), m4_x(0.);
+    T m3_y(0.), m4_y(0.);
+
+    const T el_count = static_cast<T>(in_data_size - start_pos);
+    for (size_t i = start_pos; i < in_data_size; ++i)
+    {
+        m1 += i;
+        m2 += sqr(i);
+
+        m3_x += in_data[i].x;
+        m4_x += i * in_data[i].x;
+
+        m3_y += in_data[i].y;
+        m4_y += i * in_data[i].y;
+    }
+    T det_1 = 1 / (el_count * m2 - sqr(m1));
+
+    m1 *= -1;
+
+    kx = det_1 * (m1 * m3_x + el_count * m4_x);
+    bx = det_1 * (m2 * m3_x + m1 * m4_x);
+
+    ky = det_1 * (m1 * m3_y + el_count * m4_y);
+    by = det_1 * (m2 * m3_y + m1 * m4_y);
+}

+ 65 - 0
src/api/perception_lidar_PointPillars_MultiHead/Tracker/ShortPathCalculator.h

@@ -0,0 +1,65 @@
+#pragma once
+#include "defines.h"
+#include "HungarianAlg.h"
+
+///
+/// \brief The SPSettings struct
+///
+struct SPSettings
+{
+    track_t m_distThres = 0.8f;
+    size_t m_maxHistory = 10;
+};
+
+///
+/// \brief The ShortPathCalculator class
+///
+class ShortPathCalculator
+{
+public:
+    ShortPathCalculator(const SPSettings& settings)
+        : m_settings(settings)
+    {
+    }
+    virtual ~ShortPathCalculator()
+    {
+    }
+
+    virtual void Solve(const distMatrix_t& costMatrix, size_t N, size_t M, assignments_t& assignment, track_t maxCost) = 0;
+protected:
+    SPSettings m_settings;
+};
+
+///
+/// \brief The SPHungrian class
+///
+class SPHungrian : public ShortPathCalculator
+{
+public:
+    SPHungrian(const SPSettings& settings)
+        : ShortPathCalculator(settings)
+    {
+    }
+
+    void Solve(const distMatrix_t& costMatrix, size_t N, size_t M, assignments_t& assignment, track_t /*maxCost*/)
+    {
+        m_solver.Solve(costMatrix, N, M, assignment, AssignmentProblemSolver::optimal);
+    }
+
+private:
+    AssignmentProblemSolver m_solver;
+};
+
+///
+/// \brief The SPBipart class
+///
+//class SPBipart : public ShortPathCalculator
+//{
+//public:
+//    SPBipart(const SPSettings& settings)
+//        : ShortPathCalculator(settings)
+//    {
+//    }
+
+//    void Solve(const distMatrix_t& costMatrix, size_t N, size_t M, assignments_t& assignment, track_t maxCost);
+//};

+ 159 - 0
src/api/perception_lidar_PointPillars_MultiHead/Tracker/Tracking.hpp

@@ -0,0 +1,159 @@
+#pragma once
+#include "objectarray.pb.h"
+#include "Ctracker.h"
+#include <iostream>
+#include <vector>
+#define FPS 8 
+#define MIN_DETECTEDFRAMES 5
+///
+/// \brief 跟踪器参数设置
+///
+bool InitTracker(CTracker& tracker)
+{
+    TrackerSettings settings;
+    settings.SetDistance(tracking::DistRect3Ds); // 代价矩阵:两中心点之间的距离
+    settings.m_kalmanType = tracking::KalmanLinear; // 滤波器类型:卡尔曼线性滤波器
+    settings.m_filterGoal = tracking::FilterRect3D; // 滤波对象:Rect3D
+    settings.m_lostTrackType = tracking::TrackNone; // 丢失了的目标,不再追踪
+    settings.m_matchType = tracking::MatchHungrian; // 匹配算法:匈牙利
+    settings.m_dt = 1.f; // 卡尔曼滤波器的时间步长
+    settings.m_accelNoiseMag = 0.5f; // 卡尔曼的噪声放大器
+    settings.m_distThres = 30.f; // 匹配算法中的距离阈值
+    settings.m_minAreaRadiusPix = -1.f;//frame.rows / 20.f; // 目标的最小面积半径(像素)
+    settings.m_maximumAllowedSkippedFrames = 3; // 被跟踪目标允许未匹配到的最大次数,当超过这一数值,该目标的跟踪器将被移除
+    settings.m_maxTraceLength = 5; // 最大跟踪长度,即历史轨迹保留的最大长度
+
+    tracker.setSettings(settings);
+
+    return true;
+}
+
+///
+/// \brief 对融合后的目标进行跟踪,并以跟踪后的最优估计值更新融合目标的状态信息
+///
+iv::lidar::objectarray Tracking(iv::lidar::objectarray& lidarobjvec, CTracker& tracker)
+{
+#ifdef DEBUG_SHOW
+    std::cout<<"-------------------------------------------------"<<std::endl;
+#endif
+    iv::lidar::objectarray trackedobjvec;
+    trackedobjvec.clear_obj();
+    trackedobjvec.set_timestamp(lidarobjvec.timestamp());
+    regions_t regions;
+    cv::Point3f pointXYZ;
+    for(int i = 0;i<lidarobjvec.obj_size();i++)
+    {
+        pointXYZ.x = lidarobjvec.obj(i).centroid().x();
+        pointXYZ.y = lidarobjvec.obj(i).centroid().y();
+        pointXYZ.z = lidarobjvec.obj(i).centroid().z();
+        Rect3D rect;
+        rect.center = pointXYZ;
+        rect.size.width = lidarobjvec.obj(i).dimensions().x();//w
+        rect.size.height = lidarobjvec.obj(i).dimensions().y();//l
+        rect.size.length = lidarobjvec.obj(i).dimensions().z();//h
+        rect.yaw = lidarobjvec.obj(i).tyaw();
+        CRegion region = CRegion(rect,lidarobjvec.obj(i).mntype(),lidarobjvec.obj(i).score());
+        regions.push_back(region);
+#ifdef DEBUG_SHOW
+        std::cout<<"old id:"<<i<<std::endl;
+        std::cout<<"old type:"<<lidarobjvec.obj(i).mntype()<<std::endl;
+        std::cout<<"old x,y,z,w,h,l,yaw:"<<rect.center.x<<","<<rect.center.y<<","<<rect.center.z<<"  "<<rect.size.width<<","<<rect.size.height<<","<<rect.size.length<<"  "<<rect.yaw<<std::endl;
+#endif
+
+    }
+    tracker.Update(regions, cv::UMat(), 30);
+    auto tracks = tracker.GetTracks();
+#ifdef DEBUG_SHOW
+    std::cout<<"detect size, tracker size:"<<regions.size()<<","<<tracks.size()<<std::endl;
+#endif
+    for (size_t i = 0; i < tracks.size(); i++)
+    {
+        const auto& track = tracks[i];
+        if(track.m_detectedFrames < MIN_DETECTEDFRAMES) continue;
+        int obj_id = track.m_regionID;
+        iv::lidar::lidarobject lidar_object;
+        if(obj_id != -1) // 当前融合目标成功匹配上跟踪器中的已有目标,则根据跟踪所得的最优估计值更新融合目标状态
+        {
+            lidar_object = lidarobjvec.obj(obj_id);
+            lidar_object.set_id(track.m_ID);
+
+            iv::lidar::PointXYZ centroid;
+            iv::lidar::PointXYZ *centerpoint;
+            centroid.set_x(track.m_rect.center.x);
+            centroid.set_y(track.m_rect.center.y);
+            centroid.set_z(track.m_rect.center.z);
+            centerpoint=lidar_object.mutable_centroid();
+            centerpoint->CopyFrom(centroid);
+/* not update */
+//            iv::lidar::Dimension dimension;
+//            iv::lidar::Dimension *obj_dimension;
+//            dimension.set_x(track.m_rect.size.width);
+//            dimension.set_y(track.m_rect.size.height);
+//            dimension.set_z(track.m_rect.size.length);
+//            obj_dimension=lidar_object.mutable_dimensions();
+//            obj_dimension->CopyFrom(dimension);
+
+//            lidar_object.set_tyaw(track.m_rect.yaw);
+
+            iv::lidar::VelXY vel_relative;
+            iv::lidar::VelXY *velrelative;
+            vel_relative.set_x(track.m_velocity[0]*FPS);
+            vel_relative.set_y(track.m_velocity[1]*FPS);
+            velrelative = lidar_object.mutable_vel_relative();
+            velrelative->CopyFrom(vel_relative);
+
+            iv::lidar::lidarobject *pe = trackedobjvec.add_obj();
+            pe->CopyFrom(lidar_object);
+        }else{ // 当前时刻没有融合目标与跟踪器中的已有目标匹配上,则将跟踪器中已有目标的预测结果增加到融合结果中            
+            lidar_object.set_id(track.m_ID);
+            lidar_object.set_mntype(track.m_region.m_type);
+            lidar_object.set_score(track.m_region.m_confidence);
+
+            iv::lidar::PointXYZ centroid;
+            iv::lidar::PointXYZ *centerpoint;
+            centroid.set_x(track.m_rect.center.x);
+            centroid.set_y(track.m_rect.center.y);
+            centroid.set_z(track.m_rect.center.z);
+            centerpoint=lidar_object.mutable_centroid();
+            centerpoint->CopyFrom(centroid);
+
+            iv::lidar::Dimension dimension;
+            iv::lidar::Dimension *obj_dimension;
+            dimension.set_x(track.m_region.m_rect.size.width);
+            dimension.set_y(track.m_region.m_rect.size.height);
+            dimension.set_z(track.m_region.m_rect.size.length);
+            obj_dimension=lidar_object.mutable_dimensions();
+            obj_dimension->CopyFrom(dimension);
+
+            lidar_object.set_tyaw(track.m_region.m_rect.yaw);
+
+            iv::lidar::VelXY vel_relative;
+            iv::lidar::VelXY *velrelative;
+            vel_relative.set_x(track.m_velocity[0]*FPS);
+            vel_relative.set_y(track.m_velocity[1]*FPS);
+            velrelative = lidar_object.mutable_vel_relative();
+            velrelative->CopyFrom(vel_relative);
+
+            iv::lidar::lidarobject *pe = trackedobjvec.add_obj();
+            pe->CopyFrom(lidar_object);
+        }
+#ifdef DEBUG_SHOW
+        std::cout<<"id:"<<lidar_object.id()<<"  "<<obj_id<<std::endl;
+        std::cout<<"type:"<<lidar_object.mntype()<<std::endl;
+        std::cout<<"update x,y,z,w,h,l,yaw,vx,vy:"<<lidar_object.centroid().x()<<","<<lidar_object.centroid().y()<<","<<lidar_object.centroid().z()<<"  "<<lidar_object.dimensions().x()<<","<<lidar_object.dimensions().y()<<","<<lidar_object.dimensions().z()<<"  "<<lidar_object.tyaw()<<"  "<<lidar_object.vel_relative().x()<<","<<lidar_object.vel_relative().y()<<std::endl;
+#endif
+    }
+#ifdef DEBUG_SHOW
+    std::cout<<"trackedobjvec size:"<<trackedobjvec.obj_size()<<std::endl;
+#endif
+    //    for (size_t i = 0; i < trackedobjvec.obj_size(); i++)
+    //    {
+    //        iv::lidar::lidarobject lidar_object;
+    //        lidar_object = trackedobjvec.obj(i);
+    //        std::cout<<"historical size:"<<lidar_object.point_historical_size()<<std::endl;
+    //    }
+    regions.clear();
+    tracks.clear();
+    return trackedobjvec;
+}
+

+ 147 - 0
src/api/perception_lidar_PointPillars_MultiHead/Tracker/defines.h

@@ -0,0 +1,147 @@
+#pragma once
+
+#include <vector>
+#include <string>
+#include <map>
+#include <opencv2/opencv.hpp>
+
+
+// ---------------------------------------------------------------------------
+//
+// ---------------------------------------------------------------------------
+typedef float track_t;
+typedef cv::Point_<track_t> Point_t;
+#define El_t CV_32F
+#define Mat_t CV_32FC
+
+typedef std::vector<int> assignments_t;
+typedef std::vector<track_t> distMatrix_t;
+typedef struct Size3D{
+    Size3D()
+    {
+    }
+    Size3D(float w, float h, float l)
+        :
+          width(w),
+          height(h),
+          length(l)
+    {
+    }
+    float length;
+    float width;
+    float height;
+} Size3D;
+typedef struct Rect3D{
+    Rect3D()
+    {
+    }
+    Rect3D(cv::Point3f c, Size3D s, float y)
+        :
+          center(c),
+          size(s),
+          yaw(y)
+    {
+    }
+    cv::Point3f center;
+    Size3D size;
+    float yaw;
+} Rect3D;
+///
+/// \brief config_t
+///
+typedef std::multimap<std::string, std::string> config_t;
+
+///
+/// \brief The CRegion class
+///
+class CRegion
+{
+public:
+    CRegion()
+        : m_type(-1), m_type_name(""), m_confidence(-1)
+    {
+    }
+
+    CRegion(const Rect3D& rect, const int& type, float confidence)
+        : m_rect(rect), m_type(type), m_confidence(confidence)
+    {
+        RBRect();
+    }
+
+    CRegion(const Rect3D& rect, const std::string& type, float confidence)
+        : m_rect(rect), m_type_name(type), m_confidence(confidence)
+    {
+        RBRect();
+    }
+    Rect3D m_rect;
+    cv::RotatedRect m_rrect;
+    cv::Rect m_brect;
+    int m_type = -1;
+    std::string m_type_name = "";
+    float m_confidence = -1;
+private:
+    ///
+    /// \brief B2RRect
+    /// \return
+    ///
+    cv::RotatedRect RBRect()
+    {
+        m_rrect = cv::RotatedRect(Point_t(m_rect.center.x,m_rect.center.y), cv::Size(m_rect.size.width,m_rect.size.height), 0);
+        m_brect = cv::Rect(m_rect.center.x-m_rect.size.width/2,m_rect.center.y-m_rect.size.height/2, m_rect.size.width,m_rect.size.height);
+        return m_rrect;
+    }
+};
+
+typedef std::vector<CRegion> regions_t;
+
+///
+///
+///
+namespace tracking
+{
+
+///
+/// \brief The DistType enum
+///
+enum DistType
+{
+    DistCenters,   // Euclidean distance between centers, pixels
+    DistRects,     // Euclidean distance between bounding rectangles, pixels
+    DistRect3Ds,     // Euclidean distance between bounding rectangles, pixels
+	DistsCount
+};
+
+///
+/// \brief The FilterGoal enum
+///
+enum FilterGoal
+{
+    FilterCenter, // x,y
+    FilterRect,   // x,y,w,h
+    FilterRect3D  // x,y,z,w,h,l,yaw
+};
+
+///
+/// \brief The KalmanType enum
+///
+enum KalmanType
+{
+    KalmanLinear
+};
+
+///
+/// \brief The MatchType enum
+///
+enum MatchType
+{
+    MatchHungrian
+};
+
+///
+/// \brief The LostTrackType enum
+///
+enum LostTrackType
+{
+    TrackNone
+};
+}

+ 492 - 0
src/api/perception_lidar_PointPillars_MultiHead/Tracker/track.cpp

@@ -0,0 +1,492 @@
+#include "track.h"
+
+//#include "dat/dat_tracker.hpp"
+
+///
+/// \brief CTrack
+/// \param pt
+/// \param region
+/// \param deltaTime
+/// \param accelNoiseMag
+/// \param trackID
+/// \param filterObjectSize
+/// \param externalTrackerForLost
+///
+CTrack::CTrack(const CRegion& region,
+        tracking::KalmanType kalmanType,
+        track_t deltaTime,
+        track_t accelNoiseMag,
+        bool useAcceleration,
+        size_t trackID,
+        size_t regionID,
+        tracking::FilterGoal filterGoal,
+        tracking::LostTrackType externalTrackerForLost
+        )
+    :
+      m_trackID(trackID),
+      m_regionID(regionID),
+      m_skippedFrames(0),
+      m_lastRegion(region),
+      m_predictionPoint(region.m_rect.center),
+      m_predictionRect(region.m_rrect),
+      m_predictionRect3D(region.m_rect),
+      m_kalman(kalmanType, useAcceleration, deltaTime, accelNoiseMag),
+      m_filterGoal(filterGoal),
+      m_outOfTheFrame(false),
+      m_externalTrackerForLost(externalTrackerForLost)
+{
+    if (filterGoal == tracking::FilterRect)
+        m_kalman.Update(region.m_brect, true);
+    else if(filterGoal == tracking::FilterRect3D)
+        m_kalman.Update(region.m_rect, true);
+    else
+        m_kalman.Update(m_predictionPoint, true);
+
+    m_trace.push_back(m_predictionPoint, m_predictionPoint);
+}
+
+///
+/// \brief CTrack::CalcDistCenter
+/// \param reg
+/// \return
+///
+track_t CTrack::CalcDistCenter(const CRegion& reg) const
+{
+    cv::Point3f diff = m_predictionPoint - reg.m_rect.center;
+    return sqrtf(sqr(diff.x) + sqr(diff.y));
+}
+///
+/// \brief CTrack::CalcDistRect
+/// \param reg
+/// \return
+///
+track_t CTrack::CalcDistRect(const CRegion& reg) const
+{
+    std::array<track_t, 5> diff;
+    diff[0] = reg.m_rrect.center.x - m_lastRegion.m_rrect.center.x;
+    diff[1] = reg.m_rrect.center.y - m_lastRegion.m_rrect.center.y;
+    diff[2] = static_cast<track_t>(m_lastRegion.m_rrect.size.width - reg.m_rrect.size.width);
+    diff[3] = static_cast<track_t>(m_lastRegion.m_rrect.size.height - reg.m_rrect.size.height);
+    diff[4] = static_cast<track_t>(m_lastRegion.m_rrect.angle - reg.m_rrect.angle);
+
+    track_t dist = 0;
+    for (size_t i = 0; i < diff.size(); ++i)
+    {
+        dist += sqr(diff[i]);
+    }
+    return sqrtf(dist);
+}
+///
+/// \brief CTrack::CalcDistRect3D
+/// \param reg
+/// \return
+///
+track_t CTrack::CalcDistRect3D(const CRegion& reg) const
+{
+    std::array<track_t, 7> diff;
+    diff[0] = reg.m_rect.center.x - m_lastRegion.m_rect.center.x;
+    diff[1] = reg.m_rect.center.y - m_lastRegion.m_rect.center.y;
+    diff[2] = 0;//reg.m_rect.center.z - m_lastRegion.m_rect.center.z;
+    diff[3] = static_cast<track_t>(m_lastRegion.m_rect.size.width - reg.m_rect.size.width);
+    diff[4] = static_cast<track_t>(m_lastRegion.m_rect.size.height - reg.m_rect.size.height);
+    diff[5] = static_cast<track_t>(m_lastRegion.m_rect.size.length - reg.m_rect.size.length);
+    diff[6] = 0;//static_cast<track_t>(m_lastRegion.m_rect.yaw - reg.m_rect.yaw);
+
+    track_t dist = 0;
+    for (size_t i = 0; i < diff.size(); ++i)
+    {
+        dist += sqr(diff[i]);
+    }
+    return sqrtf(dist);
+}
+
+///
+/// \brief CTrack::Update
+/// \*param region
+/// \param dataCorrect
+/// \param max_trace_length
+/// \param prevFrame
+/// \param currFrame
+/// \param trajLen
+///
+void CTrack::Update(
+        const CRegion& region,
+        bool dataCorrect,
+        size_t max_trace_length,
+        cv::UMat prevFrame,
+        cv::UMat currFrame,
+        int trajLen
+        )
+{
+    if (m_filterGoal == tracking::FilterRect) // Kalman filter for object coordinates and size
+        RectUpdate(region, dataCorrect, prevFrame, currFrame);
+    if (m_filterGoal == tracking::FilterRect3D) // Kalman filter for object coordinates and size
+        Rect3DUpdate(region, dataCorrect, prevFrame, currFrame);
+    else // Kalman filter only for object center
+        PointUpdate(region.m_rect.center, region.m_rrect.size, dataCorrect, currFrame.size());
+
+    if (dataCorrect)
+    {
+        //std::cout << m_lastRegion.m_brect << " - " << region.m_brect << std::endl;
+
+        m_lastRegion = region;
+        m_trace.push_back(m_predictionPoint, region.m_rect.center);
+
+        CheckStatic(trajLen, currFrame, region);
+    }
+    else
+    {
+        m_trace.push_back(m_predictionPoint);
+    }
+
+    if (m_trace.size() > max_trace_length)
+        m_trace.pop_front(m_trace.size() - max_trace_length);
+}
+
+///
+/// \brief CTrack::IsStatic
+/// \return
+///
+bool CTrack::IsStatic() const
+{
+    return m_isStatic;
+}
+
+///
+/// \brief CTrack::IsStaticTimeout
+/// \param framesTime
+/// \return
+///
+bool CTrack::IsStaticTimeout(int framesTime) const
+{
+    return (m_staticFrames > framesTime);
+}
+
+///
+/// \brief CTrack::IsOutOfTheFrame
+/// \return
+///
+bool CTrack::IsOutOfTheFrame() const
+{
+    return m_outOfTheFrame;
+}
+
+///
+//cv::RotatedRect CTrack::CalcPredictionEllipse(cv::Size_<track_t> minRadius) const
+//{
+//    // Move ellipse to velocity
+//    auto velocity = m_kalman.GetVelocity();
+//    Point_t d(3.f * velocity[0], 3.f * velocity[1]);
+
+//    cv::RotatedRect rrect(m_predictionPoint, cv::Size2f(std::max(minRadius.width, fabs(d.x)), std::max(minRadius.height, fabs(d.y))), 0);
+
+//    if (fabs(d.x) + fabs(d.y) > 4) // pix
+//    {
+//        if (fabs(d.x) > 0.0001f)
+//        {
+//            track_t l = std::min(rrect.size.width, rrect.size.height) / 3;
+
+//            track_t p2_l = sqrtf(sqr(d.x) + sqr(d.y));
+//            rrect.center.x = l * d.x / p2_l + m_predictionPoint.x;
+//            rrect.center.y = l * d.y / p2_l + m_predictionPoint.y;
+
+//            rrect.angle = atanf(d.y / d.x);
+//        }
+//        else
+//        {
+//            rrect.center.y += d.y / 3;
+//            rrect.angle = static_cast<float>(CV_PI / 2.);
+//        }
+//    }
+//    return rrect;
+//}
+
+///
+/// \brief CTrack::IsInsideArea
+///        If result <= 1 then center of the object is inside ellipse with prediction and velocity
+/// \param pt
+/// \return
+///
+track_t CTrack::IsInsideArea(const Point_t& pt, const cv::RotatedRect& rrect) const
+{
+    Point_t pt_(pt.x - rrect.center.x, pt.y - rrect.center.y);
+    track_t r = sqrtf(sqr(pt_.x) + sqr(pt_.y));
+    track_t t = (r > 1) ? acosf(pt_.x / r) : 0;
+    track_t t_ = t - rrect.angle;
+    Point_t pt_rotated(r * cosf(t_), r * sinf(t_));
+
+    return sqr(pt_rotated.x) / sqr(rrect.size.width) + sqr(pt_rotated.y) / sqr(rrect.size.height);
+}
+
+///
+/// \brief CTrack::WidthDist
+/// \param reg
+/// \return
+///
+track_t CTrack::WidthDist(const CRegion& reg) const
+{
+    if (m_lastRegion.m_rect.size.width < reg.m_rect.size.width)
+        return m_lastRegion.m_rect.size.width / reg.m_rect.size.width;
+    else
+        return reg.m_rect.size.width / m_lastRegion.m_rect.size.width;
+}
+
+///
+/// \brief CTrack::HeightDist
+/// \param reg
+/// \return
+///
+track_t CTrack::HeightDist(const CRegion& reg) const
+{
+    if (m_lastRegion.m_rect.size.height < reg.m_rect.size.height)
+        return m_lastRegion.m_rect.size.height / reg.m_rect.size.height;
+    else
+        return reg.m_rect.size.height / m_lastRegion.m_rect.size.height;
+}
+
+///
+/// \brief CTrack::CheckStatic
+/// \param trajLen
+/// \return
+///
+bool CTrack::CheckStatic(int trajLen, cv::UMat currFrame, const CRegion& region)
+{
+    if (!trajLen || static_cast<int>(m_trace.size()) < trajLen)
+    {
+        m_isStatic = false;
+        m_staticFrames = 0;
+        m_staticFrame = cv::UMat();
+    }
+    else
+    {
+        track_t kx = 0;
+        track_t bx = 0;
+        track_t ky = 0;
+        track_t by = 0;
+        get_lin_regress_params(m_trace, m_trace.size() - trajLen, m_trace.size(), kx, bx, ky, by);
+        track_t speed = sqrt(sqr(kx * trajLen) + sqr(ky * trajLen));
+        const track_t speedThresh = 10;
+        if (speed < speedThresh)
+        {
+            if (!m_isStatic)
+            {
+                m_staticFrame = currFrame.clone();
+                m_staticRect = region.m_rect;
+            }
+
+            ++m_staticFrames;
+            m_isStatic = true;
+        }
+        else
+        {
+            m_isStatic = false;
+            m_staticFrames = 0;
+            m_staticFrame = cv::UMat();
+        }
+    }
+
+    return m_isStatic;
+}
+
+///
+/// \brief GetLastRect
+/// \return
+///
+Rect3D CTrack::GetLastRect() const
+{
+    if (m_filterGoal == tracking::FilterRect)
+        return Rect3D(cv::Point3f(m_predictionRect.center.x,m_predictionRect.center.y,0),Size3D(m_predictionRect.boundingRect().width,m_predictionRect.boundingRect().height,0),0);
+    else if(m_filterGoal == tracking::FilterRect3D)
+        return m_predictionRect3D;
+    else
+        return Rect3D(cv::Point3f(m_predictionPoint.x, m_predictionPoint.y,0),Size3D(0,0,0),0);
+
+}
+///
+/// \brief CTrack::LastRegion
+/// \return
+///
+const CRegion& CTrack::LastRegion() const
+{
+    return m_lastRegion;
+}
+
+///
+/// \brief CTrack::ConstructObject
+/// \return
+///
+TrackingObject CTrack::ConstructObject() const
+{
+    return TrackingObject(GetLastRect(), m_lastRegion, m_trackID, m_regionID, m_trace, IsStatic(), IsOutOfTheFrame(), m_kalman.GetVelocity(), m_detectedFrames);
+}
+
+///
+/// \brief CTrack::SkippedFrames
+/// \return
+///
+size_t CTrack::SkippedFrames() const
+{
+    return m_skippedFrames;
+}
+
+///
+/// \brief CTrack::SkippedFrames
+/// \return
+///
+size_t& CTrack::SkippedFrames()
+{
+    return m_skippedFrames;
+}
+///
+/// \brief CTrack::DetectedFrames
+/// \return
+///
+size_t& CTrack::DetectedFrames()
+{
+    return m_detectedFrames;
+}
+///
+/// \brief RectUpdate
+/// \param region
+/// \param dataCorrect
+/// \param prevFrame
+/// \param currFrame
+///
+void CTrack::RectUpdate(
+        const CRegion& region,
+        bool dataCorrect,
+        cv::UMat prevFrame,
+        cv::UMat currFrame
+        )
+{
+    m_kalman.GetRectPrediction();
+
+    bool recalcPrediction = true;
+
+    auto Clamp = [](int& v, int& size, int hi) -> int
+    {
+        int res = 0;
+
+        if (size < 1)
+            size = 0;
+
+        if (v < 0)
+        {
+            res = v;
+            v = 0;
+            return res;
+        }
+        else if (v + size > hi - 1)
+        {
+            res = v;
+            v = hi - 1 - size;
+            if (v < 0)
+            {
+                size += v;
+                v = 0;
+            }
+            res -= v;
+            return res;
+        }
+        return res;
+    };
+
+    auto UpdateRRect = [&](cv::Rect prevRect, cv::Rect newRect)
+    {
+        m_predictionRect.center.x += newRect.x - prevRect.x;
+        m_predictionRect.center.y += newRect.y - prevRect.y;
+        m_predictionRect.size.width *= newRect.width / static_cast<float>(prevRect.width);
+        m_predictionRect.size.height *= newRect.height / static_cast<float>(prevRect.height);
+    };
+
+    switch (m_externalTrackerForLost)
+    {
+    case tracking::TrackNone:
+        break;
+    }
+
+    if (recalcPrediction)
+        UpdateRRect(m_predictionRect.boundingRect(), m_kalman.Update(region.m_brect, dataCorrect));
+
+    cv::Rect brect = m_predictionRect.boundingRect();
+    int dx = Clamp(brect.x, brect.width, currFrame.cols);
+    int dy = Clamp(brect.y, brect.height, currFrame.rows);
+
+    m_outOfTheFrame = (dx != 0) || (dy != 0) || (brect.width < 2) || (brect.height < 2);
+
+    m_predictionPoint = cv::Point3f(m_predictionRect.center.x,m_predictionRect.center.y,0);
+
+    //std::cout << "brect = " << brect << ", dx = " << dx << ", dy = " << dy << ", outOfTheFrame = " << m_outOfTheFrame << ", predictionPoint = " << m_predictionPoint << std::endl;
+}
+///
+/// \brief Rect3DUpdate
+/// \param region
+/// \param dataCorrect
+/// \param prevFrame
+/// \param currFrame
+///
+void CTrack::Rect3DUpdate(
+        const CRegion& region,
+        bool dataCorrect,
+        cv::UMat prevFrame,
+        cv::UMat currFrame
+        )
+{
+    m_kalman.GetRect3DPrediction();
+
+    switch (m_externalTrackerForLost)
+    {
+    case tracking::TrackNone:
+        break;
+    }
+
+    m_predictionRect3D = m_kalman.Update(region.m_rect, dataCorrect);
+
+    m_predictionPoint = m_predictionRect3D.center;
+
+    //std::cout << "brect = " << brect << ", dx = " << dx << ", dy = " << dy << ", outOfTheFrame = " << m_outOfTheFrame << ", predictionPoint = " << m_predictionPoint << std::endl;
+}
+
+///
+/// \brief PointUpdate
+/// \param pt
+/// \param dataCorrect
+///
+void CTrack::PointUpdate(
+        const cv::Point3f& pt,
+        const cv::Size& newObjSize,
+        bool dataCorrect,
+        const cv::Size& frameSize
+        )
+{
+    m_kalman.GetPointPrediction();
+
+    m_predictionPoint = m_kalman.Update(pt, dataCorrect);
+
+    if (dataCorrect)
+    {
+        const int a1 = 1;
+        const int a2 = 9;
+        m_predictionRect.size.width = (a1 * newObjSize.width + a2 * m_predictionRect.size.width) / (a1 + a2);
+        m_predictionRect.size.height = (a1 * newObjSize.height + a2 * m_predictionRect.size.height) / (a1 + a2);
+    }
+
+    auto Clamp = [](track_t& v, int hi) -> bool
+    {
+        if (v < 0)
+        {
+            v = 0;
+            return true;
+        }
+        else if (hi && v > hi - 1)
+        {
+            v = static_cast<track_t>(hi - 1);
+            return true;
+        }
+        return false;
+    };
+    auto p = m_predictionPoint;
+    m_outOfTheFrame = Clamp(p.x, frameSize.width) || Clamp(p.y, frameSize.height) || (m_predictionRect.size.width < 2) || (m_predictionRect.size.height < 2);
+
+    //std::cout << "predictionRect = " << m_predictionRect.boundingRect() << ", outOfTheFrame = " << m_outOfTheFrame << ", predictionPoint = " << m_predictionPoint << std::endl;
+}

+ 303 - 0
src/api/perception_lidar_PointPillars_MultiHead/Tracker/track.h

@@ -0,0 +1,303 @@
+#pragma once
+#include <iostream>
+#include <vector>
+#include <deque>
+#include <memory>
+#include <array>
+
+#ifdef USE_OCV_KCF
+#include <opencv2/tracking.hpp>
+#endif
+
+#include "defines.h"
+#include "Kalman.h"
+
+// --------------------------------------------------------------------------
+///
+/// \brief The TrajectoryPoint struct
+///
+struct TrajectoryPoint
+{
+    ///
+    /// \brief TrajectoryPoint
+    ///
+    TrajectoryPoint()
+        : m_hasRaw(false)
+    {
+    }
+
+    ///
+    /// \brief TrajectoryPoint
+    /// \param prediction
+    ///
+    TrajectoryPoint(const cv::Point3f& prediction)
+        :
+          m_hasRaw(false),
+          m_prediction(prediction)
+    {
+    }
+
+    ///
+    /// \brief TrajectoryPoint
+    /// \param prediction
+    /// \param raw
+    ///
+    TrajectoryPoint(const cv::Point3f& prediction, const cv::Point3f& raw)
+        :
+          m_hasRaw(true),
+          m_prediction(prediction),
+          m_raw(raw)
+    {
+    }
+
+    bool m_hasRaw = false;
+    cv::Point3f m_prediction;
+    cv::Point3f m_raw;
+};
+
+// --------------------------------------------------------------------------
+///
+/// \brief The Trace class
+///
+class Trace
+{
+public:
+    ///
+    /// \brief operator []
+    /// \param i
+    /// \return
+    ///
+    const cv::Point3f& operator[](size_t i) const
+    {
+        return m_trace[i].m_prediction;
+    }
+
+    ///
+    /// \brief operator []
+    /// \param i
+    /// \return
+    ///
+    cv::Point3f& operator[](size_t i)
+    {
+        return m_trace[i].m_prediction;
+    }
+
+    ///
+    /// \brief at
+    /// \param i
+    /// \return
+    ///
+    const TrajectoryPoint& at(size_t i) const
+    {
+        return m_trace[i];
+    }
+
+    ///
+    /// \brief size
+    /// \return
+    ///
+    size_t size() const
+    {
+        return m_trace.size();
+    }
+
+    ///
+    /// \brief push_back
+    /// \param prediction
+    ///
+    void push_back(const cv::Point3f& prediction)
+    {
+        m_trace.emplace_back(prediction);
+    }
+    void push_back(const cv::Point3f& prediction, const cv::Point3f& raw)
+    {
+        m_trace.emplace_back(prediction, raw);
+    }
+
+    ///
+    /// \brief pop_front
+    /// \param count
+    ///
+    void pop_front(size_t count)
+    {
+        if (count < size())
+            m_trace.erase(m_trace.begin(), m_trace.begin() + count);
+        else
+            m_trace.clear();
+    }
+
+    ///
+    /// \brief GetRawCount
+    /// \param lastPeriod
+    /// \return
+    ///
+    size_t GetRawCount(size_t lastPeriod) const
+    {
+        size_t res = 0;
+
+        size_t i = 0;
+        if (lastPeriod < m_trace.size())
+            i = m_trace.size() - lastPeriod;
+
+        for (; i < m_trace.size(); ++i)
+        {
+            if (m_trace[i].m_hasRaw)
+                ++res;
+        }
+
+        return res;
+    }
+
+private:
+    std::deque<TrajectoryPoint> m_trace;
+};
+
+// --------------------------------------------------------------------------
+///
+/// \brief The TrackingObject class
+///
+struct TrackingObject
+{
+    Rect3D m_rect;           // Coordinates
+    CRegion m_region;         // detect region
+    Trace m_trace;                     // Trajectory
+    size_t m_ID = 0;                   // Objects ID
+    size_t m_regionID = 0;                   // Objects ID
+    bool m_isStatic = false;           // Object is abandoned
+    bool m_outOfTheFrame = false;      // Is object out of freme
+    cv::Vec<track_t, 2> m_velocity;    // pixels/sec
+    size_t m_detectedFrames = 0;       // detected frames' count
+    ///
+    TrackingObject(const Rect3D& rect, const CRegion& region, size_t ID, size_t regionID, const Trace& trace,
+                   bool isStatic, bool outOfTheFrame, cv::Vec<track_t, 2> velocity, size_t detectedFrames)
+        :
+          m_rect(rect), m_region(region), m_ID(ID), m_regionID(regionID), m_isStatic(isStatic), m_outOfTheFrame(outOfTheFrame), m_velocity(velocity), m_detectedFrames(detectedFrames)
+    {
+        for (size_t i = 0; i < trace.size(); ++i)
+        {
+            auto tp = trace.at(i);
+            if (tp.m_hasRaw)
+                m_trace.push_back(tp.m_prediction, tp.m_raw);
+            else
+                m_trace.push_back(tp.m_prediction);
+        }
+    }
+    ///
+    bool IsRobust(int minTraceSize, float minRawRatio, cv::Size2f sizeRatio) const
+    {
+        bool res = m_trace.size() > static_cast<size_t>(minTraceSize);
+        res &= m_trace.GetRawCount(m_trace.size() - 1) / static_cast<float>(m_trace.size()) > minRawRatio;
+        if (sizeRatio.width + sizeRatio.height > 0)
+        {
+            float sr = m_rect.size.width / m_rect.size.height;
+            if (sizeRatio.width > 0)
+                res &= (sr > sizeRatio.width);
+
+            if (sizeRatio.height > 0)
+                res &= (sr < sizeRatio.height);
+        }
+        if (m_outOfTheFrame)
+            res = false;
+
+        return res;
+    }
+};
+
+// --------------------------------------------------------------------------
+///
+/// \brief The CTrack class
+///
+class CTrack
+{
+public:
+    CTrack(const CRegion& region,
+           tracking::KalmanType kalmanType,
+           track_t deltaTime,
+           track_t accelNoiseMag,
+           bool useAcceleration,
+           size_t trackID,
+           size_t regionID,
+           tracking::FilterGoal filterObjectGoal,
+           tracking::LostTrackType externalTrackerForLost);
+
+    ///
+    /// \brief CalcDist
+    /// Euclidean distance in pixels between objects centres on two N and N+1 frames
+    /// \param reg
+    /// \return
+    ///
+    track_t CalcDistCenter(const CRegion& reg) const;
+    ///
+    /// \brief CalcDist
+    /// Euclidean distance in pixels between object contours on two N and N+1 frames
+    /// \param reg
+    /// \return
+    ///
+    track_t CalcDistRect(const CRegion& reg) const;
+    ///
+    /// \brief CalcDist
+    /// Euclidean distance in pixels between object contours on two N and N+1 frames
+    /// \param reg
+    /// \return
+    ///
+    track_t CalcDistRect3D(const CRegion& reg) const;
+
+    //cv::RotatedRect CalcPredictionEllipse(cv::Size_<track_t> minRadius) const;
+    ///
+    /// \brief IsInsideArea
+    /// Test point inside in prediction area: prediction area + object velocity
+    /// \param pt
+    /// \param minVal
+    /// \return
+    ///
+    track_t IsInsideArea(const Point_t& pt, const cv::RotatedRect& rrect) const;
+    track_t WidthDist(const CRegion& reg) const;
+    track_t HeightDist(const CRegion& reg) const;
+
+    void Update(const CRegion& region, bool dataCorrect, size_t max_trace_length, cv::UMat prevFrame, cv::UMat currFrame, int trajLen);
+
+    bool IsStatic() const;
+    bool IsStaticTimeout(int framesTime) const;
+    bool IsOutOfTheFrame() const;
+
+    Rect3D GetLastRect() const;
+
+    const cv::Point3f& AveragePoint() const;
+    cv::Point3f& AveragePoint();
+    const CRegion& LastRegion() const;
+    size_t SkippedFrames() const;
+    size_t& SkippedFrames();
+    size_t& DetectedFrames();
+
+    TrackingObject ConstructObject() const;
+public:
+    size_t m_regionID = 0;
+private:
+    Trace m_trace;
+    size_t m_trackID = 0;
+    size_t m_skippedFrames = 0;
+    size_t m_detectedFrames = 0;
+    CRegion m_lastRegion;
+
+    cv::Point3f m_predictionPoint;
+    cv::RotatedRect m_predictionRect;
+    Rect3D m_predictionRect3D;
+    TKalmanFilter m_kalman;
+    tracking::FilterGoal m_filterGoal = tracking::FilterCenter;
+    bool m_outOfTheFrame = false;
+
+    tracking::LostTrackType m_externalTrackerForLost;
+
+    void RectUpdate(const CRegion& region, bool dataCorrect, cv::UMat prevFrame, cv::UMat currFrame);
+    void Rect3DUpdate(const CRegion& region, bool dataCorrect, cv::UMat prevFrame, cv::UMat currFrame);
+    void PointUpdate(const cv::Point3f& pt, const cv::Size& newObjSize, bool dataCorrect, const cv::Size& frameSize);
+
+    bool CheckStatic(int trajLen, cv::UMat currFrame, const CRegion& region);
+    bool m_isStatic = false;
+    int m_staticFrames = 0;
+    cv::UMat m_staticFrame;
+    Rect3D m_staticRect;
+};
+
+typedef std::vector<std::unique_ptr<CTrack>> tracks_t;
+

+ 246 - 0
src/api/perception_lidar_PointPillars_MultiHead/cfgs/cbgs_pp_multihead.yaml

@@ -0,0 +1,246 @@
+CLASS_NAMES: ['car','truck', 'construction_vehicle', 'bus', 'trailer',
+              'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']
+
+DATA_CONFIG:
+    _BASE_CONFIG_: cfgs/dataset_configs/nuscenes_dataset.yaml
+
+    POINT_CLOUD_RANGE: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+    DATA_PROCESSOR:
+        -   NAME: mask_points_and_boxes_outside_range
+            REMOVE_OUTSIDE_BOXES: True
+
+        -   NAME: shuffle_points
+            SHUFFLE_ENABLED: {
+                'train': True,
+                'test': True
+            }
+
+        -   NAME: transform_points_to_voxels
+            VOXEL_SIZE: [0.2, 0.2, 8.0]
+            MAX_POINTS_PER_VOXEL: 20
+            MAX_NUMBER_OF_VOXELS: {
+                'train': 30000,
+                'test': 30000
+            }
+
+MODEL:
+    NAME: PointPillar
+
+    VFE:
+        NAME: PillarVFE
+        WITH_DISTANCE: False
+        USE_ABSLOTE_XYZ: True
+        USE_NORM: True
+        NUM_FILTERS: [64]
+
+    MAP_TO_BEV:
+        NAME: PointPillarScatter
+        NUM_BEV_FEATURES: 64
+
+    BACKBONE_2D:
+        NAME: BaseBEVBackbone
+        LAYER_NUMS: [3, 5, 5]
+        LAYER_STRIDES: [2, 2, 2]
+        NUM_FILTERS: [64, 128, 256]
+        UPSAMPLE_STRIDES: [0.5, 1, 2]
+        NUM_UPSAMPLE_FILTERS: [128, 128, 128]
+
+    DENSE_HEAD:
+        NAME: AnchorHeadMulti
+        CLASS_AGNOSTIC: False
+
+        DIR_OFFSET: 0.78539
+        DIR_LIMIT_OFFSET: 0.0
+        NUM_DIR_BINS: 2
+
+        USE_MULTIHEAD: True
+        SEPARATE_MULTIHEAD: True
+        ANCHOR_GENERATOR_CONFIG: [
+            {
+                'class_name': car,
+                'anchor_sizes': [[4.63, 1.97, 1.74]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-0.95],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.6,
+                'unmatched_threshold': 0.45
+            },
+            {
+                'class_name': truck,
+                'anchor_sizes': [[6.93, 2.51, 2.84]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-0.6],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.55,
+                'unmatched_threshold': 0.4
+            },
+            {
+                'class_name': construction_vehicle,
+                'anchor_sizes': [[6.37, 2.85, 3.19]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-0.225],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.5,
+                'unmatched_threshold': 0.35
+            },
+            {
+                'class_name': bus,
+                'anchor_sizes': [[10.5, 2.94, 3.47]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-0.085],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.55,
+                'unmatched_threshold': 0.4
+            },
+            {
+                'class_name': trailer,
+                'anchor_sizes': [[12.29, 2.90, 3.87]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [0.115],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.5,
+                'unmatched_threshold': 0.35
+            },
+            {
+                'class_name': barrier,
+                'anchor_sizes': [[0.50, 2.53, 0.98]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-1.33],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.55,
+                'unmatched_threshold': 0.4
+            },
+            {
+                'class_name': motorcycle,
+                'anchor_sizes': [[2.11, 0.77, 1.47]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-1.085],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.5,
+                'unmatched_threshold': 0.3
+            },
+            {
+                'class_name': bicycle,
+                'anchor_sizes': [[1.70, 0.60, 1.28]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-1.18],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.5,
+                'unmatched_threshold': 0.35
+            },
+            {
+                'class_name': pedestrian,
+                'anchor_sizes': [[0.73, 0.67, 1.77]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-0.935],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.6,
+                'unmatched_threshold': 0.4
+            },
+            {
+                'class_name': traffic_cone,
+                'anchor_sizes': [[0.41, 0.41, 1.07]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-1.285],
+                'align_center': False,
+                'feature_map_stride': 4,
+                'matched_threshold': 0.6,
+                'unmatched_threshold': 0.4
+            },
+        ]
+
+        SHARED_CONV_NUM_FILTER: 64
+
+        RPN_HEAD_CFGS: [
+            {
+                'HEAD_CLS_NAME': ['car'],
+            },
+            {
+                'HEAD_CLS_NAME': ['truck', 'construction_vehicle'],
+            },
+            {
+                'HEAD_CLS_NAME': ['bus', 'trailer'],
+            },
+            {
+                'HEAD_CLS_NAME': ['barrier'],
+            },
+            {
+                'HEAD_CLS_NAME': ['motorcycle', 'bicycle'],
+            },
+            {
+                'HEAD_CLS_NAME': ['pedestrian', 'traffic_cone'],
+            },
+        ]
+        SEPARATE_REG_CONFIG: 
+            NUM_MIDDLE_CONV: 1
+            NUM_MIDDLE_FILTER: 64
+            REG_LIST: ['reg:2', 'height:1', 'size:3', 'angle:2', 'velo:2']
+
+        TARGET_ASSIGNER_CONFIG:
+            NAME: AxisAlignedTargetAssigner
+            POS_FRACTION: -1.0
+            SAMPLE_SIZE: 512
+            NORM_BY_NUM_EXAMPLES: False
+            MATCH_HEIGHT: False
+            BOX_CODER: ResidualCoder
+            BOX_CODER_CONFIG: {
+                'code_size': 9,
+                'encode_angle_by_sincos': True
+            }
+
+
+        LOSS_CONFIG:
+            REG_LOSS_TYPE: WeightedL1Loss
+            LOSS_WEIGHTS: {
+                'pos_cls_weight': 1.0,
+                'neg_cls_weight': 2.0,
+                'cls_weight': 1.0,
+                'loc_weight': 0.25,
+                'dir_weight': 0.2,
+                'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+            }
+
+    POST_PROCESSING:
+        RECALL_THRESH_LIST: [0.3, 0.5, 0.7]
+        SCORE_THRESH: 0.1
+        OUTPUT_RAW_SCORE: False
+
+        EVAL_METRIC: kitti
+
+        NMS_CONFIG:
+            MULTI_CLASSES_NMS: True
+            NMS_TYPE: nms_gpu
+            NMS_THRESH: 0.2
+            NMS_PRE_MAXSIZE: 1000
+            NMS_POST_MAXSIZE: 83
+
+
+OPTIMIZATION:
+    BATCH_SIZE_PER_GPU: 4
+    NUM_EPOCHS: 20
+
+    OPTIMIZER: adam_onecycle
+    LR: 0.001
+    WEIGHT_DECAY: 0.01
+    MOMENTUM: 0.9
+
+    MOMS: [0.95, 0.85]
+    PCT_START: 0.4
+    DIV_FACTOR: 10
+    DECAY_STEP_LIST: [35, 45]
+    LR_DECAY: 0.1
+    LR_CLIP: 0.0000001
+
+    LR_WARMUP: False
+    WARMUP_EPOCH: 1
+
+    GRAD_NORM_CLIP: 10

+ 141 - 0
src/api/perception_lidar_PointPillars_MultiHead/common.h

@@ -0,0 +1,141 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+
+#pragma once
+
+// headers in STL
+#include <stdio.h>
+#include <assert.h>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+// headers in CUDA
+#include "cuda_runtime_api.h"
+
+using namespace std;
+// using MACRO to allocate memory inside CUDA kernel
+#define NUM_3D_BOX_CORNERS_MACRO 8
+
+#define NUM_2D_BOX_CORNERS_MACRO 4
+
+#define NUM_THREADS_MACRO 64
+
+// need to be changed when num_threads_ is changed
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+#define GPU_CHECK(ans)                    \
+  {                                       \
+    GPUAssert((ans), __FILE__, __LINE__); \
+  }
+inline void GPUAssert(cudaError_t code, const char *file, int line,
+                      bool abort = true)
+{
+  if (code != cudaSuccess)
+  {
+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
+            line);
+    if (abort)
+      exit(code);
+  }
+};
+
+template <typename T>
+void HOST_SAVE(T *array, int size, string filename, string root = "../test/result", string postfix = ".txt")
+{
+  string filepath = root + "/" + filename + postfix;
+  if (postfix == ".bin")
+  {
+    fstream file(filepath, ios::out | ios::binary);
+    file.write(reinterpret_cast<char *>(array), sizeof(size * sizeof(T)));
+    file.close();
+    std::cout << "|>>>|  Data has been written in " << filepath << "  |<<<|" << std::endl;
+    return;
+  }
+  else if (postfix == ".txt")
+  {
+    ofstream file(filepath, ios::out);
+    for (int i = 0; i < size; ++i)
+      file << array[i] << " ";
+    file.close();
+    std::cout << "|>>>|  Data has been written in " << filepath << "  |<<<|" << std::endl;
+    return;
+  }
+};
+
+template <typename T>
+void DEVICE_SAVE(T *array, int size, string filename, string root = "../test/result", string postfix = ".txt")
+{
+  T *temp_ = new T[size];
+  cudaMemcpy(temp_, array, size * sizeof(T), cudaMemcpyDeviceToHost);
+  HOST_SAVE<T>(temp_, size, filename, root, postfix);
+  delete[] temp_;
+};
+
+
+// int TXTtoArrary( float* &points_array , string file_name , int num_feature = 4)
+// {
+//   ifstream InFile;
+//   InFile.open(file_name.data());
+//   assert(InFile.is_open());
+
+//   vector<float> temp_points;
+//   string c;
+
+//   while (!InFile.eof())
+//   {
+//       InFile >> c;
+
+//       temp_points.push_back(atof(c.c_str()));
+//   }
+//   points_array = new float[temp_points.size()];
+//   for (int i = 0 ; i < temp_points.size() ; ++i) {
+//     points_array[i] = temp_points[i];
+//   }
+
+//   InFile.close();  
+//   return temp_points.size() / num_feature;
+// };

+ 377 - 0
src/api/perception_lidar_PointPillars_MultiHead/main.cpp

@@ -0,0 +1,377 @@
+#include <QCoreApplication>
+#include <QDateTime>
+#include <iostream>
+#include "pointpillars.h"
+#include <iostream>
+#include <pcl/point_cloud.h>
+#include <pcl/point_types.h>
+#include <pcl/io/io.h>
+#include <pcl/io/pcd_io.h>
+#include "xmlparam.h"
+#include "modulecomm.h"
+#include <thread>
+#include "objectarray.pb.h"
+#include "Tracking.hpp"
+
+std::thread * gpthread;
+PointPillars * pPillars = nullptr ;
+void * gpa;
+void * gpdetect;
+int gnothavedatatime = 0;
+const int kNumPointFeature = 5;
+const int kOutputNumBoxFeature = 7;
+std::string gstrinput;
+std::string gstroutput;
+
+TrackerSettings settings;
+CTracker tracker(settings);
+bool m_isTrackerInitialized = false;
+
+void PclToArray(
+        const pcl::PointCloud<pcl::PointXYZI>::Ptr& in_pcl_pc_ptr,
+        float* out_points_array, const float normalizing_factor) {
+    for (size_t i = 0; i < in_pcl_pc_ptr->size(); ++i) {
+        pcl::PointXYZI point = in_pcl_pc_ptr->at(i);
+        out_points_array[i * 4 + 0] = point.x;
+        out_points_array[i * 4 + 1] = point.y;
+        out_points_array[i * 4 + 2] = point.z;
+        out_points_array[i * 4 + 3] =
+                static_cast<float>(point.intensity / normalizing_factor);
+    }
+}
+
+void PclXYZITToArray(
+        const pcl::PointCloud<pcl::PointXYZI>::Ptr& in_pcl_pc_ptr,
+        float* out_points_array, const float normalizing_factor) {
+    for (size_t i = 0; i < in_pcl_pc_ptr->size(); ++i) {
+        pcl::PointXYZI point = in_pcl_pc_ptr->at(i);
+        out_points_array[i * 5 + 0] = point.x;
+        out_points_array[i * 5 + 1] = point.y;
+        out_points_array[i * 5 + 2] = point.z;
+        out_points_array[i * 5 + 3] =
+                static_cast<float>(point.intensity / normalizing_factor);
+        out_points_array[i * 5 + 4] = 0;
+    }
+}
+
+void GetLidarObj(std::vector<float> out_detections,std::vector<int> out_labels,
+                 std::vector<float> out_scores,iv::lidar::objectarray & lidarobjvec)
+{
+    int i;
+    int obj_size = out_detections.size()/kOutputNumBoxFeature;
+    //    givlog->verbose("OBJ","object size is %d",obj_size);
+    for(i=0;i<obj_size;i++)
+    {
+        iv::lidar::lidarobject lidarobj;
+        if (out_scores.at(i) < 0.10) continue;
+
+        lidarobj.set_tyaw(out_detections.at(i*7+6));
+        iv::lidar::PointXYZ centroid;
+        iv::lidar::PointXYZ * _centroid;
+        centroid.set_x(out_detections.at(i*7));
+        centroid.set_y(out_detections.at(i*7+1));
+        centroid.set_z(out_detections.at(i*7+2));
+        _centroid = lidarobj.mutable_centroid();
+        _centroid->CopyFrom(centroid);
+
+        iv::lidar::PointXYZ min_point;
+        iv::lidar::PointXYZ * _min_point;
+        min_point.set_x(0);
+        min_point.set_y(0);
+        min_point.set_z(0);
+        _min_point = lidarobj.mutable_min_point();
+        _min_point->CopyFrom(min_point);
+
+        iv::lidar::PointXYZ max_point;
+        iv::lidar::PointXYZ * _max_point;
+        max_point.set_x(0);
+        max_point.set_y(0);
+        max_point.set_z(0);
+        _max_point = lidarobj.mutable_max_point();
+        _max_point->CopyFrom(max_point);
+
+        iv::lidar::PointXYZ position;
+        iv::lidar::PointXYZ * _position;
+        position.set_x(out_detections.at(i*7));
+        position.set_y(out_detections.at(i*7+1));
+        position.set_z(out_detections.at(i*7+2));
+        _position = lidarobj.mutable_position();
+        _position->CopyFrom(position);
+        lidarobj.set_mntype(out_labels.at(i));
+        // label 2  8
+        if(out_labels.at(i)==2){
+            lidarobj.set_mntype(8);
+        }else if(out_labels.at(i)==8){
+            lidarobj.set_mntype(2);
+        }
+        lidarobj.set_score(out_scores.at(i));
+        lidarobj.add_type_probs(out_scores.at(i));
+
+        iv::lidar::PointXYZI point_cloud;
+        iv::lidar::PointXYZI * _point_cloud;
+        point_cloud.set_x(out_detections.at(i*7));
+        point_cloud.set_y(out_detections.at(i*7+1));
+        point_cloud.set_z(out_detections.at(i*7+2));
+        point_cloud.set_i(0);
+
+        _point_cloud = lidarobj.add_cloud();
+        _point_cloud->CopyFrom(point_cloud);
+
+        iv::lidar::Dimension ld;
+        iv::lidar::Dimension * pld;
+        ld.set_x(out_detections.at(i*7+3));// w
+        ld.set_y(out_detections.at(i*7+4));// l
+        ld.set_z(out_detections.at(i*7+5));// h
+        pld = lidarobj.mutable_dimensions();
+        pld->CopyFrom(ld);
+
+        //        std::cout<<"x y z   :  "<<out_detections.at(i*7+3)<<"    "<< out_detections.at(i*7+4)<<"    "<<out_detections.at(i*7+5)<<std::endl;
+        iv::lidar::lidarobject * po = lidarobjvec.add_obj();
+        po->CopyFrom(lidarobj);
+    }
+
+}
+
+void DectectOnePCD(const pcl::PointCloud<pcl::PointXYZI>::Ptr &pc_ptr)
+{
+    std::shared_ptr<float> points_array_ptr = std::shared_ptr<float>(new float[pc_ptr->size() * kNumPointFeature]);
+    //   float* points_array = new float[pc_ptr->size() * kNumPointFeature];
+    PclXYZITToArray(pc_ptr, points_array_ptr.get(), 1.0);
+
+    int    in_num_points = pc_ptr->width;
+
+    std::vector<float> out_detections;
+    std::vector<int> out_labels;
+    std::vector<float> out_scores;
+
+    QTime xTime;
+
+    xTime.start();
+
+    cudaDeviceSynchronize();
+    pPillars->DoInference(points_array_ptr.get(), in_num_points, &out_detections, &out_labels , &out_scores);
+    cudaDeviceSynchronize();
+
+    //    givlog->verbose("obj size is %d", num_objects);
+    //    std::cout<<"obj size is "<<num_objects<<std::endl;
+
+    //    std::vector<iv::lidar::lidarobject> lidarobjvec;
+    iv::lidar::objectarray lidarobjvec;
+    GetLidarObj(out_detections,out_labels,out_scores,lidarobjvec);
+
+    double timex = pc_ptr->header.stamp;
+    timex = timex/1000.0;
+    lidarobjvec.set_timestamp(pc_ptr->header.stamp);
+    //---------------------------------------------  init tracker  -------------------------------------------------
+    if (!m_isTrackerInitialized)
+    {
+        m_isTrackerInitialized = InitTracker(tracker);
+        if (!m_isTrackerInitialized)
+        {
+            std::cerr << "Tracker initialize error!!!" << std::endl;
+        }
+    }
+    iv::lidar::objectarray trackedobjvec = Tracking(lidarobjvec, tracker);
+    //    std::<<"track    end"<<std::endl;
+
+    //    --------------------------------------------  end tracking  --------------------------------------------------
+    int ntlen;
+    std::string out = trackedobjvec.SerializeAsString();
+    iv::modulecomm::ModuleSendMsg(gpdetect,out.data(),out.length());
+
+    //    givlog->verbose("lenth is %d",out.length());
+}
+
+void ListenPointCloud(const char *strdata,const unsigned int nSize,
+                      const unsigned int index,const QDateTime * dt,const char * strmemname)
+{
+    (void)index;(void)dt;(void)strmemname;
+    //    std::cout<<" is  ok  ------------  "<<std::endl;
+    if(nSize <=16)return;
+    unsigned int * pHeadSize = (unsigned int *)strdata;
+    if(*pHeadSize > nSize)
+    {
+        std::cout<<"ListenPointCloud data is small headsize ="<<*pHeadSize<<"  data size is"<<nSize<<std::endl;
+    }
+
+    gnothavedatatime = 0;
+    QTime xTime;
+    xTime.start();
+
+    pcl::PointCloud<pcl::PointXYZI>::Ptr point_cloud(
+                new pcl::PointCloud<pcl::PointXYZI>());
+    int nNameSize;
+    nNameSize = *pHeadSize - 4-4-8;
+    char * strName = new char[nNameSize+1];strName[nNameSize] = 0;
+    std::shared_ptr<char> str_ptr;
+    str_ptr.reset(strName);
+    memcpy(strName,(char *)((char *)strdata +4),nNameSize);
+    point_cloud->header.frame_id = strName;
+    memcpy(&point_cloud->header.seq,(char *)strdata+4+nNameSize,4);
+    memcpy(&point_cloud->header.stamp,(char *)strdata+4+nNameSize+4,8);
+    int nPCount = (nSize - *pHeadSize)/sizeof(pcl::PointXYZI);
+    int i;
+    pcl::PointXYZI * p;
+    p = (pcl::PointXYZI *)((char *)strdata + *pHeadSize);
+    for(i=0;i<nPCount;i++)
+    {
+        pcl::PointXYZI xp;
+        memcpy(&xp,p,sizeof(pcl::PointXYZI));
+        xp.z = xp.z;
+        point_cloud->push_back(xp);
+        p++;
+    }
+
+    DectectOnePCD(point_cloud);
+    std::cout<<"time is "<<(QDateTime::currentMSecsSinceEpoch() % 1000)<<" "<<xTime.elapsed()<<std::endl;
+
+}
+
+bool gbstate = true;
+void statethread()
+{
+    int nstate = 0;
+    int nlaststate = 0;
+    while (gbstate)
+    {
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        if(gnothavedatatime < 100000) gnothavedatatime++;
+
+        if (gnothavedatatime  < 100){
+            nstate = 0;
+        }
+        if (gnothavedatatime > 1000)
+        {
+            nstate = 1;
+        }
+        if (gnothavedatatime > 6000)
+        {
+            nstate = 2;
+        }
+        if (nstate != nlaststate) {
+            switch (nstate) {
+            case 0:
+                break;
+            case 1:
+            {
+                static int64_t nLastLog = 0;
+                int64_t nnow = std::chrono::system_clock::now().time_since_epoch().count()/1e6;
+                if(abs(nnow - nLastLog) > 1e3)
+                {
+                    std::cout<<"more than 10 seconds not have lidar pointcloud."<<std::endl;
+                    nLastLog = nnow;
+                }
+            }
+                break;
+            case 2:
+            {
+                static int64_t nLastLog = 0;
+                int64_t nnow = std::chrono::system_clock::now().time_since_epoch().count()/1e6;
+                if(abs(nnow - nLastLog) > 1e3)
+                {
+                    std::cout<<"more than 60 seconds not have lidar pointcloud.."<<std::endl;
+                    nLastLog = nnow;
+                }
+            }
+                break;
+            default:
+                break;
+            }
+        }
+    }
+}
+
+void exitfunc()
+{
+    gbstate = false;
+    gpthread->join();
+    std::cout<<" state thread closed."<<std::endl;
+    iv::modulecomm::Unregister(gpa);
+    iv::modulecomm::Unregister(gpdetect);
+    std::cout<<"exit func complete"<<std::endl;
+}
+#include <QFile>
+bool trtisexist(std::string strpfe,std::string strbackbone)
+{
+    QFile xFile;
+    xFile.setFileName(strpfe.data());
+    if(xFile.exists() == false)
+    {
+        return false;
+    }
+    xFile.setFileName(strbackbone.data());
+    if(xFile.exists() == false)
+    {
+        return false;
+    }
+    return true;
+}
+int main(int argc, char *argv[])
+{
+    QCoreApplication a(argc, argv);
+
+    //    RegisterIVBackTrace();
+    tracker.setSettings(settings);
+
+
+    char * strhome = getenv("HOME");
+    std::string pfe_file = strhome;
+    pfe_file += "/models/lidar/cbgs_pp_multihead_pfe.onnx";
+    std::string pfe_trt_file = pfe_file.substr(0, pfe_file.find(".")) + ".trt";
+
+    std::string backbone_file = strhome;
+    backbone_file += "/models/lidar/cbgs_pp_multihead_backbone.onnx";
+    std::string backbone_trt_file = backbone_file.substr(0, backbone_file.find(".")) + ".trt";
+
+    bool btrtexist = trtisexist(pfe_trt_file,backbone_trt_file);
+
+
+    QString strpath = QCoreApplication::applicationDirPath();
+    std::string pp_config = strpath.toStdString() ;
+    pp_config += "/cfgs/cbgs_pp_multihead.yaml";
+    if (argc < 2)
+        strpath = strpath + "/detection_lidar_pointpillar.xml";
+    else
+        strpath = argv[1];
+
+    std::cout<<pp_config<<std::endl;
+
+    iv::xmlparam::Xmlparam xparam(strpath.toStdString());
+    pfe_file = xparam.GetParam("pfe_file",pfe_file.data());
+    backbone_file = xparam.GetParam("backbone_file",backbone_file.data());
+    gstrinput = xparam.GetParam("input","lidar_pc");
+    gstroutput = xparam.GetParam("output","lidar_pointpillar");
+
+    if(btrtexist == false)
+    {
+
+        std::cout<<"use onnx model."<<std::endl;
+        pPillars = new PointPillars(
+                    0.4,
+                    0.2,
+                    true,
+                    pfe_file,
+                    backbone_file,
+                    pp_config
+                    );
+    }
+    else
+    {
+        std::cout<<"use trt model."<<std::endl;
+        pPillars = new PointPillars(
+                    0.1,
+                    0.2,
+                    false,
+                    pfe_trt_file,
+                    backbone_trt_file,
+                    pp_config
+                    );
+    }
+    std::cout<<"PointPillars Init OK."<<std::endl;
+
+    gpa = iv::modulecomm::RegisterRecv(gstrinput.data(),ListenPointCloud);
+    gpdetect = iv::modulecomm::RegisterSend(gstroutput.data(), 10000000,1);
+    gpthread = new std::thread(statethread);
+
+    return a.exec();
+}

+ 422 - 0
src/api/perception_lidar_PointPillars_MultiHead/nms.cu

@@ -0,0 +1,422 @@
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+#include <stdio.h>
+#include "nms.h"
+#include "common.h"
+#define THREADS_PER_BLOCK 16
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+// #define DEBUG
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+const float EPS = 1e-8;
+struct Point {
+    float x, y;
+    __device__ Point() {}
+    __device__ Point(double _x, double _y){
+        x = _x, y = _y;
+    }
+
+    __device__ void set(float _x, float _y){
+        x = _x; y = _y;
+    }
+
+    __device__ Point operator +(const Point &b)const{
+        return Point(x + b.x, y + b.y);
+    }
+
+    __device__ Point operator -(const Point &b)const{
+        return Point(x - b.x, y - b.y);
+    }
+};
+
+__device__ inline float cross(const Point &a, const Point &b){
+    return a.x * b.y - a.y * b.x;
+}
+
+__device__ inline float cross(const Point &p1, const Point &p2, const Point &p0){
+    return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+__device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2){
+    int ret = min(p1.x,p2.x) <= max(q1.x,q2.x)  &&
+              min(q1.x,q2.x) <= max(p1.x,p2.x) &&
+              min(p1.y,p2.y) <= max(q1.y,q2.y) &&
+              min(q1.y,q2.y) <= max(p1.y,p2.y);
+    return ret;
+}
+
+__device__ inline int check_in_box2d(const float *box, const Point &p){
+    //params: (7) [x, y, z, dx, dy, dz, heading]
+    const float MARGIN = 1e-2;
+
+    float center_x = box[0], center_y = box[1];
+    float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);  // rotate the point in the opposite direction of box
+    float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+    float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
+
+    return (fabs(rot_x) < box[3] / 2 + MARGIN && fabs(rot_y) < box[4] / 2 + MARGIN);
+}
+
+__device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans){
+    // fast exclusion
+    if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+    // check cross standing
+    float s1 = cross(q0, p1, p0);
+    float s2 = cross(p1, q1, p0);
+    float s3 = cross(p0, q1, q0);
+    float s4 = cross(q1, p1, q0);
+
+    if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+    // calculate intersection of two lines
+    float s5 = cross(q1, p1, p0);
+    if(fabs(s5 - s1) > EPS){
+        ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+        ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+    }
+    else{
+        float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+        float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+        float D = a0 * b1 - a1 * b0;
+
+        ans.x = (b0 * c1 - b1 * c0) / D;
+        ans.y = (a1 * c0 - a0 * c1) / D;
+    }
+
+    return 1;
+}
+
+__device__ inline void rotate_around_center(const Point &center, const float angle_cos, const float angle_sin, Point &p){
+    float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
+    float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+    p.set(new_x, new_y);
+}
+
+__device__ inline int point_cmp(const Point &a, const Point &b, const Point &center){
+    return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x);
+}
+
+__device__ inline float box_overlap(const float *box_a, const float *box_b){
+    // params box_a: [x, y, z, dx, dy, dz, heading]
+    // params box_b: [x, y, z, dx, dy, dz, heading]
+
+    float a_angle = box_a[6], b_angle = box_b[6];
+    float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
+    float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
+    float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
+    float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
+    float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
+
+    Point center_a(box_a[0], box_a[1]);
+    Point center_b(box_b[0], box_b[1]);
+
+#ifdef DEBUG
+    printf("a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", a_x1, a_y1, a_x2, a_y2, a_angle,
+           b_x1, b_y1, b_x2, b_y2, b_angle);
+    printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y, center_b.x, center_b.y);
+#endif
+
+    Point box_a_corners[5];
+    box_a_corners[0].set(a_x1, a_y1);
+    box_a_corners[1].set(a_x2, a_y1);
+    box_a_corners[2].set(a_x2, a_y2);
+    box_a_corners[3].set(a_x1, a_y2);
+
+    Point box_b_corners[5];
+    box_b_corners[0].set(b_x1, b_y1);
+    box_b_corners[1].set(b_x2, b_y1);
+    box_b_corners[2].set(b_x2, b_y2);
+    box_b_corners[3].set(b_x1, b_y2);
+
+    // get oriented corners
+    float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+    float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+    for (int k = 0; k < 4; k++){
+#ifdef DEBUG
+        printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
+        rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+        rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+#ifdef DEBUG
+        printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
+    }
+
+    box_a_corners[4] = box_a_corners[0];
+    box_b_corners[4] = box_b_corners[0];
+
+    // get intersection of lines
+    Point cross_points[16];
+    Point poly_center;
+    int cnt = 0, flag = 0;
+
+    poly_center.set(0, 0);
+    for (int i = 0; i < 4; i++){
+        for (int j = 0; j < 4; j++){
+            flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]);
+            if (flag){
+                poly_center = poly_center + cross_points[cnt];
+                cnt++;
+#ifdef DEBUG
+                printf("Cross points (%.3f, %.3f): a(%.3f, %.3f)->(%.3f, %.3f), b(%.3f, %.3f)->(%.3f, %.3f) \n",
+                    cross_points[cnt - 1].x, cross_points[cnt - 1].y,
+                    box_a_corners[i].x, box_a_corners[i].y, box_a_corners[i + 1].x, box_a_corners[i + 1].y,
+                    box_b_corners[i].x, box_b_corners[i].y, box_b_corners[i + 1].x, box_b_corners[i + 1].y);
+#endif
+            }
+        }
+    }
+
+    // check corners
+    for (int k = 0; k < 4; k++){
+        if (check_in_box2d(box_a, box_b_corners[k])){
+            poly_center = poly_center + box_b_corners[k];
+            cross_points[cnt] = box_b_corners[k];
+            cnt++;
+#ifdef DEBUG
+                printf("b corners in a: corner_b(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
+#endif
+        }
+        if (check_in_box2d(box_b, box_a_corners[k])){
+            poly_center = poly_center + box_a_corners[k];
+            cross_points[cnt] = box_a_corners[k];
+            cnt++;
+#ifdef DEBUG
+                printf("a corners in b: corner_a(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
+#endif
+        }
+    }
+
+    poly_center.x /= cnt;
+    poly_center.y /= cnt;
+
+    // sort the points of polygon
+    Point temp;
+    for (int j = 0; j < cnt - 1; j++){
+        for (int i = 0; i < cnt - j - 1; i++){
+            if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)){
+                temp = cross_points[i];
+                cross_points[i] = cross_points[i + 1];
+                cross_points[i + 1] = temp;
+            }
+        }
+    }
+
+#ifdef DEBUG
+    printf("cnt=%d\n", cnt);
+    for (int i = 0; i < cnt; i++){
+        printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x, cross_points[i].y);
+    }
+#endif
+
+    // get the overlap areas
+    float area = 0;
+    for (int k = 0; k < cnt - 1; k++){
+        area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]);
+    }
+
+    return fabs(area) / 2.0;
+}
+
+__device__ inline float iou_bev(const float *box_a, const float *box_b){
+    // params box_a: [x, y, z, dx, dy, dz, heading]
+    // params box_b: [x, y, z, dx, dy, dz, heading]
+    float sa = box_a[3] * box_a[4];
+    float sb = box_b[3] * box_b[4];
+    float s_overlap = box_overlap(box_a, box_b);
+    return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){
+    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+    if (a_idx >= num_a || b_idx >= num_b){
+        return;
+    }
+    const float * cur_box_a = boxes_a + a_idx * 7;
+    const float * cur_box_b = boxes_b + b_idx * 7;
+    float s_overlap = box_overlap(cur_box_a, cur_box_b);
+    ans_overlap[a_idx * num_b + b_idx] = s_overlap;
+}
+
+__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){
+    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+    if (a_idx >= num_a || b_idx >= num_b){
+        return;
+    }
+
+    const float * cur_box_a = boxes_a + a_idx * 7;
+    const float * cur_box_b = boxes_b + b_idx * 7;
+    float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
+    ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
+}
+
+__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
+                           const float *boxes, unsigned long long *mask){
+    //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+        block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+        block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+        block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+        block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+        block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+        block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+        block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+        const float *cur_box = boxes + cur_box_idx * 7;
+
+        int i = 0;
+        unsigned long long t = 0;
+        int start = 0;
+        if (row_start == col_start) {
+          start = threadIdx.x + 1;
+        }
+        for (i = start; i < col_size; i++) {
+            if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh){
+                t |= 1ULL << i;
+            }
+        }
+        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+        mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+}
+
+
+__device__ inline float iou_normal(float const * const a, float const * const b) {
+    //params: a: [x, y, z, dx, dy, dz, heading]
+    //params: b: [x, y, z, dx, dy, dz, heading]
+
+    float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2), right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
+    float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2), bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
+    float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
+    float interS = width * height;
+    float Sa = a[3] * a[4];
+    float Sb = b[3] * b[4];
+    return interS / fmaxf(Sa + Sb - interS, EPS);
+}
+
+
+__global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_thresh,
+                           const float *boxes, unsigned long long *mask){
+    //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+        block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+        block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+        block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+        block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+        block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+        block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+        block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+        const float *cur_box = boxes + cur_box_idx * 7;
+
+        int i = 0;
+        unsigned long long t = 0;
+        int start = 0;
+        if (row_start == col_start) {
+          start = threadIdx.x + 1;
+        }
+        for (i = start; i < col_size; i++) {
+            if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh){
+                t |= 1ULL << i;
+            }
+        }
+        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+        mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+}
+
+
+NmsCuda::NmsCuda(const int num_threads, const int num_box_corners,
+    const float nms_overlap_threshold)
+: num_threads_(num_threads),
+  num_box_corners_(num_box_corners),
+  nms_overlap_threshold_(nms_overlap_threshold) {}
+
+void NmsCuda::DoNmsCuda(const int host_filter_count,
+    float *dev_sorted_box_for_nms, long *out_keep_inds,
+    int *out_num_to_keep) {
+
+    const int col_blocks = DIVUP(host_filter_count, num_threads_);
+    unsigned long long *dev_mask = NULL;
+    GPU_CHECK(cudaMalloc(&dev_mask, host_filter_count * col_blocks * sizeof(unsigned long long)));    
+
+    dim3 blocks(DIVUP(host_filter_count, num_threads_),
+                DIVUP(host_filter_count, num_threads_));
+    dim3 threads(num_threads_);
+
+    nms_kernel<<<blocks, threads>>>(host_filter_count, nms_overlap_threshold_, dev_sorted_box_for_nms, dev_mask);
+    // postprocess for nms output
+    std::vector<unsigned long long> host_mask(host_filter_count * col_blocks);
+    GPU_CHECK(cudaMemcpy(&host_mask[0], dev_mask,
+            sizeof(unsigned long long) * host_filter_count * col_blocks,
+            cudaMemcpyDeviceToHost));
+    std::vector<unsigned long long> remv(col_blocks);
+    memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+    for (int i = 0; i < host_filter_count; ++i) {
+        int nblock = i / num_threads_;
+        int inblock = i % num_threads_;
+
+        if (!(remv[nblock] & (1ULL << inblock))) {
+            out_keep_inds[(*out_num_to_keep)++] = i;
+            unsigned long long *p = &host_mask[0] + i * col_blocks;
+            for (int j = nblock; j < col_blocks; ++j) {
+                remv[j] |= p[j];
+            }
+        }
+    }
+    GPU_CHECK(cudaFree(dev_mask));
+}

+ 64 - 0
src/api/perception_lidar_PointPillars_MultiHead/nms.h

@@ -0,0 +1,64 @@
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+class NmsCuda {
+ private:
+  const int num_threads_;
+  const int num_box_corners_;
+  const float nms_overlap_threshold_;
+
+ public:
+  /**
+   * @brief Constructor
+   * @param[in] num_threads Number of threads when launching cuda kernel
+   * @param[in] num_box_corners Number of corners for 2D box
+   * @param[in] nms_overlap_threshold IOU threshold for NMS
+   * @details Captital variables never change after the compile, Non-captital
+   * variables could be chaned through rosparam
+   */
+  NmsCuda(const int num_threads, const int num_box_corners,
+          const float nms_overlap_threshold);
+
+  /**
+   * @brief GPU Non-Maximum Suppresion for network output
+   * @param[in] host_filter_count Number of filtered output
+   * @param[in] dev_sorted_box_for_nms Bounding box output sorted by score
+   * @param[out] out_keep_inds Indexes of selected bounding box
+   * @param[out] out_num_to_keep Number of kept bounding boxes
+   * @details NMS in GPU and postprocessing for selecting box in CPU
+   */
+  void DoNmsCuda(const int host_filter_count, float* dev_sorted_box_for_nms,
+                 long* out_keep_inds, int* out_num_to_keep);
+};

+ 51 - 0
src/api/perception_lidar_PointPillars_MultiHead/object.proto

@@ -0,0 +1,51 @@
+syntax = "proto2";
+
+package iv.lidar;
+
+message PointXYZI {
+  optional float x = 1 [default = nan];
+  optional float y = 2 [default = nan];
+  optional float z = 3 [default = nan];
+  optional float i = 4 [default = 0];
+};
+
+message PointXYZ {
+  optional float x = 1 [default = nan];
+  optional float y = 2 [default = nan];
+  optional float z = 3 [default = nan];
+};
+
+message Dimension {
+  optional float x = 1 [default = 0];
+  optional float y = 2 [default = 0];
+  optional float z = 3 [default = 0];
+};
+
+message VelXY {
+  optional float x =1 [default = 0];
+  optional float y =2 [default = 0];
+};
+
+message lidarobject
+{
+  optional PointXYZ min_point = 1;
+  optional PointXYZ max_point = 2;
+  optional PointXYZ centroid = 3;
+  optional Dimension dimensions = 4;
+  optional PointXYZ position = 5;
+  optional double angle =6;
+  optional double velocity_linear_x = 7;
+  optional double acceleration_linear_y = 8;
+  optional double timestamp = 9;
+  optional double tyaw = 10;
+  required int32 mnType = 11;
+  optional int32 id = 12;
+  optional int32 behavior_state = 13;
+  optional float score = 14;
+  optional bool velocity_reliable = 15;
+  optional bool pose_reliable = 16;
+  repeated float type_probs = 17;
+  repeated PointXYZI cloud = 18;
+  optional string type_name = 19;
+  optional VelXY vel_relative = 20; //相对速度
+};

+ 11 - 0
src/api/perception_lidar_PointPillars_MultiHead/objectarray.proto

@@ -0,0 +1,11 @@
+syntax = "proto2";
+
+package iv.lidar;
+
+import "object.proto";
+
+message objectarray
+{
+  repeated lidarobject obj = 1;
+  optional double timestamp = 2;
+};

+ 182 - 0
src/api/perception_lidar_PointPillars_MultiHead/perception_lidar_PointPillars_MultiHead.pro

@@ -0,0 +1,182 @@
+QT -= gui
+
+CONFIG += c++14 console
+CONFIG -= app_bundle
+
+QMAKE_CXXFLAGS += -std=gnu++17
+QMAKE_LFLAGS += -no-pie  -Wl,--no-as-needed
+
+
+system("protoc *.proto -I=./ --cpp_out=./")
+# The following define makes your compiler emit warnings if you use
+# any feature of Qt which as been marked deprecated (the exact warnings
+# depend on your compiler). Please consult the documentation of the
+# deprecated API in order to know how to port your code away from it.
+DEFINES += QT_DEPRECATED_WARNINGS
+#DEFINES += DEBUG_SHOW
+# You can also make your code fail to compile if you use deprecated APIs.
+# In order to do so, uncomment the following line.
+# You can also select to disable deprecated APIs only up to a certain version of Qt.
+#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0
+
+SOURCES += main.cpp \
+    pointpillars.cc \
+    object.pb.cc \
+    objectarray.pb.cc \
+    Tracker/Ctracker.cpp \
+    Tracker/HungarianAlg.cpp \
+    Tracker/Kalman.cpp \
+    Tracker/track.cpp
+
+DISTFILES += \
+    nms.cu \
+    postprocess.cu \
+    preprocess.cu \
+    scatter.cu
+
+HEADERS += \
+    common.h \
+    nms.h \
+    pointpillars.h \
+    postprocess.h \
+    preprocess.h \
+    scatter.h \
+    object.pb.h \
+    objectarray.pb.h \
+    Tracker/Ctracker.h \
+    Tracker/defines.h \
+    Tracker/HungarianAlg.h \
+    Tracker/Kalman.h \
+    Tracker/ShortPathCalculator.h \
+    Tracker/track.h \
+    Tracker/Tracking.hpp
+
+INCLUDEPATH+=Tracker
+
+CUDA_SOURCES +=  \
+    nms.cu \
+    postprocess.cu \
+    preprocess.cu \
+    scatter.cu
+
+CUDA_SDK = "/usr/local/cuda/"   # cudaSDK路径
+
+CUDA_DIR = "/usr/local/cuda/"            # CUDA tookit路径
+
+SYSTEM_NAME = linux         # 自己系统环境 'Win32', 'x64', or 'Win64'
+
+SYSTEM_TYPE = 64           #操作系统位数 '32' or '64',
+
+CUDA_ARCH = sm_72         # cuda架构, for example 'compute_10', 'compute_11', 'sm_10'
+
+NVCC_OPTIONS = --use_fast_math --compiler-options "-fPIC"
+
+
+# include paths
+
+INCLUDEPATH += $$CUDA_DIR/include
+#INCLUDEPATH += /usr/local/cuda-10.0/targets/aarch64-linux/include/crt
+
+# library directories
+
+QMAKE_LIBDIR += $$CUDA_DIR/lib/
+
+CUDA_OBJECTS_DIR = ./
+
+# The following library conflicts with something in Cuda
+
+#QMAKE_LFLAGS_RELEASE = /NODEFAULTLIB:msvcrt.lib
+
+#QMAKE_LFLAGS_DEBUG   = /NODEFAULTLIB:msvcrtd.lib
+
+# Add the necessary libraries
+
+CUDA_LIBS =  cudart cufft
+
+# The following makes sure all path names (which often include spaces) are put between quotation marks
+
+CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
+
+NVCC_LIBS = $$join(CUDA_LIBS,' -l','-l', '')
+
+#LIBS += $$join(CUDA_LIBS,'.so ', '', '.so')
+
+# Configuration of the Cuda compiler
+
+CONFIG(debug, debug|release) {
+
+    # Debug mode
+
+    cuda_d.input = CUDA_SOURCES
+
+    cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
+
+    cuda_d.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
+
+    cuda_d.dependency_type = TYPE_C
+
+    QMAKE_EXTRA_COMPILERS += cuda_d
+
+}
+
+else {
+
+    # Release mode
+
+    cuda.input = CUDA_SOURCES
+
+    cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
+
+    cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -O3 -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
+
+    cuda.dependency_type = TYPE_C
+
+    QMAKE_EXTRA_COMPILERS += cuda
+
+}
+
+
+LIBS += -L/usr/local/cuda-10.2/targets/aarch64-linux/lib
+LIBS += -L/usr/local/cuda-11.4/targets/aarch64-linux/lib
+
+LIBS += -lcudart -lcufft -lyaml-cpp
+
+#LIBS += -L/home/adc/soft/cudnn-10.2-linux-x64-v7.6.5.32/cuda/lib64 -lcudnn
+
+LIBS +=  -lnvinfer -lnvonnxparser -lnvcaffe_parser  # -lmyelin
+
+#LIBS += -L/home/nvidia/git/libtorch_gpu-1.6.0-linux-aarch64/lib -ltorch_cuda  -ltorch -lc10 -ltorch_cpu
+
+unix:INCLUDEPATH += /usr/include/eigen3
+unix:INCLUDEPATH += /usr/include/pcl-1.7
+unix:INCLUDEPATH += /usr/include/pcl-1.8
+unix:INCLUDEPATH += /usr/include/pcl-1.10
+unix:INCLUDEPATH += /usr/include/pcl-1.12
+unix:INCLUDEPATH += /usr/include/pcl-1.14
+
+
+LIBS += -lboost_system -lprotobuf
+
+LIBS += -L$$PWD -lmodulecomm -lxmlparam
+
+unix:LIBS +=  -lpcl_common\
+        -lpcl_features\
+        -lpcl_filters\
+        -lpcl_io\
+        -lpcl_io_ply\
+        -lpcl_kdtree\
+        -lpcl_keypoints\
+        -lpcl_octree\
+        -lpcl_outofcore\
+        -lpcl_people\
+        -lpcl_recognition\
+        -lpcl_registration\
+        -lpcl_sample_consensus\
+        -lpcl_search\
+        -lpcl_segmentation\
+        -lpcl_surface\
+        -lpcl_tracking\
+        -lpcl_visualization
+
+INCLUDEPATH += /usr/include/opencv4/
+LIBS += /usr/lib/aarch64-linux-gnu/libopencv*.so

+ 507 - 0
src/api/perception_lidar_PointPillars_MultiHead/pointpillars.cc

@@ -0,0 +1,507 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+
+
+#include "pointpillars.h"
+
+#include <chrono>
+#include <iostream>
+#include <iostream>
+
+PointPillars::PointPillars(const float score_threshold,
+                           const float nms_overlap_threshold,
+                           const bool use_onnx,
+                           const std::string pfe_file,
+                           const std::string backbone_file,
+                           const std::string pp_config)
+    : score_threshold_(score_threshold),
+      nms_overlap_threshold_(nms_overlap_threshold),
+      use_onnx_(use_onnx),
+      pfe_file_(pfe_file),
+      backbone_file_(backbone_file),
+      pp_config_(pp_config)
+{
+    InitParams();
+    InitTRT(use_onnx_);
+    DeviceMemoryMalloc();
+
+    preprocess_points_cuda_ptr_.reset(new PreprocessPointsCuda(
+        kNumThreads,
+        kMaxNumPillars,
+        kMaxNumPointsPerPillar,
+        kNumPointFeature,
+        kNumIndsForScan,
+        kGridXSize,kGridYSize, kGridZSize,
+        kPillarXSize,kPillarYSize, kPillarZSize,
+        kMinXRange, kMinYRange, kMinZRange));
+
+    scatter_cuda_ptr_.reset(new ScatterCuda(kNumThreads, kGridXSize, kGridYSize));
+
+    const float float_min = std::numeric_limits<float>::lowest();
+    const float float_max = std::numeric_limits<float>::max();
+    postprocess_cuda_ptr_.reset(
+      new PostprocessCuda(kNumThreads,
+                          float_min, float_max, 
+                          kNumClass,kNumAnchorPerCls,
+                          kMultiheadLabelMapping,
+                          score_threshold_, 
+                          nms_overlap_threshold_,
+                          kNmsPreMaxsize, 
+                          kNmsPostMaxsize,
+                          kNumBoxCorners, 
+                          kNumInputBoxFeature,
+                          7));  /*kNumOutputBoxFeature*/
+    
+}
+
+PointPillars::~PointPillars() {
+    // for pillars 
+    GPU_CHECK(cudaFree(dev_num_points_per_pillar_));
+    GPU_CHECK(cudaFree(dev_x_coors_));
+    GPU_CHECK(cudaFree(dev_y_coors_));
+    GPU_CHECK(cudaFree(dev_pillar_point_feature_));
+    GPU_CHECK(cudaFree(dev_pillar_coors_));
+    // for sparse map
+    GPU_CHECK(cudaFree(dev_sparse_pillar_map_));    
+    GPU_CHECK(cudaFree(dev_cumsum_along_x_));
+    GPU_CHECK(cudaFree(dev_cumsum_along_y_));
+    // for pfe forward
+    GPU_CHECK(cudaFree(dev_pfe_gather_feature_));
+      
+    GPU_CHECK(cudaFree(pfe_buffers_[0]));
+    GPU_CHECK(cudaFree(pfe_buffers_[1]));
+
+    GPU_CHECK(cudaFree(rpn_buffers_[0]));
+    GPU_CHECK(cudaFree(rpn_buffers_[1]));
+    GPU_CHECK(cudaFree(rpn_buffers_[2]));
+    GPU_CHECK(cudaFree(rpn_buffers_[3]));
+    GPU_CHECK(cudaFree(rpn_buffers_[4]));
+    GPU_CHECK(cudaFree(rpn_buffers_[5]));
+    GPU_CHECK(cudaFree(rpn_buffers_[6]));
+    GPU_CHECK(cudaFree(rpn_buffers_[7]));
+    pfe_context_->destroy();
+    backbone_context_->destroy();
+    pfe_engine_->destroy();
+    backbone_engine_->destroy();
+    // for post process
+    GPU_CHECK(cudaFree(dev_scattered_feature_));
+    GPU_CHECK(cudaFree(dev_filtered_box_));
+    GPU_CHECK(cudaFree(dev_filtered_score_));
+    GPU_CHECK(cudaFree(dev_filtered_label_));
+    GPU_CHECK(cudaFree(dev_filtered_dir_));
+    GPU_CHECK(cudaFree(dev_box_for_nms_));
+    GPU_CHECK(cudaFree(dev_filter_count_));
+
+
+}
+
+void PointPillars::InitParams()
+{
+    YAML::Node params = YAML::LoadFile(pp_config_);
+    kPillarXSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][0].as<float>();
+    kPillarYSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][1].as<float>();
+    kPillarZSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][2].as<float>();
+    kMinXRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][0].as<float>();
+    kMinYRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][1].as<float>();
+    kMinZRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][2].as<float>();
+    kMaxXRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][3].as<float>();
+    kMaxYRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][4].as<float>();
+    kMaxZRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][5].as<float>();
+    kNumClass = params["CLASS_NAMES"].size();
+    kMaxNumPillars = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["MAX_NUMBER_OF_VOXELS"]["test"].as<int>();
+    kMaxNumPointsPerPillar = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["MAX_POINTS_PER_VOXEL"].as<int>();
+    kNumPointFeature = 5; // [x, y, z, i,0]
+    kNumInputBoxFeature = 7;
+    kNumOutputBoxFeature = params["MODEL"]["DENSE_HEAD"]["TARGET_ASSIGNER_CONFIG"]["BOX_CODER_CONFIG"]["code_size"].as<int>();
+    kBatchSize = 1;
+    kNumIndsForScan = 1024;
+    kNumThreads = 64;
+    kNumBoxCorners = 8;
+    kAnchorStrides = 4;
+    kNmsPreMaxsize = params["MODEL"]["POST_PROCESSING"]["NMS_CONFIG"]["NMS_PRE_MAXSIZE"].as<int>();
+    kNmsPostMaxsize = params["MODEL"]["POST_PROCESSING"]["NMS_CONFIG"]["NMS_POST_MAXSIZE"].as<int>();
+    //params for initialize anchors
+    //Adapt to OpenPCDet
+    kAnchorNames = params["CLASS_NAMES"].as<std::vector<std::string>>();
+    for (int i = 0; i < kAnchorNames.size(); ++i)
+    {
+        kAnchorDxSizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][0].as<float>());
+        kAnchorDySizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][1].as<float>());
+        kAnchorDzSizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][2].as<float>());
+        kAnchorBottom.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_bottom_heights"][0].as<float>());
+    }
+    for (int idx_head = 0; idx_head < params["MODEL"]["DENSE_HEAD"]["RPN_HEAD_CFGS"].size(); ++idx_head)
+    {
+        int num_cls_per_head = params["MODEL"]["DENSE_HEAD"]["RPN_HEAD_CFGS"][idx_head]["HEAD_CLS_NAME"].size();
+        std::vector<int> value;
+        for (int i = 0; i < num_cls_per_head; ++i)
+        {
+            value.emplace_back(idx_head + i);
+        }
+        kMultiheadLabelMapping.emplace_back(value);
+    }
+
+    // Generate secondary parameters based on above.
+    kGridXSize = static_cast<int>((kMaxXRange - kMinXRange) / kPillarXSize); //512
+    kGridYSize = static_cast<int>((kMaxYRange - kMinYRange) / kPillarYSize); //512
+    kGridZSize = static_cast<int>((kMaxZRange - kMinZRange) / kPillarZSize); //1
+    kRpnInputSize = 64 * kGridYSize * kGridXSize;
+
+    kNumAnchorXinds = static_cast<int>(kGridXSize / kAnchorStrides); //Width
+    kNumAnchorYinds = static_cast<int>(kGridYSize / kAnchorStrides); //Hight
+    kNumAnchor = kNumAnchorXinds * kNumAnchorYinds * 2 * kNumClass;  // H * W * Ro * N = 196608
+
+    kNumAnchorPerCls = kNumAnchorXinds * kNumAnchorYinds * 2; //H * W * Ro = 32768
+    kRpnBoxOutputSize = kNumAnchor * kNumOutputBoxFeature;
+    kRpnClsOutputSize = kNumAnchor * kNumClass;
+    kRpnDirOutputSize = kNumAnchor * 2;
+}
+
+void PointPillars::DeviceMemoryMalloc() {
+    // for pillars 
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_num_points_per_pillar_), kMaxNumPillars * sizeof(float))); // M
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_x_coors_), kMaxNumPillars * sizeof(int))); // M
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_y_coors_), kMaxNumPillars * sizeof(int))); // M
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_point_feature_), kMaxNumPillars * kMaxNumPointsPerPillar * kNumPointFeature * sizeof(float))); // [M , m , 4]
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_coors_),  kMaxNumPillars * 4 * sizeof(float))); // [M , 4]
+    // for sparse map
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_sparse_pillar_map_), kNumIndsForScan * kNumIndsForScan * sizeof(int))); // [1024 , 1024]
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_cumsum_along_x_), kNumIndsForScan * kNumIndsForScan * sizeof(int))); // [1024 , 1024]
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_cumsum_along_y_), kNumIndsForScan * kNumIndsForScan * sizeof(int)));// [1024 , 1024]
+
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pfe_gather_feature_),
+                        kMaxNumPillars * kMaxNumPointsPerPillar *
+                            kNumGatherPointFeature * sizeof(float)));
+    // for trt inference
+    // create GPU buffers and a stream
+
+    GPU_CHECK(
+        cudaMalloc(&pfe_buffers_[0], kMaxNumPillars * kMaxNumPointsPerPillar *
+                                        kNumGatherPointFeature * sizeof(float)));
+    GPU_CHECK(cudaMalloc(&pfe_buffers_[1], kMaxNumPillars * 64 * sizeof(float)));
+
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[0],  kRpnInputSize * sizeof(float)));
+
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[1],  kNumAnchorPerCls  * sizeof(float)));  //classes
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[2],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[3],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[4],  kNumAnchorPerCls  * sizeof(float)));
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[5],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[6],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
+    
+    GPU_CHECK(cudaMalloc(&rpn_buffers_[7],  kNumAnchorPerCls * kNumClass * kNumOutputBoxFeature * sizeof(float))); //boxes
+
+    // for scatter kernel
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_scattered_feature_),
+                        kNumThreads * kGridYSize * kGridXSize * sizeof(float)));
+    // for filter
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_box_),
+                        kNumAnchor * kNumOutputBoxFeature * sizeof(float)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_score_),
+                        kNumAnchor * sizeof(float)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_label_),
+                        kNumAnchor * sizeof(int)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_dir_),
+                        kNumAnchor * sizeof(int)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_box_for_nms_),
+                        kNumAnchor * kNumBoxCorners * sizeof(float)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filter_count_), kNumClass * sizeof(int)));
+
+}
+
+void PointPillars::SetDeviceMemoryToZero() {
+
+    GPU_CHECK(cudaMemset(dev_num_points_per_pillar_, 0, kMaxNumPillars * sizeof(float)));
+    GPU_CHECK(cudaMemset(dev_x_coors_,               0, kMaxNumPillars * sizeof(int)));
+    GPU_CHECK(cudaMemset(dev_y_coors_,               0, kMaxNumPillars * sizeof(int)));
+    GPU_CHECK(cudaMemset(dev_pillar_point_feature_,  0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumPointFeature * sizeof(float)));
+    GPU_CHECK(cudaMemset(dev_pillar_coors_,          0, kMaxNumPillars * 4 * sizeof(float)));
+    // GPU_CHECK(cudaMemset(dev_sparse_pillar_map_,     0, kNumIndsForScan * kNumIndsForScan * sizeof(int)));
+    GPU_CHECK(cudaMemset(dev_pfe_gather_feature_,    0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float)));
+    
+    // GPU_CHECK(cudaMemset(pfe_buffers_[0],       0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float)));
+    // GPU_CHECK(cudaMemset(pfe_buffers_[1],       0, kMaxNumPillars * 64 * sizeof(float)));
+
+    GPU_CHECK(cudaMemset(dev_scattered_feature_,    0, kNumThreads * kGridYSize * kGridXSize * sizeof(float)));
+    
+    // GPU_CHECK(cudaMemset(rpn_buffers_[0],    0, kRpnInputSize * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[1],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[2],    0, kNumAnchorPerCls     * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[3],    0, kNumAnchorPerCls * 2 * kNumOutputBoxFeature * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[4],    0, kNumAnchorPerCls * 4 * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[5],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[6],    0, kNumAnchorPerCls     * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[7],    0, kNumAnchorPerCls * 2 * kNumOutputBoxFeature * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[8],    0, kNumAnchorPerCls * 4 * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[9],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
+    // GPU_CHECK(cudaMemset(rpn_buffers_[10],   0, kNumAnchorPerCls     * sizeof(float)));
+
+    GPU_CHECK(cudaMemset(dev_filtered_box_,     0, kNumAnchor * kNumOutputBoxFeature * sizeof(float)));
+    GPU_CHECK(cudaMemset(dev_filtered_score_,   0, kNumAnchor * sizeof(float)));
+    GPU_CHECK(cudaMemset(dev_filter_count_,     0, kNumClass * sizeof(int)));
+}
+
+void PointPillars::InitTRT(const bool use_onnx) {
+  if (use_onnx_) {
+    // create a TensorRT model from the onnx model and load it into an engine
+    OnnxToTRTModel(pfe_file_, &pfe_engine_);
+    SaveEngine(pfe_engine_, pfe_file_.substr(0, pfe_file_.find(".")) + ".trt");
+    OnnxToTRTModel(backbone_file_, &backbone_engine_);
+    SaveEngine(backbone_engine_, backbone_file_.substr(0, backbone_file_.find(".")) + ".trt");
+  }else {
+    EngineToTRTModel(pfe_file_, &pfe_engine_);
+    EngineToTRTModel(backbone_file_, &backbone_engine_);
+  }
+    if (pfe_engine_ == nullptr || backbone_engine_ == nullptr) {
+        std::cerr << "Failed to load ONNX file.";
+    }
+
+    // create execution context from the engine
+    pfe_context_ = pfe_engine_->createExecutionContext();
+    backbone_context_ = backbone_engine_->createExecutionContext();
+    if (pfe_context_ == nullptr || backbone_context_ == nullptr) {
+        std::cerr << "Failed to create TensorRT Execution Context.";
+    }
+  
+}
+
+void PointPillars::OnnxToTRTModel(
+    const std::string& model_file,  // name of the onnx model
+    nvinfer1::ICudaEngine** engine_ptr) {
+    int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kWARNING);
+
+    // create the builder
+    const auto explicit_batch =
+        static_cast<uint32_t>(kBatchSize) << static_cast<uint32_t>(
+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(g_logger_);
+    nvinfer1::INetworkDefinition* network =
+        builder->createNetworkV2(explicit_batch);
+
+    // parse onnx model
+    auto parser = nvonnxparser::createParser(*network, g_logger_);
+    if (!parser->parseFromFile(model_file.c_str(), verbosity)) {
+        std::string msg("failed to parse onnx file");
+        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
+        exit(EXIT_FAILURE);
+    }
+
+    // Build the engine
+    builder->setMaxBatchSize(kBatchSize);
+ //   builder->setHalf2Mode(true);
+    nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
+    config->setMaxWorkspaceSize(1 << 25);
+    nvinfer1::ICudaEngine* engine =
+        builder->buildEngineWithConfig(*network, *config);
+
+    *engine_ptr = engine;
+    parser->destroy();
+    network->destroy();
+    config->destroy();
+    builder->destroy();
+}
+
+void PointPillars::SaveEngine(const nvinfer1::ICudaEngine* engine, const std::string& engine_filepath)
+{
+    // serialize the engine, then close everything down
+    nvinfer1::IHostMemory& trtModelStream = *(engine->serialize());
+    std::ofstream file;
+    file.open(engine_filepath, std::ios::binary | std::ios::out);
+    if(!file.is_open())
+    {
+        std::cout << "read create engine file" << engine_filepath <<" failed" << std::endl;
+        return;
+    }
+    file.write((const char*)trtModelStream.data(), std::streamsize(trtModelStream.size()));
+    file.close();
+}
+
+void PointPillars::EngineToTRTModel(
+    const std::string &engine_file ,     
+    nvinfer1::ICudaEngine** engine_ptr)  {
+    int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kWARNING);
+    std::stringstream gieModelStream; 
+    gieModelStream.seekg(0, gieModelStream.beg); 
+
+    std::ifstream cache(engine_file); 
+    gieModelStream << cache.rdbuf();
+    cache.close(); 
+    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(g_logger_); 
+
+    if (runtime == nullptr) {
+        std::string msg("failed to build runtime parser");
+        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
+        exit(EXIT_FAILURE);
+    }
+    gieModelStream.seekg(0, std::ios::end);
+    const int modelSize = gieModelStream.tellg(); 
+
+    gieModelStream.seekg(0, std::ios::beg);
+    void* modelMem = malloc(modelSize); 
+    gieModelStream.read((char*)modelMem, modelSize);
+
+    std::cout << "                                                                  "<< std::endl;
+    std::cout << "------------------------------------------------------------------"<< std::endl;
+    std::cout << ">>>>                                                          >>>>"<< std::endl;
+    std::cout << "                                                                  "<< std::endl;
+    std::cout << "Input filename:   " << engine_file << std::endl;
+    std::cout << "                                                                  "<< std::endl;
+    std::cout << ">>>>                                                          >>>>"<< std::endl;
+    std::cout << "------------------------------------------------------------------"<< std::endl;
+    std::cout << "                                                                  "<< std::endl;
+
+    nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(modelMem, modelSize, NULL); 
+    if (engine == nullptr) {
+        std::string msg("failed to build engine parser");
+        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
+        exit(EXIT_FAILURE);
+    }
+    *engine_ptr = engine;
+
+}
+
+void PointPillars::DoInference(const float* in_points_array,
+                                const int in_num_points,
+                                std::vector<float>* out_detections,
+                                std::vector<int>* out_labels,
+                                std::vector<float>* out_scores) 
+{
+    SetDeviceMemoryToZero();
+    cudaDeviceSynchronize();
+    // [STEP 1] : load pointcloud
+    float* dev_points;
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_points),
+                        in_num_points * kNumPointFeature * sizeof(float))); // in_num_points , 5
+    GPU_CHECK(cudaMemset(dev_points, 0, in_num_points * kNumPointFeature * sizeof(float)));
+    GPU_CHECK(cudaMemcpy(dev_points, in_points_array,
+                        in_num_points * kNumPointFeature * sizeof(float),
+                        cudaMemcpyHostToDevice));
+    
+    // [STEP 2] : preprocess
+    host_pillar_count_[0] = 0;
+    auto preprocess_start = std::chrono::high_resolution_clock::now();
+    preprocess_points_cuda_ptr_->DoPreprocessPointsCuda(
+          dev_points, in_num_points, dev_x_coors_, dev_y_coors_,
+          dev_num_points_per_pillar_, dev_pillar_point_feature_, dev_pillar_coors_,
+          dev_sparse_pillar_map_, host_pillar_count_ ,
+          dev_pfe_gather_feature_ );
+    cudaDeviceSynchronize();
+    auto preprocess_end = std::chrono::high_resolution_clock::now();
+    // DEVICE_SAVE<float>(dev_pfe_gather_feature_,  kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature  , "0_Model_pfe_input_gather_feature");
+
+    // [STEP 3] : pfe forward
+    cudaStream_t stream;
+    GPU_CHECK(cudaStreamCreate(&stream));
+    auto pfe_start = std::chrono::high_resolution_clock::now();
+    GPU_CHECK(cudaMemcpyAsync(pfe_buffers_[0], dev_pfe_gather_feature_,
+                            kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float), ///kNumGatherPointFeature
+                            cudaMemcpyDeviceToDevice, stream));
+    pfe_context_->enqueueV2(pfe_buffers_, stream, nullptr);
+    cudaDeviceSynchronize();
+    auto pfe_end = std::chrono::high_resolution_clock::now();
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(pfe_buffers_[1]),  kMaxNumPillars * 64 , "1_Model_pfe_output_buffers_[1]");
+
+    // [STEP 4] : scatter pillar feature
+    auto scatter_start = std::chrono::high_resolution_clock::now();
+    scatter_cuda_ptr_->DoScatterCuda(
+        host_pillar_count_[0], dev_x_coors_, dev_y_coors_,
+        reinterpret_cast<float*>(pfe_buffers_[1]), dev_scattered_feature_);
+    cudaDeviceSynchronize();
+    auto scatter_end = std::chrono::high_resolution_clock::now();   
+    // DEVICE_SAVE<float>(dev_scattered_feature_ ,  kRpnInputSize,"2_Model_backbone_input_dev_scattered_feature");
+
+    // [STEP 5] : backbone forward
+    auto backbone_start = std::chrono::high_resolution_clock::now();
+    GPU_CHECK(cudaMemcpyAsync(rpn_buffers_[0], dev_scattered_feature_,
+                            kBatchSize * kRpnInputSize * sizeof(float),
+                            cudaMemcpyDeviceToDevice, stream));
+    backbone_context_->enqueueV2(rpn_buffers_, stream, nullptr);
+    cudaDeviceSynchronize();
+    auto backbone_end = std::chrono::high_resolution_clock::now();
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[1]) ,  kNumAnchorPerCls    ,"3_rpn_buffers_[1]");
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[2]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[2]");
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[3]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[3]");
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[4]) ,  kNumAnchorPerCls    ,"3_rpn_buffers_[4]");
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[5]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[5]");
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[6]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[6]");
+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[7]) ,  kNumAnchorPerCls * kNumClass * 9 ,"3_rpn_buffers_[7]");
+
+    // [STEP 6]: postprocess (multihead)
+    auto postprocess_start = std::chrono::high_resolution_clock::now();
+    GPU_CHECK(cudaMemset(dev_filter_count_, 0, kNumClass * sizeof(int)));
+    postprocess_cuda_ptr_->DoPostprocessCuda(
+        reinterpret_cast<float*>(rpn_buffers_[1]), // [cls]   kNumAnchorPerCls 
+        reinterpret_cast<float*>(rpn_buffers_[2]), // [cls]   kNumAnchorPerCls * 2 * 2
+        reinterpret_cast<float*>(rpn_buffers_[3]), // [cls]   kNumAnchorPerCls * 2 * 2
+        reinterpret_cast<float*>(rpn_buffers_[4]), // [cls]   kNumAnchorPerCls 
+        reinterpret_cast<float*>(rpn_buffers_[5]), // [cls]   kNumAnchorPerCls * 2 * 2
+        reinterpret_cast<float*>(rpn_buffers_[6]), // [cls]   kNumAnchorPerCls * 2 * 2
+        reinterpret_cast<float*>(rpn_buffers_[7]), // [boxes] kNumAnchorPerCls * kNumClass * kNumOutputBoxFeature
+        dev_filtered_box_, dev_filtered_score_, dev_filter_count_,
+        *out_detections, *out_labels , *out_scores);
+    cudaDeviceSynchronize();
+    auto postprocess_end = std::chrono::high_resolution_clock::now();
+
+    // release the stream and the buffers
+//    std::chrono::duration<double> preprocess_cost = preprocess_end - preprocess_start;
+//    std::chrono::duration<double> pfe_cost = pfe_end - pfe_start;
+//    std::chrono::duration<double> scatter_cost = scatter_end - scatter_start;
+//    std::chrono::duration<double> backbone_cost = backbone_end - backbone_start;
+//    std::chrono::duration<double> postprocess_cost = postprocess_end - postprocess_start;
+
+//    std::chrono::duration<double> pointpillars_cost = postprocess_end - preprocess_start;
+//    std::cout << "------------------------------------" << std::endl;
+//    std::cout << setiosflags(ios::left)  << setw(14) << "Module" << setw(12)  << "Time"  << resetiosflags(ios::left) << std::endl;
+//    std::cout << "------------------------------------" << std::endl;
+//    std::string Modules[] = {"Preprocess" , "Pfe" , "Scatter" , "Backbone" , "Postprocess" , "Summary"};
+//    double Times[] = {preprocess_cost.count() , pfe_cost.count() , scatter_cost.count() , backbone_cost.count() , postprocess_cost.count() , pointpillars_cost.count()};
+
+//    for (int i =0 ; i < 6 ; ++i) {
+//        std::cout << setiosflags(ios::left) << setw(14) << Modules[i]  << setw(8)  << Times[i] * 1000 << " ms" << resetiosflags(ios::left) << std::endl;
+//    }
+//    std::cout << "------------------------------------" << std::endl;
+    cudaStreamDestroy(stream);
+
+}

+ 287 - 0
src/api/perception_lidar_PointPillars_MultiHead/pointpillars.h

@@ -0,0 +1,287 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+#pragma once
+
+// headers in STL
+#include <algorithm>
+#include <cmath>
+#include <iomanip>
+#include <limits>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+// headers in TensorRT
+#include "NvInfer.h"
+#include "NvOnnxParser.h"
+
+// headers in local files
+// #include "params.h"
+#include "common.h"
+#include <yaml-cpp/yaml.h>
+#include "preprocess.h"
+#include "scatter.h"
+#include "postprocess.h"
+
+using namespace std;
+
+// Logger for TensorRT info/warning/errors
+class Logger : public nvinfer1::ILogger {
+ public:
+  explicit Logger(Severity severity = Severity::kWARNING)
+      : reportable_severity(severity) {}
+
+  void log(Severity severity, const char* msg)   noexcept {
+    // suppress messages with severity enum value greater than the reportable
+    if (severity > reportable_severity) return;
+
+    switch (severity) {
+      case Severity::kINTERNAL_ERROR:
+        std::cerr << "INTERNAL_ERROR: ";
+        break;
+      case Severity::kERROR:
+        std::cerr << "ERROR: ";
+        break;
+      case Severity::kWARNING:
+        std::cerr << "WARNING: ";
+        break;
+      case Severity::kINFO:
+        std::cerr << "INFO: ";
+        break;
+      default:
+        std::cerr << "UNKNOWN: ";
+        break;
+    }
+    std::cerr << msg << std::endl;
+  }
+
+  Severity reportable_severity;
+};
+
+
+
+class PointPillars {
+ private:
+    // initialize in initializer list
+    const float score_threshold_;
+    const float nms_overlap_threshold_;
+    const bool use_onnx_;
+    const std::string pfe_file_;
+    const std::string backbone_file_;
+    const std::string pp_config_;
+    // end initializer list
+    // voxel size
+    float kPillarXSize;
+    float kPillarYSize;
+    float kPillarZSize;
+    // point cloud range
+    float kMinXRange;
+    float kMinYRange;
+    float kMinZRange;
+    float kMaxXRange;
+    float kMaxYRange;
+    float kMaxZRange;
+    // hyper parameters
+    int kNumClass;
+    int kMaxNumPillars;
+    int kMaxNumPointsPerPillar;
+    int kNumPointFeature;
+    int kNumGatherPointFeature = 11;
+    int kGridXSize;
+    int kGridYSize;
+    int kGridZSize;
+    int kNumAnchorXinds;
+    int kNumAnchorYinds;
+    int kRpnInputSize;
+    int kNumAnchor;
+    int kNumInputBoxFeature;
+    int kNumOutputBoxFeature;
+    int kRpnBoxOutputSize;
+    int kRpnClsOutputSize;
+    int kRpnDirOutputSize;
+    int kBatchSize;
+    int kNumIndsForScan;
+    int kNumThreads;
+    // if you change kNumThreads, need to modify NUM_THREADS_MACRO in
+    // common.h
+    int kNumBoxCorners;
+    int kNmsPreMaxsize;
+    int kNmsPostMaxsize;
+    //params for initialize anchors
+    //Adapt to OpenPCDet
+    int kAnchorStrides;
+    std::vector<string> kAnchorNames;
+    std::vector<float> kAnchorDxSizes;
+    std::vector<float> kAnchorDySizes;
+    std::vector<float> kAnchorDzSizes;
+    std::vector<float> kAnchorBottom;
+    std::vector<std::vector<int>> kMultiheadLabelMapping;
+    int kNumAnchorPerCls;
+    int host_pillar_count_[1];
+
+    int* dev_x_coors_;
+    int* dev_y_coors_;
+    float* dev_num_points_per_pillar_;
+    int* dev_sparse_pillar_map_;
+    int* dev_cumsum_along_x_;
+    int* dev_cumsum_along_y_;
+
+    float* dev_pillar_point_feature_;
+    float* dev_pillar_coors_;
+    float* dev_points_mean_;
+
+    float* dev_pfe_gather_feature_;
+    void* pfe_buffers_[2];
+    //variable for doPostprocessCudaMultiHead
+    void* rpn_buffers_[8];
+    
+    std::vector<float*> rpn_box_output_; 
+    std::vector<float*> rpn_cls_output_;
+
+    float* dev_scattered_feature_;
+
+    float* dev_filtered_box_;
+    float* dev_filtered_score_;
+    int*   dev_filtered_label_;
+    int*   dev_filtered_dir_;
+    float* dev_box_for_nms_;
+    int*   dev_filter_count_;
+
+    std::unique_ptr<PreprocessPointsCuda> preprocess_points_cuda_ptr_;
+    std::unique_ptr<ScatterCuda> scatter_cuda_ptr_;
+    std::unique_ptr<PostprocessCuda> postprocess_cuda_ptr_;
+
+    Logger g_logger_;
+    nvinfer1::ICudaEngine* pfe_engine_;
+    nvinfer1::ICudaEngine* backbone_engine_;
+    nvinfer1::IExecutionContext* pfe_context_;
+    nvinfer1::IExecutionContext* backbone_context_;
+
+    /**
+     * @brief Memory allocation for device memory
+     * @details Called in the constructor
+     */
+    void DeviceMemoryMalloc();
+
+    /**
+     * @brief Memory set to 0 for device memory
+     * @details Called in the DoInference
+     */
+    void SetDeviceMemoryToZero();
+
+    /**
+     * @brief Initializing paraments from pointpillars.yaml
+     * @details Called in the constructor
+     */
+    void InitParams();
+    /**
+     * @brief Initializing TensorRT instances
+     * @param[in] usr_onnx_ if true, parse ONNX 
+     * @details Called in the constructor
+     */
+    void InitTRT(const bool use_onnx);
+    void SaveEngine(const nvinfer1::ICudaEngine* engine, const std::string& engine_filepath);
+    /**
+     * @brief Convert ONNX to TensorRT model
+     * @param[in] model_file ONNX model file path
+     * @param[out] engine_ptr TensorRT model engine made out of ONNX model
+     * @details Load ONNX model, and convert it to TensorRT model
+     */
+    void OnnxToTRTModel(const std::string& model_file,
+                        nvinfer1::ICudaEngine** engine_ptr);
+
+    /**
+     * @brief Convert Engine to TensorRT model
+     * @param[in] model_file Engine(TensorRT) model file path
+     * @param[out] engine_ptr TensorRT model engine made 
+     * @details Load Engine model, and convert it to TensorRT model
+     */
+    void EngineToTRTModel(const std::string &engine_file ,     
+                        nvinfer1::ICudaEngine** engine_ptr) ;
+
+    /**
+     * @brief Preproces points
+     * @param[in] in_points_array Point cloud array
+     * @param[in] in_num_points Number of points
+     * @details Call CPU or GPU preprocess
+     */
+    void Preprocess(const float* in_points_array, const int in_num_points);
+
+    public:
+    /**
+     * @brief Constructor
+     * @param[in] score_threshold Score threshold for filtering output
+     * @param[in] nms_overlap_threshold IOU threshold for NMS
+     * @param[in] use_onnx if true,using onnx file ,else using engine file
+     * @param[in] pfe_file Pillar Feature Extractor ONNX file path
+     * @param[in] rpn_file Region Proposal Network ONNX file path
+     * @details Variables could be changed through point_pillars_detection
+     */
+    PointPillars(const float score_threshold,
+                const float nms_overlap_threshold,
+                const bool use_onnx,
+                const std::string pfe_file,
+                const std::string rpn_file,
+                const std::string pp_config);
+    ~PointPillars();
+
+    /**
+     * @brief Call PointPillars for the inference
+     * @param[in] in_points_array Point cloud array
+     * @param[in] in_num_points Number of points
+     * @param[out] out_detections Network output bounding box
+     * @param[out] out_labels Network output object's label
+     * @details This is an interface for the algorithm
+     */
+    void DoInference(const float* in_points_array,
+                    const int in_num_points,
+                    std::vector<float>* out_detections,
+                    std::vector<int>* out_labels,
+                    std::vector<float>* out_scores);
+};
+

+ 383 - 0
src/api/perception_lidar_PointPillars_MultiHead/postprocess.cu

@@ -0,0 +1,383 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+// headers in CUDA
+#include <thrust/sort.h>
+
+// headers in local files
+#include "common.h"
+#include "postprocess.h"
+#include <stdio.h>
+
+
+// sigmoid_filter_warp
+__device__ void box_decode_warp(int head_offset, const float* box_pred, 
+    int tid , int num_anchors_per_head , int counter, float* filtered_box) 
+{
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 0] = box_pred[ head_offset + tid * 9 + 0];
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 1] = box_pred[ head_offset + tid * 9 + 1];
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 2] = box_pred[ head_offset + tid * 9 + 2];
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 3] = box_pred[ head_offset + tid * 9 + 3];
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 4] = box_pred[ head_offset + tid * 9 + 4];
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 5] = box_pred[ head_offset + tid * 9 + 5];
+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 6] = box_pred[ head_offset + tid * 9 + 6];
+}
+
+
+__global__ void sigmoid_filter_kernel(
+
+    float* cls_pred_0,
+    float* cls_pred_12,
+    float* cls_pred_34,
+    float* cls_pred_5,
+    float* cls_pred_67,
+    float* cls_pred_89,
+
+    const float* box_pred_0,
+
+    const float* box_pred_1,
+    const float* box_pred_2,
+
+    const float* box_pred_3,
+    const float* box_pred_4,
+
+    const float* box_pred_5,
+
+    const float* box_pred_6,
+    const float* box_pred_7,
+
+    const float* box_pred_8,
+    const float* box_pred_9,
+
+    float* filtered_box, 
+    float* filtered_score, 
+    int* filter_count,
+
+    const float score_threshold) {   
+
+    // cls_pred_34 
+    // 32768*2 , 2
+
+    int num_anchors_per_head = gridDim.x * gridDim.y * blockDim.x;
+    // 16 * 4 * 512 = 32768
+    extern __shared__ float cls_score[];
+    cls_score[threadIdx.x + blockDim.x] = -1.0f;
+
+    int tid = blockIdx.x * gridDim.y * blockDim.x + blockIdx.y *  blockDim.x + threadIdx.x; 
+
+
+    if ( blockIdx.z == 0) cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_0[ tid ]));
+    if ( blockIdx.z == 1) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_12[ tid * 2 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_12[ (num_anchors_per_head + tid) * 2]));}
+    if ( blockIdx.z == 2) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_12[ tid * 2 + 1]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_12[ (num_anchors_per_head + tid) * 2 + 1]));}
+
+    if ( blockIdx.z == 3) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_34[ tid * 2 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_34[ (num_anchors_per_head + tid) * 2]));}
+    if ( blockIdx.z == 4) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_34[ tid * 2 + 1 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_34[ (num_anchors_per_head + tid) * 2 + 1]));}
+
+    if ( blockIdx.z == 5) cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_5[ tid ]));
+
+    if ( blockIdx.z == 6) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_67[ tid * 2 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_67[ (num_anchors_per_head + tid) * 2]));}
+    if ( blockIdx.z == 7) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_67[ tid * 2 + 1 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_67[ (num_anchors_per_head + tid) * 2 + 1]));}
+
+    if ( blockIdx.z == 8) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_89[ tid * 2 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_89[ (num_anchors_per_head + tid) * 2]));}
+    if ( blockIdx.z == 9) {
+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_89[ tid * 2 + 1 ]));
+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_89[ (num_anchors_per_head + tid) * 2 + 1]));}
+    
+    __syncthreads();
+    
+    if( cls_score[ threadIdx.x ] > score_threshold) 
+    {
+        int counter = atomicAdd(&filter_count[blockIdx.z], 1);
+        if ( blockIdx.z == 0) {
+            box_decode_warp(0 ,box_pred_0 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 1) {
+            box_decode_warp(0 ,box_pred_1 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 2) {
+            box_decode_warp(0 ,box_pred_1 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 3) {
+            box_decode_warp(0 ,box_pred_3 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else 
+        if (blockIdx.z == 4) {
+            box_decode_warp(0 ,box_pred_3 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];            
+        }else
+        if ( blockIdx.z == 5) {
+            box_decode_warp(0 ,box_pred_5 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 6) {
+            box_decode_warp(0 ,box_pred_6 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 7) {
+            box_decode_warp(0 ,box_pred_6 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 8) {
+
+            box_decode_warp(0 ,box_pred_8 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }else
+        if ( blockIdx.z == 9) {
+            box_decode_warp(0 ,box_pred_8 , tid , num_anchors_per_head , counter , filtered_box);
+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+        }
+    }
+    __syncthreads();  
+    if( cls_score[ threadIdx.x + blockDim.x ] > score_threshold)  {     
+            int counter = atomicAdd(&filter_count[blockIdx.z], 1);
+            // printf("counter : %d \n" , counter);
+            if (blockIdx.z == 1) {
+                box_decode_warp(0 ,box_pred_2 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 2) {
+                box_decode_warp(0 ,box_pred_2 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 3) {
+                box_decode_warp(0 ,box_pred_4 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 4) {
+                box_decode_warp(0 ,box_pred_4 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 6) {
+                box_decode_warp(0 ,box_pred_7 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 7) {
+                box_decode_warp(0 ,box_pred_7 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 8) {
+                box_decode_warp(0 ,box_pred_9 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }else 
+            if (blockIdx.z == 9) {
+                box_decode_warp(0 ,box_pred_9 , tid , num_anchors_per_head , counter , filtered_box);
+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
+            }
+    }
+}
+
+__global__ void sort_boxes_by_indexes_kernel(float* filtered_box, float* filtered_scores, int* indexes, int filter_count,
+    float* sorted_filtered_boxes, float* sorted_filtered_scores,
+    const int num_output_box_feature)
+{
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    if(tid < filter_count)  {
+
+        int sort_index = indexes[tid];
+        sorted_filtered_boxes[tid * num_output_box_feature + 0] = filtered_box[sort_index * num_output_box_feature + 0];
+        sorted_filtered_boxes[tid * num_output_box_feature + 1] = filtered_box[sort_index * num_output_box_feature + 1];
+        sorted_filtered_boxes[tid * num_output_box_feature + 2] = filtered_box[sort_index * num_output_box_feature + 2];
+        sorted_filtered_boxes[tid * num_output_box_feature + 3] = filtered_box[sort_index * num_output_box_feature + 3];
+        sorted_filtered_boxes[tid * num_output_box_feature + 4] = filtered_box[sort_index * num_output_box_feature + 4];
+        sorted_filtered_boxes[tid * num_output_box_feature + 5] = filtered_box[sort_index * num_output_box_feature + 5];
+        sorted_filtered_boxes[tid * num_output_box_feature + 6] = filtered_box[sort_index * num_output_box_feature + 6];
+
+        // sorted_filtered_dir[tid] = filtered_dir[sort_index];
+        sorted_filtered_scores[tid] = filtered_scores[sort_index];
+    }
+}
+
+
+
+PostprocessCuda::PostprocessCuda(const int num_threads, const float float_min, const float float_max,
+    const int num_class,const int num_anchor_per_cls,
+    const std::vector<std::vector<int>> multihead_label_mapping,
+    const float score_threshold,  const float nms_overlap_threshold, 
+    const int nms_pre_maxsize, const int nms_post_maxsize,
+    const int num_box_corners, 
+    const int num_input_box_feature,
+    const int num_output_box_feature)
+: num_threads_(num_threads),
+  float_min_(float_min),
+  float_max_(float_max),
+  num_class_(num_class),
+  num_anchor_per_cls_(num_anchor_per_cls),
+  multihead_label_mapping_(multihead_label_mapping),
+  score_threshold_(score_threshold),
+  nms_overlap_threshold_(nms_overlap_threshold),
+  nms_pre_maxsize_(nms_pre_maxsize),
+  nms_post_maxsize_(nms_post_maxsize),
+  num_box_corners_(num_box_corners),
+  num_input_box_feature_(num_input_box_feature),
+  num_output_box_feature_(num_output_box_feature) {
+    nms_cuda_ptr_.reset(
+    new NmsCuda(num_threads_, num_box_corners_, nms_overlap_threshold_));
+
+}
+
+
+void PostprocessCuda::DoPostprocessCuda(
+    float* cls_pred_0,
+    float* cls_pred_12,
+    float* cls_pred_34,
+    float* cls_pred_5,
+    float* cls_pred_67,
+    float* cls_pred_89,
+
+    const float* box_preds,
+   
+    float* dev_filtered_box, 
+    float* dev_filtered_score, 
+    int* dev_filter_count,
+    std::vector<float>& out_detection, std::vector<int>& out_label , std::vector<float>& out_score) {
+    // 在此之前,先进行rpn_box_output的concat. 
+    // 128x128 的feature map, cls_pred 的shape为(32768,1),(32768,1),(32768,1),(65536,2),(32768,1)
+    dim3 gridsize(16, 4 , 10);  //16 *  4  * 512  = 32768 代表一个head的anchors
+    sigmoid_filter_kernel<<< gridsize, 512 , 512 * 2 * sizeof(float)>>>(
+        cls_pred_0,
+        cls_pred_12, 
+        cls_pred_34, 
+        cls_pred_5, 
+        cls_pred_67, 
+        cls_pred_89,
+
+        &box_preds[0 * 32768 * 9],
+        &box_preds[1 * 32768 * 9],
+        &box_preds[2 * 32768 * 9],
+        &box_preds[3 * 32768 * 9],
+        &box_preds[4 * 32768 * 9],
+        &box_preds[5 * 32768 * 9],
+        &box_preds[6 * 32768 * 9],
+        &box_preds[7 * 32768 * 9],
+        &box_preds[8 * 32768 * 9],
+        &box_preds[9 * 32768 * 9],
+
+        dev_filtered_box, 
+        dev_filtered_score,  
+        dev_filter_count, 
+    
+        score_threshold_);
+    cudaDeviceSynchronize();
+    
+    int host_filter_count[num_class_] = {0};
+    GPU_CHECK(cudaMemcpy(host_filter_count, dev_filter_count, num_class_ * sizeof(int), cudaMemcpyDeviceToHost));
+    
+    for (int i = 0; i < num_class_; ++ i) {
+        if(host_filter_count[i] <= 0) continue;
+
+        int* dev_indexes;
+        float* dev_sorted_filtered_box;
+        float* dev_sorted_filtered_scores;
+        GPU_CHECK(cudaMalloc((void**)&dev_indexes, host_filter_count[i] * sizeof(int)));
+        GPU_CHECK(cudaMalloc((void**)&dev_sorted_filtered_box, host_filter_count[i] * num_output_box_feature_ * sizeof(float)));
+        GPU_CHECK(cudaMalloc((void**)&dev_sorted_filtered_scores, host_filter_count[i]*sizeof(float)));
+        // GPU_CHECK(cudaMalloc((void**)&dev_sorted_box_for_nms, NUM_BOX_CORNERS_*host_filter_count[i]*sizeof(float)));
+        thrust::sequence(thrust::device, dev_indexes, dev_indexes + host_filter_count[i]);
+        thrust::sort_by_key(thrust::device, 
+                            &dev_filtered_score[i * num_anchor_per_cls_], 
+                            &dev_filtered_score[i * num_anchor_per_cls_ + host_filter_count[i]],
+                            dev_indexes, 
+                            thrust::greater<float>());
+
+        const int num_blocks = DIVUP(host_filter_count[i], num_threads_);
+
+        sort_boxes_by_indexes_kernel<<<num_blocks, num_threads_>>>(
+            &dev_filtered_box[i * num_anchor_per_cls_ * num_output_box_feature_], 
+            &dev_filtered_score[i * num_anchor_per_cls_], 
+            dev_indexes, 
+            host_filter_count[i],
+            dev_sorted_filtered_box, 
+            dev_sorted_filtered_scores,
+            num_output_box_feature_);
+
+        int num_box_for_nms = min(nms_pre_maxsize_, host_filter_count[i]);
+        long* keep_inds = new long[num_box_for_nms];  // index of kept box
+        memset(keep_inds, 0, num_box_for_nms * sizeof(int));
+        int num_out = 0;
+        nms_cuda_ptr_->DoNmsCuda(num_box_for_nms, dev_sorted_filtered_box, keep_inds, &num_out);
+
+        num_out = min(num_out, nms_post_maxsize_);
+
+        float* host_filtered_box = new float[host_filter_count[i] * num_output_box_feature_]();
+        float* host_filtered_scores = new float[host_filter_count[i]]();
+
+
+        cudaMemcpy(host_filtered_box, dev_sorted_filtered_box, host_filter_count[i] * num_output_box_feature_ * sizeof(float), cudaMemcpyDeviceToHost);
+        cudaMemcpy(host_filtered_scores, dev_sorted_filtered_scores, host_filter_count[i] * sizeof(float), cudaMemcpyDeviceToHost);
+        for (int j = 0; j < num_out; ++j)  {
+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 0]);
+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 1]);
+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 2]);
+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 3]);
+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 4]);
+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 5]);
+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 6]);
+            out_score.emplace_back(host_filtered_scores[keep_inds[j]]);
+            out_label.emplace_back(i);
+        }
+        delete[] keep_inds;
+        delete[] host_filtered_scores;
+        delete[] host_filtered_box;
+
+        GPU_CHECK(cudaFree(dev_indexes));
+        GPU_CHECK(cudaFree(dev_sorted_filtered_box));
+    }
+}

+ 125 - 0
src/api/perception_lidar_PointPillars_MultiHead/postprocess.h

@@ -0,0 +1,125 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+#pragma once
+#include <memory>
+#include <vector>
+#include "nms.h"
+
+class PostprocessCuda {
+ private:
+    // initializer list
+
+    const int num_threads_;
+    const float float_min_;
+    const float float_max_;
+    const int num_class_;
+    const int num_anchor_per_cls_;
+    const float score_threshold_;
+    const float nms_overlap_threshold_;
+    const int nms_pre_maxsize_;
+    const int nms_post_maxsize_;
+    const int num_box_corners_;
+    const int num_input_box_feature_;
+    const int num_output_box_feature_;
+    const std::vector<std::vector<int>> multihead_label_mapping_;
+    // end initializer list
+
+    std::unique_ptr<NmsCuda> nms_cuda_ptr_;
+  public:
+  /**
+   * @brief Constructor
+   * @param[in] num_threads Number of threads when launching cuda kernel
+   * @param[in] float_min The lowest float value
+   * @param[in] float_max The maximum float value
+   * @param[in] num_class Number of classes 
+   * @param[in] num_anchor_per_cls Number anchor per category
+   * @param[in] multihead_label_mapping 
+   * @param[in] score_threshold Score threshold for filtering output
+   * @param[in] nms_overlap_threshold IOU threshold for NMS
+   * @param[in] nms_pre_maxsize Maximum number of boxes into NMS
+   * @param[in] nms_post_maxsize Maximum number of boxes after NMS
+   * @param[in] num_box_corners Number of box's corner
+   * @param[in] num_output_box_feature Number of output box's feature
+   * @details Captital variables never change after the compile, non-capital
+   * variables could be changed through rosparam
+   */
+  PostprocessCuda(const int num_threads, 
+                  const float float_min, const float float_max,
+                  const int num_class, const int num_anchor_per_cls, 
+                  const std::vector<std::vector<int>> multihead_label_mapping,
+                  const float score_threshold,  
+                  const float nms_overlap_threshold, 
+                  const int nms_pre_maxsize, 
+                  const int nms_post_maxsize,
+                  const int num_box_corners,
+                  const int num_input_box_feature, 
+                  const int num_output_box_feature);
+  ~PostprocessCuda(){}
+
+  /**
+   * @brief Postprocessing for the network output
+   * @param[in] rpn_box_output Box predictions from the network output
+   * @param[in] rpn_cls_output Class predictions from the network output
+   * @param[in] rpn_dir_output Direction predictions from the network output
+   * @param[in] dev_filtered_box Filtered box predictions
+   * @param[in] dev_filtered_score Filtered score predictions
+   * @param[in] dev_filter_count The number of filtered output
+   * @param[out] out_detection Output bounding boxes
+   * @param[out] out_label Output labels of objects
+   * @details dev_* represents device memory allocated variables
+   */
+  void DoPostprocessCuda(
+    float* cls_pred_0,
+    float* cls_pred_12,
+    float* cls_pred_34,
+    float* cls_pred_5,
+    float* cls_pred_67,
+    float* cls_pred_89,
+    
+    const float* box_preds,
+    float* dev_filtered_box, 
+    float* dev_filtered_score, 
+    int* dev_filter_count,
+    std::vector<float>& out_detection, std::vector<int>& out_label , std::vector<float>& out_score);
+};

+ 410 - 0
src/api/perception_lidar_PointPillars_MultiHead/preprocess.cu

@@ -0,0 +1,410 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+
+// headers in STL
+#include <stdio.h>
+
+// headers in local files
+#include "common.h"
+#include "preprocess.h"
+
+__global__ void make_pillar_histo_kernel(
+    const float* dev_points, float* dev_pillar_point_feature_in_coors,
+    int* pillar_count_histo, const int num_points,
+    const int max_points_per_pillar, const int grid_x_size,
+    const int grid_y_size, const int grid_z_size, const float min_x_range,
+    const float min_y_range, const float min_z_range, const float pillar_x_size,
+    const float pillar_y_size, const float pillar_z_size,
+    const int num_point_feature) {
+  int th_i = blockIdx.x * blockDim.x +  threadIdx.x ;
+  if (th_i >= num_points) {
+    return;
+  }
+  int x_coor = floor((dev_points[th_i * num_point_feature + 0] - min_x_range) / pillar_x_size);
+  int y_coor = floor((dev_points[th_i * num_point_feature + 1] - min_y_range) / pillar_y_size);
+  int z_coor = floor((dev_points[th_i * num_point_feature + 2] - min_z_range) / pillar_z_size);
+
+  if (x_coor >= 0 && x_coor < grid_x_size && y_coor >= 0 &&
+      y_coor < grid_y_size && z_coor >= 0 && z_coor < grid_z_size) {
+    int count =
+        atomicAdd(&pillar_count_histo[y_coor * grid_x_size + x_coor], 1);
+    if (count < max_points_per_pillar) {
+      int ind =
+          y_coor * grid_x_size * max_points_per_pillar * num_point_feature +
+          x_coor * max_points_per_pillar * num_point_feature +
+          count * num_point_feature;
+ 
+      for (int i = 0; i < num_point_feature; ++i) {
+        dev_pillar_point_feature_in_coors[ind + i] =
+            dev_points[th_i * num_point_feature + i];
+      }
+    }
+  }
+}
+
+__global__ void make_pillar_index_kernel(
+    int* dev_pillar_count_histo, int* dev_counter, int* dev_pillar_count,
+    int* dev_x_coors, int* dev_y_coors, float* dev_num_points_per_pillar,
+    int* dev_sparse_pillar_map, const int max_pillars,
+    const int max_points_per_pillar, const int grid_x_size,
+    const int num_inds_for_scan) {
+  int x = blockIdx.x;
+  int y = threadIdx.x;
+  int num_points_at_this_pillar = dev_pillar_count_histo[y * grid_x_size + x];
+  if (num_points_at_this_pillar == 0) {
+    return;
+  }
+
+  int count = atomicAdd(dev_counter, 1);
+  if (count < max_pillars) {
+    atomicAdd(dev_pillar_count, 1);
+    if (num_points_at_this_pillar >= max_points_per_pillar) {
+      dev_num_points_per_pillar[count] = max_points_per_pillar;
+    } else {
+      dev_num_points_per_pillar[count] = num_points_at_this_pillar;
+    }
+    dev_x_coors[count] = x;
+    dev_y_coors[count] = y;
+    dev_sparse_pillar_map[y * num_inds_for_scan + x] = 1;
+  }
+}
+
+__global__ void make_pillar_feature_kernel(
+    float* dev_pillar_point_feature_in_coors, float* dev_pillar_point_feature,
+    float* dev_pillar_coors, int* dev_x_coors, int* dev_y_coors,
+    float* dev_num_points_per_pillar, const int max_points,
+    const int num_point_feature, const int grid_x_size) {
+  int ith_pillar = blockIdx.x;
+  int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
+  int ith_point = threadIdx.x;
+  if (ith_point >= num_points_at_this_pillar) {
+    return;
+  }
+  int x_ind = dev_x_coors[ith_pillar];
+  int y_ind = dev_y_coors[ith_pillar];
+  int pillar_ind = ith_pillar * max_points * num_point_feature +
+                   ith_point * num_point_feature;
+  int coors_ind = y_ind * grid_x_size * max_points * num_point_feature +
+                  x_ind * max_points * num_point_feature +
+                  ith_point * num_point_feature;
+  #pragma unroll 
+  for (int i = 0; i < num_point_feature; ++i) {
+    dev_pillar_point_feature[pillar_ind + i] =
+        dev_pillar_point_feature_in_coors[coors_ind + i];
+  }
+
+  float coor_x = static_cast<float>(x_ind);
+  float coor_y = static_cast<float>(y_ind);
+  dev_pillar_coors[ith_pillar * 4 + 0] = 0;  // batch idx
+  dev_pillar_coors[ith_pillar * 4 + 1] = 0;  // z
+  dev_pillar_coors[ith_pillar * 4 + 2] = coor_y;
+  dev_pillar_coors[ith_pillar * 4 + 3] = coor_x;
+}
+
+
+
+__global__ void pillar_mean_kernel(
+  float* dev_points_mean, 
+  const int num_point_feature,
+  const float* dev_pillar_point_feature, 
+  const float* dev_num_points_per_pillar, 
+  int max_pillars , 
+  int max_points_per_pillar) {
+
+    extern __shared__ float temp[];
+    int ith_pillar = blockIdx.x; 
+    int ith_point  = threadIdx.x;
+    int axis = threadIdx.y;
+  
+    int reduce_size = max_points_per_pillar > 32 ? 64 : 32;
+    temp[threadIdx.x * 3 + axis] =  dev_pillar_point_feature[ith_pillar * max_points_per_pillar * num_point_feature + ith_point * num_point_feature + axis];  
+    if (threadIdx.x < reduce_size - max_points_per_pillar) {
+        temp[(threadIdx.x + max_points_per_pillar) * 3 + axis] = 0.0f; //--> dummy placeholds will set as 0
+    }
+    __syncthreads();
+    int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
+
+    if (ith_point >= num_points_at_this_pillar) {
+          return;
+    }
+
+    for (unsigned int d = reduce_size >> 1 ; d > 0.6; d >>= 1) {
+        if (ith_point < d) {
+            temp[ith_point*3 +axis] += temp[(ith_point + d) * 3 + axis];
+        }
+        __syncthreads();
+    }
+
+    if (ith_point == 0) {
+        dev_points_mean[ith_pillar * 3 + axis] = temp[ith_point + axis] / num_points_at_this_pillar ;
+    }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+__device__ void warpReduce(volatile float* sdata , int ith_point , int axis) {
+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 8) * blockDim.y + axis];
+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 4) * blockDim.y + axis];
+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 2) * blockDim.y + axis];
+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 1) * blockDim.y + axis];
+}
+
+
+
+
+
+__global__ void make_pillar_mean_kernel(
+  float* dev_points_mean, 
+  const int num_point_feature,
+  const float* dev_pillar_point_feature, 
+  const float* dev_num_points_per_pillar, 
+  int max_pillars , 
+  int max_points_pre_pillar) {
+    extern __shared__ float temp[];
+    unsigned int ith_pillar = blockIdx.x;  // { 0 , 1, 2, ... , 10000+}
+    unsigned int ith_point  = threadIdx.x; // { 0 , 1, 2, ...,9}
+    unsigned int axis = threadIdx.y; 
+    unsigned int idx_pre  = ith_pillar * max_points_pre_pillar * num_point_feature \
+                     + ith_point  * num_point_feature;
+    unsigned int idx_post = ith_pillar * max_points_pre_pillar * num_point_feature \
+                     + (ith_point + blockDim.x)  * num_point_feature;
+
+    temp[ith_point * blockDim.y + axis] = 0.0;
+    unsigned int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
+
+    // if (ith_point < num_points_at_this_pillar / 2) {
+      temp[ith_point * blockDim.y + axis] = dev_pillar_point_feature[idx_pre  + axis] 
+                                          + dev_pillar_point_feature[idx_post + axis];
+    // }
+    __syncthreads();
+
+    // do reduction in shared mem
+    // Sequential addressing. This solves the bank conflicts as
+    // the threads now access shared memory with a stride of one
+    // 32-bit word (unsigned int) now, which does not cause bank 
+    // conflicts
+    warpReduce(temp , ith_point , axis);
+
+	// // write result for this block to global mem
+    if (ith_point == 0)
+    dev_points_mean[ith_pillar * blockDim.y + axis] = temp[ith_point * blockDim.y + axis] / num_points_at_this_pillar ;
+}
+
+
+__global__ void gather_point_feature_kernel(
+  const int max_num_pillars_,const int max_num_points_per_pillar,const int num_point_feature,
+  const float min_x_range, const float min_y_range, const float min_z_range, 
+  const float pillar_x_size,  const float pillar_y_size, const float pillar_z_size,
+  const float* dev_pillar_point_feature, const float* dev_num_points_per_pillar, 
+  const float* dev_pillar_coors,
+  float* dev_points_mean, 
+  float* dev_pfe_gather_feature_){
+
+  int ith_pillar = blockIdx.x; 
+  int ith_point = threadIdx.x;
+  // int kNumPointFeature = 5;
+  int num_gather_feature = 11;
+  int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
+
+  if (ith_point >= num_points_at_this_pillar){
+        return;
+    }
+
+
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 0] 
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0]; 
+  
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 1]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1];
+  
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 2]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2];
+  
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 3]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 3];
+
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 4]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 4];
+  
+    // dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 4]  =  0.0f;
+    //   f_cluster = voxel_features[:, :, :3] - points_mean
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 5]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0] - dev_points_mean[ith_pillar * 3 + 0 ];
+
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 6] 
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1] - dev_points_mean[ith_pillar * 3 + 1 ];
+  
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 7]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2] - dev_points_mean[ith_pillar * 3 + 2 ];
+
+    // f_center[:, :, 0] = voxel_features[:, :, 0] - (coords[:, 3].to(voxel_features.dtype).unsqueeze(1) * self.voxel_x + self.x_offset)
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 8]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0] - (dev_pillar_coors[ith_pillar * 4 + 3] * pillar_x_size + (pillar_x_size/2 + min_x_range));
+  
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 9]  
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1] - (dev_pillar_coors[ith_pillar * 4 + 2] * pillar_y_size + (pillar_y_size/2 + min_y_range));
+  
+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 10] 
+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2] - (dev_pillar_coors[ith_pillar * 4 + 1] * pillar_z_size + (pillar_z_size/2 + min_z_range));
+
+}
+
+
+
+
+PreprocessPointsCuda::PreprocessPointsCuda(
+    const int num_threads, const int max_num_pillars,
+    const int max_points_per_pillar, const int num_point_feature,
+    const int num_inds_for_scan, const int grid_x_size, const int grid_y_size,
+    const int grid_z_size, const float pillar_x_size, const float pillar_y_size,
+    const float pillar_z_size, const float min_x_range, const float min_y_range,
+    const float min_z_range)
+    : num_threads_(num_threads),
+      max_num_pillars_(max_num_pillars),
+      max_num_points_per_pillar_(max_points_per_pillar),
+      num_point_feature_(num_point_feature),
+      num_inds_for_scan_(num_inds_for_scan),
+      grid_x_size_(grid_x_size),
+      grid_y_size_(grid_y_size),
+      grid_z_size_(grid_z_size),
+      pillar_x_size_(pillar_x_size),
+      pillar_y_size_(pillar_y_size),
+      pillar_z_size_(pillar_z_size),
+      min_x_range_(min_x_range),
+      min_y_range_(min_y_range),
+      min_z_range_(min_z_range) {
+    
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_point_feature_in_coors_),
+        grid_y_size_ * grid_x_size_ * max_num_points_per_pillar_ *  num_point_feature_ * sizeof(float)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_count_histo_),
+        grid_y_size_ * grid_x_size_ * sizeof(int)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_counter_), sizeof(int)));
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_count_), sizeof(int)));    
+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_points_mean_), max_num_pillars_ * 3 *sizeof(float)));  
+    }
+
+PreprocessPointsCuda::~PreprocessPointsCuda() {
+    GPU_CHECK(cudaFree(dev_pillar_point_feature_in_coors_));
+    GPU_CHECK(cudaFree(dev_pillar_count_histo_));
+    GPU_CHECK(cudaFree(dev_counter_));
+    GPU_CHECK(cudaFree(dev_pillar_count_));
+    GPU_CHECK(cudaFree(dev_points_mean_));
+  }
+
+
+void PreprocessPointsCuda::DoPreprocessPointsCuda(
+    const float* dev_points, const int in_num_points, 
+    int* dev_x_coors,int* dev_y_coors, 
+    float* dev_num_points_per_pillar,
+    float* dev_pillar_point_feature, float* dev_pillar_coors,
+    int* dev_sparse_pillar_map, int* host_pillar_count , float* dev_pfe_gather_feature) {
+    // initialize paraments
+    GPU_CHECK(cudaMemset(dev_pillar_point_feature_in_coors_, 0 , grid_y_size_ * grid_x_size_ * max_num_points_per_pillar_ *  num_point_feature_ * sizeof(float)));
+    GPU_CHECK(cudaMemset(dev_pillar_count_histo_, 0 , grid_y_size_ * grid_x_size_ * sizeof(int)));
+    GPU_CHECK(cudaMemset(dev_counter_, 0, sizeof(int)));
+    GPU_CHECK(cudaMemset(dev_pillar_count_, 0, sizeof(int)));
+    GPU_CHECK(cudaMemset(dev_points_mean_, 0,  max_num_pillars_ * 3 * sizeof(float)));
+    int num_block = DIVUP(in_num_points , num_threads_);
+    make_pillar_histo_kernel<<<num_block , num_threads_>>>(
+        dev_points, dev_pillar_point_feature_in_coors_, dev_pillar_count_histo_,
+        in_num_points, max_num_points_per_pillar_, grid_x_size_, grid_y_size_,
+        grid_z_size_, min_x_range_, min_y_range_, min_z_range_, pillar_x_size_,
+        pillar_y_size_, pillar_z_size_, num_point_feature_);
+    
+    make_pillar_index_kernel<<<grid_x_size_, grid_y_size_>>>(
+        dev_pillar_count_histo_, dev_counter_, dev_pillar_count_, dev_x_coors,
+        dev_y_coors, dev_num_points_per_pillar, dev_sparse_pillar_map,
+        max_num_pillars_, max_num_points_per_pillar_, grid_x_size_,
+        num_inds_for_scan_);  
+
+    GPU_CHECK(cudaMemcpy(host_pillar_count, dev_pillar_count_, 1 * sizeof(int),
+        cudaMemcpyDeviceToHost));
+    make_pillar_feature_kernel<<<host_pillar_count[0],max_num_points_per_pillar_>>>(
+        dev_pillar_point_feature_in_coors_, dev_pillar_point_feature,
+        dev_pillar_coors, dev_x_coors, dev_y_coors, dev_num_points_per_pillar,
+        max_num_points_per_pillar_, num_point_feature_, grid_x_size_);
+    
+
+    dim3 mean_block(max_num_points_per_pillar_,3); //(32,3)
+
+    pillar_mean_kernel<<<host_pillar_count[0],mean_block,64 * 3 *sizeof(float)>>>(
+      dev_points_mean_  ,num_point_feature_, dev_pillar_point_feature, dev_num_points_per_pillar, 
+        max_num_pillars_ , max_num_points_per_pillar_);
+
+    // dim3 mean_block(10,3); // Unrolling the Last Warp
+    // make_pillar_mean_kernel<<<host_pillar_count[0], mean_block , 32 * 3 *sizeof(float)>>>(
+    //       dev_points_mean_  ,num_point_feature_, dev_pillar_point_feature, dev_num_points_per_pillar, 
+    //       max_num_pillars_ , max_num_points_per_pillar_);
+
+    gather_point_feature_kernel<<<max_num_pillars_, max_num_points_per_pillar_>>>(
+      max_num_pillars_,max_num_points_per_pillar_,num_point_feature_,
+      min_x_range_, min_y_range_, min_z_range_,
+      pillar_x_size_, pillar_y_size_, pillar_z_size_, 
+      dev_pillar_point_feature, dev_num_points_per_pillar, dev_pillar_coors,
+      dev_points_mean_,
+      dev_pfe_gather_feature);
+
+    // DEVICE_SAVE<float>(dev_pillar_point_feature , \
+    //     max_num_pillars_ * max_num_points_per_pillar_ * num_point_feature_ , "dev_pillar_point_feature");
+    // DEVICE_SAVE<float>(dev_num_points_per_pillar , \
+    //   max_num_pillars_ , "dev_num_points_per_pillar");
+    // DEVICE_SAVE<float>(dev_pfe_gather_feature , \
+    //   max_num_pillars_ * 11, "dev_pfe_gather_feature");
+}
+
+

+ 138 - 0
src/api/perception_lidar_PointPillars_MultiHead/preprocess.h

@@ -0,0 +1,138 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file preprocess_points_cuda.h
+ * @brief GPU version of preprocess points
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+#pragma once
+
+
+class PreprocessPointsCuda {
+ private:
+    // initializer list
+    const int num_threads_;
+    const int max_num_pillars_;
+    const int max_num_points_per_pillar_;
+    const int num_point_feature_;
+    const int num_inds_for_scan_;
+    const int grid_x_size_;
+    const int grid_y_size_;
+    const int grid_z_size_;
+    const float pillar_x_size_;
+    const float pillar_y_size_;
+    const float pillar_z_size_;
+    const float min_x_range_;
+    const float min_y_range_;
+    const float min_z_range_;
+    // end initializer list
+
+    float* dev_pillar_point_feature_in_coors_;
+    int* dev_pillar_count_histo_;
+
+    int* dev_counter_;
+    int* dev_pillar_count_;
+    float* dev_points_mean_;
+
+
+
+ public:
+  /**
+   * @brief Constructor
+   * @param[in] num_threads Number of threads when launching cuda kernel
+   * @param[in] num_point_feature Number of features in a point
+   * @param[in] num_inds_for_scan Number of indexes for scan(cumsum)
+   * 
+   * @param[in] max_num_pillars Maximum number of pillars
+   * @param[in] max_points_per_pillar Maximum number of points per pillar
+
+   * @param[in] grid_x_size Number of pillars in x-coordinate
+   * @param[in] grid_y_size Number of pillars in y-coordinate
+   * @param[in] grid_z_size Number of pillars in z-coordinate
+   * 
+   * @param[in] pillar_x_size Size of x-dimension for a pillar
+   * @param[in] pillar_y_size Size of y-dimension for a pillar
+   * @param[in] pillar_z_size Size of z-dimension for a pillar
+   * 
+   * @param[in] min_x_range Minimum x value for point cloud
+   * @param[in] min_y_range Minimum y value for point cloud
+   * @param[in] min_z_range Minimum z value for point cloud
+   * @details Captital variables never change after the compile
+   */
+  PreprocessPointsCuda(const int num_threads, const int num_point_feature, const int num_inds_for_scan,
+                       const int max_num_pillars, const int max_points_per_pillar,  
+                       const int grid_x_size, const int grid_y_size, const int grid_z_size,  // grid size
+                       const float pillar_x_size, const float pillar_y_size, const float pillar_z_size, //voxel size
+                       const float min_x_range, const float min_y_range, const float min_z_range); // point cloud range
+  ~PreprocessPointsCuda();
+
+  /**
+   * @brief CUDA preprocessing for input point cloud
+   * @param[in] dev_points Point cloud array
+   * @param[in] in_num_points The number of points
+   * @param[mid] dev_x_coors X-coordinate indexes for corresponding pillars
+   * @param[mid] dev_y_coors Y-coordinate indexes for corresponding pillars
+   * @param[mid] dev_num_points_per_pillar
+   *   Number of points in corresponding pillars
+   * @param[mid] pillar_point_feature
+   *   Values of point feature in each pillar
+   * @param[mid] pillar_coors Array for coors of pillars
+   * @param[mid] dev_points_mean Array for calculate the point center
+   * @param[out] dev_sparse_pillar_map
+   *   Grid map representation for pillar-occupancy
+   * @param[out] host_pillar_count
+   *   The number of valid pillars for an input point cloud
+   * @param[out] dev_pfe_gather_feature
+   *   11 dimensions feature for pfe input channel
+   * @details Convert point cloud to pillar representation
+   */
+  void DoPreprocessPointsCuda(const float* dev_points,
+                              const int in_num_points,
+                              int* dev_x_coors,
+                              int* dev_y_coors,
+                              float* dev_num_points_per_pillar,
+                              float* dev_pillar_point_feature,
+                              float* dev_pillar_coors,
+                              int* dev_sparse_pillar_map,
+                              int* host_pillar_count,
+                              float* dev_pfe_gather_feature);
+};

+ 73 - 0
src/api/perception_lidar_PointPillars_MultiHead/scatter.cu

@@ -0,0 +1,73 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+
+//headers in local files
+#include "scatter.h"
+
+__global__ void scatter_kernel(int *x_coors, int *y_coors, float *pfe_output,
+  float *scattered_feature, const int grid_x_size,
+  const int grid_y_size) {
+int i_pillar = blockIdx.x;
+int i_feature = threadIdx.x;
+int x_ind = x_coors[i_pillar];
+int y_ind = y_coors[i_pillar];
+float feature = pfe_output[i_pillar * 64 + i_feature];
+scattered_feature[i_feature * grid_y_size * grid_x_size +
+y_ind * grid_x_size + x_ind] = feature;
+}
+
+ScatterCuda::ScatterCuda(const int num_threads, const int grid_x_size,
+const int grid_y_size)
+: num_threads_(num_threads),
+grid_x_size_(grid_x_size),
+grid_y_size_(grid_y_size) {}
+
+void ScatterCuda::DoScatterCuda(const int pillar_count, int *x_coors,
+   int *y_coors, float *pfe_output,
+   float *scattered_feature) {
+scatter_kernel<<<pillar_count, num_threads_>>>(x_coors, y_coors, pfe_output,
+                    scattered_feature,
+                    grid_x_size_, grid_y_size_);
+}

+ 77 - 0
src/api/perception_lidar_PointPillars_MultiHead/scatter.h

@@ -0,0 +1,77 @@
+/******************************************************************************
+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+
+/*
+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Kosuke Murakami
+ * @date 2019/02/26
+ */
+
+/**
+* @author Yan haixu
+* Contact: just github.com/hova88
+* @date 2021/04/30
+*/
+
+
+#pragma once
+
+class ScatterCuda {
+ private:
+  const int num_threads_;
+  const int grid_x_size_;
+  const int grid_y_size_;
+
+ public:
+  /**
+   * @brief Constructor
+   * @param[in] num_threads The number of threads to launch cuda kernel
+   * @param[in] grid_x_size Number of pillars in x-coordinate
+   * @param[in] grid_y_size Number of pillars in y-coordinate
+   * @details Captital variables never change after the compile
+   */
+  ScatterCuda(const int num_threads, const int grid_x_size,
+              const int grid_y_size);
+
+  /**
+   * @brief Call scatter cuda kernel
+   * @param[in] pillar_count The valid number of pillars
+   * @param[in] x_coors X-coordinate indexes for corresponding pillars
+   * @param[in] y_coors Y-coordinate indexes for corresponding pillars
+   * @param[in] pfe_output Output from Pillar Feature Extractor
+   * @param[out] scattered_feature Gridmap representation for pillars' feature
+   * @details Allocate pillars in gridmap based on index(coordinates)
+   * information
+   */
+  void DoScatterCuda(const int pillar_count, int* x_coors, int* y_coors,
+                     float* pfe_output, float* scattered_feature);
+};
+

+ 282 - 0
src/api/perception_lidar_cnn_segmentation/caffe/blob.hpp

@@ -0,0 +1,282 @@
+#ifndef CAFFE_BLOB_HPP_
+#define CAFFE_BLOB_HPP_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/syncedmem.hpp"
+
+const int kMaxBlobAxes = 32;
+
+namespace caffe {
+
+/**
+ * @brief A wrapper around SyncedMemory holders serving as the basic
+ *        computational unit through which Layer%s, Net%s, and Solver%s
+ *        interact.
+ *
+ * TODO(dox): more thorough description.
+ */
+template <typename Dtype>
+class Blob {
+ public:
+  Blob()
+       : data_(), diff_(), count_(0), capacity_(0) {}
+
+  /// @brief Deprecated; use <code>Blob(const vector<int>& shape)</code>.
+  explicit Blob(const int num, const int channels, const int height,
+      const int width);
+  explicit Blob(const vector<int>& shape);
+
+  /// @brief Deprecated; use <code>Reshape(const vector<int>& shape)</code>.
+  void Reshape(const int num, const int channels, const int height,
+      const int width);
+  /**
+   * @brief Change the dimensions of the blob, allocating new memory if
+   *        necessary.
+   *
+   * This function can be called both to create an initial allocation
+   * of memory, and to adjust the dimensions of a top blob during Layer::Reshape
+   * or Layer::Forward. When changing the size of blob, memory will only be
+   * reallocated if sufficient memory does not already exist, and excess memory
+   * will never be freed.
+   *
+   * Note that reshaping an input blob and immediately calling Net::Backward is
+   * an error; either Net::Forward or Net::Reshape need to be called to
+   * propagate the new input shape to higher layers.
+   */
+  void Reshape(const vector<int>& shape);
+  void Reshape(const BlobShape& shape);
+  void ReshapeLike(const Blob& other);
+  inline string shape_string() const {
+    ostringstream stream;
+    for (int i = 0; i < shape_.size(); ++i) {
+      stream << shape_[i] << " ";
+    }
+    stream << "(" << count_ << ")";
+    return stream.str();
+  }
+  inline const vector<int>& shape() const { return shape_; }
+  /**
+   * @brief Returns the dimension of the index-th axis (or the negative index-th
+   *        axis from the end, if index is negative).
+   *
+   * @param index the axis index, which may be negative as it will be
+   *        "canonicalized" using CanonicalAxisIndex.
+   *        Dies on out of range index.
+   */
+  inline int shape(int index) const {
+    return shape_[CanonicalAxisIndex(index)];
+  }
+  inline int num_axes() const { return shape_.size(); }
+  inline int count() const { return count_; }
+
+  /**
+   * @brief Compute the volume of a slice; i.e., the product of dimensions
+   *        among a range of axes.
+   *
+   * @param start_axis The first axis to include in the slice.
+   *
+   * @param end_axis The first axis to exclude from the slice.
+   */
+  inline int count(int start_axis, int end_axis) const {
+    CHECK_LE(start_axis, end_axis);
+    CHECK_GE(start_axis, 0);
+    CHECK_GE(end_axis, 0);
+    CHECK_LE(start_axis, num_axes());
+    CHECK_LE(end_axis, num_axes());
+    int count = 1;
+    for (int i = start_axis; i < end_axis; ++i) {
+      count *= shape(i);
+    }
+    return count;
+  }
+  /**
+   * @brief Compute the volume of a slice spanning from a particular first
+   *        axis to the final axis.
+   *
+   * @param start_axis The first axis to include in the slice.
+   */
+  inline int count(int start_axis) const {
+    return count(start_axis, num_axes());
+  }
+
+  /**
+   * @brief Returns the 'canonical' version of a (usually) user-specified axis,
+   *        allowing for negative indexing (e.g., -1 for the last axis).
+   *
+   * @param axis_index the axis index.
+   *        If 0 <= index < num_axes(), return index.
+   *        If -num_axes <= index <= -1, return (num_axes() - (-index)),
+   *        e.g., the last axis index (num_axes() - 1) if index == -1,
+   *        the second to last if index == -2, etc.
+   *        Dies on out of range index.
+   */
+  inline int CanonicalAxisIndex(int axis_index) const {
+    CHECK_GE(axis_index, -num_axes())
+        << "axis " << axis_index << " out of range for " << num_axes()
+        << "-D Blob with shape " << shape_string();
+    CHECK_LT(axis_index, num_axes())
+        << "axis " << axis_index << " out of range for " << num_axes()
+        << "-D Blob with shape " << shape_string();
+    if (axis_index < 0) {
+      return axis_index + num_axes();
+    }
+    return axis_index;
+  }
+
+  /// @brief Deprecated legacy shape accessor num: use shape(0) instead.
+  inline int num() const { return LegacyShape(0); }
+  /// @brief Deprecated legacy shape accessor channels: use shape(1) instead.
+  inline int channels() const { return LegacyShape(1); }
+  /// @brief Deprecated legacy shape accessor height: use shape(2) instead.
+  inline int height() const { return LegacyShape(2); }
+  /// @brief Deprecated legacy shape accessor width: use shape(3) instead.
+  inline int width() const { return LegacyShape(3); }
+  inline int LegacyShape(int index) const {
+    CHECK_LE(num_axes(), 4)
+        << "Cannot use legacy accessors on Blobs with > 4 axes.";
+    CHECK_LT(index, 4);
+    CHECK_GE(index, -4);
+    if (index >= num_axes() || index < -num_axes()) {
+      // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse
+      // indexing) -- this special case simulates the one-padding used to fill
+      // extraneous axes of legacy blobs.
+      return 1;
+    }
+    return shape(index);
+  }
+
+  inline int offset(const int n, const int c = 0, const int h = 0,
+      const int w = 0) const {
+    CHECK_GE(n, 0);
+    CHECK_LE(n, num());
+    CHECK_GE(channels(), 0);
+    CHECK_LE(c, channels());
+    CHECK_GE(height(), 0);
+    CHECK_LE(h, height());
+    CHECK_GE(width(), 0);
+    CHECK_LE(w, width());
+    return ((n * channels() + c) * height() + h) * width() + w;
+  }
+
+  inline int offset(const vector<int>& indices) const {
+    CHECK_LE(indices.size(), num_axes());
+    int offset = 0;
+    for (int i = 0; i < num_axes(); ++i) {
+      offset *= shape(i);
+      if (indices.size() > i) {
+        CHECK_GE(indices[i], 0);
+        CHECK_LT(indices[i], shape(i));
+        offset += indices[i];
+      }
+    }
+    return offset;
+  }
+  /**
+   * @brief Copy from a source Blob.
+   *
+   * @param source the Blob to copy from
+   * @param copy_diff if false, copy the data; if true, copy the diff
+   * @param reshape if false, require this Blob to be pre-shaped to the shape
+   *        of other (and die otherwise); if true, Reshape this Blob to other's
+   *        shape if necessary
+   */
+  void CopyFrom(const Blob<Dtype>& source, bool copy_diff = false,
+      bool reshape = false);
+
+  inline Dtype data_at(const int n, const int c, const int h,
+      const int w) const {
+    return cpu_data()[offset(n, c, h, w)];
+  }
+
+  inline Dtype diff_at(const int n, const int c, const int h,
+      const int w) const {
+    return cpu_diff()[offset(n, c, h, w)];
+  }
+
+  inline Dtype data_at(const vector<int>& index) const {
+    return cpu_data()[offset(index)];
+  }
+
+  inline Dtype diff_at(const vector<int>& index) const {
+    return cpu_diff()[offset(index)];
+  }
+
+  inline const shared_ptr<SyncedMemory>& data() const {
+    CHECK(data_);
+    return data_;
+  }
+
+  inline const shared_ptr<SyncedMemory>& diff() const {
+    CHECK(diff_);
+    return diff_;
+  }
+
+  const Dtype* cpu_data() const;
+  void set_cpu_data(Dtype* data);
+  const int* gpu_shape() const;
+  const Dtype* gpu_data() const;
+  void set_gpu_data(Dtype* data);
+  const Dtype* cpu_diff() const;
+  const Dtype* gpu_diff() const;
+  Dtype* mutable_cpu_data();
+  Dtype* mutable_gpu_data();
+  Dtype* mutable_cpu_diff();
+  Dtype* mutable_gpu_diff();
+  void Update();
+  void FromProto(const BlobProto& proto, bool reshape = true);
+  void ToProto(BlobProto* proto, bool write_diff = false) const;
+
+  /// @brief Compute the sum of absolute values (L1 norm) of the data.
+  Dtype asum_data() const;
+  /// @brief Compute the sum of absolute values (L1 norm) of the diff.
+  Dtype asum_diff() const;
+  /// @brief Compute the sum of squares (L2 norm squared) of the data.
+  Dtype sumsq_data() const;
+  /// @brief Compute the sum of squares (L2 norm squared) of the diff.
+  Dtype sumsq_diff() const;
+
+  /// @brief Scale the blob data by a constant factor.
+  void scale_data(Dtype scale_factor);
+  /// @brief Scale the blob diff by a constant factor.
+  void scale_diff(Dtype scale_factor);
+
+  /**
+   * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the
+   *        data_ of Blob other -- useful in Layer%s which simply perform a copy
+   *        in their Forward pass.
+   *
+   * This deallocates the SyncedMemory holding this Blob's data_, as
+   * shared_ptr calls its destructor when reset with the "=" operator.
+   */
+  void ShareData(const Blob& other);
+  /**
+   * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the
+   *        diff_ of Blob other -- useful in Layer%s which simply perform a copy
+   *        in their Forward pass.
+   *
+   * This deallocates the SyncedMemory holding this Blob's diff_, as
+   * shared_ptr calls its destructor when reset with the "=" operator.
+   */
+  void ShareDiff(const Blob& other);
+
+  bool ShapeEquals(const BlobProto& other);
+
+ protected:
+  shared_ptr<SyncedMemory> data_;
+  shared_ptr<SyncedMemory> diff_;
+  shared_ptr<SyncedMemory> shape_data_;
+  vector<int> shape_;
+  int count_;
+  int capacity_;
+
+  DISABLE_COPY_AND_ASSIGN(Blob);
+};  // class Blob
+
+}  // namespace caffe
+
+#endif  // CAFFE_BLOB_HPP_

+ 22 - 0
src/api/perception_lidar_cnn_segmentation/caffe/caffe.hpp

@@ -0,0 +1,22 @@
+// caffe.hpp is the header file that you need to include in your code. It wraps
+// all the internal caffe header files into one for simpler inclusion.
+
+#ifndef CAFFE_CAFFE_HPP_
+#define CAFFE_CAFFE_HPP_
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layer_factory.hpp"
+#include "caffe/net.hpp"
+#include "caffe/parallel.hpp"
+//#include "caffe/proto/caffe.pb.h"
+#include "caffe.pb.h"
+#include "caffe/solver.hpp"
+#include "caffe/solver_factory.hpp"
+#include "caffe/util/benchmark.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/util/upgrade_proto.hpp"
+
+#endif  // CAFFE_CAFFE_HPP_

+ 193 - 0
src/api/perception_lidar_cnn_segmentation/caffe/common.hpp

@@ -0,0 +1,193 @@
+#ifndef CAFFE_COMMON_HPP_
+#define CAFFE_COMMON_HPP_
+
+#include <boost/shared_ptr.hpp>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include <climits>
+#include <cmath>
+#include <fstream>  // NOLINT(readability/streams)
+#include <iostream>  // NOLINT(readability/streams)
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>  // pair
+#include <vector>
+
+#include "caffe/util/device_alternate.hpp"
+
+// Convert macro to string
+#define STRINGIFY(m) #m
+#define AS_STRING(m) STRINGIFY(m)
+
+// gflags 2.1 issue: namespace google was changed to gflags without warning.
+// Luckily we will be able to use GFLAGS_GFLAGS_H_ to detect if it is version
+// 2.1. If yes, we will add a temporary solution to redirect the namespace.
+// TODO(Yangqing): Once gflags solves the problem in a more elegant way, let's
+// remove the following hack.
+#ifndef GFLAGS_GFLAGS_H_
+namespace gflags = google;
+#endif  // GFLAGS_GFLAGS_H_
+
+// Disable the copy and assignment operator for a class.
+#define DISABLE_COPY_AND_ASSIGN(classname) \
+private:\
+  classname(const classname&);\
+  classname& operator=(const classname&)
+
+// Instantiate a class with float and double specifications.
+#define INSTANTIATE_CLASS(classname) \
+  char gInstantiationGuard##classname; \
+  template class classname<float>; \
+  template class classname<double>
+
+#define INSTANTIATE_LAYER_GPU_FORWARD(classname) \
+  template void classname<float>::Forward_gpu( \
+      const std::vector<Blob<float>*>& bottom, \
+      const std::vector<Blob<float>*>& top); \
+  template void classname<double>::Forward_gpu( \
+      const std::vector<Blob<double>*>& bottom, \
+      const std::vector<Blob<double>*>& top);
+
+#define INSTANTIATE_LAYER_GPU_BACKWARD(classname) \
+  template void classname<float>::Backward_gpu( \
+      const std::vector<Blob<float>*>& top, \
+      const std::vector<bool>& propagate_down, \
+      const std::vector<Blob<float>*>& bottom); \
+  template void classname<double>::Backward_gpu( \
+      const std::vector<Blob<double>*>& top, \
+      const std::vector<bool>& propagate_down, \
+      const std::vector<Blob<double>*>& bottom)
+
+#define INSTANTIATE_LAYER_GPU_FUNCS(classname) \
+  INSTANTIATE_LAYER_GPU_FORWARD(classname); \
+  INSTANTIATE_LAYER_GPU_BACKWARD(classname)
+
+// A simple macro to mark codes that are not implemented, so that when the code
+// is executed we will see a fatal log.
+#define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet"
+
+// See PR #1236
+namespace cv { class Mat; }
+
+namespace caffe {
+
+// We will use the boost shared_ptr instead of the new C++11 one mainly
+// because cuda does not work (at least now) well with C++11 features.
+using boost::shared_ptr;
+
+// Common functions and classes from std that caffe often uses.
+using std::fstream;
+using std::ios;
+using std::isnan;
+using std::isinf;
+using std::iterator;
+using std::make_pair;
+using std::map;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+// A global initialization function that you should call in your main function.
+// Currently it initializes google flags and google logging.
+void GlobalInit(int* pargc, char*** pargv);
+
+// A singleton class to hold common caffe stuff, such as the handler that
+// caffe is going to use for cublas, curand, etc.
+class Caffe {
+ public:
+  ~Caffe();
+
+  // Thread local context for Caffe. Moved to common.cpp instead of
+  // including boost/thread.hpp to avoid a boost/NVCC issues (#1009, #1010)
+  // on OSX. Also fails on Linux with CUDA 7.0.18.
+  static Caffe& Get();
+
+  enum Brew { CPU, GPU };
+
+  // This random number generator facade hides boost and CUDA rng
+  // implementation from one another (for cross-platform compatibility).
+  class RNG {
+   public:
+    RNG();
+    explicit RNG(unsigned int seed);
+    explicit RNG(const RNG&);
+    RNG& operator=(const RNG&);
+    void* generator();
+   private:
+    class Generator;
+    shared_ptr<Generator> generator_;
+  };
+
+  // Getters for boost rng, curand, and cublas handles
+  inline static RNG& rng_stream() {
+    if (!Get().random_generator_) {
+      Get().random_generator_.reset(new RNG());
+    }
+    return *(Get().random_generator_);
+  }
+#ifndef CPU_ONLY
+  inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
+  inline static curandGenerator_t curand_generator() {
+    return Get().curand_generator_;
+  }
+#endif
+
+  // Returns the mode: running on CPU or GPU.
+  inline static Brew mode() { return Get().mode_; }
+  // The setters for the variables
+  // Sets the mode. It is recommended that you don't change the mode halfway
+  // into the program since that may cause allocation of pinned memory being
+  // freed in a non-pinned way, which may cause problems - I haven't verified
+  // it personally but better to note it here in the header file.
+  inline static void set_mode(Brew mode) { Get().mode_ = mode; }
+  // Sets the random seed of both boost and curand
+  static void set_random_seed(const unsigned int seed);
+  // Sets the device. Since we have cublas and curand stuff, set device also
+  // requires us to reset those values.
+  static void SetDevice(const int device_id);
+  // Prints the current GPU status.
+  static void DeviceQuery();
+  // Check if specified device is available
+  static bool CheckDevice(const int device_id);
+  // Search from start_id to the highest possible device ordinal,
+  // return the ordinal of the first available device.
+  static int FindDevice(const int start_id = 0);
+  // Parallel training
+  inline static int solver_count() { return Get().solver_count_; }
+  inline static void set_solver_count(int val) { Get().solver_count_ = val; }
+  inline static int solver_rank() { return Get().solver_rank_; }
+  inline static void set_solver_rank(int val) { Get().solver_rank_ = val; }
+  inline static bool multiprocess() { return Get().multiprocess_; }
+  inline static void set_multiprocess(bool val) { Get().multiprocess_ = val; }
+  inline static bool root_solver() { return Get().solver_rank_ == 0; }
+
+ protected:
+#ifndef CPU_ONLY
+  cublasHandle_t cublas_handle_;
+  curandGenerator_t curand_generator_;
+#endif
+  shared_ptr<RNG> random_generator_;
+
+  Brew mode_;
+
+  // Parallel training
+  int solver_count_;
+  int solver_rank_;
+  bool multiprocess_;
+
+ private:
+  // The private constructor to avoid duplicate instantiation.
+  Caffe();
+
+  DISABLE_COPY_AND_ASSIGN(Caffe);
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_COMMON_HPP_

+ 154 - 0
src/api/perception_lidar_cnn_segmentation/caffe/data_transformer.hpp

@@ -0,0 +1,154 @@
+#ifndef CAFFE_DATA_TRANSFORMER_HPP
+#define CAFFE_DATA_TRANSFORMER_HPP
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Applies common transformations to the input data, such as
+ * scaling, mirroring, substracting the image mean...
+ */
+template <typename Dtype>
+class DataTransformer {
+ public:
+  explicit DataTransformer(const TransformationParameter& param, Phase phase);
+  virtual ~DataTransformer() {}
+
+  /**
+   * @brief Initialize the Random number generations if needed by the
+   *    transformation.
+   */
+  void InitRand();
+
+  /**
+   * @brief Applies the transformation defined in the data layer's
+   * transform_param block to the data.
+   *
+   * @param datum
+   *    Datum containing the data to be transformed.
+   * @param transformed_blob
+   *    This is destination blob. It can be part of top blob's data if
+   *    set_cpu_data() is used. See data_layer.cpp for an example.
+   */
+  void Transform(const Datum& datum, Blob<Dtype>* transformed_blob);
+
+  /**
+   * @brief Applies the transformation defined in the data layer's
+   * transform_param block to a vector of Datum.
+   *
+   * @param datum_vector
+   *    A vector of Datum containing the data to be transformed.
+   * @param transformed_blob
+   *    This is destination blob. It can be part of top blob's data if
+   *    set_cpu_data() is used. See memory_layer.cpp for an example.
+   */
+  void Transform(const vector<Datum> & datum_vector,
+                Blob<Dtype>* transformed_blob);
+
+#ifdef USE_OPENCV
+  /**
+   * @brief Applies the transformation defined in the data layer's
+   * transform_param block to a vector of Mat.
+   *
+   * @param mat_vector
+   *    A vector of Mat containing the data to be transformed.
+   * @param transformed_blob
+   *    This is destination blob. It can be part of top blob's data if
+   *    set_cpu_data() is used. See memory_layer.cpp for an example.
+   */
+  void Transform(const vector<cv::Mat> & mat_vector,
+                Blob<Dtype>* transformed_blob);
+
+  /**
+   * @brief Applies the transformation defined in the data layer's
+   * transform_param block to a cv::Mat
+   *
+   * @param cv_img
+   *    cv::Mat containing the data to be transformed.
+   * @param transformed_blob
+   *    This is destination blob. It can be part of top blob's data if
+   *    set_cpu_data() is used. See image_data_layer.cpp for an example.
+   */
+  void Transform(const cv::Mat& cv_img, Blob<Dtype>* transformed_blob);
+#endif  // USE_OPENCV
+
+  /**
+   * @brief Applies the same transformation defined in the data layer's
+   * transform_param block to all the num images in a input_blob.
+   *
+   * @param input_blob
+   *    A Blob containing the data to be transformed. It applies the same
+   *    transformation to all the num images in the blob.
+   * @param transformed_blob
+   *    This is destination blob, it will contain as many images as the
+   *    input blob. It can be part of top blob's data.
+   */
+  void Transform(Blob<Dtype>* input_blob, Blob<Dtype>* transformed_blob);
+
+  /**
+   * @brief Infers the shape of transformed_blob will have when
+   *    the transformation is applied to the data.
+   *
+   * @param datum
+   *    Datum containing the data to be transformed.
+   */
+  vector<int> InferBlobShape(const Datum& datum);
+  /**
+   * @brief Infers the shape of transformed_blob will have when
+   *    the transformation is applied to the data.
+   *    It uses the first element to infer the shape of the blob.
+   *
+   * @param datum_vector
+   *    A vector of Datum containing the data to be transformed.
+   */
+  vector<int> InferBlobShape(const vector<Datum> & datum_vector);
+  /**
+   * @brief Infers the shape of transformed_blob will have when
+   *    the transformation is applied to the data.
+   *    It uses the first element to infer the shape of the blob.
+   *
+   * @param mat_vector
+   *    A vector of Mat containing the data to be transformed.
+   */
+#ifdef USE_OPENCV
+  vector<int> InferBlobShape(const vector<cv::Mat> & mat_vector);
+  /**
+   * @brief Infers the shape of transformed_blob will have when
+   *    the transformation is applied to the data.
+   *
+   * @param cv_img
+   *    cv::Mat containing the data to be transformed.
+   */
+  vector<int> InferBlobShape(const cv::Mat& cv_img);
+#endif  // USE_OPENCV
+
+ protected:
+   /**
+   * @brief Generates a random integer from Uniform({0, 1, ..., n-1}).
+   *
+   * @param n
+   *    The upperbound (exclusive) value of the random number.
+   * @return
+   *    A uniformly random integer value from ({0, 1, ..., n-1}).
+   */
+  virtual int Rand(int n);
+
+  void Transform(const Datum& datum, Dtype* transformed_data);
+  // Tranformation parameters
+  TransformationParameter param_;
+
+
+  shared_ptr<Caffe::RNG> rng_;
+  Phase phase_;
+  Blob<Dtype> data_mean_;
+  vector<Dtype> mean_values_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_DATA_TRANSFORMER_HPP_

+ 301 - 0
src/api/perception_lidar_cnn_segmentation/caffe/filler.hpp

@@ -0,0 +1,301 @@
+// Fillers are random number generators that fills a blob using the specified
+// algorithm. The expectation is that they are only going to be used during
+// initialization time and will not involve any GPUs.
+
+#ifndef CAFFE_FILLER_HPP
+#define CAFFE_FILLER_HPP
+
+#include <string>
+
+#include "caffe/blob.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/syncedmem.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+/// @brief Fills a Blob with constant or randomly-generated data.
+template <typename Dtype>
+class Filler {
+ public:
+  explicit Filler(const FillerParameter& param) : filler_param_(param) {}
+  virtual ~Filler() {}
+  virtual void Fill(Blob<Dtype>* blob) = 0;
+ protected:
+  FillerParameter filler_param_;
+};  // class Filler
+
+
+/// @brief Fills a Blob with constant values @f$ x = 0 @f$.
+template <typename Dtype>
+class ConstantFiller : public Filler<Dtype> {
+ public:
+  explicit ConstantFiller(const FillerParameter& param)
+      : Filler<Dtype>(param) {}
+  virtual void Fill(Blob<Dtype>* blob) {
+    Dtype* data = blob->mutable_cpu_data();
+    const int count = blob->count();
+    const Dtype value = this->filler_param_.value();
+    CHECK(count);
+    for (int i = 0; i < count; ++i) {
+      data[i] = value;
+    }
+    CHECK_EQ(this->filler_param_.sparse(), -1)
+         << "Sparsity not supported by this Filler.";
+  }
+};
+
+/// @brief Fills a Blob with uniformly distributed values @f$ x\sim U(a, b) @f$.
+template <typename Dtype>
+class UniformFiller : public Filler<Dtype> {
+ public:
+  explicit UniformFiller(const FillerParameter& param)
+      : Filler<Dtype>(param) {}
+  virtual void Fill(Blob<Dtype>* blob) {
+    CHECK(blob->count());
+    caffe_rng_uniform<Dtype>(blob->count(), Dtype(this->filler_param_.min()),
+        Dtype(this->filler_param_.max()), blob->mutable_cpu_data());
+    CHECK_EQ(this->filler_param_.sparse(), -1)
+         << "Sparsity not supported by this Filler.";
+  }
+};
+
+/// @brief Fills a Blob with Gaussian-distributed values @f$ x = a @f$.
+template <typename Dtype>
+class GaussianFiller : public Filler<Dtype> {
+ public:
+  explicit GaussianFiller(const FillerParameter& param)
+      : Filler<Dtype>(param) {}
+  virtual void Fill(Blob<Dtype>* blob) {
+    Dtype* data = blob->mutable_cpu_data();
+    CHECK(blob->count());
+    caffe_rng_gaussian<Dtype>(blob->count(), Dtype(this->filler_param_.mean()),
+        Dtype(this->filler_param_.std()), blob->mutable_cpu_data());
+    int sparse = this->filler_param_.sparse();
+    CHECK_GE(sparse, -1);
+    if (sparse >= 0) {
+      // Sparse initialization is implemented for "weight" blobs; i.e. matrices.
+      // These have num == channels == 1; width is number of inputs; height is
+      // number of outputs.  The 'sparse' variable specifies the mean number
+      // of non-zero input weights for a given output.
+      CHECK_GE(blob->num_axes(), 1);
+      const int num_outputs = blob->shape(0);
+      Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs);
+      rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int)));
+      int* mask = reinterpret_cast<int*>(rand_vec_->mutable_cpu_data());
+      caffe_rng_bernoulli(blob->count(), non_zero_probability, mask);
+      for (int i = 0; i < blob->count(); ++i) {
+        data[i] *= mask[i];
+      }
+    }
+  }
+
+ protected:
+  shared_ptr<SyncedMemory> rand_vec_;
+};
+
+/** @brief Fills a Blob with values @f$ x \in [0, 1] @f$
+ *         such that @f$ \forall i \sum_j x_{ij} = 1 @f$.
+ */
+template <typename Dtype>
+class PositiveUnitballFiller : public Filler<Dtype> {
+ public:
+  explicit PositiveUnitballFiller(const FillerParameter& param)
+      : Filler<Dtype>(param) {}
+  virtual void Fill(Blob<Dtype>* blob) {
+    Dtype* data = blob->mutable_cpu_data();
+    DCHECK(blob->count());
+    caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
+    // We expect the filler to not be called very frequently, so we will
+    // just use a simple implementation
+    int dim = blob->count() / blob->shape(0);
+    CHECK(dim);
+    for (int i = 0; i < blob->shape(0); ++i) {
+      Dtype sum = 0;
+      for (int j = 0; j < dim; ++j) {
+        sum += data[i * dim + j];
+      }
+      for (int j = 0; j < dim; ++j) {
+        data[i * dim + j] /= sum;
+      }
+    }
+    CHECK_EQ(this->filler_param_.sparse(), -1)
+         << "Sparsity not supported by this Filler.";
+  }
+};
+
+/**
+ * @brief Fills a Blob with values @f$ x \sim U(-a, +a) @f$ where @f$ a @f$ is
+ *        set inversely proportional to number of incoming nodes, outgoing
+ *        nodes, or their average.
+ *
+ * A Filler based on the paper [Bengio and Glorot 2010]: Understanding
+ * the difficulty of training deep feedforward neuralnetworks.
+ *
+ * It fills the incoming matrix by randomly sampling uniform data from [-scale,
+ * scale] where scale = sqrt(3 / n) where n is the fan_in, fan_out, or their
+ * average, depending on the variance_norm option. You should make sure the
+ * input blob has shape (num, a, b, c) where a * b * c = fan_in and num * b * c
+ * = fan_out. Note that this is currently not the case for inner product layers.
+ *
+ * TODO(dox): make notation in above comment consistent with rest & use LaTeX.
+ */
+template <typename Dtype>
+class XavierFiller : public Filler<Dtype> {
+ public:
+  explicit XavierFiller(const FillerParameter& param)
+      : Filler<Dtype>(param) {}
+  virtual void Fill(Blob<Dtype>* blob) {
+    CHECK(blob->count());
+    int fan_in = blob->count() / blob->shape(0);
+    // Compatibility with ND blobs
+    int fan_out = blob->num_axes() > 1 ?
+                  blob->count() / blob->shape(1) :
+                  blob->count();
+    Dtype n = fan_in;  // default to fan_in
+    if (this->filler_param_.variance_norm() ==
+        FillerParameter_VarianceNorm_AVERAGE) {
+      n = (fan_in + fan_out) / Dtype(2);
+    } else if (this->filler_param_.variance_norm() ==
+        FillerParameter_VarianceNorm_FAN_OUT) {
+      n = fan_out;
+    }
+    Dtype scale = sqrt(Dtype(3) / n);
+    caffe_rng_uniform<Dtype>(blob->count(), -scale, scale,
+        blob->mutable_cpu_data());
+    CHECK_EQ(this->filler_param_.sparse(), -1)
+         << "Sparsity not supported by this Filler.";
+  }
+};
+
+/**
+ * @brief Fills a Blob with values @f$ x \sim N(0, \sigma^2) @f$ where
+ *        @f$ \sigma^2 @f$ is set inversely proportional to number of incoming
+ *        nodes, outgoing nodes, or their average.
+ *
+ * A Filler based on the paper [He, Zhang, Ren and Sun 2015]: Specifically
+ * accounts for ReLU nonlinearities.
+ *
+ * Aside: for another perspective on the scaling factor, see the derivation of
+ * [Saxe, McClelland, and Ganguli 2013 (v3)].
+ *
+ * It fills the incoming matrix by randomly sampling Gaussian data with std =
+ * sqrt(2 / n) where n is the fan_in, fan_out, or their average, depending on
+ * the variance_norm option. You should make sure the input blob has shape (num,
+ * a, b, c) where a * b * c = fan_in and num * b * c = fan_out. Note that this
+ * is currently not the case for inner product layers.
+ */
+template <typename Dtype>
+class MSRAFiller : public Filler<Dtype> {
+ public:
+  explicit MSRAFiller(const FillerParameter& param)
+      : Filler<Dtype>(param) {}
+  virtual void Fill(Blob<Dtype>* blob) {
+    CHECK(blob->count());
+    int fan_in = blob->count() / blob->shape(0);
+    // Compatibility with ND blobs
+    int fan_out = blob->num_axes() > 1 ?
+                  blob->count() / blob->shape(1) :
+                  blob->count();
+    Dtype n = fan_in;  // default to fan_in
+    if (this->filler_param_.variance_norm() ==
+        FillerParameter_VarianceNorm_AVERAGE) {
+      n = (fan_in + fan_out) / Dtype(2);
+    } else if (this->filler_param_.variance_norm() ==
+        FillerParameter_VarianceNorm_FAN_OUT) {
+      n = fan_out;
+    }
+    Dtype std = sqrt(Dtype(2) / n);
+    caffe_rng_gaussian<Dtype>(blob->count(), Dtype(0), std,
+        blob->mutable_cpu_data());
+    CHECK_EQ(this->filler_param_.sparse(), -1)
+         << "Sparsity not supported by this Filler.";
+  }
+};
+
+/*!
+@brief Fills a Blob with coefficients for bilinear interpolation.
+
+A common use case is with the DeconvolutionLayer acting as upsampling.
+You can upsample a feature map with shape of (B, C, H, W) by any integer factor
+using the following proto.
+\code
+layer {
+  name: "upsample", type: "Deconvolution"
+  bottom: "{{bottom_name}}" top: "{{top_name}}"
+  convolution_param {
+    kernel_size: {{2 * factor - factor % 2}} stride: {{factor}}
+    num_output: {{C}} group: {{C}}
+    pad: {{ceil((factor - 1) / 2.)}}
+    weight_filler: { type: "bilinear" } bias_term: false
+  }
+  param { lr_mult: 0 decay_mult: 0 }
+}
+\endcode
+Please use this by replacing `{{}}` with your values. By specifying
+`num_output: {{C}} group: {{C}}`, it behaves as
+channel-wise convolution. The filter shape of this deconvolution layer will be
+(C, 1, K, K) where K is `kernel_size`, and this filler will set a (K, K)
+interpolation kernel for every channel of the filter identically. The resulting
+shape of the top feature map will be (B, C, factor * H, factor * W).
+Note that the learning rate and the
+weight decay are set to 0 in order to keep coefficient values of bilinear
+interpolation unchanged during training. If you apply this to an image, this
+operation is equivalent to the following call in Python with Scikit.Image.
+\code{.py}
+out = skimage.transform.rescale(img, factor, mode='constant', cval=0)
+\endcode
+ */
+template <typename Dtype>
+class BilinearFiller : public Filler<Dtype> {
+ public:
+  explicit BilinearFiller(const FillerParameter& param)
+      : Filler<Dtype>(param) {}
+  virtual void Fill(Blob<Dtype>* blob) {
+    CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim.";
+    CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";
+    Dtype* data = blob->mutable_cpu_data();
+    int f = ceil(blob->width() / 2.);
+    Dtype c = (blob->width() - 1) / (2. * f);
+    for (int i = 0; i < blob->count(); ++i) {
+      Dtype x = i % blob->width();
+      Dtype y = (i / blob->width()) % blob->height();
+      data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
+    }
+    CHECK_EQ(this->filler_param_.sparse(), -1)
+         << "Sparsity not supported by this Filler.";
+  }
+};
+
+/**
+ * @brief Get a specific filler from the specification given in FillerParameter.
+ *
+ * Ideally this would be replaced by a factory pattern, but we will leave it
+ * this way for now.
+ */
+template <typename Dtype>
+Filler<Dtype>* GetFiller(const FillerParameter& param) {
+  const std::string& type = param.type();
+  if (type == "constant") {
+    return new ConstantFiller<Dtype>(param);
+  } else if (type == "gaussian") {
+    return new GaussianFiller<Dtype>(param);
+  } else if (type == "positive_unitball") {
+    return new PositiveUnitballFiller<Dtype>(param);
+  } else if (type == "uniform") {
+    return new UniformFiller<Dtype>(param);
+  } else if (type == "xavier") {
+    return new XavierFiller<Dtype>(param);
+  } else if (type == "msra") {
+    return new MSRAFiller<Dtype>(param);
+  } else if (type == "bilinear") {
+    return new BilinearFiller<Dtype>(param);
+  } else {
+    CHECK(false) << "Unknown filler name: " << param.type();
+  }
+  return (Filler<Dtype>*)(NULL);
+}
+
+}  // namespace caffe
+
+#endif  // CAFFE_FILLER_HPP_

+ 53 - 0
src/api/perception_lidar_cnn_segmentation/caffe/internal_thread.hpp

@@ -0,0 +1,53 @@
+#ifndef CAFFE_INTERNAL_THREAD_HPP_
+#define CAFFE_INTERNAL_THREAD_HPP_
+
+#include "caffe/common.hpp"
+
+/**
+ Forward declare boost::thread instead of including boost/thread.hpp
+ to avoid a boost/NVCC issues (#1009, #1010) on OSX.
+ */
+namespace boost { class thread; }
+
+namespace caffe {
+
+/**
+ * Virtual class encapsulate boost::thread for use in base class
+ * The child class will acquire the ability to run a single thread,
+ * by reimplementing the virtual function InternalThreadEntry.
+ */
+class InternalThread {
+ public:
+  InternalThread() : thread_() {}
+  virtual ~InternalThread();
+
+  /**
+   * Caffe's thread local state will be initialized using the current
+   * thread values, e.g. device id, solver index etc. The random seed
+   * is initialized using caffe_rng_rand.
+   */
+  void StartInternalThread();
+
+  /** Will not return until the internal thread has exited. */
+  void StopInternalThread();
+
+  bool is_started() const;
+
+ protected:
+  /* Implement this method in your subclass
+      with the code you want your thread to run. */
+  virtual void InternalThreadEntry() {}
+
+  /* Should be tested when running loops to exit when requested. */
+  bool must_stop();
+
+ private:
+  void entry(int device, Caffe::Brew mode, int rand_seed,
+      int solver_count, int solver_rank, bool multiprocess);
+
+  shared_ptr<boost::thread> thread_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_INTERNAL_THREAD_HPP_

+ 477 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layer.hpp

@@ -0,0 +1,477 @@
+#ifndef CAFFE_LAYER_H_
+#define CAFFE_LAYER_H_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer_factory.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/math_functions.hpp"
+
+/**
+ Forward declare boost::thread instead of including boost/thread.hpp
+ to avoid a boost/NVCC issues (#1009, #1010) on OSX.
+ */
+namespace boost { class mutex; }
+
+namespace caffe {
+
+/**
+ * @brief An interface for the units of computation which can be composed into a
+ *        Net.
+ *
+ * Layer%s must implement a Forward function, in which they take their input
+ * (bottom) Blob%s (if any) and compute their output Blob%s (if any).
+ * They may also implement a Backward function, in which they compute the error
+ * gradients with respect to their input Blob%s, given the error gradients with
+ * their output Blob%s.
+ */
+template <typename Dtype>
+class Layer {
+ public:
+  /**
+   * You should not implement your own constructor. Any set up code should go
+   * to SetUp(), where the dimensions of the bottom blobs are provided to the
+   * layer.
+   */
+  explicit Layer(const LayerParameter& param)
+    : layer_param_(param) {
+      // Set phase and copy blobs (if there are any).
+      phase_ = param.phase();
+      if (layer_param_.blobs_size() > 0) {
+        blobs_.resize(layer_param_.blobs_size());
+        for (int i = 0; i < layer_param_.blobs_size(); ++i) {
+          blobs_[i].reset(new Blob<Dtype>());
+          blobs_[i]->FromProto(layer_param_.blobs(i));
+        }
+      }
+    }
+  virtual ~Layer() {}
+
+  /**
+   * @brief Implements common layer setup functionality.
+   *
+   * @param bottom the preshaped input blobs
+   * @param top
+   *     the allocated but unshaped output blobs, to be shaped by Reshape
+   *
+   * Checks that the number of bottom and top blobs is correct.
+   * Calls LayerSetUp to do special layer setup for individual layer types,
+   * followed by Reshape to set up sizes of top blobs and internal buffers.
+   * Sets up the loss weight multiplier blobs for any non-zero loss weights.
+   * This method may not be overridden.
+   */
+  void SetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+    CheckBlobCounts(bottom, top);
+    LayerSetUp(bottom, top);
+    Reshape(bottom, top);
+    SetLossWeights(top);
+  }
+
+  /**
+   * @brief Does layer-specific setup: your layer should implement this function
+   *        as well as Reshape.
+   *
+   * @param bottom
+   *     the preshaped input blobs, whose data fields store the input data for
+   *     this layer
+   * @param top
+   *     the allocated but unshaped output blobs
+   *
+   * This method should do one-time layer specific setup. This includes reading
+   * and processing relevent parameters from the <code>layer_param_</code>.
+   * Setting up the shapes of top blobs and internal buffers should be done in
+   * <code>Reshape</code>, which will be called before the forward pass to
+   * adjust the top blob sizes.
+   */
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
+
+  /**
+   * @brief Adjust the shapes of top blobs and internal buffers to accommodate
+   *        the shapes of the bottom blobs.
+   *
+   * @param bottom the input blobs, with the requested input shapes
+   * @param top the top blobs, which should be reshaped as needed
+   *
+   * This method should reshape top blobs as needed according to the shapes
+   * of the bottom (input) blobs, as well as reshaping any internal buffers
+   * and making any other necessary adjustments so that the layer can
+   * accommodate the bottom blobs.
+   */
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) = 0;
+
+  /**
+   * @brief Given the bottom blobs, compute the top blobs and the loss.
+   *
+   * @param bottom
+   *     the input blobs, whose data fields store the input data for this layer
+   * @param top
+   *     the preshaped output blobs, whose data fields will store this layers'
+   *     outputs
+   * \return The total loss from the layer.
+   *
+   * The Forward wrapper calls the relevant device wrapper function
+   * (Forward_cpu or Forward_gpu) to compute the top blob values given the
+   * bottom blobs.  If the layer has any non-zero loss_weights, the wrapper
+   * then computes and returns the loss.
+   *
+   * Your layer should implement Forward_cpu and (optionally) Forward_gpu.
+   */
+  inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Given the top blob error gradients, compute the bottom blob error
+   *        gradients.
+   *
+   * @param top
+   *     the output blobs, whose diff fields store the gradient of the error
+   *     with respect to themselves
+   * @param propagate_down
+   *     a vector with equal length to bottom, with each index indicating
+   *     whether to propagate the error gradients down to the bottom blob at
+   *     the corresponding index
+   * @param bottom
+   *     the input blobs, whose diff fields will store the gradient of the error
+   *     with respect to themselves after Backward is run
+   *
+   * The Backward wrapper calls the relevant device wrapper function
+   * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the
+   * top blob diffs.
+   *
+   * Your layer should implement Backward_cpu and (optionally) Backward_gpu.
+   */
+  inline void Backward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down,
+      const vector<Blob<Dtype>*>& bottom);
+
+  /**
+   * @brief Returns the vector of learnable parameter blobs.
+   */
+  vector<shared_ptr<Blob<Dtype> > >& blobs() {
+    return blobs_;
+  }
+
+  /**
+   * @brief Returns the layer parameter.
+   */
+  const LayerParameter& layer_param() const { return layer_param_; }
+
+  /**
+   * @brief Writes the layer parameter to a protocol buffer
+   */
+  virtual void ToProto(LayerParameter* param, bool write_diff = false);
+
+  /**
+   * @brief Returns the scalar loss associated with a top blob at a given index.
+   */
+  inline Dtype loss(const int top_index) const {
+    return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0);
+  }
+
+  /**
+   * @brief Sets the loss associated with a top blob at a given index.
+   */
+  inline void set_loss(const int top_index, const Dtype value) {
+    if (loss_.size() <= top_index) {
+      loss_.resize(top_index + 1, Dtype(0));
+    }
+    loss_[top_index] = value;
+  }
+
+  /**
+   * @brief Returns the layer type.
+   */
+  virtual inline const char* type() const { return ""; }
+
+  /**
+   * @brief Returns the exact number of bottom blobs required by the layer,
+   *        or -1 if no exact number is required.
+   *
+   * This method should be overridden to return a non-negative value if your
+   * layer expects some exact number of bottom blobs.
+   */
+  virtual inline int ExactNumBottomBlobs() const { return -1; }
+  /**
+   * @brief Returns the minimum number of bottom blobs required by the layer,
+   *        or -1 if no minimum number is required.
+   *
+   * This method should be overridden to return a non-negative value if your
+   * layer expects some minimum number of bottom blobs.
+   */
+  virtual inline int MinBottomBlobs() const { return -1; }
+  /**
+   * @brief Returns the maximum number of bottom blobs required by the layer,
+   *        or -1 if no maximum number is required.
+   *
+   * This method should be overridden to return a non-negative value if your
+   * layer expects some maximum number of bottom blobs.
+   */
+  virtual inline int MaxBottomBlobs() const { return -1; }
+  /**
+   * @brief Returns the exact number of top blobs required by the layer,
+   *        or -1 if no exact number is required.
+   *
+   * This method should be overridden to return a non-negative value if your
+   * layer expects some exact number of top blobs.
+   */
+  virtual inline int ExactNumTopBlobs() const { return -1; }
+  /**
+   * @brief Returns the minimum number of top blobs required by the layer,
+   *        or -1 if no minimum number is required.
+   *
+   * This method should be overridden to return a non-negative value if your
+   * layer expects some minimum number of top blobs.
+   */
+  virtual inline int MinTopBlobs() const { return -1; }
+  /**
+   * @brief Returns the maximum number of top blobs required by the layer,
+   *        or -1 if no maximum number is required.
+   *
+   * This method should be overridden to return a non-negative value if your
+   * layer expects some maximum number of top blobs.
+   */
+  virtual inline int MaxTopBlobs() const { return -1; }
+  /**
+   * @brief Returns true if the layer requires an equal number of bottom and
+   *        top blobs.
+   *
+   * This method should be overridden to return true if your layer expects an
+   * equal number of bottom and top blobs.
+   */
+  virtual inline bool EqualNumBottomTopBlobs() const { return false; }
+
+  /**
+   * @brief Return whether "anonymous" top blobs are created automatically
+   *        by the layer.
+   *
+   * If this method returns true, Net::Init will create enough "anonymous" top
+   * blobs to fulfill the requirement specified by ExactNumTopBlobs() or
+   * MinTopBlobs().
+   */
+  virtual inline bool AutoTopBlobs() const { return false; }
+
+  /**
+   * @brief Return whether to allow force_backward for a given bottom blob
+   *        index.
+   *
+   * If AllowForceBackward(i) == false, we will ignore the force_backward
+   * setting and backpropagate to blob i only if it needs gradient information
+   * (as is done when force_backward == false).
+   */
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    return true;
+  }
+
+  /**
+   * @brief Specifies whether the layer should compute gradients w.r.t. a
+   *        parameter at a particular index given by param_id.
+   *
+   * You can safely ignore false values and always compute gradients
+   * for all parameters, but possibly with wasteful computation.
+   */
+  inline bool param_propagate_down(const int param_id) {
+    return (param_propagate_down_.size() > param_id) ?
+        param_propagate_down_[param_id] : false;
+  }
+  /**
+   * @brief Sets whether the layer should compute gradients w.r.t. a
+   *        parameter at a particular index given by param_id.
+   */
+  inline void set_param_propagate_down(const int param_id, const bool value) {
+    if (param_propagate_down_.size() <= param_id) {
+      param_propagate_down_.resize(param_id + 1, true);
+    }
+    param_propagate_down_[param_id] = value;
+  }
+
+
+ protected:
+  /** The protobuf that stores the layer parameters */
+  LayerParameter layer_param_;
+  /** The phase: TRAIN or TEST */
+  Phase phase_;
+  /** The vector that stores the learnable parameters as a set of blobs. */
+  vector<shared_ptr<Blob<Dtype> > > blobs_;
+  /** Vector indicating whether to compute the diff of each param blob. */
+  vector<bool> param_propagate_down_;
+
+  /** The vector that indicates whether each top blob has a non-zero weight in
+   *  the objective function. */
+  vector<Dtype> loss_;
+
+  /** @brief Using the CPU device, compute the layer output. */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) = 0;
+  /**
+   * @brief Using the GPU device, compute the layer output.
+   *        Fall back to Forward_cpu() if unavailable.
+   */
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+    // LOG(WARNING) << "Using CPU code as backup.";
+    return Forward_cpu(bottom, top);
+  }
+
+  /**
+   * @brief Using the CPU device, compute the gradients for any parameters and
+   *        for the bottom blobs if propagate_down is true.
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down,
+      const vector<Blob<Dtype>*>& bottom) = 0;
+  /**
+   * @brief Using the GPU device, compute the gradients for any parameters and
+   *        for the bottom blobs if propagate_down is true.
+   *        Fall back to Backward_cpu() if unavailable.
+   */
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down,
+      const vector<Blob<Dtype>*>& bottom) {
+    // LOG(WARNING) << "Using CPU code as backup.";
+    Backward_cpu(top, propagate_down, bottom);
+  }
+
+  /**
+   * Called by the parent Layer's SetUp to check that the number of bottom
+   * and top Blobs provided as input match the expected numbers specified by
+   * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions.
+   */
+  virtual void CheckBlobCounts(const vector<Blob<Dtype>*>& bottom,
+                               const vector<Blob<Dtype>*>& top) {
+    if (ExactNumBottomBlobs() >= 0) {
+      CHECK_EQ(ExactNumBottomBlobs(), bottom.size())
+          << type() << " Layer takes " << ExactNumBottomBlobs()
+          << " bottom blob(s) as input.";
+    }
+    if (MinBottomBlobs() >= 0) {
+      CHECK_LE(MinBottomBlobs(), bottom.size())
+          << type() << " Layer takes at least " << MinBottomBlobs()
+          << " bottom blob(s) as input.";
+    }
+    if (MaxBottomBlobs() >= 0) {
+      CHECK_GE(MaxBottomBlobs(), bottom.size())
+          << type() << " Layer takes at most " << MaxBottomBlobs()
+          << " bottom blob(s) as input.";
+    }
+    if (ExactNumTopBlobs() >= 0) {
+      CHECK_EQ(ExactNumTopBlobs(), top.size())
+          << type() << " Layer produces " << ExactNumTopBlobs()
+          << " top blob(s) as output.";
+    }
+    if (MinTopBlobs() >= 0) {
+      CHECK_LE(MinTopBlobs(), top.size())
+          << type() << " Layer produces at least " << MinTopBlobs()
+          << " top blob(s) as output.";
+    }
+    if (MaxTopBlobs() >= 0) {
+      CHECK_GE(MaxTopBlobs(), top.size())
+          << type() << " Layer produces at most " << MaxTopBlobs()
+          << " top blob(s) as output.";
+    }
+    if (EqualNumBottomTopBlobs()) {
+      CHECK_EQ(bottom.size(), top.size())
+          << type() << " Layer produces one top blob as output for each "
+          << "bottom blob input.";
+    }
+  }
+
+  /**
+   * Called by SetUp to initialize the weights associated with any top blobs in
+   * the loss function. Store non-zero loss weights in the diff blob.
+   */
+  inline void SetLossWeights(const vector<Blob<Dtype>*>& top) {
+    const int num_loss_weights = layer_param_.loss_weight_size();
+    if (num_loss_weights) {
+      CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be "
+          "unspecified or specified once per top blob.";
+      for (int top_id = 0; top_id < top.size(); ++top_id) {
+        const Dtype loss_weight = layer_param_.loss_weight(top_id);
+        if (loss_weight == Dtype(0)) { continue; }
+        this->set_loss(top_id, loss_weight);
+        const int count = top[top_id]->count();
+        Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff();
+        caffe_set(count, loss_weight, loss_multiplier);
+      }
+    }
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(Layer);
+};  // class Layer
+
+// Forward and backward wrappers. You should implement the cpu and
+// gpu specific implementations instead, and should not change these
+// functions.
+template <typename Dtype>
+inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  Dtype loss = 0;
+  Reshape(bottom, top);
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    Forward_cpu(bottom, top);
+    for (int top_id = 0; top_id < top.size(); ++top_id) {
+      if (!this->loss(top_id)) { continue; }
+      const int count = top[top_id]->count();
+      const Dtype* data = top[top_id]->cpu_data();
+      const Dtype* loss_weights = top[top_id]->cpu_diff();
+      loss += caffe_cpu_dot(count, data, loss_weights);
+    }
+    break;
+  case Caffe::GPU:
+    Forward_gpu(bottom, top);
+#ifndef CPU_ONLY
+    for (int top_id = 0; top_id < top.size(); ++top_id) {
+      if (!this->loss(top_id)) { continue; }
+      const int count = top[top_id]->count();
+      const Dtype* data = top[top_id]->gpu_data();
+      const Dtype* loss_weights = top[top_id]->gpu_diff();
+      Dtype blob_loss = 0;
+      caffe_gpu_dot(count, data, loss_weights, &blob_loss);
+      loss += blob_loss;
+    }
+#endif
+    break;
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+  }
+  return loss;
+}
+
+template <typename Dtype>
+inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    Backward_cpu(top, propagate_down, bottom);
+    break;
+  case Caffe::GPU:
+    Backward_gpu(top, propagate_down, bottom);
+    break;
+  default:
+    LOG(FATAL) << "Unknown caffe mode.";
+  }
+}
+
+// Serialize LayerParameter to protocol buffer
+template <typename Dtype>
+void Layer<Dtype>::ToProto(LayerParameter* param, bool write_diff) {
+  param->Clear();
+  param->CopyFrom(layer_param_);
+  param->clear_blobs();
+  for (int i = 0; i < blobs_.size(); ++i) {
+    blobs_[i]->ToProto(param->add_blobs(), write_diff);
+  }
+}
+
+}  // namespace caffe
+
+#endif  // CAFFE_LAYER_H_

+ 141 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layer_factory.hpp

@@ -0,0 +1,141 @@
+/**
+ * @brief A layer factory that allows one to register layers.
+ * During runtime, registered layers can be called by passing a LayerParameter
+ * protobuffer to the CreateLayer function:
+ *
+ *     LayerRegistry<Dtype>::CreateLayer(param);
+ *
+ * There are two ways to register a layer. Assuming that we have a layer like:
+ *
+ *   template <typename Dtype>
+ *   class MyAwesomeLayer : public Layer<Dtype> {
+ *     // your implementations
+ *   };
+ *
+ * and its type is its C++ class name, but without the "Layer" at the end
+ * ("MyAwesomeLayer" -> "MyAwesome").
+ *
+ * If the layer is going to be created simply by its constructor, in your c++
+ * file, add the following line:
+ *
+ *    REGISTER_LAYER_CLASS(MyAwesome);
+ *
+ * Or, if the layer is going to be created by another creator function, in the
+ * format of:
+ *
+ *    template <typename Dtype>
+ *    Layer<Dtype*> GetMyAwesomeLayer(const LayerParameter& param) {
+ *      // your implementation
+ *    }
+ *
+ * (for example, when your layer has multiple backends, see GetConvolutionLayer
+ * for a use case), then you can register the creator function instead, like
+ *
+ * REGISTER_LAYER_CREATOR(MyAwesome, GetMyAwesomeLayer)
+ *
+ * Note that each layer type should only be registered once.
+ */
+
+#ifndef CAFFE_LAYER_FACTORY_H_
+#define CAFFE_LAYER_FACTORY_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype>
+class Layer;
+
+template <typename Dtype>
+class LayerRegistry {
+ public:
+  typedef shared_ptr<Layer<Dtype> > (*Creator)(const LayerParameter&);
+  typedef std::map<string, Creator> CreatorRegistry;
+
+  static CreatorRegistry& Registry() {
+    static CreatorRegistry* g_registry_ = new CreatorRegistry();
+    return *g_registry_;
+  }
+
+  // Adds a creator.
+  static void AddCreator(const string& type, Creator creator) {
+    CreatorRegistry& registry = Registry();
+    CHECK_EQ(registry.count(type), 0)
+        << "Layer type " << type << " already registered.";
+    registry[type] = creator;
+  }
+
+  // Get a layer using a LayerParameter.
+  static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
+    if (Caffe::root_solver()) {
+      LOG(INFO) << "Creating layer " << param.name();
+    }
+    const string& type = param.type();
+    CreatorRegistry& registry = Registry();
+    CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
+        << " (known types: " << LayerTypeListString() << ")";
+    return registry[type](param);
+  }
+
+  static vector<string> LayerTypeList() {
+    CreatorRegistry& registry = Registry();
+    vector<string> layer_types;
+    for (typename CreatorRegistry::iterator iter = registry.begin();
+         iter != registry.end(); ++iter) {
+      layer_types.push_back(iter->first);
+    }
+    return layer_types;
+  }
+
+ private:
+  // Layer registry should never be instantiated - everything is done with its
+  // static variables.
+  LayerRegistry() {}
+
+  static string LayerTypeListString() {
+    vector<string> layer_types = LayerTypeList();
+    string layer_types_str;
+    for (vector<string>::iterator iter = layer_types.begin();
+         iter != layer_types.end(); ++iter) {
+      if (iter != layer_types.begin()) {
+        layer_types_str += ", ";
+      }
+      layer_types_str += *iter;
+    }
+    return layer_types_str;
+  }
+};
+
+
+template <typename Dtype>
+class LayerRegisterer {
+ public:
+  LayerRegisterer(const string& type,
+                  shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&)) {
+    // LOG(INFO) << "Registering layer type: " << type;
+    LayerRegistry<Dtype>::AddCreator(type, creator);
+  }
+};
+
+
+#define REGISTER_LAYER_CREATOR(type, creator)                                  \
+  static LayerRegisterer<float> g_creator_f_##type(#type, creator<float>);     \
+  static LayerRegisterer<double> g_creator_d_##type(#type, creator<double>)    \
+
+#define REGISTER_LAYER_CLASS(type)                                             \
+  template <typename Dtype>                                                    \
+  shared_ptr<Layer<Dtype> > Creator_##type##Layer(const LayerParameter& param) \
+  {                                                                            \
+    return shared_ptr<Layer<Dtype> >(new type##Layer<Dtype>(param));           \
+  }                                                                            \
+  REGISTER_LAYER_CREATOR(type, Creator_##type##Layer)
+
+}  // namespace caffe
+
+#endif  // CAFFE_LAYER_FACTORY_H_

+ 68 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/absval_layer.hpp

@@ -0,0 +1,68 @@
+#ifndef CAFFE_ABSVAL_LAYER_HPP_
+#define CAFFE_ABSVAL_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes @f$ y = |x| @f$
+ *
+ * @param bottom input Blob vector (length 1)
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the inputs @f$ x @f$
+ * @param top output Blob vector (length 1)
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the computed outputs @f$ y = |x| @f$
+ */
+template <typename Dtype>
+class AbsValLayer : public NeuronLayer<Dtype> {
+ public:
+  explicit AbsValLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "AbsVal"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /// @copydoc AbsValLayer
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the absolute value inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 2)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial x} =
+   *            \mathrm{sign}(x) \frac{\partial E}{\partial y}
+   *      @f$ if propagate_down[0]
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_ABSVAL_LAYER_HPP_

+ 99 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/accuracy_layer.hpp

@@ -0,0 +1,99 @@
+#ifndef CAFFE_ACCURACY_LAYER_HPP_
+#define CAFFE_ACCURACY_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/loss_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes the classification accuracy for a one-of-many
+ *        classification task.
+ */
+template <typename Dtype>
+class AccuracyLayer : public Layer<Dtype> {
+ public:
+  /**
+   * @param param provides AccuracyParameter accuracy_param,
+   *     with AccuracyLayer options:
+   *   - top_k (\b optional, default 1).
+   *     Sets the maximum rank @f$ k @f$ at which a prediction is considered
+   *     correct.  For example, if @f$ k = 5 @f$, a prediction is counted
+   *     correct if the correct label is among the top 5 predicted labels.
+   */
+  explicit AccuracyLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Accuracy"; }
+  virtual inline int ExactNumBottomBlobs() const { return 2; }
+
+  // If there are two top blobs, then the second blob will contain
+  // accuracies per class.
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MaxTopBlobs() const { return 2; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 2)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the predictions @f$ x @f$, a Blob with values in
+   *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
+   *      the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted
+   *      label @f$ \hat{l}_n @f$ given by its maximal index:
+   *      @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$
+   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+   *      the labels @f$ l @f$, an integer-valued Blob with values
+   *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
+   *      indicating the correct class label among the @f$ K @f$ classes
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+   *      the computed accuracy: @f$
+   *        \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \}
+   *      @f$, where @f$
+   *      \delta\{\mathrm{condition}\} = \left\{
+   *         \begin{array}{lr}
+   *            1 & \mbox{if condition} \\
+   *            0 & \mbox{otherwise}
+   *         \end{array} \right.
+   *      @f$
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+
+  /// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    for (int i = 0; i < propagate_down.size(); ++i) {
+      if (propagate_down[i]) { NOT_IMPLEMENTED; }
+    }
+  }
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  int label_axis_, outer_num_, inner_num_;
+
+  int top_k_;
+
+  /// Whether to ignore instances with a certain label.
+  bool has_ignore_label_;
+  /// The label indicating that an instance should be ignored.
+  int ignore_label_;
+  /// Keeps counts of the number of samples per class.
+  Blob<Dtype> nums_buffer_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_ACCURACY_LAYER_HPP_

+ 77 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/argmax_layer.hpp

@@ -0,0 +1,77 @@
+#ifndef CAFFE_ARGMAX_LAYER_HPP_
+#define CAFFE_ARGMAX_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Compute the index of the @f$ K @f$ max values for each datum across
+ *        all dimensions @f$ (C \times H \times W) @f$.
+ *
+ * Intended for use after a classification layer to produce a prediction.
+ * If parameter out_max_val is set to true, output is a vector of pairs
+ * (max_ind, max_val) for each image. The axis parameter specifies an axis
+ * along which to maximise.
+ *
+ * NOTE: does not implement Backwards operation.
+ */
+template <typename Dtype>
+class ArgMaxLayer : public Layer<Dtype> {
+ public:
+  /**
+   * @param param provides ArgMaxParameter argmax_param,
+   *     with ArgMaxLayer options:
+   *   - top_k (\b optional uint, default 1).
+   *     the number @f$ K @f$ of maximal items to output.
+   *   - out_max_val (\b optional bool, default false).
+   *     if set, output a vector of pairs (max_ind, max_val) unless axis is set then
+   *     output max_val along the specified axis.
+   *   - axis (\b optional int).
+   *     if set, maximise along the specified axis else maximise the flattened
+   *     trailing dimensions for each index of the first / num dimension.
+   */
+  explicit ArgMaxLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "ArgMax"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times 1 \times K) @f$ or, if out_max_val
+   *      @f$ (N \times 2 \times K) @f$ unless axis set than e.g.
+   *      @f$ (N \times K \times H \times W) @f$ if axis == 1
+   *      the computed outputs @f$
+   *       y_n = \arg\max\limits_i x_{ni}
+   *      @f$ (for @f$ K = 1 @f$).
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  /// @brief Not implemented (non-differentiable function)
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    NOT_IMPLEMENTED;
+  }
+  bool out_max_val_;
+  size_t top_k_;
+  bool has_axis_;
+  int axis_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_ARGMAX_LAYER_HPP_

+ 174 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/base_conv_layer.hpp

@@ -0,0 +1,174 @@
+#ifndef CAFFE_BASE_CONVOLUTION_LAYER_HPP_
+#define CAFFE_BASE_CONVOLUTION_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/im2col.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Abstract base class that factors out the BLAS code common to
+ *        ConvolutionLayer and DeconvolutionLayer.
+ */
+template <typename Dtype>
+class BaseConvolutionLayer : public Layer<Dtype> {
+ public:
+  explicit BaseConvolutionLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline bool EqualNumBottomTopBlobs() const { return true; }
+
+ protected:
+  // Helper functions that abstract away the column buffer and gemm arguments.
+  // The last argument in forward_cpu_gemm is so that we can skip the im2col if
+  // we just called weight_cpu_gemm with the same input.
+  void forward_cpu_gemm(const Dtype* input, const Dtype* weights,
+      Dtype* output, bool skip_im2col = false);
+  void forward_cpu_bias(Dtype* output, const Dtype* bias);
+  void backward_cpu_gemm(const Dtype* input, const Dtype* weights,
+      Dtype* output);
+  void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype*
+      weights);
+  void backward_cpu_bias(Dtype* bias, const Dtype* input);
+
+#ifndef CPU_ONLY
+  void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
+      Dtype* output, bool skip_im2col = false);
+  void forward_gpu_bias(Dtype* output, const Dtype* bias);
+  void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
+      Dtype* col_output);
+  void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype*
+      weights);
+  void backward_gpu_bias(Dtype* bias, const Dtype* input);
+#endif
+
+  /// @brief The spatial dimensions of the input.
+  inline int input_shape(int i) {
+    return (*bottom_shape_)[channel_axis_ + i];
+  }
+  // reverse_dimensions should return true iff we are implementing deconv, so
+  // that conv helpers know which dimensions are which.
+  virtual bool reverse_dimensions() = 0;
+  // Compute height_out_ and width_out_ from other parameters.
+  virtual void compute_output_shape() = 0;
+
+  /// @brief The spatial dimensions of a filter kernel.
+  Blob<int> kernel_shape_;
+  /// @brief The spatial dimensions of the stride.
+  Blob<int> stride_;
+  /// @brief The spatial dimensions of the padding.
+  Blob<int> pad_;
+  /// @brief The spatial dimensions of the dilation.
+  Blob<int> dilation_;
+  /// @brief The spatial dimensions of the convolution input.
+  Blob<int> conv_input_shape_;
+  /// @brief The spatial dimensions of the col_buffer.
+  vector<int> col_buffer_shape_;
+  /// @brief The spatial dimensions of the output.
+  vector<int> output_shape_;
+  const vector<int>* bottom_shape_;
+
+  int num_spatial_axes_;
+  int bottom_dim_;
+  int top_dim_;
+
+  int channel_axis_;
+  int num_;
+  int channels_;
+  int group_;
+  int out_spatial_dim_;
+  int weight_offset_;
+  int num_output_;
+  bool bias_term_;
+  bool is_1x1_;
+  bool force_nd_im2col_;
+
+ private:
+  // wrap im2col/col2im so we don't have to remember the (long) argument lists
+  inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
+      im2col_cpu(data, conv_in_channels_,
+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
+          pad_.cpu_data()[0], pad_.cpu_data()[1],
+          stride_.cpu_data()[0], stride_.cpu_data()[1],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);
+    } else {
+      im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(),
+          col_buffer_shape_.data(), kernel_shape_.cpu_data(),
+          pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), col_buff);
+    }
+  }
+  inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
+      col2im_cpu(col_buff, conv_in_channels_,
+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
+          pad_.cpu_data()[0], pad_.cpu_data()[1],
+          stride_.cpu_data()[0], stride_.cpu_data()[1],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], data);
+    } else {
+      col2im_nd_cpu(col_buff, num_spatial_axes_, conv_input_shape_.cpu_data(),
+          col_buffer_shape_.data(), kernel_shape_.cpu_data(),
+          pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), data);
+    }
+  }
+#ifndef CPU_ONLY
+  inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
+      im2col_gpu(data, conv_in_channels_,
+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
+          pad_.cpu_data()[0], pad_.cpu_data()[1],
+          stride_.cpu_data()[0], stride_.cpu_data()[1],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);
+    } else {
+      im2col_nd_gpu(data, num_spatial_axes_, num_kernels_im2col_,
+          conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
+          kernel_shape_.gpu_data(), pad_.gpu_data(),
+          stride_.gpu_data(), dilation_.gpu_data(), col_buff);
+    }
+  }
+  inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
+      col2im_gpu(col_buff, conv_in_channels_,
+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
+          pad_.cpu_data()[0], pad_.cpu_data()[1],
+          stride_.cpu_data()[0], stride_.cpu_data()[1],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], data);
+    } else {
+      col2im_nd_gpu(col_buff, num_spatial_axes_, num_kernels_col2im_,
+          conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
+          kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
+          dilation_.gpu_data(), data);
+    }
+  }
+#endif
+
+  int num_kernels_im2col_;
+  int num_kernels_col2im_;
+  int conv_out_channels_;
+  int conv_in_channels_;
+  int conv_out_spatial_dim_;
+  int kernel_dim_;
+  int col_offset_;
+  int output_offset_;
+
+  Blob<Dtype> col_buffer_;
+  Blob<Dtype> bias_multiplier_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_BASE_CONVOLUTION_LAYER_HPP_

+ 82 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/base_data_layer.hpp

@@ -0,0 +1,82 @@
+#ifndef CAFFE_DATA_LAYERS_HPP_
+#define CAFFE_DATA_LAYERS_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/data_transformer.hpp"
+#include "caffe/internal_thread.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/blocking_queue.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Provides base for data layers that feed blobs to the Net.
+ *
+ * TODO(dox): thorough documentation for Forward and proto params.
+ */
+template <typename Dtype>
+class BaseDataLayer : public Layer<Dtype> {
+ public:
+  explicit BaseDataLayer(const LayerParameter& param);
+  // LayerSetUp: implements common data layer setup functionality, and calls
+  // DataLayerSetUp to do special data layer setup for individual layer types.
+  // This method may not be overridden except by the BasePrefetchingDataLayer.
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
+  // Data layers have no bottoms, so reshaping is trivial.
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
+
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+
+ protected:
+  TransformationParameter transform_param_;
+  shared_ptr<DataTransformer<Dtype> > data_transformer_;
+  bool output_labels_;
+};
+
+template <typename Dtype>
+class Batch {
+ public:
+  Blob<Dtype> data_, label_;
+};
+
+template <typename Dtype>
+class BasePrefetchingDataLayer :
+    public BaseDataLayer<Dtype>, public InternalThread {
+ public:
+  explicit BasePrefetchingDataLayer(const LayerParameter& param);
+  // LayerSetUp: implements common data layer setup functionality, and calls
+  // DataLayerSetUp to do special data layer setup for individual layer types.
+  // This method may not be overridden.
+  void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+ protected:
+  virtual void InternalThreadEntry();
+  virtual void load_batch(Batch<Dtype>* batch) = 0;
+
+  vector<shared_ptr<Batch<Dtype> > > prefetch_;
+  BlockingQueue<Batch<Dtype>*> prefetch_free_;
+  BlockingQueue<Batch<Dtype>*> prefetch_full_;
+  Batch<Dtype>* prefetch_current_;
+
+  Blob<Dtype> transformed_data_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_DATA_LAYERS_HPP_

+ 78 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/batch_norm_layer.hpp

@@ -0,0 +1,78 @@
+#ifndef CAFFE_BATCHNORM_LAYER_HPP_
+#define CAFFE_BATCHNORM_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Normalizes the input to have 0-mean and/or unit (1) variance across
+ *        the batch.
+ *
+ * This layer computes Batch Normalization as described in [1]. For each channel
+ * in the data (i.e. axis 1), it subtracts the mean and divides by the variance,
+ * where both statistics are computed across both spatial dimensions and across
+ * the different examples in the batch.
+ *
+ * By default, during training time, the network is computing global
+ * mean/variance statistics via a running average, which is then used at test
+ * time to allow deterministic outputs for each input. You can manually toggle
+ * whether the network is accumulating or using the statistics via the
+ * use_global_stats option. For reference, these statistics are kept in the
+ * layer's three blobs: (0) mean, (1) variance, and (2) moving average factor.
+ *
+ * Note that the original paper also included a per-channel learned bias and
+ * scaling factor. To implement this in Caffe, define a `ScaleLayer` configured
+ * with `bias_term: true` after each `BatchNormLayer` to handle both the bias
+ * and scaling factor.
+ *
+ * [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
+ *     Training by Reducing Internal Covariate Shift." arXiv preprint
+ *     arXiv:1502.03167 (2015).
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Dtype>
+class BatchNormLayer : public Layer<Dtype> {
+ public:
+  explicit BatchNormLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "BatchNorm"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  Blob<Dtype> mean_, variance_, temp_, x_norm_;
+  bool use_global_stats_;
+  Dtype moving_average_fraction_;
+  int channels_;
+  Dtype eps_;
+
+  // extra temporarary variables is used to carry out sums/broadcasting
+  // using BLAS
+  Blob<Dtype> batch_sum_multiplier_;
+  Blob<Dtype> num_by_chans_;
+  Blob<Dtype> spatial_sum_multiplier_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_BATCHNORM_LAYER_HPP_

+ 83 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/batch_reindex_layer.hpp

@@ -0,0 +1,83 @@
+#ifndef CAFFE_BATCHREINDEX_LAYER_HPP_
+#define CAFFE_BATCHREINDEX_LAYER_HPP_
+
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Index into the input blob along its first axis.
+ *
+ * This layer can be used to select, reorder, and even replicate examples in a
+ * batch.  The second blob is cast to int and treated as an index into the
+ * first axis of the first blob.
+ */
+template <typename Dtype>
+class BatchReindexLayer : public Layer<Dtype> {
+ public:
+  explicit BatchReindexLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "BatchReindex"; }
+  virtual inline int ExactNumBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 2+)
+   *   -# @f$ (N \times ...) @f$
+   *      the inputs @f$ x_1 @f$
+   *   -# @f$ (M) @f$
+   *      the inputs @f$ x_2 @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (M \times ...) @f$:
+   *      the reindexed array @f$
+   *        y = x_1[x_2]
+   *      @f$
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the reordered input.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient
+   *        with respect to the outputs
+   *   -# @f$ (M \times ...) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to concatenated outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 2):
+   *   - @f$ \frac{\partial E}{\partial y} @f$ is de-indexed (summing where
+   *     required) back to the input x_1
+   *   - This layer cannot backprop to x_2, i.e. propagate_down[1] must be
+   *     false.
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+ private:
+  struct pair_sort_first {
+    bool operator()(const std::pair<int, int> &left,
+                    const std::pair<int, int> &right) {
+      return left.first < right.first;
+    }
+  };
+  void check_batch_reindex(int initial_num, int final_num,
+                           const Dtype* ridx_data);
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_BATCHREINDEX_LAYER_HPP_

+ 54 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/bias_layer.hpp

@@ -0,0 +1,54 @@
+#ifndef CAFFE_BIAS_LAYER_HPP_
+#define CAFFE_BIAS_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Computes a sum of two input Blobs, with the shape of the latter Blob
+ *        "broadcast" to match the shape of the former. Equivalent to tiling
+ *        the latter Blob, then computing the elementwise sum.
+ *
+ * The second input may be omitted, in which case it's learned as a parameter
+ * of the layer. Note: in case bias and scaling are desired, both operations can
+ * be handled by `ScaleLayer` configured with `bias_term: true`.
+ */
+template <typename Dtype>
+class BiasLayer : public Layer<Dtype> {
+ public:
+  explicit BiasLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Bias"; }
+  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int MaxBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+ private:
+  Blob<Dtype> bias_multiplier_;
+  int outer_dim_, bias_dim_, inner_dim_, dim_;
+};
+
+
+
+}  // namespace caffe
+
+#endif  // CAFFE_BIAS_LAYER_HPP_

+ 70 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/bnll_layer.hpp

@@ -0,0 +1,70 @@
+#ifndef CAFFE_BNLL_LAYER_HPP_
+#define CAFFE_BNLL_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes @f$ y = x + \log(1 + \exp(-x)) @f$ if @f$ x > 0 @f$;
+ *        @f$ y = \log(1 + \exp(x)) @f$ otherwise.
+ *
+ * @param bottom input Blob vector (length 1)
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the inputs @f$ x @f$
+ * @param top output Blob vector (length 1)
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the computed outputs @f$
+ *      y = \left\{
+ *         \begin{array}{ll}
+ *            x + \log(1 + \exp(-x)) & \mbox{if } x > 0 \\
+ *            \log(1 + \exp(x)) & \mbox{otherwise}
+ *         \end{array} \right.
+ *      @f$
+ */
+template <typename Dtype>
+class BNLLLayer : public NeuronLayer<Dtype> {
+ public:
+  explicit BNLLLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "BNLL"; }
+
+ protected:
+  /// @copydoc BNLLLayer
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the BNLL inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 2)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial x}
+   *      @f$ if propagate_down[0]
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_BNLL_LAYER_HPP_

+ 75 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/clip_layer.hpp

@@ -0,0 +1,75 @@
+#ifndef CAFFE_CLIP_LAYER_HPP_
+#define CAFFE_CLIP_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Clip: @f$ y = \max(min, \min(max, x)) @f$.
+ */
+template <typename Dtype>
+class ClipLayer : public NeuronLayer<Dtype> {
+ public:
+  /**
+   * @param param provides ClipParameter clip_param,
+   *     with ClipLayer options:
+   *   - min
+   *   - max
+   */
+  explicit ClipLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "Clip"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the computed outputs @f$
+   *        y = \max(min, \min(max, x))
+   *      @f$
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the clipped inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial x} = \left\{
+   *        \begin{array}{lr}
+   *            0 & \mathrm{if} \; x < min \vee x > max \\
+   *            \frac{\partial E}{\partial y} & \mathrm{if} \; x \ge min \wedge x \le max
+   *        \end{array} \right.
+   *      @f$
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_CLIP_LAYER_HPP_

+ 87 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/concat_layer.hpp

@@ -0,0 +1,87 @@
+#ifndef CAFFE_CONCAT_LAYER_HPP_
+#define CAFFE_CONCAT_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Takes at least two Blob%s and concatenates them along either the num
+ *        or channel dimension, outputting the result.
+ */
+template <typename Dtype>
+class ConcatLayer : public Layer<Dtype> {
+ public:
+  explicit ConcatLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Concat"; }
+  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 2+)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x_1 @f$
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x_2 @f$
+   *   -# ...
+   *   - K @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x_K @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
+   *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
+   *      the concatenated output @f$
+   *        y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}]
+   *      @f$
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the concatenate inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *        respect to the outputs
+   *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
+   *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to concatenated outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length K), into which the top gradient
+   *        @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the
+   *        inputs @f$
+   *        \left[ \begin{array}{cccc}
+   *          \frac{\partial E}{\partial x_1} &
+   *          \frac{\partial E}{\partial x_2} &
+   *          ... &
+   *          \frac{\partial E}{\partial x_K}
+   *        \end{array} \right] =
+   *        \frac{\partial E}{\partial y}
+   *        @f$
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  int count_;
+  int num_concats_;
+  int concat_input_size_;
+  int concat_axis_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_CONCAT_LAYER_HPP_

+ 101 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/contrastive_loss_layer.hpp

@@ -0,0 +1,101 @@
+#ifndef CAFFE_CONTRASTIVE_LOSS_LAYER_HPP_
+#define CAFFE_CONTRASTIVE_LOSS_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/loss_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes the contrastive loss @f$
+ *          E = \frac{1}{2N} \sum\limits_{n=1}^N \left(y\right) d^2 +
+ *              \left(1-y\right) \max \left(margin-d, 0\right)^2
+ *          @f$ where @f$
+ *          d = \left| \left| a_n - b_n \right| \right|_2 @f$. This can be
+ *          used to train siamese networks.
+ *
+ * @param bottom input Blob vector (length 3)
+ *   -# @f$ (N \times C \times 1 \times 1) @f$
+ *      the features @f$ a \in [-\infty, +\infty]@f$
+ *   -# @f$ (N \times C \times 1 \times 1) @f$
+ *      the features @f$ b \in [-\infty, +\infty]@f$
+ *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+ *      the binary similarity @f$ s \in [0, 1]@f$
+ * @param top output Blob vector (length 1)
+ *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+ *      the computed contrastive loss: @f$ E =
+ *          \frac{1}{2N} \sum\limits_{n=1}^N \left(y\right) d^2 +
+ *          \left(1-y\right) \max \left(margin-d, 0\right)^2
+ *          @f$ where @f$
+ *          d = \left| \left| a_n - b_n \right| \right|_2 @f$.
+ * This can be used to train siamese networks.
+ */
+template <typename Dtype>
+class ContrastiveLossLayer : public LossLayer<Dtype> {
+ public:
+  explicit ContrastiveLossLayer(const LayerParameter& param)
+      : LossLayer<Dtype>(param), diff_() {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline int ExactNumBottomBlobs() const { return 3; }
+  virtual inline const char* type() const { return "ContrastiveLoss"; }
+  /**
+   * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate
+   * to the first two inputs.
+   */
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    return bottom_index != 2;
+  }
+
+ protected:
+  /// @copydoc ContrastiveLossLayer
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the Contrastive error gradient w.r.t. the inputs.
+   *
+   * Computes the gradients with respect to the two input vectors (bottom[0] and
+   * bottom[1]), but not the similarity label (bottom[2]).
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+   *      as @f$ \lambda @f$ is the coefficient of this layer's output
+   *      @f$\ell_i@f$ in the overall Net loss
+   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+   *      (*Assuming that this top Blob is not used as a bottom (input) by any
+   *      other layer of the Net.)
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 2)
+   *   -# @f$ (N \times C \times 1 \times 1) @f$
+   *      the features @f$a@f$; Backward fills their diff with
+   *      gradients if propagate_down[0]
+   *   -# @f$ (N \times C \times 1 \times 1) @f$
+   *      the features @f$b@f$; Backward fills their diff with gradients if
+   *      propagate_down[1]
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  Blob<Dtype> diff_;  // cached for backward pass
+  Blob<Dtype> dist_sq_;  // cached for backward pass
+  Blob<Dtype> diff_sq_;  // tmp storage for gpu forward pass
+  Blob<Dtype> summer_vec_;  // tmp storage for gpu forward pass
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_CONTRASTIVE_LOSS_LAYER_HPP_

+ 84 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/conv_layer.hpp

@@ -0,0 +1,84 @@
+#ifndef CAFFE_CONV_LAYER_HPP_
+#define CAFFE_CONV_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/base_conv_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Convolves the input image with a bank of learned filters,
+ *        and (optionally) adds biases.
+ *
+ *   Caffe convolves by reduction to matrix multiplication. This achieves
+ *   high-throughput and generality of input and filter dimensions but comes at
+ *   the cost of memory for matrices. This makes use of efficiency in BLAS.
+ *
+ *   The input is "im2col" transformed to a channel K' x H x W data matrix
+ *   for multiplication with the N x K' x H x W filter matrix to yield a
+ *   N' x H x W output matrix that is then "col2im" restored. K' is the
+ *   input channel * kernel height * kernel width dimension of the unrolled
+ *   inputs so that the im2col matrix has a column for each input region to
+ *   be filtered. col2im restores the output spatial structure by rolling up
+ *   the output channel N' columns of the output matrix.
+ */
+template <typename Dtype>
+class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
+ public:
+  /**
+   * @param param provides ConvolutionParameter convolution_param,
+   *    with ConvolutionLayer options:
+   *  - num_output. The number of filters.
+   *  - kernel_size / kernel_h / kernel_w. The filter dimensions, given by
+   *  kernel_size for square filters or kernel_h and kernel_w for rectangular
+   *  filters.
+   *  - stride / stride_h / stride_w (\b optional, default 1). The filter
+   *  stride, given by stride_size for equal dimensions or stride_h and stride_w
+   *  for different strides. By default the convolution is dense with stride 1.
+   *  - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for
+   *  convolution, given by pad for equal dimensions or pad_h and pad_w for
+   *  different padding. Input padding is computed implicitly instead of
+   *  actually padding.
+   *  - dilation (\b optional, default 1). The filter
+   *  dilation, given by dilation_size for equal dimensions for different
+   *  dilation. By default the convolution has dilation 1.
+   *  - group (\b optional, default 1). The number of filter groups. Group
+   *  convolution is a method for reducing parameterization by selectively
+   *  connecting input and output channels. The input and output channel dimensions must be divisible
+   *  by the number of groups. For group @f$ \geq 1 @f$, the
+   *  convolutional filters' input and output channels are separated s.t. each
+   *  group takes 1 / group of the input channels and makes 1 / group of the
+   *  output channels. Concretely 4 input channels, 8 output channels, and
+   *  2 groups separate input channels 1-2 and output channels 1-4 into the
+   *  first group and input channels 3-4 and output channels 5-8 into the second
+   *  group.
+   *  - bias_term (\b optional, default true). Whether to have a bias.
+   *  - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library
+   *    kernels + stream parallelism) engines.
+   */
+  explicit ConvolutionLayer(const LayerParameter& param)
+      : BaseConvolutionLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "Convolution"; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual inline bool reverse_dimensions() { return false; }
+  virtual void compute_output_shape();
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_CONV_LAYER_HPP_

+ 78 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/crop_layer.hpp

@@ -0,0 +1,78 @@
+#ifndef CAFFE_CROP_LAYER_HPP_
+#define CAFFE_CROP_LAYER_HPP_
+
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Takes a Blob and crop it, to the shape specified by the second input
+ *  Blob, across all dimensions after the specified axis.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+
+template <typename Dtype>
+class CropLayer : public Layer<Dtype> {
+ public:
+  explicit CropLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Crop"; }
+  virtual inline int ExactNumBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  Blob<int> offsets;
+  Blob<int> src_strides_;
+  Blob<int> dest_strides_;
+
+ private:
+  // Recursive copy function.
+  void crop_copy(const vector<Blob<Dtype>*>& bottom,
+               const vector<Blob<Dtype>*>& top,
+               const int* offsets,
+               vector<int> indices,
+               int cur_dim,
+               const Dtype* src_data,
+               Dtype* dest_data,
+               bool is_forward);
+
+  // Recursive copy function: this is similar to crop_copy() but loops over all
+  // but the last two dimensions to allow for ND cropping while still relying on
+  // a CUDA kernel for the innermost two dimensions for performance reasons.  An
+  // alterantive implementation could rely on the kernel more by passing
+  // offsets, but this is problematic because of its variable length.
+  // Since in the standard (N,C,W,H) case N,C are usually not cropped a speedup
+  // could be achieved by not looping the application of the copy_kernel around
+  // these dimensions.
+  void crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
+                const vector<Blob<Dtype>*>& top,
+                const vector<int>& offsets,
+                vector<int> indices,
+                int cur_dim,
+                const Dtype* src_data,
+                Dtype* dest_data,
+                bool is_forward);
+};
+}  // namespace caffe
+
+#endif  // CAFFE_CROP_LAYER_HPP_

+ 72 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_conv_layer.hpp

@@ -0,0 +1,72 @@
+#ifndef CAFFE_CUDNN_CONV_LAYER_HPP_
+#define CAFFE_CUDNN_CONV_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/conv_layer.hpp"
+
+namespace caffe {
+
+#ifdef USE_CUDNN
+/*
+ * @brief cuDNN implementation of ConvolutionLayer.
+ *        Fallback to ConvolutionLayer for CPU mode.
+ *
+ * cuDNN accelerates convolution through forward kernels for filtering and bias
+ * plus backward kernels for the gradient w.r.t. the filters, biases, and
+ * inputs. Caffe + cuDNN further speeds up the computation through forward
+ * parallelism across groups and backward parallelism across gradients.
+ *
+ * The CUDNN engine does not have memory overhead for matrix buffers. For many
+ * input and filter regimes the CUDNN engine is faster than the CAFFE engine,
+ * but for fully-convolutional models and large inputs the CAFFE engine can be
+ * faster as long as it fits in memory.
+*/
+template <typename Dtype>
+class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
+ public:
+  explicit CuDNNConvolutionLayer(const LayerParameter& param)
+      : ConvolutionLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNConvolutionLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t* handle_;
+  cudaStream_t*  stream_;
+
+  // algorithms for forward and backwards convolutions
+  cudnnConvolutionFwdAlgo_t *fwd_algo_;
+  cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_;
+  cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_;
+
+  vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
+  cudnnTensorDescriptor_t    bias_desc_;
+  cudnnFilterDescriptor_t      filter_desc_;
+  vector<cudnnConvolutionDescriptor_t> conv_descs_;
+  int bottom_offset_, top_offset_, bias_offset_;
+
+  size_t *workspace_fwd_sizes_;
+  size_t *workspace_bwd_data_sizes_;
+  size_t *workspace_bwd_filter_sizes_;
+  size_t workspaceSizeInBytes;  // size of underlying storage
+  void *workspaceData;  // underlying storage
+  void **workspace;  // aliases into workspaceData
+};
+#endif
+
+}  // namespace caffe
+
+#endif  // CAFFE_CUDNN_CONV_LAYER_HPP_

+ 68 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_deconv_layer.hpp

@@ -0,0 +1,68 @@
+#ifndef CAFFE_CUDNN_DECONV_LAYER_HPP_
+#define CAFFE_CUDNN_DECONV_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/deconv_layer.hpp"
+
+namespace caffe {
+
+#ifdef USE_CUDNN
+/*
+ * @brief cuDNN implementation of DeConvolutionLayer.
+ *        Fallback to DeConvolutionLayer for CPU mode.
+ *
+ * cuDNN accelerates deconvolution through forward kernels for filtering and
+ * bias plus backward kernels for the gradient w.r.t. the filters, biases, and
+ * inputs. Caffe + cuDNN further speeds up the computation through forward
+ * parallelism across groups and backward parallelism across gradients.
+*/
+template <typename Dtype>
+class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Dtype> {
+ public:
+  explicit CuDNNDeconvolutionLayer(const LayerParameter& param)
+    : DeconvolutionLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+                          const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+                       const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNDeconvolutionLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+                           const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+                            const vector<bool>& propagate_down,
+                            const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t* handle_;
+  cudaStream_t*  stream_;
+
+  // algorithms for forward and backwards convolutions
+  cudnnConvolutionFwdAlgo_t *fwd_algo_;
+  cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_;
+  cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_;
+
+  vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  vector<cudnnConvolutionDescriptor_t> conv_descs_;
+  int bottom_offset_, top_offset_, bias_offset_;
+
+  size_t *workspace_fwd_sizes_;
+  size_t *workspace_bwd_data_sizes_;
+  size_t *workspace_bwd_filter_sizes_;
+  size_t workspaceSizeInBytes;  // size of underlying storage
+  void *workspaceData;  // underlying storage
+  void **workspace;  // aliases into workspaceData
+};
+#endif
+
+}  // namespace caffe
+
+#endif  // CAFFE_CUDNN_DECONV_LAYER_HPP_

+ 49 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_lcn_layer.hpp

@@ -0,0 +1,49 @@
+#ifndef CAFFE_CUDNN_LCN_LAYER_HPP_
+#define CAFFE_CUDNN_LCN_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/lrn_layer.hpp"
+#include "caffe/layers/power_layer.hpp"
+
+namespace caffe {
+
+#ifdef USE_CUDNN
+template <typename Dtype>
+class CuDNNLCNLayer : public LRNLayer<Dtype> {
+ public:
+  explicit CuDNNLCNLayer(const LayerParameter& param)
+      : LRNLayer<Dtype>(param), handles_setup_(false), tempDataSize(0),
+        tempData1(NULL), tempData2(NULL) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNLCNLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t             handle_;
+  cudnnLRNDescriptor_t norm_desc_;
+  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
+
+  int size_, pre_pad_;
+  Dtype alpha_, beta_, k_;
+
+  size_t tempDataSize;
+  void *tempData1, *tempData2;
+};
+#endif
+
+}  // namespace caffe
+
+#endif  // CAFFE_CUDNN_LCN_LAYER_HPP_

+ 44 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_lrn_layer.hpp

@@ -0,0 +1,44 @@
+#ifndef CAFFE_CUDNN_LRN_LAYER_HPP_
+#define CAFFE_CUDNN_LRN_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/lrn_layer.hpp"
+
+namespace caffe {
+
+#ifdef USE_CUDNN
+template <typename Dtype>
+class CuDNNLRNLayer : public LRNLayer<Dtype> {
+ public:
+  explicit CuDNNLRNLayer(const LayerParameter& param)
+      : LRNLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNLRNLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t             handle_;
+  cudnnLRNDescriptor_t norm_desc_;
+  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
+
+  int size_;
+  Dtype alpha_, beta_, k_;
+};
+#endif
+
+}  // namespace caffe
+
+#endif  // CAFFE_CUDNN_LRN_LAYER_HPP_

+ 49 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_pooling_layer.hpp

@@ -0,0 +1,49 @@
+#ifndef CAFFE_CUDNN_POOLING_LAYER_HPP_
+#define CAFFE_CUDNN_POOLING_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/pooling_layer.hpp"
+
+namespace caffe {
+
+#ifdef USE_CUDNN
+/*
+ * @brief cuDNN implementation of PoolingLayer.
+ *        Fallback to PoolingLayer for CPU mode.
+*/
+template <typename Dtype>
+class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
+ public:
+  explicit CuDNNPoolingLayer(const LayerParameter& param)
+      : PoolingLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNPoolingLayer();
+  // Currently, cuDNN does not support the extra top blob.
+  virtual inline int MinTopBlobs() const { return -1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t             handle_;
+  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
+  cudnnPoolingDescriptor_t  pooling_desc_;
+  cudnnPoolingMode_t        mode_;
+};
+#endif
+
+}  // namespace caffe
+
+#endif  // CAFFE_CUDNN_POOLING_LAYER_HPP_

+ 46 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_relu_layer.hpp

@@ -0,0 +1,46 @@
+#ifndef CAFFE_CUDNN_RELU_LAYER_HPP_
+#define CAFFE_CUDNN_RELU_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+#include "caffe/layers/relu_layer.hpp"
+
+namespace caffe {
+
+#ifdef USE_CUDNN
+/**
+ * @brief CuDNN acceleration of ReLULayer.
+ */
+template <typename Dtype>
+class CuDNNReLULayer : public ReLULayer<Dtype> {
+ public:
+  explicit CuDNNReLULayer(const LayerParameter& param)
+      : ReLULayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNReLULayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t             handle_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnTensorDescriptor_t top_desc_;
+  cudnnActivationDescriptor_t activ_desc_;
+};
+#endif
+
+}  // namespace caffe
+
+#endif  // CAFFE_CUDNN_RELU_LAYER_HPP_

+ 46 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_sigmoid_layer.hpp

@@ -0,0 +1,46 @@
+#ifndef CAFFE_CUDNN_SIGMOID_LAYER_HPP_
+#define CAFFE_CUDNN_SIGMOID_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+#include "caffe/layers/sigmoid_layer.hpp"
+
+namespace caffe {
+
+#ifdef USE_CUDNN
+/**
+ * @brief CuDNN acceleration of SigmoidLayer.
+ */
+template <typename Dtype>
+class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
+ public:
+  explicit CuDNNSigmoidLayer(const LayerParameter& param)
+      : SigmoidLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNSigmoidLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t             handle_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnTensorDescriptor_t top_desc_;
+  cudnnActivationDescriptor_t activ_desc_;
+};
+#endif
+
+}  // namespace caffe
+
+#endif  // CAFFE_CUDNN_SIGMOID_LAYER_HPP_

+ 45 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_softmax_layer.hpp

@@ -0,0 +1,45 @@
+#ifndef CAFFE_CUDNN_SOFTMAX_LAYER_HPP_
+#define CAFFE_CUDNN_SOFTMAX_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/softmax_layer.hpp"
+
+namespace caffe {
+
+#ifdef USE_CUDNN
+/**
+ * @brief cuDNN implementation of SoftmaxLayer.
+ *        Fallback to SoftmaxLayer for CPU mode.
+ */
+template <typename Dtype>
+class CuDNNSoftmaxLayer : public SoftmaxLayer<Dtype> {
+ public:
+  explicit CuDNNSoftmaxLayer(const LayerParameter& param)
+      : SoftmaxLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNSoftmaxLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t             handle_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnTensorDescriptor_t top_desc_;
+};
+#endif
+
+}  // namespace caffe
+
+#endif  // CAFFE_CUDNN_SOFTMAX_LAYER_HPP_

+ 46 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_tanh_layer.hpp

@@ -0,0 +1,46 @@
+#ifndef CAFFE_CUDNN_TANH_LAYER_HPP_
+#define CAFFE_CUDNN_TANH_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+#include "caffe/layers/tanh_layer.hpp"
+
+namespace caffe {
+
+#ifdef USE_CUDNN
+/**
+ * @brief CuDNN acceleration of TanHLayer.
+ */
+template <typename Dtype>
+class CuDNNTanHLayer : public TanHLayer<Dtype> {
+ public:
+  explicit CuDNNTanHLayer(const LayerParameter& param)
+      : TanHLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNTanHLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t             handle_;
+  cudnnTensorDescriptor_t bottom_desc_;
+  cudnnTensorDescriptor_t top_desc_;
+  cudnnActivationDescriptor_t activ_desc_;
+};
+#endif
+
+}  // namespace caffe
+
+#endif  // CAFFE_CUDNN_TANH_LAYER_HPP_

+ 40 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/data_layer.hpp

@@ -0,0 +1,40 @@
+#ifndef CAFFE_DATA_LAYER_HPP_
+#define CAFFE_DATA_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/data_transformer.hpp"
+#include "caffe/internal_thread.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/base_data_layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/db.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+class DataLayer : public BasePrefetchingDataLayer<Dtype> {
+ public:
+  explicit DataLayer(const LayerParameter& param);
+  virtual ~DataLayer();
+  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual inline const char* type() const { return "Data"; }
+  virtual inline int ExactNumBottomBlobs() const { return 0; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MaxTopBlobs() const { return 2; }
+
+ protected:
+  void Next();
+  bool Skip();
+  virtual void load_batch(Batch<Dtype>* batch);
+
+  shared_ptr<db::DB> db_;
+  shared_ptr<db::Cursor> cursor_;
+  uint64_t offset_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_DATA_LAYER_HPP_

+ 51 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/deconv_layer.hpp

@@ -0,0 +1,51 @@
+#ifndef CAFFE_DECONV_LAYER_HPP_
+#define CAFFE_DECONV_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/base_conv_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Convolve the input with a bank of learned filters, and (optionally)
+ *        add biases, treating filters and convolution parameters in the
+ *        opposite sense as ConvolutionLayer.
+ *
+ *   ConvolutionLayer computes each output value by dotting an input window with
+ *   a filter; DeconvolutionLayer multiplies each input value by a filter
+ *   elementwise, and sums over the resulting output windows. In other words,
+ *   DeconvolutionLayer is ConvolutionLayer with the forward and backward passes
+ *   reversed. DeconvolutionLayer reuses ConvolutionParameter for its
+ *   parameters, but they take the opposite sense as in ConvolutionLayer (so
+ *   padding is removed from the output rather than added to the input, and
+ *   stride results in upsampling rather than downsampling).
+ */
+template <typename Dtype>
+class DeconvolutionLayer : public BaseConvolutionLayer<Dtype> {
+ public:
+  explicit DeconvolutionLayer(const LayerParameter& param)
+      : BaseConvolutionLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "Deconvolution"; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual inline bool reverse_dimensions() { return true; }
+  virtual void compute_output_shape();
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_DECONV_LAYER_HPP_

+ 80 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/dropout_layer.hpp

@@ -0,0 +1,80 @@
+#ifndef CAFFE_DROPOUT_LAYER_HPP_
+#define CAFFE_DROPOUT_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief During training only, sets a random portion of @f$x@f$ to 0, adjusting
+ *        the rest of the vector magnitude accordingly.
+ *
+ * @param bottom input Blob vector (length 1)
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the inputs @f$ x @f$
+ * @param top output Blob vector (length 1)
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the computed outputs @f$ y = |x| @f$
+ */
+template <typename Dtype>
+class DropoutLayer : public NeuronLayer<Dtype> {
+ public:
+  /**
+   * @param param provides DropoutParameter dropout_param,
+   *     with DropoutLayer options:
+   *   - dropout_ratio (\b optional, default 0.5).
+   *     Sets the probability @f$ p @f$ that any given unit is dropped.
+   */
+  explicit DropoutLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Dropout"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the computed outputs. At training time, we have @f$
+   *      y_{\mbox{train}} = \left\{
+   *         \begin{array}{ll}
+   *            \frac{x}{1 - p} & \mbox{if } u > p \\
+   *            0 & \mbox{otherwise}
+   *         \end{array} \right.
+   *      @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each
+   *      input at each iteration. At test time, we simply have
+   *      @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$
+  Blob<unsigned int> rand_vec_;
+  /// the probability @f$ p @f$ of dropping any input
+  Dtype threshold_;
+  /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$
+  Dtype scale_;
+  unsigned int uint_thres_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_DROPOUT_LAYER_HPP_

+ 47 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/dummy_data_layer.hpp

@@ -0,0 +1,47 @@
+#ifndef CAFFE_DUMMY_DATA_LAYER_HPP_
+#define CAFFE_DUMMY_DATA_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Provides data to the Net generated by a Filler.
+ *
+ * TODO(dox): thorough documentation for Forward and proto params.
+ */
+template <typename Dtype>
+class DummyDataLayer : public Layer<Dtype> {
+ public:
+  explicit DummyDataLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  // Data layers have no bottoms, so reshaping is trivial.
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
+
+  virtual inline const char* type() const { return "DummyData"; }
+  virtual inline int ExactNumBottomBlobs() const { return 0; }
+  virtual inline int MinTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+
+  vector<shared_ptr<Filler<Dtype> > > fillers_;
+  vector<bool> refill_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_DUMMY_DATA_LAYER_HPP_

+ 51 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/eltwise_layer.hpp

@@ -0,0 +1,51 @@
+#ifndef CAFFE_ELTWISE_LAYER_HPP_
+#define CAFFE_ELTWISE_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Compute elementwise operations, such as product and sum,
+ *        along multiple input Blobs.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Dtype>
+class EltwiseLayer : public Layer<Dtype> {
+ public:
+  explicit EltwiseLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Eltwise"; }
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  EltwiseParameter_EltwiseOp op_;
+  vector<Dtype> coeffs_;
+  Blob<int> max_idx_;
+
+  bool stable_prod_grad_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_ELTWISE_LAYER_HPP_

+ 86 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/elu_layer.hpp

@@ -0,0 +1,86 @@
+#ifndef CAFFE_ELU_LAYER_HPP_
+#define CAFFE_ELU_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Exponential Linear Unit non-linearity @f$
+ *        y = \left\{
+ *        \begin{array}{lr}
+ *            x                  & \mathrm{if} \; x > 0 \\
+ *            \alpha (\exp(x)-1) & \mathrm{if} \; x \le 0
+ *        \end{array} \right.
+ *      @f$.  
+ */
+template <typename Dtype>
+class ELULayer : public NeuronLayer<Dtype> {
+ public:
+  /**
+   * @param param provides ELUParameter elu_param,
+   *     with ELULayer options:
+   *   - alpha (\b optional, default 1).
+   *     the value @f$ \alpha @f$ by which controls saturation for negative inputs.
+   */
+  explicit ELULayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "ELU"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the computed outputs @f$
+   *        y = \left\{
+   *        \begin{array}{lr}
+   *            x                  & \mathrm{if} \; x > 0 \\
+   *            \alpha (\exp(x)-1) & \mathrm{if} \; x \le 0
+   *        \end{array} \right.
+   *      @f$.  
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the ELU inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial x} = \left\{
+   *        \begin{array}{lr}
+   *            1           & \mathrm{if} \; x > 0 \\
+   *            y + \alpha  & \mathrm{if} \; x \le 0
+   *        \end{array} \right.
+   *      @f$ if propagate_down[0].
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+};
+
+
+}  // namespace caffe
+
+#endif  // CAFFE_ELU_LAYER_HPP_

+ 52 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/embed_layer.hpp

@@ -0,0 +1,52 @@
+#ifndef CAFFE_EMBED_LAYER_HPP_
+#define CAFFE_EMBED_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief A layer for learning "embeddings" of one-hot vector input.
+ *        Equivalent to an InnerProductLayer with one-hot vectors as input, but
+ *        for efficiency the input is the "hot" index of each column itself.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Dtype>
+class EmbedLayer : public Layer<Dtype> {
+ public:
+  explicit EmbedLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Embed"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  int M_;
+  int K_;
+  int N_;
+  bool bias_term_;
+  Blob<Dtype> bias_multiplier_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_EMBED_LAYER_HPP_

+ 107 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/euclidean_loss_layer.hpp

@@ -0,0 +1,107 @@
+#ifndef CAFFE_EUCLIDEAN_LOSS_LAYER_HPP_
+#define CAFFE_EUCLIDEAN_LOSS_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/loss_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes the Euclidean (L2) loss @f$
+ *          E = \frac{1}{2N} \sum\limits_{n=1}^N \left| \left| \hat{y}_n - y_n
+ *        \right| \right|_2^2 @f$ for real-valued regression tasks.
+ *
+ * @param bottom input Blob vector (length 2)
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the predictions @f$ \hat{y} \in [-\infty, +\infty]@f$
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the targets @f$ y \in [-\infty, +\infty]@f$
+ * @param top output Blob vector (length 1)
+ *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+ *      the computed Euclidean loss: @f$ E =
+ *          \frac{1}{2n} \sum\limits_{n=1}^N \left| \left| \hat{y}_n - y_n
+ *        \right| \right|_2^2 @f$
+ *
+ * This can be used for least-squares regression tasks.  An InnerProductLayer
+ * input to a EuclideanLossLayer exactly formulates a linear least squares
+ * regression problem. With non-zero weight decay the problem becomes one of
+ * ridge regression -- see src/caffe/test/test_gradient_based_solver.cpp for a concrete
+ * example wherein we check that the gradients computed for a Net with exactly
+ * this structure match hand-computed gradient formulas for ridge regression.
+ *
+ * (Note: Caffe, and SGD in general, is certainly \b not the best way to solve
+ * linear least squares problems! We use it only as an instructive example.)
+ */
+template <typename Dtype>
+class EuclideanLossLayer : public LossLayer<Dtype> {
+ public:
+  explicit EuclideanLossLayer(const LayerParameter& param)
+      : LossLayer<Dtype>(param), diff_() {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "EuclideanLoss"; }
+  /**
+   * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate
+   * to both inputs -- override to return true and always allow force_backward.
+   */
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    return true;
+  }
+
+ protected:
+  /// @copydoc EuclideanLossLayer
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the Euclidean error gradient w.r.t. the inputs.
+   *
+   * Unlike other children of LossLayer, EuclideanLossLayer \b can compute
+   * gradients with respect to the label inputs bottom[1] (but still only will
+   * if propagate_down[1] is set, due to being produced by learnable parameters
+   * or if force_backward is set). In fact, this layer is "commutative" -- the
+   * result is the same regardless of the order of the two bottoms.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+   *      as @f$ \lambda @f$ is the coefficient of this layer's output
+   *      @f$\ell_i@f$ in the overall Net loss
+   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+   *      (*Assuming that this top Blob is not used as a bottom (input) by any
+   *      other layer of the Net.)
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 2)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the predictions @f$\hat{y}@f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial \hat{y}} =
+   *            \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n)
+   *      @f$ if propagate_down[0]
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the targets @f$y@f$; Backward fills their diff with gradients
+   *      @f$ \frac{\partial E}{\partial y} =
+   *          \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n)
+   *      @f$ if propagate_down[1]
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  Blob<Dtype> diff_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_EUCLIDEAN_LOSS_LAYER_HPP_

+ 80 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/exp_layer.hpp

@@ -0,0 +1,80 @@
+#ifndef CAFFE_EXP_LAYER_HPP_
+#define CAFFE_EXP_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes @f$ y = \gamma ^ {\alpha x + \beta} @f$,
+ *        as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$,
+ *        and base @f$ \gamma @f$.
+ */
+template <typename Dtype>
+class ExpLayer : public NeuronLayer<Dtype> {
+ public:
+  /**
+   * @param param provides ExpParameter exp_param,
+   *     with ExpLayer options:
+   *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
+   *   - shift (\b optional, default 0) the shift @f$ \beta @f$
+   *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
+   *         the base @f$ \gamma @f$
+   */
+  explicit ExpLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Exp"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the computed outputs @f$
+   *        y = \gamma ^ {\alpha x + \beta}
+   *      @f$
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the exp inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial x} =
+   *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
+   *      @f$ if propagate_down[0]
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  Dtype inner_scale_, outer_scale_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_EXP_LAYER_HPP_

+ 77 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/filter_layer.hpp

@@ -0,0 +1,77 @@
+#ifndef CAFFE_FILTER_LAYER_HPP_
+#define CAFFE_FILTER_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Takes two+ Blobs, interprets last Blob as a selector and
+ *  filter remaining Blobs accordingly with selector data (0 means that
+ * the corresponding item has to be filtered, non-zero means that corresponding
+ * item needs to stay).
+ */
+template <typename Dtype>
+class FilterLayer : public Layer<Dtype> {
+ public:
+  explicit FilterLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Filter"; }
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MinTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 2+)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs to be filtered @f$ x_1 @f$
+   *   -# ...
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs to be filtered @f$ x_K @f$
+   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+   *      the selector blob
+   * @param top output Blob vector (length 1+)
+   *   -# @f$ (S \times C \times H \times W) @f$ ()
+   *        the filtered output @f$ x_1 @f$
+   *        where S is the number of items
+   *        that haven't been filtered
+   *      @f$ (S \times C \times H \times W) @f$
+   *        the filtered output @f$ x_K @f$
+   *        where S is the number of items
+   *        that haven't been filtered
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the forwarded inputs.
+   *
+   * @param top output Blob vector (length 1+), providing the error gradient with
+   *        respect to the outputs
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 2+), into which the top error
+   *        gradient is copied
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool first_reshape_;
+  vector<int> indices_to_forward_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_FILTER_LAYER_HPP_

+ 61 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/flatten_layer.hpp

@@ -0,0 +1,61 @@
+#ifndef CAFFE_FLATTEN_LAYER_HPP_
+#define CAFFE_FLATTEN_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Reshapes the input Blob into flat vectors.
+ *
+ * Note: because this layer does not change the input values -- merely the
+ * dimensions -- it can simply copy the input. The copy happens "virtually"
+ * (thus taking effectively 0 real time) by setting, in Forward, the data
+ * pointer of the top Blob to that of the bottom Blob (see Blob::ShareData),
+ * and in Backward, the diff pointer of the bottom Blob to that of the top Blob
+ * (see Blob::ShareDiff).
+ */
+template <typename Dtype>
+class FlattenLayer : public Layer<Dtype> {
+ public:
+  explicit FlattenLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Flatten"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 2+)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times CHW \times 1 \times 1) @f$
+   *      the outputs -- i.e., the (virtually) copied, flattened inputs
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the concatenate inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *        respect to the outputs
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length K), into which the top error
+   *        gradient is (virtually) copied
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_FLATTEN_LAYER_HPP_

+ 64 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/hdf5_data_layer.hpp

@@ -0,0 +1,64 @@
+#ifndef CAFFE_HDF5_DATA_LAYER_HPP_
+#define CAFFE_HDF5_DATA_LAYER_HPP_
+
+#include "hdf5.h"
+
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/base_data_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Provides data to the Net from HDF5 files.
+ *
+ * TODO(dox): thorough documentation for Forward and proto params.
+ */
+template <typename Dtype>
+class HDF5DataLayer : public Layer<Dtype> {
+ public:
+  explicit HDF5DataLayer(const LayerParameter& param)
+      : Layer<Dtype>(param), offset_() {}
+  virtual ~HDF5DataLayer();
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  // Data layers have no bottoms, so reshaping is trivial.
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
+
+  virtual inline const char* type() const { return "HDF5Data"; }
+  virtual inline int ExactNumBottomBlobs() const { return 0; }
+  virtual inline int MinTopBlobs() const { return 1; }
+
+ protected:
+  void Next();
+  bool Skip();
+
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+  virtual void LoadHDF5FileData(const char* filename);
+
+  std::vector<std::string> hdf_filenames_;
+  unsigned int num_files_;
+  unsigned int current_file_;
+  hsize_t current_row_;
+  std::vector<shared_ptr<Blob<Dtype> > > hdf_blobs_;
+  std::vector<unsigned int> data_permutation_;
+  std::vector<unsigned int> file_permutation_;
+  uint64_t offset_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_HDF5_DATA_LAYER_HPP_

+ 62 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/hdf5_output_layer.hpp

@@ -0,0 +1,62 @@
+#ifndef CAFFE_HDF5_OUTPUT_LAYER_HPP_
+#define CAFFE_HDF5_OUTPUT_LAYER_HPP_
+
+#include "hdf5.h"
+
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+#define HDF5_DATA_DATASET_NAME "data"
+#define HDF5_DATA_LABEL_NAME "label"
+
+/**
+ * @brief Write blobs to disk as HDF5 files.
+ *
+ * TODO(dox): thorough documentation for Forward and proto params.
+ */
+template <typename Dtype>
+class HDF5OutputLayer : public Layer<Dtype> {
+ public:
+  explicit HDF5OutputLayer(const LayerParameter& param)
+      : Layer<Dtype>(param), file_opened_(false) {}
+  virtual ~HDF5OutputLayer();
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  // Data layers have no bottoms, so reshaping is trivial.
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
+
+  virtual inline const char* type() const { return "HDF5Output"; }
+  // TODO: no limit on the number of blobs
+  virtual inline int ExactNumBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 0; }
+
+  inline std::string file_name() const { return file_name_; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void SaveBlobs();
+
+  bool file_opened_;
+  std::string file_name_;
+  hid_t file_id_;
+  Blob<Dtype> data_blob_;
+  Blob<Dtype> label_blob_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_HDF5_OUTPUT_LAYER_HPP_

+ 104 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/hinge_loss_layer.hpp

@@ -0,0 +1,104 @@
+#ifndef CAFFE_HINGE_LOSS_LAYER_HPP_
+#define CAFFE_HINGE_LOSS_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/loss_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes the hinge loss for a one-of-many classification task.
+ *
+ * @param bottom input Blob vector (length 2)
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the predictions @f$ t @f$, a Blob with values in
+ *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
+ *      the @f$ K = CHW @f$ classes. In an SVM, @f$ t @f$ is the result of
+ *      taking the inner product @f$ X^T W @f$ of the D-dimensional features
+ *      @f$ X \in \mathcal{R}^{D \times N} @f$ and the learned hyperplane
+ *      parameters @f$ W \in \mathcal{R}^{D \times K} @f$, so a Net with just
+ *      an InnerProductLayer (with num_output = D) providing predictions to a
+ *      HingeLossLayer and no other learnable parameters or losses is
+ *      equivalent to an SVM.
+ *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+ *      the labels @f$ l @f$, an integer-valued Blob with values
+ *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
+ *      indicating the correct class label among the @f$ K @f$ classes
+ * @param top output Blob vector (length 1)
+ *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+ *      the computed hinge loss: @f$ E =
+ *        \frac{1}{N} \sum\limits_{n=1}^N \sum\limits_{k=1}^K
+ *        [\max(0, 1 - \delta\{l_n = k\} t_{nk})] ^ p
+ *      @f$, for the @f$ L^p @f$ norm
+ *      (defaults to @f$ p = 1 @f$, the L1 norm; L2 norm, as in L2-SVM,
+ *      is also available), and @f$
+ *      \delta\{\mathrm{condition}\} = \left\{
+ *         \begin{array}{lr}
+ *            1 & \mbox{if condition} \\
+ *           -1 & \mbox{otherwise}
+ *         \end{array} \right.
+ *      @f$
+ *
+ * In an SVM, @f$ t \in \mathcal{R}^{N \times K} @f$ is the result of taking
+ * the inner product @f$ X^T W @f$ of the features
+ * @f$ X \in \mathcal{R}^{D \times N} @f$
+ * and the learned hyperplane parameters
+ * @f$ W \in \mathcal{R}^{D \times K} @f$. So, a Net with just an
+ * InnerProductLayer (with num_output = @f$k@f$) providing predictions to a
+ * HingeLossLayer is equivalent to an SVM (assuming it has no other learned
+ * outside the InnerProductLayer and no other losses outside the
+ * HingeLossLayer).
+ */
+template <typename Dtype>
+class HingeLossLayer : public LossLayer<Dtype> {
+ public:
+  explicit HingeLossLayer(const LayerParameter& param)
+      : LossLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "HingeLoss"; }
+
+ protected:
+  /// @copydoc HingeLossLayer
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the hinge loss error gradient w.r.t. the predictions.
+   *
+   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+   * if propagate_down[1] is set.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+   *      as @f$ \lambda @f$ is the coefficient of this layer's output
+   *      @f$\ell_i@f$ in the overall Net loss
+   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+   *      (*Assuming that this top Blob is not used as a bottom (input) by any
+   *      other layer of the Net.)
+   * @param propagate_down see Layer::Backward.
+   *      propagate_down[1] must be false as we can't compute gradients with
+   *      respect to the labels.
+   * @param bottom input Blob vector (length 2)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the predictions @f$t@f$; Backward computes diff
+   *      @f$ \frac{\partial E}{\partial t} @f$
+   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+   *      the labels -- ignored as we can't compute their error gradients
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+};
+
+
+}  // namespace caffe
+
+#endif  // CAFFE_HINGE_LOSS_LAYER_HPP_

+ 65 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/im2col_layer.hpp

@@ -0,0 +1,65 @@
+#ifndef CAFFE_IM2COL_LAYER_HPP_
+#define CAFFE_IM2COL_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief A helper for image operations that rearranges image regions into
+ *        column vectors.  Used by ConvolutionLayer to perform convolution
+ *        by matrix multiplication.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Dtype>
+class Im2colLayer : public Layer<Dtype> {
+ public:
+  explicit Im2colLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Im2col"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief The spatial dimensions of a filter kernel.
+  Blob<int> kernel_shape_;
+  /// @brief The spatial dimensions of the stride.
+  Blob<int> stride_;
+  /// @brief The spatial dimensions of the padding.
+  Blob<int> pad_;
+  /// @brief The spatial dimensions of the dilation.
+  Blob<int> dilation_;
+
+  int num_spatial_axes_;
+  int bottom_dim_;
+  int top_dim_;
+
+  int channel_axis_;
+  int num_;
+  int channels_;
+
+  bool force_nd_im2col_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_IM2COL_LAYER_HPP_

+ 47 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/image_data_layer.hpp

@@ -0,0 +1,47 @@
+#ifndef CAFFE_IMAGE_DATA_LAYER_HPP_
+#define CAFFE_IMAGE_DATA_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/data_transformer.hpp"
+#include "caffe/internal_thread.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/base_data_layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Provides data to the Net from image files.
+ *
+ * TODO(dox): thorough documentation for Forward and proto params.
+ */
+template <typename Dtype>
+class ImageDataLayer : public BasePrefetchingDataLayer<Dtype> {
+ public:
+  explicit ImageDataLayer(const LayerParameter& param)
+      : BasePrefetchingDataLayer<Dtype>(param) {}
+  virtual ~ImageDataLayer();
+  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "ImageData"; }
+  virtual inline int ExactNumBottomBlobs() const { return 0; }
+  virtual inline int ExactNumTopBlobs() const { return 2; }
+
+ protected:
+  shared_ptr<Caffe::RNG> prefetch_rng_;
+  virtual void ShuffleImages();
+  virtual void load_batch(Batch<Dtype>* batch);
+
+  vector<std::pair<std::string, int> > lines_;
+  int lines_id_;
+};
+
+
+}  // namespace caffe
+
+#endif  // CAFFE_IMAGE_DATA_LAYER_HPP_

+ 146 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/infogain_loss_layer.hpp

@@ -0,0 +1,146 @@
+#ifndef CAFFE_INFOGAIN_LOSS_LAYER_HPP_
+#define CAFFE_INFOGAIN_LOSS_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/loss_layer.hpp"
+#include "caffe/layers/softmax_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief A generalization of SoftmaxWithLossLayer that takes an
+ *        "information gain" (infogain) matrix specifying the "value" of all label
+ *        pairs.
+ *
+ * Equivalent to the SoftmaxWithLossLayer if the infogain matrix is the
+ * identity.
+ *
+ * @param bottom input Blob vector (length 2-3)
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the predictions @f$ x @f$, a Blob with values in
+ *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
+ *      the @f$ K = CHW @f$ classes. This layer maps these scores to a
+ *      probability distribution over classes using the softmax function
+ *      @f$ \hat{p}_{nk} = \exp(x_{nk}) /
+ *      \left[\sum_{k'} \exp(x_{nk'})\right] @f$ (see SoftmaxLayer).
+ *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+ *      the labels @f$ l @f$, an integer-valued Blob with values
+ *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
+ *      indicating the correct class label among the @f$ K @f$ classes
+ *   -# @f$ (1 \times 1 \times K \times K) @f$
+ *      (\b optional) the infogain matrix @f$ H @f$.  This must be provided as
+ *      the third bottom blob input if not provided as the infogain_mat in the
+ *      InfogainLossParameter. If @f$ H = I @f$, this layer is equivalent to the
+ *      SoftmaxWithLossLayer.
+ * @param top output Blob vector (length 1)
+ *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+ *      the computed infogain multinomial logistic loss: @f$ E =
+ *        \frac{-1}{N} \sum\limits_{n=1}^N H_{l_n} \log(\hat{p}_n) =
+ *        \frac{-1}{N} \sum\limits_{n=1}^N \sum\limits_{k=1}^{K} H_{l_n,k}
+ *        \log(\hat{p}_{n,k})
+ *      @f$, where @f$ H_{l_n} @f$ denotes row @f$l_n@f$ of @f$H@f$.
+ */
+template <typename Dtype>
+class InfogainLossLayer : public LossLayer<Dtype> {
+ public:
+  explicit InfogainLossLayer(const LayerParameter& param)
+      : LossLayer<Dtype>(param), infogain_() {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  // InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should
+  // be the infogain matrix.  (Otherwise the infogain matrix is loaded from a
+  // file specified by LayerParameter.)
+  virtual inline int ExactNumBottomBlobs() const { return -1; }
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MaxBottomBlobs() const { return 3; }
+
+  // InfogainLossLayer computes softmax prob internally.
+  // optional second "top" outputs the softmax prob
+  virtual inline int ExactNumTopBlobs() const { return -1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MaxTopBlobs() const { return 2; }
+
+  virtual inline const char* type() const { return "InfogainLoss"; }
+
+ protected:
+  /// @copydoc InfogainLossLayer
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the infogain loss error gradient w.r.t. the predictions.
+   *
+   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+   * if propagate_down[1] is set. (The same applies to the infogain matrix, if
+   * provided as bottom[2] rather than in the layer_param.)
+   *
+   * @param top output Blob vector (length 1), providing the error gradient
+   *      with respect to the outputs
+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+   *      as @f$ \lambda @f$ is the coefficient of this layer's output
+   *      @f$\ell_i@f$ in the overall Net loss
+   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+   *      (*Assuming that this top Blob is not used as a bottom (input) by any
+   *      other layer of the Net.)
+   * @param propagate_down see Layer::Backward.
+   *      propagate_down[1] must be false as we can't compute gradients with
+   *      respect to the labels (similarly for propagate_down[2] and the
+   *      infogain matrix, if provided as bottom[2])
+   * @param bottom input Blob vector (length 2-3)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the predictions @f$ x @f$; Backward computes diff
+   *      @f$ \frac{\partial E}{\partial x} @f$
+   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+   *      the labels -- ignored as we can't compute their error gradients
+   *   -# @f$ (1 \times 1 \times K \times K) @f$
+   *      (\b optional) the information gain matrix -- ignored as its error
+   *      gradient computation is not implemented.
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// Read the normalization mode parameter and compute the normalizer based
+  /// on the blob size.  If normalization_mode is VALID, the count of valid
+  /// outputs will be read from valid_count, unless it is -1 in which case
+  /// all outputs are assumed to be valid.
+  virtual Dtype get_normalizer(
+      LossParameter_NormalizationMode normalization_mode, int valid_count);
+  /// fill sum_rows_H_ according to matrix H
+  virtual void sum_rows_of_H(const Blob<Dtype>* H);
+
+  /// The internal SoftmaxLayer used to map predictions to a distribution.
+  shared_ptr<Layer<Dtype> > softmax_layer_;
+  /// prob stores the output probability predictions from the SoftmaxLayer.
+  Blob<Dtype> prob_;
+  /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_bottom_vec_;
+  /// top vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_top_vec_;
+
+  Blob<Dtype> infogain_;
+  Blob<Dtype> sum_rows_H_;  // cache the row sums of H.
+
+  /// Whether to ignore instances with a certain label.
+  bool has_ignore_label_;
+  /// The label indicating that an instance should be ignored.
+  int ignore_label_;
+  /// How to normalize the output loss.
+  LossParameter_NormalizationMode normalization_;
+
+  int infogain_axis_, outer_num_, inner_num_, num_labels_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_INFOGAIN_LOSS_LAYER_HPP_

+ 52 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/inner_product_layer.hpp

@@ -0,0 +1,52 @@
+#ifndef CAFFE_INNER_PRODUCT_LAYER_HPP_
+#define CAFFE_INNER_PRODUCT_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Also known as a "fully-connected" layer, computes an inner product
+ *        with a set of learned weights, and (optionally) adds biases.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Dtype>
+class InnerProductLayer : public Layer<Dtype> {
+ public:
+  explicit InnerProductLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "InnerProduct"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  int M_;
+  int K_;
+  int N_;
+  bool bias_term_;
+  Blob<Dtype> bias_multiplier_;
+  bool transpose_;  ///< if true, assume transposed weights
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_INNER_PRODUCT_LAYER_HPP_

+ 42 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/input_layer.hpp

@@ -0,0 +1,42 @@
+#ifndef CAFFE_INPUT_LAYER_HPP_
+#define CAFFE_INPUT_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Provides data to the Net by assigning tops directly.
+ *
+ * This data layer is a container that merely holds the data assigned to it;
+ * forward, backward, and reshape are all no-ops.
+ */
+template <typename Dtype>
+class InputLayer : public Layer<Dtype> {
+ public:
+  explicit InputLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  // Data layers have no bottoms, so reshaping is trivial.
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
+
+  virtual inline const char* type() const { return "Input"; }
+  virtual inline int ExactNumBottomBlobs() const { return 0; }
+  virtual inline int MinTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_INPUT_LAYER_HPP_

+ 82 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/log_layer.hpp

@@ -0,0 +1,82 @@
+#ifndef CAFFE_LOG_LAYER_HPP_
+#define CAFFE_LOG_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes @f$ y = log_{\gamma}(\alpha x + \beta) @f$,
+ *        as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$,
+ *        and base @f$ \gamma @f$.
+ */
+template <typename Dtype>
+class LogLayer : public NeuronLayer<Dtype> {
+ public:
+  /**
+   * @param param provides LogParameter log_param,
+   *     with LogLayer options:
+   *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
+   *   - shift (\b optional, default 0) the shift @f$ \beta @f$
+   *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
+   *         the base @f$ \gamma @f$
+   */
+  explicit LogLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Log"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the computed outputs @f$
+   *        y = log_{\gamma}(\alpha x + \beta)
+   *      @f$
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the exp inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial x} =
+   *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
+   *      @f$ if propagate_down[0]
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  Dtype base_scale_;
+  Dtype input_scale_, input_shift_;
+  Dtype backward_num_scale_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_LOG_LAYER_HPP_

+ 53 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/loss_layer.hpp

@@ -0,0 +1,53 @@
+#ifndef CAFFE_LOSS_LAYER_HPP_
+#define CAFFE_LOSS_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+const float kLOG_THRESHOLD = 1e-20;
+
+/**
+ * @brief An interface for Layer%s that take two Blob%s as input -- usually
+ *        (1) predictions and (2) ground-truth labels -- and output a
+ *        singleton Blob representing the loss.
+ *
+ * LossLayers are typically only capable of backpropagating to their first input
+ * -- the predictions.
+ */
+template <typename Dtype>
+class LossLayer : public Layer<Dtype> {
+ public:
+  explicit LossLayer(const LayerParameter& param)
+     : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(
+      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(
+      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
+
+  virtual inline int ExactNumBottomBlobs() const { return 2; }
+
+  /**
+   * @brief For convenience and backwards compatibility, instruct the Net to
+   *        automatically allocate a single top Blob for LossLayers, into which
+   *        they output their singleton loss, (even if the user didn't specify
+   *        one in the prototxt, etc.).
+   */
+  virtual inline bool AutoTopBlobs() const { return true; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+  /**
+   * We usually cannot backpropagate to the labels; ignore force_backward for
+   * these inputs.
+   */
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    return bottom_index != 1;
+  }
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_LOSS_LAYER_HPP_

+ 94 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/lrn_layer.hpp

@@ -0,0 +1,94 @@
+#ifndef CAFFE_LRN_LAYER_HPP_
+#define CAFFE_LRN_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/eltwise_layer.hpp"
+#include "caffe/layers/pooling_layer.hpp"
+#include "caffe/layers/power_layer.hpp"
+#include "caffe/layers/split_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Normalize the input in a local region across or within feature maps.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Dtype>
+class LRNLayer : public Layer<Dtype> {
+ public:
+  explicit LRNLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "LRN"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void WithinChannelBackward(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  int size_;
+  int pre_pad_;
+  Dtype alpha_;
+  Dtype beta_;
+  Dtype k_;
+  int num_;
+  int channels_;
+  int height_;
+  int width_;
+
+  // Fields used for normalization ACROSS_CHANNELS
+  // scale_ stores the intermediate summing results
+  Blob<Dtype> scale_;
+
+  // Fields used for normalization WITHIN_CHANNEL
+  shared_ptr<SplitLayer<Dtype> > split_layer_;
+  vector<Blob<Dtype>*> split_top_vec_;
+  shared_ptr<PowerLayer<Dtype> > square_layer_;
+  Blob<Dtype> square_input_;
+  Blob<Dtype> square_output_;
+  vector<Blob<Dtype>*> square_bottom_vec_;
+  vector<Blob<Dtype>*> square_top_vec_;
+  shared_ptr<PoolingLayer<Dtype> > pool_layer_;
+  Blob<Dtype> pool_output_;
+  vector<Blob<Dtype>*> pool_top_vec_;
+  shared_ptr<PowerLayer<Dtype> > power_layer_;
+  Blob<Dtype> power_output_;
+  vector<Blob<Dtype>*> power_top_vec_;
+  shared_ptr<EltwiseLayer<Dtype> > product_layer_;
+  Blob<Dtype> product_input_;
+  vector<Blob<Dtype>*> product_bottom_vec_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_LRN_LAYER_HPP_

+ 154 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/lstm_layer.hpp

@@ -0,0 +1,154 @@
+#ifndef CAFFE_LSTM_LAYER_HPP_
+#define CAFFE_LSTM_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/recurrent_layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM)
+ *        [1] style recurrent neural network (RNN). Implemented by unrolling
+ *        the LSTM computation through time.
+ *
+ * The specific architecture used in this implementation is as described in
+ * "Learning to Execute" [2], reproduced below:
+ *     i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ]
+ *     f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ]
+ *     o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ]
+ *     g_t :=    \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ]
+ *     c_t := (f_t .* c_{t-1}) + (i_t .* g_t)
+ *     h_t := o_t .* \tanh[c_t]
+ * In the implementation, the i, f, o, and g computations are performed as a
+ * single inner product.
+ *
+ * Notably, this implementation lacks the "diagonal" gates, as used in the
+ * LSTM architectures described by Alex Graves [3] and others.
+ *
+ * [1] Hochreiter, Sepp, and Schmidhuber, Jürgen. "Long short-term memory."
+ *     Neural Computation 9, no. 8 (1997): 1735-1780.
+ *
+ * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute."
+ *     arXiv preprint arXiv:1410.4615 (2014).
+ *
+ * [3] Graves, Alex. "Generating sequences with recurrent neural networks."
+ *     arXiv preprint arXiv:1308.0850 (2013).
+ */
+template <typename Dtype>
+class LSTMLayer : public RecurrentLayer<Dtype> {
+ public:
+  explicit LSTMLayer(const LayerParameter& param)
+      : RecurrentLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "LSTM"; }
+
+ protected:
+  virtual void FillUnrolledNet(NetParameter* net_param) const;
+  virtual void RecurrentInputBlobNames(vector<string>* names) const;
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const;
+  virtual void OutputBlobNames(vector<string>* names) const;
+};
+
+/**
+ * @brief A helper for LSTMLayer: computes a single timestep of the
+ *        non-linearity of the LSTM, producing the updated cell and hidden
+ *        states.
+ */
+template <typename Dtype>
+class LSTMUnitLayer : public Layer<Dtype> {
+ public:
+  explicit LSTMUnitLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "LSTMUnit"; }
+  virtual inline int ExactNumBottomBlobs() const { return 3; }
+  virtual inline int ExactNumTopBlobs() const { return 2; }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 2;
+  }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 3)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the previous timestep cell state @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$
+   *   -# @f$ (1 \times N) @f$
+   *      the sequence continuation indicators  @f$ \delta_t @f$
+   * @param top output Blob vector (length 2)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated cell state @f$ c_t @f$, computed as:
+   *          i_t := \sigmoid[i_t']
+   *          f_t := \sigmoid[f_t']
+   *          o_t := \sigmoid[o_t']
+   *          g_t := \tanh[g_t']
+   *          c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated hidden state @f$ h_t @f$, computed as:
+   *          h_t := o_t .* \tanh[c_t]
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the LSTMUnit inputs.
+   *
+   * @param top output Blob vector (length 2), providing the error gradient with
+   *        respect to the outputs
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$
+   *      with respect to the updated cell state @f$ c_t @f$
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$
+   *      with respect to the updated cell state @f$ h_t @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 3), into which the error gradients
+   *        with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate
+   *        inputs are computed.  Computatation of the error gradients w.r.t.
+   *        the sequence indicators is not implemented.
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the error gradient w.r.t. the previous timestep cell state
+   *      @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the error gradient w.r.t. the "gate inputs"
+   *      @f$ [
+   *          \frac{\partial E}{\partial i_t}
+   *          \frac{\partial E}{\partial f_t}
+   *          \frac{\partial E}{\partial o_t}
+   *          \frac{\partial E}{\partial g_t}
+   *          ] @f$
+   *   -# @f$ (1 \times 1 \times N) @f$
+   *      the gradient w.r.t. the sequence continuation indicators
+   *      @f$ \delta_t @f$ is currently not computed.
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief The hidden and output dimension.
+  int hidden_dim_;
+  Blob<Dtype> X_acts_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_LSTM_LAYER_HPP_

+ 63 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/memory_data_layer.hpp

@@ -0,0 +1,63 @@
+#ifndef CAFFE_MEMORY_DATA_LAYER_HPP_
+#define CAFFE_MEMORY_DATA_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/base_data_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Provides data to the Net from memory.
+ *
+ * TODO(dox): thorough documentation for Forward and proto params.
+ */
+template <typename Dtype>
+class MemoryDataLayer : public BaseDataLayer<Dtype> {
+ public:
+  explicit MemoryDataLayer(const LayerParameter& param)
+      : BaseDataLayer<Dtype>(param), has_new_data_(false) {}
+  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "MemoryData"; }
+  virtual inline int ExactNumBottomBlobs() const { return 0; }
+  virtual inline int ExactNumTopBlobs() const { return 2; }
+
+  virtual void AddDatumVector(const vector<Datum>& datum_vector);
+#ifdef USE_OPENCV
+  virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
+      const vector<int>& labels);
+#endif  // USE_OPENCV
+
+  // Reset should accept const pointers, but can't, because the memory
+  //  will be given to Blob, which is mutable
+  void Reset(Dtype* data, Dtype* label, int n);
+  void set_batch_size(int new_size);
+
+  int batch_size() { return batch_size_; }
+  int channels() { return channels_; }
+  int height() { return height_; }
+  int width() { return width_; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  int batch_size_, channels_, height_, width_, size_;
+  Dtype* data_;
+  Dtype* labels_;
+  int n_;
+  size_t pos_;
+  Blob<Dtype> added_data_;
+  Blob<Dtype> added_label_;
+  bool has_new_data_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_MEMORY_DATA_LAYER_HPP_

+ 92 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/multinomial_logistic_loss_layer.hpp

@@ -0,0 +1,92 @@
+#ifndef CAFFE_MULTINOMIAL_LOGISTIC_LOSS_LAYER_HPP_
+#define CAFFE_MULTINOMIAL_LOGISTIC_LOSS_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/loss_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes the multinomial logistic loss for a one-of-many
+ *        classification task, directly taking a predicted probability
+ *        distribution as input.
+ *
+ * When predictions are not already a probability distribution, you should
+ * instead use the SoftmaxWithLossLayer, which maps predictions to a
+ * distribution using the SoftmaxLayer, before computing the multinomial
+ * logistic loss. The SoftmaxWithLossLayer should be preferred over separate
+ * SoftmaxLayer + MultinomialLogisticLossLayer
+ * as its gradient computation is more numerically stable.
+ *
+ * @param bottom input Blob vector (length 2)
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the predictions @f$ \hat{p} @f$, a Blob with values in
+ *      @f$ [0, 1] @f$ indicating the predicted probability of each of the
+ *      @f$ K = CHW @f$ classes.  Each prediction vector @f$ \hat{p}_n @f$
+ *      should sum to 1 as in a probability distribution: @f$
+ *      \forall n \sum\limits_{k=1}^K \hat{p}_{nk} = 1 @f$.
+ *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+ *      the labels @f$ l @f$, an integer-valued Blob with values
+ *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
+ *      indicating the correct class label among the @f$ K @f$ classes
+ * @param top output Blob vector (length 1)
+ *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+ *      the computed multinomial logistic loss: @f$ E =
+ *        \frac{-1}{N} \sum\limits_{n=1}^N \log(\hat{p}_{n,l_n})
+ *      @f$
+ */
+template <typename Dtype>
+class MultinomialLogisticLossLayer : public LossLayer<Dtype> {
+ public:
+  explicit MultinomialLogisticLossLayer(const LayerParameter& param)
+      : LossLayer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "MultinomialLogisticLoss"; }
+
+ protected:
+  /// @copydoc MultinomialLogisticLossLayer
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the multinomial logistic loss error gradient w.r.t. the
+   *        predictions.
+   *
+   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+   * if propagate_down[1] is set.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+   *      as @f$ \lambda @f$ is the coefficient of this layer's output
+   *      @f$\ell_i@f$ in the overall Net loss
+   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+   *      (*Assuming that this top Blob is not used as a bottom (input) by any
+   *      other layer of the Net.)
+   * @param propagate_down see Layer::Backward.
+   *      propagate_down[1] must be false as we can't compute gradients with
+   *      respect to the labels.
+   * @param bottom input Blob vector (length 2)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the predictions @f$ \hat{p} @f$; Backward computes diff
+   *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
+   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+   *      the labels -- ignored as we can't compute their error gradients
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_MULTINOMIAL_LOGISTIC_LOSS_LAYER_HPP_

+ 48 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/mvn_layer.hpp

@@ -0,0 +1,48 @@
+#ifndef CAFFE_MVN_LAYER_HPP_
+#define CAFFE_MVN_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Normalizes the input to have 0-mean and/or unit (1) variance.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Dtype>
+class MVNLayer : public Layer<Dtype> {
+ public:
+  explicit MVNLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "MVN"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  Blob<Dtype> mean_, variance_, temp_;
+
+  /// sum_multiplier is used to carry out sum using BLAS
+  Blob<Dtype> sum_multiplier_;
+  Dtype eps_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_MVN_LAYER_HPP_

+ 32 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/neuron_layer.hpp

@@ -0,0 +1,32 @@
+#ifndef CAFFE_NEURON_LAYER_HPP_
+#define CAFFE_NEURON_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief An interface for layers that take one blob as input (@f$ x @f$)
+ *        and produce one equally-sized blob as output (@f$ y @f$), where
+ *        each element of the output depends only on the corresponding input
+ *        element.
+ */
+template <typename Dtype>
+class NeuronLayer : public Layer<Dtype> {
+ public:
+  explicit NeuronLayer(const LayerParameter& param)
+     : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_NEURON_LAYER_HPP_

+ 45 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/parameter_layer.hpp

@@ -0,0 +1,45 @@
+#ifndef CAFFE_PARAMETER_LAYER_HPP_
+#define CAFFE_PARAMETER_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+class ParameterLayer : public Layer<Dtype> {
+ public:
+  explicit ParameterLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+    if (this->blobs_.size() > 0) {
+      LOG(INFO) << "Skipping parameter initialization";
+    } else {
+      this->blobs_.resize(1);
+      this->blobs_[0].reset(new Blob<Dtype>());
+      this->blobs_[0]->Reshape(this->layer_param_.parameter_param().shape());
+    }
+    top[0]->Reshape(this->layer_param_.parameter_param().shape());
+  }
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) { }
+  virtual inline const char* type() const { return "Parameter"; }
+  virtual inline int ExactNumBottomBlobs() const { return 0; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+    top[0]->ShareData(*(this->blobs_[0]));
+    top[0]->ShareDiff(*(this->blobs_[0]));
+  }
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom)
+  { }
+};
+
+}  // namespace caffe
+
+#endif

+ 61 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/pooling_layer.hpp

@@ -0,0 +1,61 @@
+#ifndef CAFFE_POOLING_LAYER_HPP_
+#define CAFFE_POOLING_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Pools the input image by taking the max, average, etc. within regions.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Dtype>
+class PoolingLayer : public Layer<Dtype> {
+ public:
+  explicit PoolingLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Pooling"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  // MAX POOL layers can output an extra top blob for the mask;
+  // others can only output the pooled inputs.
+  virtual inline int MaxTopBlobs() const {
+    return (this->layer_param_.pooling_param().pool() ==
+            PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+  }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  int kernel_h_, kernel_w_;
+  int stride_h_, stride_w_;
+  int pad_h_, pad_w_;
+  int channels_;
+  int height_, width_;
+  int pooled_height_, pooled_width_;
+  bool global_pooling_;
+  PoolingParameter_RoundMode round_mode_;
+  Blob<Dtype> rand_idx_;
+  Blob<int> max_idx_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_POOLING_LAYER_HPP_

+ 89 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/power_layer.hpp

@@ -0,0 +1,89 @@
+#ifndef CAFFE_POWER_LAYER_HPP_
+#define CAFFE_POWER_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes @f$ y = (\alpha x + \beta) ^ \gamma @f$,
+ *        as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$,
+ *        and power @f$ \gamma @f$.
+ */
+template <typename Dtype>
+class PowerLayer : public NeuronLayer<Dtype> {
+ public:
+  /**
+   * @param param provides PowerParameter power_param,
+   *     with PowerLayer options:
+   *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
+   *   - shift (\b optional, default 0) the shift @f$ \beta @f$
+   *   - power (\b optional, default 1) the power @f$ \gamma @f$
+   */
+  explicit PowerLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Power"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the computed outputs @f$
+   *        y = (\alpha x + \beta) ^ \gamma
+   *      @f$
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the power inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial x} =
+   *            \frac{\partial E}{\partial y}
+   *            \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} =
+   *            \frac{\partial E}{\partial y}
+   *            \frac{\alpha \gamma y}{\alpha x + \beta}
+   *      @f$ if propagate_down[0]
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief @f$ \gamma @f$ from layer_param_.power_param()
+  Dtype power_;
+  /// @brief @f$ \alpha @f$ from layer_param_.power_param()
+  Dtype scale_;
+  /// @brief @f$ \beta @f$ from layer_param_.power_param()
+  Dtype shift_;
+  /// @brief Result of @f$ \alpha \gamma @f$
+  Dtype diff_scale_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_POWER_LAYER_HPP_

+ 101 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/prelu_layer.hpp

@@ -0,0 +1,101 @@
+#ifndef CAFFE_PRELU_LAYER_HPP_
+#define CAFFE_PRELU_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Parameterized Rectified Linear Unit non-linearity @f$
+ *        y_i = \max(0, x_i) + a_i \min(0, x_i)
+ *        @f$. The differences from ReLULayer are 1) negative slopes are
+ *        learnable though backprop and 2) negative slopes can vary across
+ *        channels. The number of axes of input blob should be greater than or
+ *        equal to 2. The 1st axis (0-based) is seen as channels.
+ */
+template <typename Dtype>
+class PReLULayer : public NeuronLayer<Dtype> {
+ public:
+  /**
+   * @param param provides PReLUParameter prelu_param,
+   *     with PReLULayer options:
+   *   - filler (\b optional, FillerParameter,
+   *     default {'type': constant 'value':0.25}).
+   *   - channel_shared (\b optional, default false).
+   *     negative slopes are shared across channels.
+   */
+  explicit PReLULayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "PReLU"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times ...) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times ...) @f$
+   *      the computed outputs for each channel @f$i@f$ @f$
+   *        y_i = \max(0, x_i) + a_i \min(0, x_i)
+   *      @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the PReLU inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times ...) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times ...) @f$
+   *      the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their
+   *      diff with gradients @f$
+   *        \frac{\partial E}{\partial x_i} = \left\{
+   *        \begin{array}{lr}
+   *            a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
+   *            \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i > 0
+   *        \end{array} \right.
+   *      @f$.
+   *      If param_propagate_down_[0] is true, it fills the diff with gradients
+   *      @f$
+   *        \frac{\partial E}{\partial a_i} = \left\{
+   *        \begin{array}{lr}
+   *            \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
+   *            0 & \mathrm{if} \; x_i > 0
+   *        \end{array} \right.
+   *      @f$.
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  bool channel_shared_;
+  Blob<Dtype> multiplier_;  // dot multiplier for backward computation of params
+  Blob<Dtype> backward_buff_;  // temporary buffer for backward computation
+  Blob<Dtype> bottom_memory_;  // memory for in-place computation
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_PRELU_LAYER_HPP_

+ 55 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/python_layer.hpp

@@ -0,0 +1,55 @@
+#ifndef CAFFE_PYTHON_LAYER_HPP_
+#define CAFFE_PYTHON_LAYER_HPP_
+
+#include <boost/python.hpp>
+#include <vector>
+
+#include "caffe/layer.hpp"
+
+namespace bp = boost::python;
+
+namespace caffe {
+
+template <typename Dtype>
+class PythonLayer : public Layer<Dtype> {
+ public:
+  PythonLayer(PyObject* self, const LayerParameter& param)
+      : Layer<Dtype>(param), self_(bp::handle<>(bp::borrowed(self))) { }
+
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+    // Disallow PythonLayer in MultiGPU training stage, due to GIL issues
+    // Details: https://github.com/BVLC/caffe/issues/2936
+    if (this->phase_ == TRAIN && Caffe::solver_count() > 1
+        && !Caffe::multiprocess()) {
+      LOG(FATAL) << "PythonLayer does not support CLI Multi-GPU, use train.py";
+    }
+    self_.attr("param_str") = bp::str(
+        this->layer_param_.python_param().param_str());
+    self_.attr("phase") = static_cast<int>(this->phase_);
+    self_.attr("setup")(bottom, top);
+  }
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+    self_.attr("reshape")(bottom, top);
+  }
+
+  virtual inline const char* type() const { return "Python"; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+    self_.attr("forward")(bottom, top);
+  }
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    self_.attr("backward")(top, propagate_down, bottom);
+  }
+
+ private:
+  bp::object self_;
+};
+
+}  // namespace caffe
+
+#endif

+ 187 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/recurrent_layer.hpp

@@ -0,0 +1,187 @@
+#ifndef CAFFE_RECURRENT_LAYER_HPP_
+#define CAFFE_RECURRENT_LAYER_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/util/format.hpp"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief An abstract class for implementing recurrent behavior inside of an
+ *        unrolled network.  This Layer type cannot be instantiated -- instead,
+ *        you should use one of its implementations which defines the recurrent
+ *        architecture, such as RNNLayer or LSTMLayer.
+ */
+template <typename Dtype>
+class RecurrentLayer : public Layer<Dtype> {
+ public:
+  explicit RecurrentLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reset();
+
+  virtual inline const char* type() const { return "Recurrent"; }
+  virtual inline int MinBottomBlobs() const {
+    int min_bottoms = 2;
+    if (this->layer_param_.recurrent_param().expose_hidden()) {
+      vector<string> inputs;
+      this->RecurrentInputBlobNames(&inputs);
+      min_bottoms += inputs.size();
+    }
+    return min_bottoms;
+  }
+  virtual inline int MaxBottomBlobs() const { return MinBottomBlobs() + 1; }
+  virtual inline int ExactNumTopBlobs() const {
+    int num_tops = 1;
+    if (this->layer_param_.recurrent_param().expose_hidden()) {
+      vector<string> outputs;
+      this->RecurrentOutputBlobNames(&outputs);
+      num_tops += outputs.size();
+    }
+    return num_tops;
+  }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 1;
+  }
+
+ protected:
+  /**
+   * @brief Fills net_param with the recurrent network architecture.  Subclasses
+   *        should define this -- see RNNLayer and LSTMLayer for examples.
+   */
+  virtual void FillUnrolledNet(NetParameter* net_param) const = 0;
+
+  /**
+   * @brief Fills names with the names of the 0th timestep recurrent input
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentInputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills shapes with the shapes of the recurrent input Blob&s.
+   *        Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const = 0;
+
+  /**
+   * @brief Fills names with the names of the Tth timestep recurrent output
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills names with the names of the output blobs, concatenated across
+   *        all timesteps.  Should return a name for each top Blob.
+   *        Subclasses should define this -- see RNNLayer and LSTMLayer for
+   *        examples.
+   */
+  virtual void OutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @param bottom input Blob vector (length 2-3)
+   *
+   *   -# @f$ (T \times N \times ...) @f$
+   *      the time-varying input @f$ x @f$.  After the first two axes, whose
+   *      dimensions must correspond to the number of timesteps @f$ T @f$ and
+   *      the number of independent streams @f$ N @f$, respectively, its
+   *      dimensions may be arbitrary.  Note that the ordering of dimensions --
+   *      @f$ (T \times N \times ...) @f$, rather than
+   *      @f$ (N \times T \times ...) @f$ -- means that the @f$ N @f$
+   *      independent input streams must be "interleaved".
+   *
+   *   -# @f$ (T \times N) @f$
+   *      the sequence continuation indicators @f$ \delta @f$.
+   *      These inputs should be binary (0 or 1) indicators, where
+   *      @f$ \delta_{t,n} = 0 @f$ means that timestep @f$ t @f$ of stream
+   *      @f$ n @f$ is the beginning of a new sequence, and hence the previous
+   *      hidden state @f$ h_{t-1} @f$ is multiplied by @f$ \delta_t = 0 @f$
+   *      and has no effect on the cell's output at timestep @f$ t @f$, and
+   *      a value of @f$ \delta_{t,n} = 1 @f$ means that timestep @f$ t @f$ of
+   *      stream @f$ n @f$ is a continuation from the previous timestep
+   *      @f$ t-1 @f$, and the previous hidden state @f$ h_{t-1} @f$ affects the
+   *      updated hidden state and output.
+   *
+   *   -# @f$ (N \times ...) @f$ (optional)
+   *      the static (non-time-varying) input @f$ x_{static} @f$.
+   *      After the first axis, whose dimension must be the number of
+   *      independent streams, its dimensions may be arbitrary.
+   *      This is mathematically equivalent to using a time-varying input of
+   *      @f$ x'_t = [x_t; x_{static}] @f$ -- i.e., tiling the static input
+   *      across the @f$ T @f$ timesteps and concatenating with the time-varying
+   *      input.  Note that if this input is used, all timesteps in a single
+   *      batch within a particular one of the @f$ N @f$ streams must share the
+   *      same static input, even if the sequence continuation indicators
+   *      suggest that difference sequences are ending and beginning within a
+   *      single batch.  This may require padding and/or truncation for uniform
+   *      length.
+   *
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (T \times N \times D) @f$
+   *      the time-varying output @f$ y @f$, where @f$ D @f$ is
+   *      <code>recurrent_param.num_output()</code>.
+   *      Refer to documentation for particular RecurrentLayer implementations
+   *      (such as RNNLayer and LSTMLayer) for the definition of @f$ y @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief A Net to implement the Recurrent functionality.
+  shared_ptr<Net<Dtype> > unrolled_net_;
+
+  /// @brief The number of independent streams to process simultaneously.
+  int N_;
+
+  /**
+   * @brief The number of timesteps in the layer's input, and the number of
+   *        timesteps over which to backpropagate through time.
+   */
+  int T_;
+
+  /// @brief Whether the layer has a "static" input copied across all timesteps.
+  bool static_input_;
+
+  /**
+   * @brief The last layer to run in the network. (Any later layers are losses
+   *        added to force the recurrent net to do backprop.)
+   */
+  int last_layer_index_;
+
+  /**
+   * @brief Whether the layer's hidden state at the first and last timesteps
+   *        are layer inputs and outputs, respectively.
+   */
+  bool expose_hidden_;
+
+  vector<Blob<Dtype>* > recur_input_blobs_;
+  vector<Blob<Dtype>* > recur_output_blobs_;
+  vector<Blob<Dtype>* > output_blobs_;
+  Blob<Dtype>* x_input_blob_;
+  Blob<Dtype>* x_static_input_blob_;
+  Blob<Dtype>* cont_input_blob_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_RECURRENT_LAYER_HPP_

+ 59 - 0
src/api/perception_lidar_cnn_segmentation/caffe/layers/reduction_layer.hpp

@@ -0,0 +1,59 @@
+#ifndef CAFFE_REDUCTION_LAYER_HPP_
+#define CAFFE_REDUCTION_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/**
+ * @brief Compute "reductions" -- operations that return a scalar output Blob
+ *        for an input Blob of arbitrary size, such as the sum, absolute sum,
+ *        and sum of squares.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Dtype>
+class ReductionLayer : public Layer<Dtype> {
+ public:
+  explicit ReductionLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Reduction"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief the reduction operation performed by the layer
+  ReductionParameter_ReductionOp op_;
+  /// @brief a scalar coefficient applied to all outputs
+  Dtype coeff_;
+  /// @brief the index of the first input axis to reduce
+  int axis_;
+  /// @brief the number of reductions performed
+  int num_;
+  /// @brief the input size of each reduction
+  int dim_;
+  /// @brief a helper Blob used for summation (op_ == SUM)
+  Blob<Dtype> sum_multiplier_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_REDUCTION_LAYER_HPP_

Some files were not shown because too many files changed in this diff