4 months ago · 35a612c68a
--- a/src/api/commonfile/PyModuleCommModule.py
+++ b/src/api/commonfile/PyModuleCommModule.py
@@ -0,0 +1,91 @@
 
				+
			
 
				+import threading  
			
 
				+import time  
			
 
				+
			
 
				+import modulecommpython
			
 
				+import numpy as np  
			
 
				+
			
 
				+
			
 
				+modulelock = threading.Lock()  
			
 
				+nThread = 0
			
 
				+  
			
 
				+class PyModuleComm:  
			
 
				+    def __init__(self,strname):  
			
 
				+        # 初始化代码...  
			
 
				+        print("name: ",strname)
			
 
				+        self.strmemname = strname
			
 
				+        self.mbRegister = False
			
 
				+        global nThread
			
 
				+        nThread = nThread+1
			
 
				+        self.mnMode = 0
			
 
				+        print("nThread = ",nThread)
			
 
				+        self.obj = modulecommpython.get_ca_object()
			
 
				+        pass  
			
 
				+
			
 
				+    def RegisterRecv(self,call):
			
 
				+        if self.mbRegister:
			
 
				+            print(" Have register, can't register other.")
			
 
				+            return
			
 
				+        print("Register: ",self.strmemname)
			
 
				+        self.mpcall = call
			
 
				+        self.mbRegister = True
			
 
				+        self.mbRun = True
			
 
				+        self.mpthread = threading.Thread(target=self.threadrecvdata, args=(self.strmemname,))  
			
 
				+        self.mpthread.start()
			
 
				+        print("complete create thread.")
			
 
				+        self.mnMode = 1
			
 
				+        self.obj.RegisterRecv(self.strmemname)
			
 
				+
			
 
				+    def RegiseterSend(self,nSize,nPacCount):
			
 
				+        if self.mbRegister:
			
 
				+            print(" Have register, can't register other.")
			
 
				+            return
			
 
				+        print("Register: ",self.strmemname)
			
 
				+        self.mnsize = nSize
			
 
				+        self.mnPacCount = nPacCount
			
 
				+        self.mbRegister = True
			
 
				+        self.mnMode = 2
			
 
				+        self.obj.RegisterSend(self.strmemname,nSize,nPacCount)
			
 
				+    
			
 
				+    def SendData(self,arr,nsendsize):      
			
 
				+    #    nrealsize = np.zeros(1, dtype=np.int32)  
			
 
				+        nrtn = self.obj.SendData(arr,nsendsize)
			
 
				+  
			
 
				+    def threadrecvdata(self, arg):  
			
 
				+        # 这个函数将被线程执行  
			
 
				+ #       print(f"线程开始执行，参数是 {arg}")  
			
 
				+        nBuffSize = int(1000)
			
 
				+        arr = np.zeros(nBuffSize,dtype=np.int8)
			
 
				+        recvtime = np.zeros(1,dtype=np.int64)
			
 
				+        nrealsize = np.zeros(1,dtype=np.int32)
			
 
				+        while self.mbRun:
			
 
				+            nrtn = self.obj.RecvData(arr,nBuffSize,nrealsize,recvtime)
			
 
				+            if nrtn > 0:
			
 
				+                self.mpcall(arr,nrtn,recvtime)
			
 
				+            else:
			
 
				+                pass
			
 
				+            if nrtn < 0:
			
 
				+                nBuffSize = int(nrealsize[0] * 2)
			
 
				+                arr = np.zeros(nBuffSize,dtype=np.int8)
			
 
				+            else:
			
 
				+                time.sleep(0.001)
			
 
				+            
			
 
				+        print("threadrecvdata complete.")
			
 
				+
			
 
				+    def stop_thread(self):
			
 
				+        self.mbRun = False
			
 
				+        self.mpthread.join()
			
 
				+  
			
 
				+    def start_thread(self, arg):  
			
 
				+        # 创建线程对象，target参数指向要在线程中运行的函数  
			
 
				+        self.mbRun = True
			
 
				+        self.mpthread = threading.Thread(target=self.my_function, args=(arg,))  
			
 
				+          
			
 
				+        # 启动线程  
			
 
				+        self.mpthread.start()  
			
 
				+          
			
 
				+        # 可以在这里添加其他代码，主线程会继续执行  
			
 
				+        print("主线程继续执行...")  
			
 
				+  
			
 
				+        # 如果需要等待线程结束，可以调用 join() 方法  
			
 
				+        # thread.join()  
			
--- a/src/api/commonfile/libcaffe.so
+++ b/src/api/commonfile/libcaffe.so
@@ -0,0 +1 @@
 
				+libcaffe.so.1.0.0
			
--- a/src/api/commonfile/libcaffe.so.1.0.0
+++ b/src/api/commonfile/libcaffe.so.1.0.0
--- a/src/api/commonfile/libmodulecomm.so
+++ b/src/api/commonfile/libmodulecomm.so
--- a/src/api/commonfile/libxmlparam.so
+++ b/src/api/commonfile/libxmlparam.so
--- a/src/api/commonfile/modulecomm.h
+++ b/src/api/commonfile/modulecomm.h
@@ -0,0 +1,80 @@
 
				+#ifndef MODULECOMM_H
			
 
				+#define MODULECOMM_H
			
 
				+
			
 
				+
			
 
				+#include <QtCore/qglobal.h>
			
 
				+#include <QDateTime>
			
 
				+
			
 
				+#include <functional>
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+#if defined(MODULECOMM_LIBRARY)
			
 
				+#  define MODULECOMMSHARED_EXPORT Q_DECL_EXPORT
			
 
				+#else
			
 
				+#  define MODULECOMMSHARED_EXPORT Q_DECL_IMPORT
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+//#include <iostream>
			
 
				+//#include <thread>
			
 
				+
			
 
				+//using namespace std::placeholders;
			
 
				+
			
 
				+#ifndef IV_MODULE_FUN
			
 
				+
			
 
				+typedef std::function<void(const char * ,const unsigned int , const unsigned int , QDateTime * ,const char *)> ModuleFun;
			
 
				+typedef void (* SMCallBack)(const char * strdata,const unsigned int nSize,const unsigned int index,const QDateTime * dt,const char * strmemname);
			
 
				+#define IV_MODULE_FUN
			
 
				+#endif
			
 
				+
			
 
				+namespace iv {
			
 
				+namespace modulecomm {
			
 
				+
			
 
				+enum ModuleComm_TYPE
			
 
				+{
			
 
				+    ModuleComm_SHAREMEM = 0,
			
 
				+    ModuleComm_INTERIOR = 1,
			
 
				+    ModuleComm_FASTRTPS = 2,
			
 
				+    ModuleComm_FASTRTPS_TCP = 3,
			
 
				+    ModuleComm_UNDEFINE = 4
			
 
				+};
			
 
				+
			
 
				+void *  RegisterSend(const char * strcommname);
			
 
				+
			
 
				+
			
 
				+void *  RegisterSend(const char * strcommname,const unsigned int nBufSize,const unsigned int nMsgBufCount
			
 
				+                                            ,ModuleComm_TYPE xmctype,const unsigned short nport);
			
 
				+void *  RegisterRecv(const char * strcommname,SMCallBack pCall,
			
 
				+                                            ModuleComm_TYPE xmctype ,const char * strip,const unsigned short );
			
 
				+void *  RegisterRecvPlus(const char * strcommname,ModuleFun xFun,
			
 
				+                                                ModuleComm_TYPE xmctype,const char * strip,const unsigned short );
			
 
				+
			
 
				+
			
 
				+//void * MODULECOMMSHARED_EXPORT RegisterSend(const char * strcommname,const unsigned int nBufSize,const unsigned int nMsgBufCount
			
 
				+//                                            ,ModuleComm_TYPE xmctype = ModuleComm_UNDEFINE,const unsigned short nport = 5100);
			
 
				+//void * MODULECOMMSHARED_EXPORT RegisterRecv(const char * strcommname,SMCallBack pCall,
			
 
				+//                                            ModuleComm_TYPE xmctype = ModuleComm_UNDEFINE,const char * strip = 0,const unsigned short = 5100);
			
 
				+//void * MODULECOMMSHARED_EXPORT RegisterRecvPlus(const char * strcommname,ModuleFun xFun,
			
 
				+//                                                ModuleComm_TYPE xmctype = ModuleComm_UNDEFINE,const char * strip = 0,const unsigned short = 5100);
			
 
				+
			
 
				+void MODULECOMMSHARED_EXPORT ModuleSendMsg(void * pHandle,const char * strdata,const unsigned int nDataLen);
			
 
				+void MODULECOMMSHARED_EXPORT Unregister(void * pHandle);
			
 
				+void MODULECOMMSHARED_EXPORT PauseComm(void * pHandle);
			
 
				+void MODULECOMMSHARED_EXPORT ContintuComm(void * pHandle);
			
 
				+
			
 
				+void *  RegisterSend(const char * strcommname,const unsigned int nBufSize,const unsigned int nMsgBufCount);
			
 
				+void *  RegisterRecv(const char * strcommname,SMCallBack pCall);
			
 
				+void *  RegisterRecvPlus(const char * strcommname,ModuleFun xFun);
			
 
				+
			
 
				+
			
 
				+}
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif // MODULECOMM_H
			
--- a/src/api/commonfile/modulecommpython.so
+++ b/src/api/commonfile/modulecommpython.so
--- a/src/api/commonfile/xmlparam.h
+++ b/src/api/commonfile/xmlparam.h
@@ -0,0 +1,47 @@
 
				+#ifndef XMLPARAM_H
			
 
				+#define XMLPARAM_H
			
 
				+
			
 
				+
			
 
				+#include <QtCore/qglobal.h>
			
 
				+
			
 
				+#include <string>
			
 
				+
			
 
				+#if defined(XMLPARAM_LIBRARY)
			
 
				+#  define XMLPARAMSHARED_EXPORT Q_DECL_EXPORT
			
 
				+#else
			
 
				+#  define XMLPARAMSHARED_EXPORT Q_DECL_IMPORT
			
 
				+#endif
			
 
				+
			
 
				+namespace iv {
			
 
				+namespace xmlparam {
			
 
				+
			
 
				+class XMLPARAMSHARED_EXPORT Xmlparam
			
 
				+{
			
 
				+
			
 
				+public:
			
 
				+    Xmlparam(std::string filepath);
			
 
				+    std::string GetParam(std::string paramname,std::string defaultvalue);
			
 
				+    std::string GetParam(std::string paramname,const char *  strdefaultvalue);
			
 
				+    int GetParam(std::string paramname, int ndefaultvalue);
			
 
				+    double GetParam(std::string paramname, double dfdefvalue);
			
 
				+    float GetParam(std::string paramname, float fdefvalue);
			
 
				+    bool GetParam(std::string paramname, bool bdefValue);
			
 
				+
			
 
				+    void GetParam(std::string paramname,std::string & strvalue,std::string defaultvalue);
			
 
				+    void GetParam(std::string paramname,std::string & strvalue,const char *  strdefaultvalue);
			
 
				+    void GetParam(std::string paramname, int & nvalue, int ndefaultvalue);
			
 
				+    void GetParam(std::string paramname, double & dfvalue,double dfdefvalue);
			
 
				+    void GetParam(std::string paramname, float & fvalue, float fdefvalue);
			
 
				+    void GetParam(std::string paramname, bool & bvalue,bool bdefvalue);
			
 
				+
			
 
				+    QString GetVersion();
			
 
				+private:
			
 
				+    int * mpx;
			
 
				+
			
 
				+};
			
 
				+}
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#endif // XMLPARAM_H
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/Tracker/Ctracker.cpp
+++ b/src/api/perception_lidar_PointPillars_MultiHead/Tracker/Ctracker.cpp
@@ -0,0 +1,294 @@
 
				+#include "Ctracker.h"
			
 
				+///
			
 
				+/// \brief CTracker::CTracker
			
 
				+/// Tracker. Manage tracks. Create, remove, update.
			
 
				+/// \param settings
			
 
				+///
			
 
				+CTracker::CTracker(TrackerSettings& settings)
			
 
				+{
			
 
				+    m_settings = settings;
			
 
				+}
			
 
				+///
			
 
				+/// \brief CTracker::CTracker
			
 
				+/// Tracker. Manage tracks. Create, remove, update.
			
 
				+/// \param settings
			
 
				+///
			
 
				+CTracker::CTracker(const TrackerSettings& settings)
			
 
				+    :
			
 
				+      m_settings(settings),
			
 
				+      m_nextTrackID(0)
			
 
				+{
			
 
				+    ShortPathCalculator* spcalc = nullptr;
			
 
				+    SPSettings spSettings = { settings.m_distThres, 12 };
			
 
				+    switch (m_settings.m_matchType)
			
 
				+    {
			
 
				+    case tracking::MatchHungrian:
			
 
				+        spcalc = new SPHungrian(spSettings);
			
 
				+        break;
			
 
				+        //    case tracking::MatchBipart:
			
 
				+        //        spcalc = new SPBipart(spSettings);
			
 
				+        //        break;
			
 
				+    }
			
 
				+    assert(spcalc != nullptr);
			
 
				+    m_SPCalculator = std::unique_ptr<ShortPathCalculator>(spcalc);
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTracker::~CTracker
			
 
				+///
			
 
				+CTracker::~CTracker(void)
			
 
				+{
			
 
				+}
			
 
				+///
			
 
				+/// \brief CTracker::setSettings
			
 
				+/// Tracker. Manage tracks. Create, remove, update.
			
 
				+/// \param settings
			
 
				+///
			
 
				+void CTracker::setSettings(TrackerSettings& settings)
			
 
				+{
			
 
				+    m_settings = settings;
			
 
				+    m_nextTrackID = 0;
			
 
				+    ShortPathCalculator* spcalc = nullptr;
			
 
				+    SPSettings spSettings = { settings.m_distThres, 12 };
			
 
				+    switch (m_settings.m_matchType)
			
 
				+    {
			
 
				+    case tracking::MatchHungrian:
			
 
				+        spcalc = new SPHungrian(spSettings);
			
 
				+        break;
			
 
				+        //    case tracking::MatchBipart:
			
 
				+        //        spcalc = new SPBipart(spSettings);
			
 
				+        //        break;
			
 
				+    }
			
 
				+    assert(spcalc != nullptr);
			
 
				+    m_SPCalculator = std::unique_ptr<ShortPathCalculator>(spcalc);
			
 
				+}
			
 
				+///
			
 
				+/// \brief CTracker::Update
			
 
				+/// \param regions
			
 
				+/// \param currFrame
			
 
				+/// \param fps
			
 
				+///
			
 
				+void CTracker::Update(
			
 
				+        const regions_t& regions,
			
 
				+        cv::UMat currFrame,
			
 
				+        float fps
			
 
				+        )
			
 
				+{
			
 
				+    UpdateTrackingState(regions, currFrame, fps);
			
 
				+
			
 
				+    currFrame.copyTo(m_prevFrame);
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTracker::UpdateTrackingState
			
 
				+/// \param regions
			
 
				+/// \param currFrame
			
 
				+/// \param fps
			
 
				+///
			
 
				+void CTracker::UpdateTrackingState(
			
 
				+        const regions_t& regions,
			
 
				+        cv::UMat currFrame,
			
 
				+        float fps
			
 
				+        )
			
 
				+{
			
 
				+    const size_t N = m_tracks.size();	// Tracking objects
			
 
				+    const size_t M = regions.size();	// Detections or regions
			
 
				+
			
 
				+    assignments_t assignment(N, -1); // Assignments regions -> tracks
			
 
				+
			
 
				+    if (!m_tracks.empty())
			
 
				+    {
			
 
				+        // Distance matrix between all tracks to all regions
			
 
				+        distMatrix_t costMatrix(N * M);
			
 
				+        const track_t maxPossibleCost = 1e3;//static_cast<track_t>(currFrame.cols * currFrame.rows);
			
 
				+        track_t maxCost = 0;
			
 
				+        CreateDistaceMatrix(regions, costMatrix, maxPossibleCost, maxCost, currFrame);
			
 
				+
			
 
				+        // Solving assignment problem (shortest paths)
			
 
				+        m_SPCalculator->Solve(costMatrix, N, M, assignment, maxCost);//row->col(trackid->regionid)
			
 
				+
			
 
				+        // clean assignment from pairs with large distance
			
 
				+        for (size_t i = 0; i < assignment.size(); i++)
			
 
				+        {
			
 
				+            if (assignment[i] != -1)
			
 
				+            {
			
 
				+#ifdef DEBUG_SHOW
			
 
				+                std::cout<<costMatrix[i + assignment[i] * N]<<", ";
			
 
				+#endif
			
 
				+                if (costMatrix[i + assignment[i] * N] > m_settings.m_distThres)
			
 
				+                {
			
 
				+                    assignment[i] = -1;
			
 
				+                    m_tracks[i]->SkippedFrames()++;
			
 
				+                }
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+#ifdef DEBUG_SHOW
			
 
				+                std::cout<<-1<<", ";
			
 
				+#endif
			
 
				+                // If track have no assigned detect, then increment skipped frames counter.
			
 
				+                m_tracks[i]->SkippedFrames()++;
			
 
				+            }
			
 
				+        }
			
 
				+#ifdef DEBUG_SHOW
			
 
				+                std::cout<<std::endl;
			
 
				+#endif
			
 
				+        // If track didn't get detects long time, remove it.
			
 
				+        for (size_t i = 0; i < m_tracks.size();)
			
 
				+        {
			
 
				+            if (m_tracks[i]->SkippedFrames() > m_settings.m_maximumAllowedSkippedFrames ||
			
 
				+                    m_tracks[i]->IsStaticTimeout(cvRound(fps * (m_settings.m_maxStaticTime - m_settings.m_minStaticTime))))
			
 
				+            {
			
 
				+                m_tracks.erase(m_tracks.begin() + i);
			
 
				+                assignment.erase(assignment.begin() + i);
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                ++i;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Search for unassigned detects and start new tracks for them.
			
 
				+    for (size_t i = 0; i < regions.size(); ++i)
			
 
				+    {
			
 
				+        if (find(assignment.begin(), assignment.end(), i) == assignment.end())
			
 
				+        {
			
 
				+            m_tracks.push_back(std::make_unique<CTrack>(regions[i],
			
 
				+                                                        m_settings.m_kalmanType,
			
 
				+                                                        m_settings.m_dt,
			
 
				+                                                        m_settings.m_accelNoiseMag,
			
 
				+                                                        m_settings.m_useAcceleration,
			
 
				+                                                        m_nextTrackID++%500,
			
 
				+                                                        i,
			
 
				+                                                        m_settings.m_filterGoal,
			
 
				+                                                        m_settings.m_lostTrackType));
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Update Kalman Filters state
			
 
				+    const ptrdiff_t stop_i = static_cast<ptrdiff_t>(assignment.size());
			
 
				+#pragma omp parallel for
			
 
				+    for (ptrdiff_t i = 0; i < stop_i; ++i)
			
 
				+    {
			
 
				+        // If track updated less than one time, than filter state is not correct.
			
 
				+        if (assignment[i] != -1) // If we have assigned detect, then update using its coordinates,
			
 
				+        {
			
 
				+            m_tracks[i]->DetectedFrames()++;
			
 
				+            m_tracks[i]->SkippedFrames() = 0;
			
 
				+            m_tracks[i]->Update(
			
 
				+                        regions[assignment[i]], true,
			
 
				+                    m_settings.m_maxTraceLength,
			
 
				+                    m_prevFrame, currFrame,
			
 
				+                    m_settings.m_useAbandonedDetection ? cvRound(m_settings.m_minStaticTime * fps) : 0);
			
 
				+        }
			
 
				+        else				     // if not continue using predictions
			
 
				+        {
			
 
				+            m_tracks[i]->Update(CRegion(), false, m_settings.m_maxTraceLength, m_prevFrame, currFrame, 0);
			
 
				+        }
			
 
				+        m_tracks[i]->m_regionID = assignment[i];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTracker::CreateDistaceMatrix
			
 
				+/// \param regions
			
 
				+/// \param costMatrix
			
 
				+/// \param maxPossibleCost
			
 
				+/// \param maxCost
			
 
				+///
			
 
				+void CTracker::CreateDistaceMatrix(const regions_t& regions, distMatrix_t& costMatrix, track_t maxPossibleCost, track_t& maxCost, cv::UMat currFrame)
			
 
				+{
			
 
				+    const size_t N = m_tracks.size();	// Tracking objects
			
 
				+    maxCost = 0;
			
 
				+
			
 
				+    for (size_t i = 0; i < N; ++i)
			
 
				+    {
			
 
				+        const auto& track = m_tracks[i];
			
 
				+
			
 
				+        // Calc predicted area for track
			
 
				+//        cv::Size_<track_t> minRadius;
			
 
				+//        if (m_settings.m_minAreaRadiusPix < 0)
			
 
				+//        {
			
 
				+//            minRadius.width = m_settings.m_minAreaRadiusK * track->LastRegion().m_rect.size.width;
			
 
				+//            minRadius.height = m_settings.m_minAreaRadiusK * track->LastRegion().m_rect.size.height;
			
 
				+//        }
			
 
				+//        else
			
 
				+//        {
			
 
				+//            minRadius.width = m_settings.m_minAreaRadiusPix;
			
 
				+//            minRadius.height = m_settings.m_minAreaRadiusPix;
			
 
				+//        }
			
 
				+        //cv::RotatedRect predictedArea = track->CalcPredictionEllipse(minRadius);
			
 
				+
			
 
				+        // Calc distance between track and regions
			
 
				+        for (size_t j = 0; j < regions.size(); ++j)
			
 
				+        {
			
 
				+            const auto& reg = regions[j];
			
 
				+
			
 
				+            auto dist = maxPossibleCost;
			
 
				+            if (reg.m_type==-1?(m_settings.CheckType(m_tracks[i]->LastRegion().m_type_name, reg.m_type_name)):(m_settings.CheckType(m_tracks[i]->LastRegion().m_type, reg.m_type)))
			
 
				+            {
			
 
				+                dist = 0;
			
 
				+                size_t ind = 0;
			
 
				+                if (m_settings.m_distType[ind] > 0.0f && ind == tracking::DistCenters)
			
 
				+                {
			
 
				+#if 0
			
 
				+                    track_t ellipseDist = track->IsInsideArea(reg.m_rrect.center, predictedArea);
			
 
				+                    if (ellipseDist > 1)
			
 
				+                        dist += m_settings.m_distType[ind];
			
 
				+                    else
			
 
				+                        dist += ellipseDist * m_settings.m_distType[ind];
			
 
				+#else
			
 
				+                    dist += m_settings.m_distType[ind] * track->CalcDistCenter(reg);
			
 
				+#endif
			
 
				+                }
			
 
				+                ++ind;
			
 
				+
			
 
				+                if (m_settings.m_distType[ind] > 0.0f && ind == tracking::DistRects)
			
 
				+                {
			
 
				+#if 0
			
 
				+                    track_t ellipseDist = track->IsInsideArea(reg.m_rect.center, predictedArea);
			
 
				+                    if (ellipseDist < 1)
			
 
				+                    {
			
 
				+                        track_t dw = track->WidthDist(reg);
			
 
				+                        track_t dh = track->HeightDist(reg);
			
 
				+                        dist += m_settings.m_distType[ind] * (1 - (1 - ellipseDist) * (dw + dh) * 0.5f);
			
 
				+                    }
			
 
				+                    else
			
 
				+                    {
			
 
				+                        dist += m_settings.m_distType[ind];
			
 
				+                    }
			
 
				+                    //std::cout << "dist = " << dist << ", ed = " << ellipseDist << ", dw = " << dw << ", dh = " << dh << std::endl;
			
 
				+#else
			
 
				+                    dist += m_settings.m_distType[ind] * track->CalcDistRect(reg);
			
 
				+#endif
			
 
				+                }
			
 
				+                ++ind;
			
 
				+                if (m_settings.m_distType[ind] > 0.0f && ind == tracking::DistRect3Ds)
			
 
				+                {
			
 
				+#if 0
			
 
				+                    track_t ellipseDist = track->IsInsideArea(reg.m_rect.center, predictedArea);
			
 
				+                    if (ellipseDist < 1)
			
 
				+                    {
			
 
				+                        track_t dw = track->WidthDist(reg);
			
 
				+                        track_t dh = track->HeightDist(reg);
			
 
				+                        dist += m_settings.m_distType[ind] * (1 - (1 - ellipseDist) * (dw + dh) * 0.5f);
			
 
				+                    }
			
 
				+                    else
			
 
				+                    {
			
 
				+                        dist += m_settings.m_distType[ind];
			
 
				+                    }
			
 
				+                    //std::cout << "dist = " << dist << ", ed = " << ellipseDist << ", dw = " << dw << ", dh = " << dh << std::endl;
			
 
				+#else
			
 
				+                    dist += m_settings.m_distType[ind] * track->CalcDistRect3D(reg);
			
 
				+#endif
			
 
				+                }
			
 
				+                ++ind;
			
 
				+                assert(ind == tracking::DistsCount);
			
 
				+            }
			
 
				+            costMatrix[i + j * N] = dist;
			
 
				+            if (dist > maxCost)
			
 
				+                maxCost = dist;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/Tracker/Ctracker.h
+++ b/src/api/perception_lidar_PointPillars_MultiHead/Tracker/Ctracker.h
@@ -0,0 +1,292 @@
 
				+#pragma once
			
 
				+#include <iostream>
			
 
				+#include <vector>
			
 
				+#include <memory>
			
 
				+#include <array>
			
 
				+#include <deque>
			
 
				+#include <numeric>
			
 
				+#include <map>
			
 
				+#include <set>
			
 
				+
			
 
				+#include "defines.h"
			
 
				+#include "track.h"
			
 
				+#include "ShortPathCalculator.h"
			
 
				+
			
 
				+// ----------------------------------------------------------------------
			
 
				+
			
 
				+///
			
 
				+/// \brief The TrackerSettings struct
			
 
				+///
			
 
				+struct TrackerSettings
			
 
				+{
			
 
				+    tracking::KalmanType m_kalmanType = tracking::KalmanLinear;
			
 
				+    tracking::FilterGoal m_filterGoal = tracking::FilterCenter;
			
 
				+    tracking::LostTrackType m_lostTrackType = tracking::TrackNone;
			
 
				+    tracking::MatchType m_matchType = tracking::MatchHungrian;
			
 
				+
			
 
				+	std::array<track_t, tracking::DistsCount> m_distType;
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief m_dt
			
 
				+    /// Time step for Kalman
			
 
				+    ///
			
 
				+    track_t m_dt = 1.0f;
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief m_accelNoiseMag
			
 
				+    /// Noise magnitude for Kalman
			
 
				+    ///
			
 
				+    track_t m_accelNoiseMag = 0.1f;
			
 
				+
			
 
				+	///
			
 
				+	/// \brief m_useAcceleration
			
 
				+	/// Constant velocity or constant acceleration motion model
			
 
				+	///
			
 
				+	bool m_useAcceleration = false;
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief m_distThres
			
 
				+    /// Distance threshold for Assignment problem: from 0 to 1
			
 
				+    ///
			
 
				+    track_t m_distThres = 0.8f;
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief m_minAreaRadius
			
 
				+    /// Minimal area radius in pixels for objects centers
			
 
				+    ///
			
 
				+    track_t m_minAreaRadiusPix = 20.f;
			
 
				+
			
 
				+	///
			
 
				+	/// \brief m_minAreaRadius
			
 
				+	/// Minimal area radius in ration for object size.. Used if m_minAreaRadiusPix < 0
			
 
				+	///
			
 
				+	track_t m_minAreaRadiusK = 0.5f;
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief m_maximumAllowedSkippedFrames
			
 
				+    /// If the object don't assignment more than this frames then it will be removed
			
 
				+    ///
			
 
				+    size_t m_maximumAllowedSkippedFrames = 25;
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief m_maxTraceLength
			
 
				+    /// The maximum trajectory length
			
 
				+    ///
			
 
				+    size_t m_maxTraceLength = 50;
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief m_useAbandonedDetection
			
 
				+    /// Detection abandoned objects
			
 
				+    ///
			
 
				+    bool m_useAbandonedDetection = false;
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief m_minStaticTime
			
 
				+    /// After this time (in seconds) the object is considered abandoned
			
 
				+    ///
			
 
				+    int m_minStaticTime = 5;
			
 
				+    ///
			
 
				+    /// \brief m_maxStaticTime
			
 
				+    /// After this time (in seconds) the abandoned object will be removed
			
 
				+    ///
			
 
				+    int m_maxStaticTime = 25;
			
 
				+
			
 
				+	///
			
 
				+	/// \brief m_nearTypes
			
 
				+	/// Object types that can be matched while tracking
			
 
				+	///
			
 
				+    std::map<int, std::set<int>> m_nearTypes;
			
 
				+
			
 
				+    std::map<std::string, std::set<std::string>> m_nearTypeNames;
			
 
				+
			
 
				+	///
			
 
				+	TrackerSettings()
			
 
				+	{
			
 
				+        m_distType[tracking::DistCenters] = 0.5f;
			
 
				+        m_distType[tracking::DistRects] = 0.5f;
			
 
				+        m_distType[tracking::DistRect3Ds] = 0.f;
			
 
				+
			
 
				+		assert(CheckDistance());
			
 
				+	}
			
 
				+
			
 
				+	///
			
 
				+	bool CheckDistance() const
			
 
				+	{
			
 
				+		track_t sum = std::accumulate(m_distType.begin(), m_distType.end(), 0.0f);
			
 
				+        track_t maxOne = std::max(1.0f, std::fabs(sum));
			
 
				+		return std::fabs(sum - 1.0f) <= std::numeric_limits<track_t>::epsilon() * maxOne;
			
 
				+	}
			
 
				+
			
 
				+	///
			
 
				+	bool SetDistances(std::array<track_t, tracking::DistsCount> distType)
			
 
				+	{
			
 
				+		bool res = true;
			
 
				+		auto oldDists = m_distType;
			
 
				+		m_distType = distType;
			
 
				+		if (!CheckDistance())
			
 
				+		{
			
 
				+			m_distType = oldDists;
			
 
				+			res = false;
			
 
				+		}
			
 
				+		return res;
			
 
				+	}
			
 
				+
			
 
				+	///
			
 
				+	bool SetDistance(tracking::DistType distType)
			
 
				+	{
			
 
				+		std::fill(m_distType.begin(), m_distType.end(), 0.0f);
			
 
				+		m_distType[distType] = 1.f;
			
 
				+		return true;
			
 
				+	}
			
 
				+
			
 
				+	///
			
 
				+    void AddNearTypes(const int& type1, const int& type2, bool sym)
			
 
				+	{
			
 
				+        auto AddOne = [&](const int& type1, const int& type2)
			
 
				+		{
			
 
				+			auto it = m_nearTypes.find(type1);
			
 
				+			if (it == std::end(m_nearTypes))
			
 
				+			{
			
 
				+                m_nearTypes[type1] = std::set<int>{ type2 };
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				it->second.insert(type2);
			
 
				+			}
			
 
				+		};
			
 
				+		AddOne(type1, type2);
			
 
				+		if (sym)
			
 
				+        {
			
 
				+			AddOne(type2, type1);
			
 
				+        }
			
 
				+	}
			
 
				+
			
 
				+	///
			
 
				+    bool CheckType(const int& type1, const int& type2) const
			
 
				+	{
			
 
				+        bool res = type1==-1 || type2==-1 || (type1 == type2);
			
 
				+		if (!res)
			
 
				+		{
			
 
				+			auto it = m_nearTypes.find(type1);
			
 
				+			if (it != std::end(m_nearTypes))
			
 
				+			{
			
 
				+				res = it->second.find(type2) != std::end(it->second);
			
 
				+			}
			
 
				+		}
			
 
				+		return res;
			
 
				+	}
			
 
				+
			
 
				+    ///
			
 
				+    void AddNearTypes(const std::string& type1, const std::string& type2, bool sym)
			
 
				+    {
			
 
				+        auto AddOne = [&](const std::string& type1, const std::string& type2)
			
 
				+        {
			
 
				+            auto it = m_nearTypeNames.find(type1);
			
 
				+            if (it == std::end(m_nearTypeNames))
			
 
				+            {
			
 
				+                m_nearTypeNames[type1] = std::set<std::string>{ type2 };
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                it->second.insert(type2);
			
 
				+            }
			
 
				+        };
			
 
				+        AddOne(type1, type2);
			
 
				+        if (sym)
			
 
				+        {
			
 
				+            AddOne(type2, type1);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    ///
			
 
				+    bool CheckType(const std::string& type1, const std::string& type2) const
			
 
				+    {
			
 
				+        bool res = type1.empty() || type2.empty() || (type1 == type2);
			
 
				+        if (!res)
			
 
				+        {
			
 
				+            auto it = m_nearTypeNames.find(type1);
			
 
				+            if (it != std::end(m_nearTypeNames))
			
 
				+            {
			
 
				+                res = it->second.find(type2) != std::end(it->second);
			
 
				+            }
			
 
				+        }
			
 
				+        return res;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+///
			
 
				+/// \brief The CTracker class
			
 
				+///
			
 
				+class CTracker
			
 
				+{
			
 
				+public:
			
 
				+    CTracker(TrackerSettings& settings);
			
 
				+    CTracker(const TrackerSettings& settings);
			
 
				+    CTracker(const CTracker&) = delete;
			
 
				+    CTracker(CTracker&&) = delete;
			
 
				+	CTracker& operator=(const CTracker&) = delete;
			
 
				+	CTracker& operator=(CTracker&&) = delete;
			
 
				+	
			
 
				+	~CTracker(void);
			
 
				+    void setSettings(TrackerSettings& settings);
			
 
				+    void Update(const regions_t& regions, cv::UMat currFrame, float fps);
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief CanGrayFrameToTrack
			
 
				+    /// \return
			
 
				+    ///
			
 
				+    bool CanGrayFrameToTrack() const
			
 
				+    {
			
 
				+        bool needColor = (m_settings.m_lostTrackType == tracking::LostTrackType::TrackNone);
			
 
				+        return !needColor;
			
 
				+    }
			
 
				+
			
 
				+	///
			
 
				+	/// \brief CanColorFrameToTrack
			
 
				+	/// \return
			
 
				+	///
			
 
				+	bool CanColorFrameToTrack() const
			
 
				+	{
			
 
				+		return true;
			
 
				+	}
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief GetTracksCount
			
 
				+    /// \return
			
 
				+    ///
			
 
				+	size_t GetTracksCount() const
			
 
				+	{
			
 
				+		return m_tracks.size();
			
 
				+	}
			
 
				+    ///
			
 
				+    /// \brief GetTracks
			
 
				+    /// \return
			
 
				+    ///
			
 
				+	std::vector<TrackingObject> GetTracks() const
			
 
				+	{
			
 
				+		std::vector<TrackingObject> tracks;
			
 
				+		if (!m_tracks.empty())
			
 
				+		{
			
 
				+			tracks.reserve(m_tracks.size());
			
 
				+			for (const auto& track : m_tracks)
			
 
				+			{
			
 
				+                tracks.push_back(track->ConstructObject());
			
 
				+			}
			
 
				+		}
			
 
				+		return tracks;
			
 
				+	}
			
 
				+
			
 
				+private:
			
 
				+    TrackerSettings m_settings;
			
 
				+
			
 
				+	tracks_t m_tracks;
			
 
				+
			
 
				+    size_t m_nextTrackID;
			
 
				+
			
 
				+    cv::UMat m_prevFrame;
			
 
				+
			
 
				+    std::unique_ptr<ShortPathCalculator> m_SPCalculator;
			
 
				+
			
 
				+    void CreateDistaceMatrix(const regions_t& regions, distMatrix_t& costMatrix, track_t maxPossibleCost, track_t& maxCost, cv::UMat currFrame);
			
 
				+    void UpdateTrackingState(const regions_t& regions, cv::UMat currFrame, float fps);
			
 
				+};
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/Tracker/HungarianAlg.cpp
+++ b/src/api/perception_lidar_PointPillars_MultiHead/Tracker/HungarianAlg.cpp
@@ -0,0 +1,723 @@
 
				+#include "HungarianAlg.h"
			
 
				+#include <limits>
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+//
			
 
				+// --------------------------------------------------------------------------
			
 
				+AssignmentProblemSolver::AssignmentProblemSolver()
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+//
			
 
				+// --------------------------------------------------------------------------
			
 
				+AssignmentProblemSolver::~AssignmentProblemSolver()
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+//
			
 
				+// --------------------------------------------------------------------------
			
 
				+track_t AssignmentProblemSolver::Solve(
			
 
				+	const distMatrix_t& distMatrixIn,
			
 
				+	size_t nOfRows,
			
 
				+	size_t nOfColumns,
			
 
				+	std::vector<int>& assignment,
			
 
				+	TMethod Method
			
 
				+	)
			
 
				+{
			
 
				+	assignment.resize(nOfRows, -1);
			
 
				+
			
 
				+	track_t cost = 0;
			
 
				+
			
 
				+	switch (Method)
			
 
				+	{
			
 
				+	case optimal:
			
 
				+		assignmentoptimal(assignment, cost, distMatrixIn, nOfRows, nOfColumns);
			
 
				+		break;
			
 
				+
			
 
				+	case many_forbidden_assignments:
			
 
				+		assignmentsuboptimal1(assignment, cost, distMatrixIn, nOfRows, nOfColumns);
			
 
				+		break;
			
 
				+
			
 
				+	case without_forbidden_assignments:
			
 
				+		assignmentsuboptimal2(assignment, cost, distMatrixIn, nOfRows, nOfColumns);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return cost;
			
 
				+}
			
 
				+// --------------------------------------------------------------------------
			
 
				+// Computes the optimal assignment (minimum overall costs) using Munkres algorithm.
			
 
				+// --------------------------------------------------------------------------
			
 
				+void AssignmentProblemSolver::assignmentoptimal(assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns)
			
 
				+{
			
 
				+	// Generate distance cv::Matrix 
			
 
				+	// and check cv::Matrix elements positiveness :)
			
 
				+
			
 
				+	// Total elements number
			
 
				+	size_t nOfElements = nOfRows * nOfColumns;
			
 
				+	// Memory allocation
			
 
				+	track_t* distMatrix = (track_t *)malloc(nOfElements * sizeof(track_t));
			
 
				+
			
 
				+    if (distMatrix == nullptr)
			
 
				+    {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+	// Pointer to last element
			
 
				+	track_t* distMatrixEnd = distMatrix + nOfElements;
			
 
				+
			
 
				+	for (size_t row = 0; row < nOfElements; row++)
			
 
				+	{
			
 
				+        track_t value = distMatrixIn[row];
			
 
				+		assert(value >= 0);
			
 
				+		distMatrix[row] = value;
			
 
				+	}
			
 
				+
			
 
				+	// Memory allocation
			
 
				+	bool* coveredColumns = (bool*)calloc(nOfColumns, sizeof(bool));
			
 
				+	bool* coveredRows = (bool*)calloc(nOfRows, sizeof(bool));
			
 
				+	bool* starMatrix = (bool*)calloc(nOfElements, sizeof(bool));
			
 
				+	bool* primeMatrix = (bool*)calloc(nOfElements, sizeof(bool));
			
 
				+	bool* newStarMatrix = (bool*)calloc(nOfElements, sizeof(bool)); /* used in step4 */
			
 
				+
			
 
				+	/* preliminary steps */
			
 
				+	if (nOfRows <= nOfColumns)
			
 
				+	{
			
 
				+		for (size_t row = 0; row < nOfRows; row++)
			
 
				+		{
			
 
				+			/* find the smallest element in the row */
			
 
				+            track_t* distMatrixTemp = distMatrix + row;
			
 
				+			track_t  minValue = *distMatrixTemp;
			
 
				+			distMatrixTemp += nOfRows;
			
 
				+			while (distMatrixTemp < distMatrixEnd)
			
 
				+			{
			
 
				+				track_t value = *distMatrixTemp;
			
 
				+				if (value < minValue)
			
 
				+				{
			
 
				+					minValue = value;
			
 
				+				}
			
 
				+				distMatrixTemp += nOfRows;
			
 
				+			}
			
 
				+			/* subtract the smallest element from each element of the row */
			
 
				+			distMatrixTemp = distMatrix + row;
			
 
				+			while (distMatrixTemp < distMatrixEnd)
			
 
				+			{
			
 
				+				*distMatrixTemp -= minValue;
			
 
				+				distMatrixTemp += nOfRows;
			
 
				+			}
			
 
				+		}
			
 
				+		/* Steps 1 and 2a */
			
 
				+		for (size_t row = 0; row < nOfRows; row++)
			
 
				+		{
			
 
				+			for (size_t col = 0; col < nOfColumns; col++)
			
 
				+			{
			
 
				+				if (distMatrix[row + nOfRows*col] == 0)
			
 
				+				{
			
 
				+					if (!coveredColumns[col])
			
 
				+					{
			
 
				+						starMatrix[row + nOfRows * col] = true;
			
 
				+						coveredColumns[col] = true;
			
 
				+						break;
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else /* if(nOfRows > nOfColumns) */
			
 
				+	{
			
 
				+		for (size_t col = 0; col < nOfColumns; col++)
			
 
				+		{
			
 
				+			/* find the smallest element in the column */
			
 
				+			track_t* distMatrixTemp = distMatrix + nOfRows*col;
			
 
				+			track_t* columnEnd = distMatrixTemp + nOfRows;
			
 
				+			track_t  minValue = *distMatrixTemp++;
			
 
				+			while (distMatrixTemp < columnEnd)
			
 
				+			{
			
 
				+				track_t value = *distMatrixTemp++;
			
 
				+				if (value < minValue)
			
 
				+				{
			
 
				+					minValue = value;
			
 
				+				}
			
 
				+			}
			
 
				+			/* subtract the smallest element from each element of the column */
			
 
				+			distMatrixTemp = distMatrix + nOfRows*col;
			
 
				+			while (distMatrixTemp < columnEnd)
			
 
				+			{
			
 
				+				*distMatrixTemp++ -= minValue;
			
 
				+			}
			
 
				+		}
			
 
				+		/* Steps 1 and 2a */
			
 
				+		for (size_t col = 0; col < nOfColumns; col++)
			
 
				+		{
			
 
				+			for (size_t row = 0; row < nOfRows; row++)
			
 
				+			{
			
 
				+				if (distMatrix[row + nOfRows*col] == 0)
			
 
				+				{
			
 
				+					if (!coveredRows[row])
			
 
				+					{
			
 
				+						starMatrix[row + nOfRows*col] = true;
			
 
				+						coveredColumns[col] = true;
			
 
				+						coveredRows[row] = true;
			
 
				+						break;
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		for (size_t row = 0; row < nOfRows; row++)
			
 
				+		{
			
 
				+			coveredRows[row] = false;
			
 
				+		}
			
 
				+	}
			
 
				+	/* move to step 2b */
			
 
				+	step2b(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, (nOfRows <= nOfColumns) ? nOfRows : nOfColumns);
			
 
				+	/* compute cost and remove invalid assignments */
			
 
				+	computeassignmentcost(assignment, cost, distMatrixIn, nOfRows);
			
 
				+	/* free allocated memory */
			
 
				+	free(distMatrix);
			
 
				+	free(coveredColumns);
			
 
				+	free(coveredRows);
			
 
				+	free(starMatrix);
			
 
				+	free(primeMatrix);
			
 
				+	free(newStarMatrix);
			
 
				+	return;
			
 
				+}
			
 
				+// --------------------------------------------------------------------------
			
 
				+//
			
 
				+// --------------------------------------------------------------------------
			
 
				+void AssignmentProblemSolver::buildassignmentvector(assignments_t& assignment, bool *starMatrix, size_t nOfRows, size_t nOfColumns)
			
 
				+{
			
 
				+    for (size_t row = 0; row < nOfRows; row++)
			
 
				+	{
			
 
				+        for (size_t col = 0; col < nOfColumns; col++)
			
 
				+		{
			
 
				+			if (starMatrix[row + nOfRows * col])
			
 
				+			{
			
 
				+				assignment[row] = static_cast<int>(col);
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+// --------------------------------------------------------------------------
			
 
				+//
			
 
				+// --------------------------------------------------------------------------
			
 
				+void AssignmentProblemSolver::computeassignmentcost(const assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows)
			
 
				+{
			
 
				+	for (size_t row = 0; row < nOfRows; row++)
			
 
				+	{
			
 
				+		const int col = assignment[row];
			
 
				+		if (col >= 0)
			
 
				+		{
			
 
				+			cost += distMatrixIn[row + nOfRows * col];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+//
			
 
				+// --------------------------------------------------------------------------
			
 
				+void AssignmentProblemSolver::step2a(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim)
			
 
				+{
			
 
				+    bool *starMatrixTemp, *columnEnd;
			
 
				+    /* cover every column containing a starred zero */
			
 
				+    for (size_t col = 0; col < nOfColumns; col++)
			
 
				+    {
			
 
				+        starMatrixTemp = starMatrix + nOfRows * col;
			
 
				+        columnEnd = starMatrixTemp + nOfRows;
			
 
				+        while (starMatrixTemp < columnEnd)
			
 
				+        {
			
 
				+            if (*starMatrixTemp++)
			
 
				+            {
			
 
				+                coveredColumns[col] = true;
			
 
				+                break;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    /* move to step 3 */
			
 
				+	step2b(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim);
			
 
				+}
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+//
			
 
				+// --------------------------------------------------------------------------
			
 
				+void AssignmentProblemSolver::step2b(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim)
			
 
				+{
			
 
				+	/* count covered columns */
			
 
				+    size_t nOfCoveredColumns = 0;
			
 
				+    for (size_t col = 0; col < nOfColumns; col++)
			
 
				+	{
			
 
				+		if (coveredColumns[col])
			
 
				+		{
			
 
				+			nOfCoveredColumns++;
			
 
				+		}
			
 
				+	}
			
 
				+	if (nOfCoveredColumns == minDim)
			
 
				+	{
			
 
				+		/* algorithm finished */
			
 
				+		buildassignmentvector(assignment, starMatrix, nOfRows, nOfColumns);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* move to step 3 */
			
 
				+		step3_5(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+//
			
 
				+// --------------------------------------------------------------------------
			
 
				+void AssignmentProblemSolver::step3_5(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim)
			
 
				+{
			
 
				+	for (;;)
			
 
				+	{
			
 
				+		/* step 3 */
			
 
				+		bool zerosFound = true;
			
 
				+		while (zerosFound)
			
 
				+		{
			
 
				+			zerosFound = false;
			
 
				+			for (size_t col = 0; col < nOfColumns; col++)
			
 
				+			{
			
 
				+				if (!coveredColumns[col])
			
 
				+				{
			
 
				+					for (size_t row = 0; row < nOfRows; row++)
			
 
				+					{
			
 
				+						if ((!coveredRows[row]) && (distMatrix[row + nOfRows*col] == 0))
			
 
				+						{
			
 
				+							/* prime zero */
			
 
				+							primeMatrix[row + nOfRows*col] = true;
			
 
				+							/* find starred zero in current row */
			
 
				+							size_t starCol = 0;
			
 
				+							for (; starCol < nOfColumns; starCol++)
			
 
				+							{
			
 
				+								if (starMatrix[row + nOfRows * starCol])
			
 
				+								{
			
 
				+									break;
			
 
				+								}
			
 
				+							}
			
 
				+							if (starCol == nOfColumns) /* no starred zero found */
			
 
				+							{
			
 
				+								/* move to step 4 */
			
 
				+								step4(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim, row, col);
			
 
				+								return;
			
 
				+							}
			
 
				+							else
			
 
				+							{
			
 
				+								coveredRows[row] = true;
			
 
				+								coveredColumns[starCol] = false;
			
 
				+								zerosFound = true;
			
 
				+								break;
			
 
				+							}
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		/* step 5 */
			
 
				+        track_t h = std::numeric_limits<track_t>::max();
			
 
				+		for (size_t row = 0; row < nOfRows; row++)
			
 
				+		{
			
 
				+			if (!coveredRows[row])
			
 
				+			{
			
 
				+				for (size_t col = 0; col < nOfColumns; col++)
			
 
				+				{
			
 
				+					if (!coveredColumns[col])
			
 
				+					{
			
 
				+                        const track_t value = distMatrix[row + nOfRows*col];
			
 
				+						if (value < h)
			
 
				+						{
			
 
				+							h = value;
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		/* add h to each covered row */
			
 
				+		for (size_t row = 0; row < nOfRows; row++)
			
 
				+		{
			
 
				+			if (coveredRows[row])
			
 
				+			{
			
 
				+				for (size_t col = 0; col < nOfColumns; col++)
			
 
				+				{
			
 
				+					distMatrix[row + nOfRows*col] += h;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		/* subtract h from each uncovered column */
			
 
				+		for (size_t col = 0; col < nOfColumns; col++)
			
 
				+		{
			
 
				+			if (!coveredColumns[col])
			
 
				+			{
			
 
				+				for (size_t row = 0; row < nOfRows; row++)
			
 
				+				{
			
 
				+					distMatrix[row + nOfRows*col] -= h;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+//
			
 
				+// --------------------------------------------------------------------------
			
 
				+void AssignmentProblemSolver::step4(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim, size_t row, size_t col)
			
 
				+{
			
 
				+	const size_t nOfElements = nOfRows * nOfColumns;
			
 
				+	/* generate temporary copy of starMatrix */
			
 
				+	for (size_t n = 0; n < nOfElements; n++)
			
 
				+	{
			
 
				+		newStarMatrix[n] = starMatrix[n];
			
 
				+	}
			
 
				+	/* star current zero */
			
 
				+	newStarMatrix[row + nOfRows*col] = true;
			
 
				+	/* find starred zero in current column */
			
 
				+	size_t starCol = col;
			
 
				+	size_t starRow = 0;
			
 
				+	for (; starRow < nOfRows; starRow++)
			
 
				+	{
			
 
				+		if (starMatrix[starRow + nOfRows * starCol])
			
 
				+		{
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	while (starRow < nOfRows)
			
 
				+	{
			
 
				+		/* unstar the starred zero */
			
 
				+		newStarMatrix[starRow + nOfRows*starCol] = false;
			
 
				+		/* find primed zero in current row */
			
 
				+		size_t primeRow = starRow;
			
 
				+		size_t primeCol = 0;
			
 
				+		for (; primeCol < nOfColumns; primeCol++)
			
 
				+		{
			
 
				+			if (primeMatrix[primeRow + nOfRows * primeCol])
			
 
				+			{
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		/* star the primed zero */
			
 
				+		newStarMatrix[primeRow + nOfRows*primeCol] = true;
			
 
				+		/* find starred zero in current column */
			
 
				+		starCol = primeCol;
			
 
				+		for (starRow = 0; starRow < nOfRows; starRow++)
			
 
				+		{
			
 
				+			if (starMatrix[starRow + nOfRows * starCol])
			
 
				+			{
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	/* use temporary copy as new starMatrix */
			
 
				+	/* delete all primes, uncover all rows */
			
 
				+    for (size_t n = 0; n < nOfElements; n++)
			
 
				+	{
			
 
				+		primeMatrix[n] = false;
			
 
				+		starMatrix[n] = newStarMatrix[n];
			
 
				+	}
			
 
				+    for (size_t n = 0; n < nOfRows; n++)
			
 
				+	{
			
 
				+		coveredRows[n] = false;
			
 
				+	}
			
 
				+	/* move to step 2a */
			
 
				+	step2a(assignment, distMatrix, starMatrix, newStarMatrix, primeMatrix, coveredColumns, coveredRows, nOfRows, nOfColumns, minDim);
			
 
				+}
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+// Computes a suboptimal solution. Good for cases without forbidden assignments.
			
 
				+// --------------------------------------------------------------------------
			
 
				+void AssignmentProblemSolver::assignmentsuboptimal2(assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns)
			
 
				+{
			
 
				+	/* make working copy of distance Matrix */
			
 
				+	const size_t nOfElements = nOfRows * nOfColumns;
			
 
				+	track_t* distMatrix = (track_t*)malloc(nOfElements * sizeof(track_t));
			
 
				+	for (size_t n = 0; n < nOfElements; n++)
			
 
				+	{
			
 
				+		distMatrix[n] = distMatrixIn[n];
			
 
				+	}
			
 
				+
			
 
				+	/* recursively search for the minimum element and do the assignment */
			
 
				+	for (;;)
			
 
				+	{
			
 
				+		/* find minimum distance observation-to-track pair */
			
 
				+		track_t minValue = std::numeric_limits<track_t>::max();
			
 
				+		size_t tmpRow = 0;
			
 
				+		size_t tmpCol = 0;
			
 
				+		for (size_t row = 0; row < nOfRows; row++)
			
 
				+		{
			
 
				+			for (size_t col = 0; col < nOfColumns; col++)
			
 
				+			{
			
 
				+				const track_t value = distMatrix[row + nOfRows*col];
			
 
				+				if (value != std::numeric_limits<track_t>::max() && (value < minValue))
			
 
				+				{
			
 
				+					minValue = value;
			
 
				+					tmpRow = row;
			
 
				+					tmpCol = col;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (minValue != std::numeric_limits<track_t>::max())
			
 
				+		{
			
 
				+			assignment[tmpRow] = static_cast<int>(tmpCol);
			
 
				+			cost += minValue;
			
 
				+			for (size_t n = 0; n < nOfRows; n++)
			
 
				+			{
			
 
				+				distMatrix[n + nOfRows*tmpCol] = std::numeric_limits<track_t>::max();
			
 
				+			}
			
 
				+			for (size_t n = 0; n < nOfColumns; n++)
			
 
				+			{
			
 
				+				distMatrix[tmpRow + nOfRows*n] = std::numeric_limits<track_t>::max();
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	free(distMatrix);
			
 
				+}
			
 
				+// --------------------------------------------------------------------------
			
 
				+// Computes a suboptimal solution. Good for cases with many forbidden assignments.
			
 
				+// --------------------------------------------------------------------------
			
 
				+void AssignmentProblemSolver::assignmentsuboptimal1(assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns)
			
 
				+{
			
 
				+	/* make working copy of distance Matrix */
			
 
				+	const size_t nOfElements = nOfRows * nOfColumns;
			
 
				+	track_t* distMatrix = (track_t *)malloc(nOfElements * sizeof(track_t));
			
 
				+    for (size_t n = 0; n < nOfElements; n++)
			
 
				+	{
			
 
				+		distMatrix[n] = distMatrixIn[n];
			
 
				+	}
			
 
				+
			
 
				+	/* allocate memory */
			
 
				+	int* nOfValidObservations = (int *)calloc(nOfRows, sizeof(int));
			
 
				+	int* nOfValidTracks = (int *)calloc(nOfColumns, sizeof(int));
			
 
				+
			
 
				+	/* compute number of validations */
			
 
				+	bool infiniteValueFound = false;
			
 
				+	bool finiteValueFound = false;
			
 
				+	for (size_t row = 0; row < nOfRows; row++)
			
 
				+	{
			
 
				+		for (size_t col = 0; col < nOfColumns; col++)
			
 
				+		{
			
 
				+			if (distMatrix[row + nOfRows*col] != std::numeric_limits<track_t>::max())
			
 
				+			{
			
 
				+				nOfValidTracks[col] += 1;
			
 
				+				nOfValidObservations[row] += 1;
			
 
				+				finiteValueFound = true;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				infiniteValueFound = true;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (infiniteValueFound)
			
 
				+	{
			
 
				+		if (!finiteValueFound)
			
 
				+		{
			
 
				+            /* free allocated memory */
			
 
				+            free(nOfValidObservations);
			
 
				+            free(nOfValidTracks);
			
 
				+            free(distMatrix);
			
 
				+
			
 
				+			return;
			
 
				+		}
			
 
				+		bool repeatSteps = true;
			
 
				+
			
 
				+		while (repeatSteps)
			
 
				+		{
			
 
				+			repeatSteps = false;
			
 
				+
			
 
				+			/* step 1: reject assignments of multiply validated tracks to singly validated observations		 */
			
 
				+			for (size_t col = 0; col < nOfColumns; col++)
			
 
				+			{
			
 
				+				bool singleValidationFound = false;
			
 
				+				for (size_t row = 0; row < nOfRows; row++)
			
 
				+				{
			
 
				+					if (distMatrix[row + nOfRows * col] != std::numeric_limits<track_t>::max() && (nOfValidObservations[row] == 1))
			
 
				+					{
			
 
				+						singleValidationFound = true;
			
 
				+						break;
			
 
				+					}
			
 
				+				}
			
 
				+				if (singleValidationFound)
			
 
				+				{
			
 
				+					for (size_t nestedRow = 0; nestedRow < nOfRows; nestedRow++)
			
 
				+						if ((nOfValidObservations[nestedRow] > 1) && distMatrix[nestedRow + nOfRows * col] != std::numeric_limits<track_t>::max())
			
 
				+						{
			
 
				+							distMatrix[nestedRow + nOfRows * col] = std::numeric_limits<track_t>::max();
			
 
				+							nOfValidObservations[nestedRow] -= 1;
			
 
				+							nOfValidTracks[col] -= 1;
			
 
				+							repeatSteps = true;
			
 
				+						}
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			/* step 2: reject assignments of multiply validated observations to singly validated tracks */
			
 
				+			if (nOfColumns > 1)
			
 
				+			{
			
 
				+				for (size_t row = 0; row < nOfRows; row++)
			
 
				+				{
			
 
				+					bool singleValidationFound = false;
			
 
				+                    for (size_t col = 0; col < nOfColumns; col++)
			
 
				+					{
			
 
				+						if (distMatrix[row + nOfRows*col] != std::numeric_limits<track_t>::max() && (nOfValidTracks[col] == 1))
			
 
				+						{
			
 
				+							singleValidationFound = true;
			
 
				+							break;
			
 
				+						}
			
 
				+					}
			
 
				+
			
 
				+					if (singleValidationFound)
			
 
				+					{
			
 
				+						for (size_t col = 0; col < nOfColumns; col++)
			
 
				+						{
			
 
				+							if ((nOfValidTracks[col] > 1) && distMatrix[row + nOfRows*col] != std::numeric_limits<track_t>::max())
			
 
				+							{
			
 
				+								distMatrix[row + nOfRows*col] = std::numeric_limits<track_t>::max();
			
 
				+								nOfValidObservations[row] -= 1;
			
 
				+								nOfValidTracks[col] -= 1;
			
 
				+								repeatSteps = true;
			
 
				+							}
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		} /* while(repeatSteps) */
			
 
				+
			
 
				+		/* for each multiply validated track that validates only with singly validated  */
			
 
				+		/* observations, choose the observation with minimum distance */
			
 
				+        for (size_t row = 0; row < nOfRows; row++)
			
 
				+		{
			
 
				+			if (nOfValidObservations[row] > 1)
			
 
				+			{
			
 
				+				bool allSinglyValidated = true;
			
 
				+				track_t minValue = std::numeric_limits<track_t>::max();
			
 
				+				size_t tmpCol = 0;
			
 
				+                for (size_t col = 0; col < nOfColumns; col++)
			
 
				+				{
			
 
				+					const track_t value = distMatrix[row + nOfRows*col];
			
 
				+					if (value != std::numeric_limits<track_t>::max())
			
 
				+					{
			
 
				+						if (nOfValidTracks[col] > 1)
			
 
				+						{
			
 
				+							allSinglyValidated = false;
			
 
				+							break;
			
 
				+						}
			
 
				+						else if ((nOfValidTracks[col] == 1) && (value < minValue))
			
 
				+						{
			
 
				+							tmpCol = col;
			
 
				+							minValue = value;
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				if (allSinglyValidated)
			
 
				+				{
			
 
				+					assignment[row] = static_cast<int>(tmpCol);
			
 
				+					cost += minValue;
			
 
				+					for (size_t n = 0; n < nOfRows; n++)
			
 
				+					{
			
 
				+						distMatrix[n + nOfRows*tmpCol] = std::numeric_limits<track_t>::max();
			
 
				+					}
			
 
				+					for (size_t n = 0; n < nOfColumns; n++)
			
 
				+					{
			
 
				+						distMatrix[row + nOfRows*n] = std::numeric_limits<track_t>::max();
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// for each multiply validated observation that validates only with singly validated  track, choose the track with minimum distance
			
 
				+        for (size_t col = 0; col < nOfColumns; col++)
			
 
				+		{
			
 
				+			if (nOfValidTracks[col] > 1)
			
 
				+			{
			
 
				+				bool allSinglyValidated = true;
			
 
				+				track_t minValue = std::numeric_limits<track_t>::max();
			
 
				+				size_t tmpRow = 0;
			
 
				+				for (size_t row = 0; row < nOfRows; row++)
			
 
				+				{
			
 
				+					const track_t value = distMatrix[row + nOfRows*col];
			
 
				+					if (value != std::numeric_limits<track_t>::max())
			
 
				+					{
			
 
				+						if (nOfValidObservations[row] > 1)
			
 
				+						{
			
 
				+							allSinglyValidated = false;
			
 
				+							break;
			
 
				+						}
			
 
				+						else if ((nOfValidObservations[row] == 1) && (value < minValue))
			
 
				+						{
			
 
				+							tmpRow = row;
			
 
				+							minValue = value;
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				if (allSinglyValidated)
			
 
				+				{
			
 
				+					assignment[tmpRow] = static_cast<int>(col);
			
 
				+					cost += minValue;
			
 
				+					for (size_t n = 0; n < nOfRows; n++)
			
 
				+					{
			
 
				+						distMatrix[n + nOfRows*col] = std::numeric_limits<track_t>::max();
			
 
				+					}
			
 
				+					for (size_t n = 0; n < nOfColumns; n++)
			
 
				+					{
			
 
				+						distMatrix[tmpRow + nOfRows*n] = std::numeric_limits<track_t>::max();
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	} /* if(infiniteValueFound) */
			
 
				+
			
 
				+
			
 
				+	/* now, recursively search for the minimum element and do the assignment */
			
 
				+	for (;;)
			
 
				+	{
			
 
				+		/* find minimum distance observation-to-track pair */
			
 
				+		track_t minValue = std::numeric_limits<track_t>::max();
			
 
				+		size_t tmpRow = 0;
			
 
				+		size_t tmpCol = 0;
			
 
				+		for (size_t row = 0; row < nOfRows; row++)
			
 
				+		{
			
 
				+			for (size_t col = 0; col < nOfColumns; col++)
			
 
				+			{
			
 
				+				const track_t value = distMatrix[row + nOfRows*col];
			
 
				+				if (value != std::numeric_limits<track_t>::max() && (value < minValue))
			
 
				+				{
			
 
				+					minValue = value;
			
 
				+					tmpRow = row;
			
 
				+					tmpCol = col;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (minValue != std::numeric_limits<track_t>::max())
			
 
				+		{
			
 
				+			assignment[tmpRow] = static_cast<int>(tmpCol);
			
 
				+			cost += minValue;
			
 
				+			for (size_t n = 0; n < nOfRows; n++)
			
 
				+			{
			
 
				+				distMatrix[n + nOfRows*tmpCol] = std::numeric_limits<track_t>::max();
			
 
				+			}
			
 
				+			for (size_t n = 0; n < nOfColumns; n++)
			
 
				+			{
			
 
				+				distMatrix[tmpRow + nOfRows*n] = std::numeric_limits<track_t>::max();
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* free allocated memory */
			
 
				+	free(nOfValidObservations);
			
 
				+	free(nOfValidTracks);
			
 
				+	free(distMatrix);
			
 
				+}
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/Tracker/HungarianAlg.h
+++ b/src/api/perception_lidar_PointPillars_MultiHead/Tracker/HungarianAlg.h
@@ -0,0 +1,39 @@
 
				+#include <vector>
			
 
				+#include <iostream>
			
 
				+#include <limits>
			
 
				+#include <time.h>
			
 
				+#include "defines.h"
			
 
				+// http://community.topcoder.com/tc?module=Static&d1=tutorials&d2=hungarianAlgorithm
			
 
				+
			
 
				+///
			
 
				+/// \brief The AssignmentProblemSolver class
			
 
				+///
			
 
				+class AssignmentProblemSolver
			
 
				+{
			
 
				+private:
			
 
				+	// Computes the optimal assignment (minimum overall costs) using Munkres algorithm.
			
 
				+	void assignmentoptimal(assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns);
			
 
				+	void buildassignmentvector(assignments_t& assignment, bool *starMatrix, size_t nOfRows, size_t nOfColumns);
			
 
				+	void computeassignmentcost(const assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows);
			
 
				+	void step2a(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim);
			
 
				+	void step2b(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim);
			
 
				+	void step3_5(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim);
			
 
				+	void step4(assignments_t& assignment, track_t *distMatrix, bool *starMatrix, bool *newStarMatrix, bool *primeMatrix, bool *coveredColumns, bool *coveredRows, size_t nOfRows, size_t nOfColumns, size_t minDim, size_t row, size_t col);
			
 
				+
			
 
				+	// Computes a suboptimal solution. Good for cases with many forbidden assignments.
			
 
				+	void assignmentsuboptimal1(assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns);
			
 
				+	// Computes a suboptimal solution. Good for cases with many forbidden assignments.
			
 
				+	void assignmentsuboptimal2(assignments_t& assignment, track_t& cost, const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns);
			
 
				+
			
 
				+public:
			
 
				+	enum TMethod
			
 
				+	{
			
 
				+		optimal,
			
 
				+		many_forbidden_assignments,
			
 
				+		without_forbidden_assignments
			
 
				+	};
			
 
				+
			
 
				+	AssignmentProblemSolver();
			
 
				+	~AssignmentProblemSolver();
			
 
				+	track_t Solve(const distMatrix_t& distMatrixIn, size_t nOfRows, size_t nOfColumns, assignments_t& assignment, TMethod Method = optimal);
			
 
				+};
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/Tracker/Kalman.cpp
+++ b/src/api/perception_lidar_PointPillars_MultiHead/Tracker/Kalman.cpp
@@ -0,0 +1,873 @@
 
				+#include "Kalman.h"
			
 
				+#include <iostream>
			
 
				+#include <vector>
			
 
				+
			
 
				+//---------------------------------------------------------------------------
			
 
				+TKalmanFilter::TKalmanFilter(
			
 
				+        tracking::KalmanType type,
			
 
				+	    bool useAcceleration,
			
 
				+        track_t deltaTime, // time increment (lower values makes target more "massive")
			
 
				+        track_t accelNoiseMag
			
 
				+        )
			
 
				+    :
			
 
				+      m_type(type),
			
 
				+      m_initialized(false),
			
 
				+      m_deltaTime(deltaTime),
			
 
				+      m_deltaTimeMin(deltaTime),
			
 
				+      m_deltaTimeMax(2 * deltaTime),
			
 
				+      m_lastDist(0),
			
 
				+      m_accelNoiseMag(accelNoiseMag),
			
 
				+	  m_useAcceleration(useAcceleration)
			
 
				+{
			
 
				+    m_deltaStep = (m_deltaTimeMax - m_deltaTimeMin) / m_deltaStepsCount;
			
 
				+}
			
 
				+
			
 
				+//---------------------------------------------------------------------------
			
 
				+void TKalmanFilter::CreateLinear(cv::Point3f xy0, cv::Point3f xyv0)
			
 
				+{
			
 
				+    // We don't know acceleration, so, assume it to process noise.
			
 
				+    // But we can guess, the range of acceleration values thich can be achieved by tracked object.
			
 
				+    // Process noise. (standard deviation of acceleration: m/s^2)
			
 
				+    // shows, woh much target can accelerate.
			
 
				+
			
 
				+    // 4 state variables, 2 measurements
			
 
				+    m_linearKalman.init(4, 2, 0, El_t);
			
 
				+    // Transition cv::Matrix
			
 
				+    m_linearKalman.transitionMatrix = (cv::Mat_<track_t>(4, 4) <<
			
 
				+                                        1, 0, m_deltaTime, 0,
			
 
				+                                        0, 1, 0, m_deltaTime,
			
 
				+                                        0, 0, 1, 0,
			
 
				+                                        0, 0, 0, 1);
			
 
				+
			
 
				+    // init...
			
 
				+    m_lastPointResult = xy0;
			
 
				+    m_linearKalman.statePre.at<track_t>(0) = xy0.x;  // x
			
 
				+    m_linearKalman.statePre.at<track_t>(1) = xy0.y;  // y
			
 
				+    m_linearKalman.statePre.at<track_t>(2) = xyv0.x; // vx
			
 
				+    m_linearKalman.statePre.at<track_t>(3) = xyv0.y; // vy
			
 
				+
			
 
				+    m_linearKalman.statePost.at<track_t>(0) = xy0.x;
			
 
				+    m_linearKalman.statePost.at<track_t>(1) = xy0.y;
			
 
				+    m_linearKalman.statePost.at<track_t>(2) = xyv0.x;
			
 
				+    m_linearKalman.statePost.at<track_t>(3) = xyv0.y;
			
 
				+
			
 
				+    cv::setIdentity(m_linearKalman.measurementMatrix);
			
 
				+
			
 
				+    m_linearKalman.processNoiseCov = (cv::Mat_<track_t>(4, 4) <<
			
 
				+                                       pow(m_deltaTime,4.0)/4.0	,0						,pow(m_deltaTime,3.0)/2.0		,0,
			
 
				+                                       0						,pow(m_deltaTime,4.0)/4.0	,0							,pow(m_deltaTime,3.0)/2.0,
			
 
				+                                       pow(m_deltaTime,3.0)/2.0	,0						,pow(m_deltaTime,2.0)			,0,
			
 
				+                                       0						,pow(m_deltaTime,3.0)/2.0	,0							,pow(m_deltaTime,2.0));
			
 
				+
			
 
				+
			
 
				+    m_linearKalman.processNoiseCov *= m_accelNoiseMag;
			
 
				+
			
 
				+    cv::setIdentity(m_linearKalman.measurementNoiseCov, cv::Scalar::all(0.1));
			
 
				+
			
 
				+    cv::setIdentity(m_linearKalman.errorCovPost, cv::Scalar::all(.1));
			
 
				+
			
 
				+    m_initialized = true;
			
 
				+}
			
 
				+//---------------------------------------------------------------------------
			
 
				+void TKalmanFilter::CreateLinear(cv::Rect_<track_t> rect0, Point_t rectv0)
			
 
				+{
			
 
				+    // We don't know acceleration, so, assume it to process noise.
			
 
				+    // But we can guess, the range of acceleration values thich can be achieved by tracked object.
			
 
				+    // Process noise. (standard deviation of acceleration: m/s^2)
			
 
				+    // shows, woh much target can accelerate.
			
 
				+
			
 
				+    // 8 state variables (x, y, vx, vy, width, height, vw, vh), 4 measurements (x, y, width, height)
			
 
				+    m_linearKalman.init(8, 4, 0, El_t);
			
 
				+    // Transition cv::Matrix
			
 
				+    m_linearKalman.transitionMatrix = (cv::Mat_<track_t>(8, 8) <<
			
 
				+                                        1, 0, 0, 0, m_deltaTime, 0,           0,           0,
			
 
				+                                        0, 1, 0, 0, 0,           m_deltaTime, 0,           0,
			
 
				+                                        0, 0, 1, 0, 0,           0,           m_deltaTime, 0,
			
 
				+                                        0, 0, 0, 1, 0,           0,           0,           m_deltaTime,
			
 
				+                                        0, 0, 0, 0, 1,           0,           0,           0,
			
 
				+                                        0, 0, 0, 0, 0,           1,           0,           0,
			
 
				+                                        0, 0, 0, 0, 0,           0,           1,           0,
			
 
				+                                        0, 0, 0, 0, 0,           0,           0,           1);
			
 
				+
			
 
				+    // init...
			
 
				+    m_linearKalman.statePre.at<track_t>(0) = rect0.x;      // x
			
 
				+    m_linearKalman.statePre.at<track_t>(1) = rect0.y;      // y
			
 
				+    m_linearKalman.statePre.at<track_t>(2) = rect0.width;  // width
			
 
				+    m_linearKalman.statePre.at<track_t>(3) = rect0.height; // height
			
 
				+    m_linearKalman.statePre.at<track_t>(4) = rectv0.x;     // dx
			
 
				+    m_linearKalman.statePre.at<track_t>(5) = rectv0.y;     // dy
			
 
				+    m_linearKalman.statePre.at<track_t>(6) = 0;            // dw
			
 
				+    m_linearKalman.statePre.at<track_t>(7) = 0;            // dh
			
 
				+
			
 
				+    m_linearKalman.statePost.at<track_t>(0) = rect0.x;
			
 
				+    m_linearKalman.statePost.at<track_t>(1) = rect0.y;
			
 
				+    m_linearKalman.statePost.at<track_t>(2) = rect0.width;
			
 
				+    m_linearKalman.statePost.at<track_t>(3) = rect0.height;
			
 
				+    m_linearKalman.statePost.at<track_t>(4) = rectv0.x;
			
 
				+    m_linearKalman.statePost.at<track_t>(5) = rectv0.y;
			
 
				+    m_linearKalman.statePost.at<track_t>(6) = 0;
			
 
				+    m_linearKalman.statePost.at<track_t>(7) = 0;
			
 
				+
			
 
				+    cv::setIdentity(m_linearKalman.measurementMatrix);
			
 
				+
			
 
				+    track_t n1 = pow(m_deltaTime, 4.f) / 4.f;
			
 
				+    track_t n2 = pow(m_deltaTime, 3.f) / 2.f;
			
 
				+    track_t n3 = pow(m_deltaTime, 2.f);
			
 
				+    m_linearKalman.processNoiseCov = (cv::Mat_<track_t>(8, 8) <<
			
 
				+                                       n1, 0,  0,  0,  n2, 0,  0,  0,
			
 
				+                                       0,  n1, 0,  0,  0,  n2, 0,  0,
			
 
				+                                       0,  0,  n1, 0,  0,  0,  n2, 0,
			
 
				+                                       0,  0,  0,  n1, 0,  0,  0,  n2,
			
 
				+                                       n2, 0,  0,  0,  n3, 0,  0,  0,
			
 
				+                                       0,  n2, 0,  0,  0,  n3, 0,  0,
			
 
				+                                       0,  0,  n2, 0,  0,  0,  n3, 0,
			
 
				+                                       0,  0,  0,  n2, 0,  0,  0,  n3);
			
 
				+
			
 
				+    m_linearKalman.processNoiseCov *= m_accelNoiseMag;
			
 
				+
			
 
				+    cv::setIdentity(m_linearKalman.measurementNoiseCov, cv::Scalar::all(0.1));
			
 
				+
			
 
				+    cv::setIdentity(m_linearKalman.errorCovPost, cv::Scalar::all(.1));
			
 
				+
			
 
				+    m_initialized = true;
			
 
				+}
			
 
				+//---------------------------------------------------------------------------
			
 
				+void TKalmanFilter::CreateLinear3D(Rect3D rect0, cv::Point3f rectv0)
			
 
				+{
			
 
				+    // We don't know acceleration, so, assume it to process noise.
			
 
				+    // But we can guess, the range of acceleration values thich can be achieved by tracked object.
			
 
				+    // Process noise. (standard deviation of acceleration: m/s^2)
			
 
				+    // shows, woh much target can accelerate.
			
 
				+
			
 
				+    // 14 state variables (x, y, z, width, height, length, yaw, d...), 7 measurements (x, y, z, width, height, length, yaw)
			
 
				+    m_linearKalman.init(14, 7, 0, El_t);
			
 
				+    // Transition cv::Matrix
			
 
				+    m_linearKalman.transitionMatrix = (cv::Mat_<track_t>(14, 14) <<
			
 
				+                                        1, 0, 0, 0, 0, 0, 0, m_deltaTime, 0,           0,           0,           0,           0,           0,
			
 
				+                                        0, 1, 0, 0, 0, 0, 0, 0,           m_deltaTime, 0,           0,           0,           0,           0,
			
 
				+                                        0, 0, 1, 0, 0, 0, 0, 0,           0,           m_deltaTime, 0,           0,           0,           0,
			
 
				+                                        0, 0, 0, 1, 0, 0, 0, 0,           0,           0,           m_deltaTime, 0,           0,           0,
			
 
				+                                        0, 0, 0, 0, 1, 0, 0, 0,           0,           0,           0,           m_deltaTime, 0,           0,
			
 
				+                                        0, 0, 0, 0, 0, 1, 0, 0,           0,           0,           0,           0,           m_deltaTime, 0,
			
 
				+                                        0, 0, 0, 0, 0, 0, 1, 0,           0,           0,           0,           0,           0,           m_deltaTime,
			
 
				+                                        0, 0, 0, 0, 0, 0, 0, 1,           0,           0,           0,           0,           0,           0,
			
 
				+                                        0, 0, 0, 0, 0, 0, 0, 0,           1,           0,           0,           0,           0,           0,
			
 
				+                                        0, 0, 0, 0, 0, 0, 0, 0,           0,           1,           0,           0,           0,           0,
			
 
				+                                        0, 0, 0, 0, 0, 0, 0, 0,           0,           0,           1,           0,           0,           0,
			
 
				+                                        0, 0, 0, 0, 0, 0, 0, 0,           0,           0,           0,           1,           0,           0,
			
 
				+                                        0, 0, 0, 0, 0, 0, 0, 0,           0,           0,           0,           0,           1,           0,
			
 
				+                                        0, 0, 0, 0, 0, 0, 0, 0,           0,           0,           0,           0,           0,           1);
			
 
				+
			
 
				+    // init...
			
 
				+    m_linearKalman.statePre.at<track_t>(0) = rect0.center.x;      // x
			
 
				+    m_linearKalman.statePre.at<track_t>(1) = rect0.center.y;      // y
			
 
				+    m_linearKalman.statePre.at<track_t>(2) = rect0.center.z;      // z
			
 
				+    m_linearKalman.statePre.at<track_t>(3) = rect0.size.width;  // width
			
 
				+    m_linearKalman.statePre.at<track_t>(4) = rect0.size.height; // height
			
 
				+    m_linearKalman.statePre.at<track_t>(5) = rect0.size.length; // length
			
 
				+    m_linearKalman.statePre.at<track_t>(6) = rect0.yaw; // yaw
			
 
				+    m_linearKalman.statePre.at<track_t>(7) = rectv0.x;     // dx
			
 
				+    m_linearKalman.statePre.at<track_t>(8) = rectv0.y;     // dy
			
 
				+    m_linearKalman.statePre.at<track_t>(9) = 0;            // dz
			
 
				+    m_linearKalman.statePre.at<track_t>(10) = 0;            // dw
			
 
				+    m_linearKalman.statePre.at<track_t>(11) = 0;            // dh
			
 
				+    m_linearKalman.statePre.at<track_t>(12) = 0;            // dl
			
 
				+    m_linearKalman.statePre.at<track_t>(13) = 0;            // dyaw
			
 
				+
			
 
				+    m_linearKalman.statePost.at<track_t>(0) = rect0.center.x;
			
 
				+    m_linearKalman.statePost.at<track_t>(1) = rect0.center.y;
			
 
				+    m_linearKalman.statePost.at<track_t>(2) = rect0.center.z;
			
 
				+    m_linearKalman.statePost.at<track_t>(3) = rect0.size.width;
			
 
				+    m_linearKalman.statePost.at<track_t>(4) = rect0.size.height;
			
 
				+    m_linearKalman.statePost.at<track_t>(5) = rect0.size.length;
			
 
				+    m_linearKalman.statePost.at<track_t>(6) = rect0.yaw;
			
 
				+    m_linearKalman.statePost.at<track_t>(7) = rectv0.x;
			
 
				+    m_linearKalman.statePost.at<track_t>(8) = rectv0.y;
			
 
				+    m_linearKalman.statePost.at<track_t>(9) = 0;
			
 
				+    m_linearKalman.statePost.at<track_t>(10) = 0;
			
 
				+    m_linearKalman.statePost.at<track_t>(11) = 0;
			
 
				+    m_linearKalman.statePost.at<track_t>(12) = 0;
			
 
				+    m_linearKalman.statePost.at<track_t>(13) = 0;
			
 
				+
			
 
				+    cv::setIdentity(m_linearKalman.measurementMatrix);
			
 
				+
			
 
				+    track_t n1 = pow(m_deltaTime, 4.f) / 4.f;
			
 
				+    track_t n2 = pow(m_deltaTime, 3.f) / 2.f;
			
 
				+    track_t n3 = pow(m_deltaTime, 2.f);
			
 
				+    m_linearKalman.processNoiseCov = (cv::Mat_<track_t>(14, 14) <<
			
 
				+                                       n1, 0,  0,  0,  0,  0,  0,  n2, 0,  0,  0,  0,  0,  0,
			
 
				+                                       0,  n1, 0,  0,  0,  0,  0,  0,  n2, 0,  0,  0,  0,  0,
			
 
				+                                       0,  0,  n1, 0,  0,  0,  0,  0,  0,  n2, 0,  0,  0,  0,
			
 
				+                                       0,  0,  0,  n1, 0,  0,  0,  0,  0,  0,  n2, 0,  0,  0,
			
 
				+                                       0,  0,  0,  0,  n1, 0,  0,  0,  0,  0,  0,  n2, 0,  0,
			
 
				+                                       0,  0,  0,  0,  0,  n1, 0,  0,  0,  0,  0,  0,  n2, 0,
			
 
				+                                       0,  0,  0,  0,  0,  0,  n1, 0,  0,  0,  0,  0,  0,  n2,
			
 
				+                                       n2, 0,  0,  0,  0,  0,  0,  n3, 0,  0,  0,  0,  0,  0,
			
 
				+                                       0,  n2, 0,  0,  0,  0,  0,  0,  n3, 0,  0,  0,  0,  0,
			
 
				+                                       0,  0,  n2, 0,  0,  0,  0,  0,  0,  n3, 0,  0,  0,  0,
			
 
				+                                       0,  0,  0,  n2, 0,  0,  0,  0,  0,  0,  n3, 0,  0,  0,
			
 
				+                                       0,  0,  0,  0,  n2, 0,  0,  0,  0,  0,  0,  n3, 0,  0,
			
 
				+                                       0,  0,  0,  0,  0,  n2, 0,  0,  0,  0,  0,  0,  n3, 0,
			
 
				+                                       0,  0,  0,  0,  0,  0,  n2, 0,  0,  0,  0,  0,  0,  n3);
			
 
				+
			
 
				+    m_linearKalman.processNoiseCov *= m_accelNoiseMag;
			
 
				+
			
 
				+    cv::setIdentity(m_linearKalman.measurementNoiseCov, cv::Scalar::all(0.1));
			
 
				+
			
 
				+    cv::setIdentity(m_linearKalman.errorCovPost, cv::Scalar::all(.1));
			
 
				+
			
 
				+    m_initialized = true;
			
 
				+}
			
 
				+
			
 
				+//---------------------------------------------------------------------------
			
 
				+void TKalmanFilter::CreateLinearAcceleration(cv::Point3f xy0, cv::Point3f xyv0)
			
 
				+{
			
 
				+	// 6 state variables, 2 measurements
			
 
				+	m_linearKalman.init(6, 2, 0, El_t);
			
 
				+	// Transition cv::Matrix
			
 
				+	const track_t dt = m_deltaTime;
			
 
				+	const track_t dt2 = 0.5f * m_deltaTime * m_deltaTime;
			
 
				+	m_linearKalman.transitionMatrix = (cv::Mat_<track_t>(6, 6) <<
			
 
				+		1, 0, dt, 0,  dt2, 0,
			
 
				+		0, 1, 0,  dt, 0,   dt2,
			
 
				+		0, 0, 1,  0,  dt,  0,
			
 
				+		0, 0, 0,  1,  0,   dt,
			
 
				+	    0, 0, 0,  0,  1,   0,
			
 
				+	    0, 0, 0,  0,  0,   1);
			
 
				+
			
 
				+	// init...
			
 
				+	m_lastPointResult = xy0;
			
 
				+	m_linearKalman.statePre.at<track_t>(0) = xy0.x;  // x
			
 
				+	m_linearKalman.statePre.at<track_t>(1) = xy0.y;  // y
			
 
				+	m_linearKalman.statePre.at<track_t>(2) = xyv0.x; // vx
			
 
				+	m_linearKalman.statePre.at<track_t>(3) = xyv0.y; // vy
			
 
				+	m_linearKalman.statePre.at<track_t>(4) = 0;      // ax
			
 
				+	m_linearKalman.statePre.at<track_t>(5) = 0;      // ay
			
 
				+
			
 
				+	m_linearKalman.statePost.at<track_t>(0) = xy0.x;
			
 
				+	m_linearKalman.statePost.at<track_t>(1) = xy0.y;
			
 
				+	m_linearKalman.statePost.at<track_t>(2) = xyv0.x;
			
 
				+	m_linearKalman.statePost.at<track_t>(3) = xyv0.y;
			
 
				+	m_linearKalman.statePost.at<track_t>(4) = 0;
			
 
				+	m_linearKalman.statePost.at<track_t>(5) = 0;
			
 
				+
			
 
				+	cv::setIdentity(m_linearKalman.measurementMatrix);
			
 
				+
			
 
				+	track_t n1 = pow(m_deltaTime, 4.f) / 4.f;
			
 
				+	track_t n2 = pow(m_deltaTime, 3.f) / 2.f;
			
 
				+	track_t n3 = pow(m_deltaTime, 2.f);
			
 
				+	m_linearKalman.processNoiseCov = (cv::Mat_<track_t>(6, 6) <<
			
 
				+		n1, 0, n2, 0, n2, 0,
			
 
				+		0, n1, 0, n2, 0, n2,
			
 
				+		n2, 0, n3, 0, n3, 0,
			
 
				+		0, n2, 0, n3, 0, n3,
			
 
				+		0, 0, n2, 0, n3, 0,
			
 
				+		0, 0, 0, n2, 0, n3);
			
 
				+
			
 
				+	m_linearKalman.processNoiseCov *= m_accelNoiseMag;
			
 
				+
			
 
				+	cv::setIdentity(m_linearKalman.measurementNoiseCov, cv::Scalar::all(0.1));
			
 
				+
			
 
				+	cv::setIdentity(m_linearKalman.errorCovPost, cv::Scalar::all(.1));
			
 
				+
			
 
				+	m_initialized = true;
			
 
				+}
			
 
				+//---------------------------------------------------------------------------
			
 
				+void TKalmanFilter::CreateLinearAcceleration(cv::Rect_<track_t> rect0, Point_t rectv0)
			
 
				+{
			
 
				+    // 12 state variables (x, y, vx, vy, ax, ay, width, height, vw, vh, aw, ah), 4 measurements (x, y, width, height)
			
 
				+    m_linearKalman.init(12, 4, 0, El_t);
			
 
				+    // Transition cv::Matrix
			
 
				+    const track_t dt = m_deltaTime;
			
 
				+    const track_t dt2 = 0.5f * m_deltaTime * m_deltaTime;
			
 
				+    m_linearKalman.transitionMatrix = (cv::Mat_<track_t>(12, 12) <<
			
 
				+        1, 0, 0, 0, dt, 0,  0,  0,  dt2, 0,   dt2, 0,
			
 
				+        0, 1, 0, 0, 0,  dt, 0,  0,  0,   dt2, 0,   dt2,
			
 
				+        0, 0, 1, 0, 0,  0,  dt, 0,  0,   0,   dt2, 0,
			
 
				+        0, 0, 0, 1, 0,  0,  0,  dt, 0,   0,   0,   dt2,
			
 
				+        0, 0, 0, 0, 1,  0,  0,  0,  dt,  0,   0,   0,
			
 
				+        0, 0, 0, 0, 0,  1,  0,  0,  0,   dt,  0,   0,
			
 
				+        0, 0, 0, 0, 0,  0,  1,  0,  0,   0,   dt,  0,
			
 
				+        0, 0, 0, 0, 0,  0,  0,  1,  0,   0,   0,   dt,
			
 
				+        0, 0, 0, 0, 0,  0,  0,  0,  1,   0,   0,   0,
			
 
				+        0, 0, 0, 0, 0,  0,  0,  0,  0,   1,   0,   0,
			
 
				+        0, 0, 0, 0, 0,  0,  0,  0,  0,   0,   0,   1);
			
 
				+
			
 
				+    // init...
			
 
				+    m_linearKalman.statePre.at<track_t>(0) = rect0.x;      // x
			
 
				+    m_linearKalman.statePre.at<track_t>(1) = rect0.y;      // y
			
 
				+    m_linearKalman.statePre.at<track_t>(2) = rect0.width;  // width
			
 
				+    m_linearKalman.statePre.at<track_t>(3) = rect0.height; // height
			
 
				+    m_linearKalman.statePre.at<track_t>(4) = rectv0.x;     // dx
			
 
				+    m_linearKalman.statePre.at<track_t>(5) = rectv0.y;     // dy
			
 
				+    m_linearKalman.statePre.at<track_t>(6) = 0;            // dw
			
 
				+    m_linearKalman.statePre.at<track_t>(7) = 0;            // dh
			
 
				+    m_linearKalman.statePre.at<track_t>(8) = 0;            // ax
			
 
				+    m_linearKalman.statePre.at<track_t>(9) = 0;            // ay
			
 
				+    m_linearKalman.statePre.at<track_t>(10) = 0;           // aw
			
 
				+    m_linearKalman.statePre.at<track_t>(11) = 0;           // ah
			
 
				+
			
 
				+    m_linearKalman.statePost.at<track_t>(0) = rect0.x;
			
 
				+    m_linearKalman.statePost.at<track_t>(1) = rect0.y;
			
 
				+    m_linearKalman.statePost.at<track_t>(2) = rect0.width;
			
 
				+    m_linearKalman.statePost.at<track_t>(3) = rect0.height;
			
 
				+    m_linearKalman.statePost.at<track_t>(4) = rectv0.x;
			
 
				+    m_linearKalman.statePost.at<track_t>(5) = rectv0.y;
			
 
				+    m_linearKalman.statePost.at<track_t>(6) = 0;
			
 
				+    m_linearKalman.statePost.at<track_t>(7) = 0;
			
 
				+    m_linearKalman.statePost.at<track_t>(8) = 0;
			
 
				+    m_linearKalman.statePost.at<track_t>(9) = 0;
			
 
				+    m_linearKalman.statePost.at<track_t>(10) = 0;
			
 
				+    m_linearKalman.statePost.at<track_t>(11) = 0;
			
 
				+
			
 
				+    cv::setIdentity(m_linearKalman.measurementMatrix);
			
 
				+
			
 
				+    track_t n1 = pow(m_deltaTime, 4.f) / 4.f;
			
 
				+    track_t n2 = pow(m_deltaTime, 3.f) / 2.f;
			
 
				+    track_t n3 = pow(m_deltaTime, 2.f);
			
 
				+    m_linearKalman.processNoiseCov = (cv::Mat_<track_t>(12, 12) <<
			
 
				+        n1, 0, 0, 0, n2, 0, 0, 0, n2, 0, n2,
			
 
				+        0, n1, 0, 0, 0, n2, 0, 0, 0, n2, 0, n2,
			
 
				+        0, 0, n1, 0, 0, 0, n2, 0, 0, 0, n2, 0,
			
 
				+        0, 0, 0, n1, 0, 0, 0, n2, 0, 0, 0, n2,
			
 
				+        n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, n3, 0,
			
 
				+        0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, n3,
			
 
				+        0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0,
			
 
				+        0, 0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3,
			
 
				+        n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, 0, 0,
			
 
				+        0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, 0,
			
 
				+        0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0,
			
 
				+        0, 0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3);
			
 
				+
			
 
				+    m_linearKalman.processNoiseCov *= m_accelNoiseMag;
			
 
				+
			
 
				+    cv::setIdentity(m_linearKalman.measurementNoiseCov, cv::Scalar::all(0.1));
			
 
				+
			
 
				+    cv::setIdentity(m_linearKalman.errorCovPost, cv::Scalar::all(.1));
			
 
				+
			
 
				+    m_initialized = true;
			
 
				+}
			
 
				+//---------------------------------------------------------------------------
			
 
				+void TKalmanFilter::CreateLinearAcceleration3D(Rect3D rect0, cv::Point3f rectv0)
			
 
				+{
			
 
				+	// 12 state variables (x, y, vx, vy, ax, ay, width, height, vw, vh, aw, ah), 4 measurements (x, y, width, height)
			
 
				+	m_linearKalman.init(12, 4, 0, El_t);
			
 
				+	// Transition cv::Matrix
			
 
				+	const track_t dt = m_deltaTime;
			
 
				+	const track_t dt2 = 0.5f * m_deltaTime * m_deltaTime;
			
 
				+	m_linearKalman.transitionMatrix = (cv::Mat_<track_t>(12, 12) <<
			
 
				+		1, 0, 0, 0, dt, 0,  0,  0,  dt2, 0,   dt2, 0,
			
 
				+		0, 1, 0, 0, 0,  dt, 0,  0,  0,   dt2, 0,   dt2,
			
 
				+		0, 0, 1, 0, 0,  0,  dt, 0,  0,   0,   dt2, 0,
			
 
				+		0, 0, 0, 1, 0,  0,  0,  dt, 0,   0,   0,   dt2,
			
 
				+		0, 0, 0, 0, 1,  0,  0,  0,  dt,  0,   0,   0,
			
 
				+		0, 0, 0, 0, 0,  1,  0,  0,  0,   dt,  0,   0,
			
 
				+		0, 0, 0, 0, 0,  0,  1,  0,  0,   0,   dt,  0,
			
 
				+		0, 0, 0, 0, 0,  0,  0,  1,  0,   0,   0,   dt,
			
 
				+		0, 0, 0, 0, 0,  0,  0,  0,  1,   0,   0,   0,
			
 
				+		0, 0, 0, 0, 0,  0,  0,  0,  0,   1,   0,   0,
			
 
				+		0, 0, 0, 0, 0,  0,  0,  0,  0,   0,   0,   1);
			
 
				+
			
 
				+	// init...
			
 
				+    m_linearKalman.statePre.at<track_t>(0) = rect0.center.x;      // x
			
 
				+    m_linearKalman.statePre.at<track_t>(1) = rect0.center.y;      // y
			
 
				+    m_linearKalman.statePre.at<track_t>(2) = rect0.size.width;  // width
			
 
				+    m_linearKalman.statePre.at<track_t>(3) = rect0.size.height; // height
			
 
				+	m_linearKalman.statePre.at<track_t>(4) = rectv0.x;     // dx
			
 
				+	m_linearKalman.statePre.at<track_t>(5) = rectv0.y;     // dy
			
 
				+	m_linearKalman.statePre.at<track_t>(6) = 0;            // dw
			
 
				+	m_linearKalman.statePre.at<track_t>(7) = 0;            // dh
			
 
				+	m_linearKalman.statePre.at<track_t>(8) = 0;            // ax
			
 
				+	m_linearKalman.statePre.at<track_t>(9) = 0;            // ay
			
 
				+	m_linearKalman.statePre.at<track_t>(10) = 0;           // aw
			
 
				+	m_linearKalman.statePre.at<track_t>(11) = 0;           // ah
			
 
				+
			
 
				+    m_linearKalman.statePost.at<track_t>(0) = rect0.center.x;
			
 
				+    m_linearKalman.statePost.at<track_t>(1) = rect0.center.y;
			
 
				+    m_linearKalman.statePost.at<track_t>(2) = rect0.size.width;
			
 
				+    m_linearKalman.statePost.at<track_t>(3) = rect0.size.height;
			
 
				+	m_linearKalman.statePost.at<track_t>(4) = rectv0.x;
			
 
				+	m_linearKalman.statePost.at<track_t>(5) = rectv0.y;
			
 
				+	m_linearKalman.statePost.at<track_t>(6) = 0;
			
 
				+	m_linearKalman.statePost.at<track_t>(7) = 0;
			
 
				+	m_linearKalman.statePost.at<track_t>(8) = 0;
			
 
				+	m_linearKalman.statePost.at<track_t>(9) = 0;
			
 
				+	m_linearKalman.statePost.at<track_t>(10) = 0;
			
 
				+	m_linearKalman.statePost.at<track_t>(11) = 0;
			
 
				+
			
 
				+	cv::setIdentity(m_linearKalman.measurementMatrix);
			
 
				+
			
 
				+	track_t n1 = pow(m_deltaTime, 4.f) / 4.f;
			
 
				+	track_t n2 = pow(m_deltaTime, 3.f) / 2.f;
			
 
				+	track_t n3 = pow(m_deltaTime, 2.f);
			
 
				+	m_linearKalman.processNoiseCov = (cv::Mat_<track_t>(12, 12) <<
			
 
				+		n1, 0, 0, 0, n2, 0, 0, 0, n2, 0, n2,
			
 
				+		0, n1, 0, 0, 0, n2, 0, 0, 0, n2, 0, n2,
			
 
				+		0, 0, n1, 0, 0, 0, n2, 0, 0, 0, n2, 0,
			
 
				+		0, 0, 0, n1, 0, 0, 0, n2, 0, 0, 0, n2,
			
 
				+		n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, n3, 0,
			
 
				+		0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, n3,
			
 
				+		0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0,
			
 
				+		0, 0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3,
			
 
				+		n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, 0, 0,
			
 
				+		0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0, 0,
			
 
				+		0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3, 0,
			
 
				+		0, 0, 0, n2, 0, 0, 0, n3, 0, 0, 0, n3);
			
 
				+
			
 
				+	m_linearKalman.processNoiseCov *= m_accelNoiseMag;
			
 
				+
			
 
				+	cv::setIdentity(m_linearKalman.measurementNoiseCov, cv::Scalar::all(0.1));
			
 
				+
			
 
				+	cv::setIdentity(m_linearKalman.errorCovPost, cv::Scalar::all(.1));
			
 
				+
			
 
				+	m_initialized = true;
			
 
				+}
			
 
				+
			
 
				+//---------------------------------------------------------------------------
			
 
				+cv::Point3f TKalmanFilter::GetPointPrediction()
			
 
				+{
			
 
				+    if (m_initialized)
			
 
				+    {
			
 
				+        cv::Mat prediction;
			
 
				+
			
 
				+        switch (m_type)
			
 
				+        {
			
 
				+        case tracking::KalmanLinear:
			
 
				+            prediction = m_linearKalman.predict();
			
 
				+            break;
			
 
				+        }
			
 
				+
			
 
				+        m_lastPointResult = cv::Point3f(prediction.at<track_t>(0), prediction.at<track_t>(1), prediction.at<track_t>(2));
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+
			
 
				+    }
			
 
				+    return m_lastPointResult;
			
 
				+}
			
 
				+
			
 
				+//---------------------------------------------------------------------------
			
 
				+cv::Point3f TKalmanFilter::Update(cv::Point3f pt, bool dataCorrect)
			
 
				+{
			
 
				+    if (!m_initialized)
			
 
				+    {
			
 
				+        if (m_initialPoints.size() < MIN_INIT_VALS)
			
 
				+        {
			
 
				+            if (dataCorrect)
			
 
				+            {
			
 
				+                m_initialPoints.push_back(pt);
			
 
				+                m_lastPointResult = pt;
			
 
				+            }
			
 
				+        }
			
 
				+        if (m_initialPoints.size() == MIN_INIT_VALS)
			
 
				+        {
			
 
				+            track_t kx = 0;
			
 
				+            track_t bx = 0;
			
 
				+            track_t ky = 0;
			
 
				+            track_t by = 0;
			
 
				+            get_lin_regress_params(m_initialPoints, 0, MIN_INIT_VALS, kx, bx, ky, by);//predict p,v
			
 
				+            cv::Point3f xy0(kx * (MIN_INIT_VALS - 1) + bx, ky * (MIN_INIT_VALS - 1) + by, m_lastPointResult.z);
			
 
				+            cv::Point3f xyv0(kx, ky,0);
			
 
				+
			
 
				+            switch (m_type)
			
 
				+            {
			
 
				+            case tracking::KalmanLinear:
			
 
				+                if (m_useAcceleration)
			
 
				+					CreateLinearAcceleration(xy0, xyv0);
			
 
				+				else
			
 
				+					CreateLinear(xy0, xyv0);
			
 
				+                break;
			
 
				+            }
			
 
				+            m_lastDist = 0;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if (m_initialized)
			
 
				+    {
			
 
				+        cv::Mat measurement(2, 1, Mat_t(1));
			
 
				+        if (!dataCorrect)
			
 
				+        {
			
 
				+            measurement.at<track_t>(0) = m_lastPointResult.x;  //update using prediction
			
 
				+            measurement.at<track_t>(1) = m_lastPointResult.y;
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            measurement.at<track_t>(0) = pt.x;  //update using measurements
			
 
				+            measurement.at<track_t>(1) = pt.y;
			
 
				+        }
			
 
				+        // Correction
			
 
				+        cv::Mat estimated;
			
 
				+        switch (m_type)
			
 
				+        {
			
 
				+        case tracking::KalmanLinear:
			
 
				+        {
			
 
				+            estimated = m_linearKalman.correct(measurement);
			
 
				+
			
 
				+            // Inertia correction
			
 
				+			if (!m_useAcceleration)
			
 
				+			{
			
 
				+				track_t currDist = sqrtf(sqr(estimated.at<track_t>(0) - pt.x) + sqr(estimated.at<track_t>(1) - pt.y));
			
 
				+				if (currDist > m_lastDist)
			
 
				+				{
			
 
				+					m_deltaTime = std::min(m_deltaTime + m_deltaStep, m_deltaTimeMax);
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					m_deltaTime = std::max(m_deltaTime - m_deltaStep, m_deltaTimeMin);
			
 
				+				}
			
 
				+				m_lastDist = currDist;
			
 
				+
			
 
				+				m_linearKalman.transitionMatrix.at<track_t>(0, 2) = m_deltaTime;
			
 
				+				m_linearKalman.transitionMatrix.at<track_t>(1, 3) = m_deltaTime;
			
 
				+			}
			
 
				+            break;
			
 
				+        }
			
 
				+        }
			
 
				+
			
 
				+        m_lastPointResult.x = estimated.at<track_t>(0);   //update using measurements
			
 
				+        m_lastPointResult.y = estimated.at<track_t>(1);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        if (dataCorrect)
			
 
				+        {
			
 
				+            m_lastPointResult = pt;
			
 
				+        }
			
 
				+    }
			
 
				+    return m_lastPointResult;
			
 
				+}
			
 
				+//---------------------------------------------------------------------------
			
 
				+cv::Rect TKalmanFilter::GetRectPrediction()
			
 
				+{
			
 
				+    if (m_initialized)
			
 
				+    {
			
 
				+        cv::Mat prediction;
			
 
				+
			
 
				+        switch (m_type)
			
 
				+        {
			
 
				+        case tracking::KalmanLinear:
			
 
				+            prediction = m_linearKalman.predict();
			
 
				+            break;
			
 
				+        }
			
 
				+
			
 
				+        m_lastRectResult = cv::Rect_<track_t>(prediction.at<track_t>(0), prediction.at<track_t>(1), prediction.at<track_t>(2), prediction.at<track_t>(3));
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+
			
 
				+    }
			
 
				+    return cv::Rect(static_cast<int>(m_lastRectResult.x), static_cast<int>(m_lastRectResult.y), static_cast<int>(m_lastRectResult.width), static_cast<int>(m_lastRectResult.height));
			
 
				+}
			
 
				+
			
 
				+//---------------------------------------------------------------------------
			
 
				+cv::Rect TKalmanFilter::Update(cv::Rect rect, bool dataCorrect)
			
 
				+{
			
 
				+    if (!m_initialized)
			
 
				+    {
			
 
				+        if (m_initialRects.size() < MIN_INIT_VALS)
			
 
				+        {
			
 
				+            if (dataCorrect)
			
 
				+            {
			
 
				+                m_initialRects.push_back(rect);
			
 
				+                m_lastRectResult.x = static_cast<track_t>(rect.x);
			
 
				+                m_lastRectResult.y = static_cast<track_t>(rect.y);
			
 
				+                m_lastRectResult.width = static_cast<track_t>(rect.width);
			
 
				+                m_lastRectResult.height = static_cast<track_t>(rect.height);
			
 
				+            }
			
 
				+        }
			
 
				+        if (m_initialRects.size() == MIN_INIT_VALS)
			
 
				+        {
			
 
				+            std::vector<Point_t> initialPoints;
			
 
				+            Point_t averageSize(0, 0);
			
 
				+            for (const auto& r : m_initialRects)
			
 
				+            {
			
 
				+                initialPoints.emplace_back(static_cast<track_t>(r.x), static_cast<track_t>(r.y));
			
 
				+                averageSize.x += r.width;
			
 
				+                averageSize.y += r.height;
			
 
				+            }
			
 
				+            averageSize.x /= MIN_INIT_VALS;
			
 
				+            averageSize.y /= MIN_INIT_VALS;
			
 
				+
			
 
				+            track_t kx = 0;
			
 
				+            track_t bx = 0;
			
 
				+            track_t ky = 0;
			
 
				+            track_t by = 0;
			
 
				+            get_lin_regress_params(initialPoints, 0, MIN_INIT_VALS, kx, bx, ky, by);
			
 
				+            cv::Rect_<track_t> rect0(kx * (MIN_INIT_VALS - 1) + bx, ky * (MIN_INIT_VALS - 1) + by, averageSize.x, averageSize.y);
			
 
				+            Point_t rectv0(kx, ky);
			
 
				+
			
 
				+            switch (m_type)
			
 
				+            {
			
 
				+            case tracking::KalmanLinear:
			
 
				+                if (m_useAcceleration)
			
 
				+                    CreateLinearAcceleration(rect0, rectv0);
			
 
				+                else
			
 
				+                    CreateLinear(rect0, rectv0);
			
 
				+                break;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if (m_initialized)
			
 
				+    {
			
 
				+        cv::Mat measurement(4, 1, Mat_t(1));
			
 
				+        if (!dataCorrect)
			
 
				+        {
			
 
				+            measurement.at<track_t>(0) = m_lastRectResult.x;  // update using prediction
			
 
				+            measurement.at<track_t>(1) = m_lastRectResult.y;
			
 
				+            measurement.at<track_t>(2) = m_lastRectResult.width;
			
 
				+            measurement.at<track_t>(3) = m_lastRectResult.height;
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            measurement.at<track_t>(0) = static_cast<track_t>(rect.x);  // update using measurements
			
 
				+            measurement.at<track_t>(1) = static_cast<track_t>(rect.y);
			
 
				+            measurement.at<track_t>(2) = static_cast<track_t>(rect.width);
			
 
				+            measurement.at<track_t>(3) = static_cast<track_t>(rect.height);
			
 
				+        }
			
 
				+        // Correction
			
 
				+        cv::Mat estimated;
			
 
				+        switch (m_type)
			
 
				+        {
			
 
				+        case tracking::KalmanLinear:
			
 
				+        {
			
 
				+            estimated = m_linearKalman.correct(measurement);
			
 
				+
			
 
				+            m_lastRectResult.x = estimated.at<track_t>(0);   //update using measurements
			
 
				+            m_lastRectResult.y = estimated.at<track_t>(1);
			
 
				+            m_lastRectResult.width = estimated.at<track_t>(2);
			
 
				+            m_lastRectResult.height = estimated.at<track_t>(3);
			
 
				+
			
 
				+            // Inertia correction
			
 
				+            if (!m_useAcceleration)
			
 
				+            {
			
 
				+                track_t currDist = sqrtf(sqr(estimated.at<track_t>(0) - rect.x) + sqr(estimated.at<track_t>(1) - rect.y) + sqr(estimated.at<track_t>(2) - rect.width) + sqr(estimated.at<track_t>(3) - rect.height));
			
 
				+                if (currDist > m_lastDist)
			
 
				+                {
			
 
				+                    m_deltaTime = std::min(m_deltaTime + m_deltaStep, m_deltaTimeMax);
			
 
				+                }
			
 
				+                else
			
 
				+                {
			
 
				+                    m_deltaTime = std::max(m_deltaTime - m_deltaStep, m_deltaTimeMin);
			
 
				+                }
			
 
				+                m_lastDist = currDist;
			
 
				+
			
 
				+                m_linearKalman.transitionMatrix.at<track_t>(0, 4) = m_deltaTime;
			
 
				+                m_linearKalman.transitionMatrix.at<track_t>(1, 5) = m_deltaTime;
			
 
				+                m_linearKalman.transitionMatrix.at<track_t>(2, 6) = m_deltaTime;
			
 
				+                m_linearKalman.transitionMatrix.at<track_t>(3, 7) = m_deltaTime;
			
 
				+            }
			
 
				+            break;
			
 
				+        }
			
 
				+        }
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        if (dataCorrect)
			
 
				+        {
			
 
				+            m_lastRectResult.x = static_cast<track_t>(rect.x);
			
 
				+            m_lastRectResult.y = static_cast<track_t>(rect.y);
			
 
				+            m_lastRectResult.width = static_cast<track_t>(rect.width);
			
 
				+            m_lastRectResult.height = static_cast<track_t>(rect.height);
			
 
				+        }
			
 
				+    }
			
 
				+    return cv::Rect(static_cast<int>(m_lastRectResult.x), static_cast<int>(m_lastRectResult.y), static_cast<int>(m_lastRectResult.width), static_cast<int>(m_lastRectResult.height));
			
 
				+}
			
 
				+//---------------------------------------------------------------------------
			
 
				+Rect3D TKalmanFilter::GetRect3DPrediction()
			
 
				+{
			
 
				+    if (m_initialized)
			
 
				+    {
			
 
				+        cv::Mat prediction;
			
 
				+
			
 
				+        switch (m_type)
			
 
				+        {
			
 
				+        case tracking::KalmanLinear:
			
 
				+            prediction = m_linearKalman.predict();
			
 
				+            break;
			
 
				+        }
			
 
				+
			
 
				+        m_lastRect3DResult = Rect3D(cv::Point3f(prediction.at<track_t>(0), prediction.at<track_t>(1), prediction.at<track_t>(2)), Size3D(prediction.at<track_t>(3), prediction.at<track_t>(4), prediction.at<track_t>(5)), prediction.at<track_t>(6));
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+
			
 
				+    }
			
 
				+    return m_lastRect3DResult;
			
 
				+}
			
 
				+
			
 
				+//---------------------------------------------------------------------------
			
 
				+Rect3D TKalmanFilter::Update(Rect3D rect, bool dataCorrect)
			
 
				+{
			
 
				+    if (!m_initialized)
			
 
				+    {
			
 
				+        if (m_initialRect3Ds.size() < MIN_INIT_VALS)
			
 
				+        {
			
 
				+            if (dataCorrect)
			
 
				+            {
			
 
				+                m_initialRect3Ds.push_back(rect);
			
 
				+                m_lastRect3DResult.center.x = static_cast<track_t>(rect.center.x);
			
 
				+                m_lastRect3DResult.center.y = static_cast<track_t>(rect.center.y);
			
 
				+                m_lastRect3DResult.center.z = static_cast<track_t>(rect.center.z);
			
 
				+                m_lastRect3DResult.size.width = static_cast<track_t>(rect.size.width);
			
 
				+                m_lastRect3DResult.size.height = static_cast<track_t>(rect.size.height);
			
 
				+                m_lastRect3DResult.size.length = static_cast<track_t>(rect.size.length);
			
 
				+                m_lastRect3DResult.yaw = static_cast<track_t>(rect.yaw);
			
 
				+            }
			
 
				+        }
			
 
				+        if (m_initialRect3Ds.size() == MIN_INIT_VALS)
			
 
				+        {
			
 
				+            std::vector<Point_t> initialPoints;
			
 
				+            cv::Point3f averageSize(0, 0, 0);
			
 
				+            float averageZ = 0;
			
 
				+            float averageYaw = 0;
			
 
				+            for (const auto& r : m_initialRect3Ds)
			
 
				+            {
			
 
				+                initialPoints.emplace_back(static_cast<track_t>(r.center.x), static_cast<track_t>(r.center.y));
			
 
				+                averageZ += r.center.z;
			
 
				+                averageSize.x += r.size.width;
			
 
				+                averageSize.y += r.size.height;
			
 
				+                averageSize.z += r.size.length;
			
 
				+                averageYaw += r.yaw;
			
 
				+            }
			
 
				+            averageZ /= MIN_INIT_VALS;
			
 
				+            averageSize.x /= MIN_INIT_VALS;
			
 
				+            averageSize.y /= MIN_INIT_VALS;
			
 
				+            averageSize.z /= MIN_INIT_VALS;
			
 
				+            averageYaw /= MIN_INIT_VALS;
			
 
				+
			
 
				+            track_t kx = 0;
			
 
				+            track_t bx = 0;
			
 
				+            track_t ky = 0;
			
 
				+            track_t by = 0;
			
 
				+            get_lin_regress_params(initialPoints, 0, MIN_INIT_VALS, kx, bx, ky, by);
			
 
				+            Rect3D rect0(cv::Point3f(kx * (MIN_INIT_VALS - 1) + bx, ky * (MIN_INIT_VALS - 1) + by, averageZ), Size3D(averageSize.x, averageSize.y,averageSize.z), averageYaw);
			
 
				+            cv::Point3f rectv0(kx, ky, 0);
			
 
				+
			
 
				+            switch (m_type)
			
 
				+            {
			
 
				+            case tracking::KalmanLinear:
			
 
				+				if (m_useAcceleration)
			
 
				+                    CreateLinearAcceleration3D(rect0, rectv0);
			
 
				+				else
			
 
				+                    CreateLinear3D(rect0, rectv0);
			
 
				+                break;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if (m_initialized)
			
 
				+    {
			
 
				+        cv::Mat measurement(7, 1, Mat_t(1));
			
 
				+        if (!dataCorrect)
			
 
				+        {
			
 
				+            measurement.at<track_t>(0) = m_lastRect3DResult.center.x;  // update using prediction
			
 
				+            measurement.at<track_t>(1) = m_lastRect3DResult.center.y;
			
 
				+            measurement.at<track_t>(2) = m_lastRect3DResult.center.z;
			
 
				+            measurement.at<track_t>(3) = m_lastRect3DResult.size.width;
			
 
				+            measurement.at<track_t>(4) = m_lastRect3DResult.size.height;
			
 
				+            measurement.at<track_t>(5) = m_lastRect3DResult.size.length;
			
 
				+            measurement.at<track_t>(6) = m_lastRect3DResult.yaw;
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            measurement.at<track_t>(0) = static_cast<track_t>(rect.center.x);  // update using measurements
			
 
				+            measurement.at<track_t>(1) = static_cast<track_t>(rect.center.y);
			
 
				+            measurement.at<track_t>(2) = static_cast<track_t>(rect.center.z);
			
 
				+            measurement.at<track_t>(3) = static_cast<track_t>(rect.size.width);
			
 
				+            measurement.at<track_t>(4) = static_cast<track_t>(rect.size.height);
			
 
				+            measurement.at<track_t>(5) = static_cast<track_t>(rect.size.length);
			
 
				+            measurement.at<track_t>(6) = static_cast<track_t>(rect.yaw);
			
 
				+        }
			
 
				+        // Correction
			
 
				+        cv::Mat estimated;
			
 
				+        switch (m_type)
			
 
				+        {
			
 
				+        case tracking::KalmanLinear:
			
 
				+        {
			
 
				+            estimated = m_linearKalman.correct(measurement);
			
 
				+
			
 
				+            m_lastRect3DResult.center.x = estimated.at<track_t>(0);   //update using measurements
			
 
				+            m_lastRect3DResult.center.y = estimated.at<track_t>(1);
			
 
				+            m_lastRect3DResult.center.z = estimated.at<track_t>(2);
			
 
				+            m_lastRect3DResult.size.width = estimated.at<track_t>(3);
			
 
				+            m_lastRect3DResult.size.height = estimated.at<track_t>(4);
			
 
				+            m_lastRect3DResult.size.length = estimated.at<track_t>(5);
			
 
				+            m_lastRect3DResult.yaw = estimated.at<track_t>(6);
			
 
				+
			
 
				+            // Inertia correction
			
 
				+			if (!m_useAcceleration)
			
 
				+			{
			
 
				+                track_t currDist = sqrtf(sqr(estimated.at<track_t>(0) - rect.center.x) + sqr(estimated.at<track_t>(1) - rect.center.y)+ sqr(estimated.at<track_t>(2) - rect.center.z) + sqr(estimated.at<track_t>(3) - rect.size.width) + sqr(estimated.at<track_t>(4) - rect.size.height) + sqr(estimated.at<track_t>(5) - rect.size.length) + sqr(estimated.at<track_t>(6) - rect.yaw));
			
 
				+				if (currDist > m_lastDist)
			
 
				+				{
			
 
				+					m_deltaTime = std::min(m_deltaTime + m_deltaStep, m_deltaTimeMax);
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					m_deltaTime = std::max(m_deltaTime - m_deltaStep, m_deltaTimeMin);
			
 
				+				}
			
 
				+				m_lastDist = currDist;
			
 
				+
			
 
				+                m_linearKalman.transitionMatrix.at<track_t>(0, 7) = m_deltaTime;
			
 
				+                m_linearKalman.transitionMatrix.at<track_t>(1, 8) = m_deltaTime;
			
 
				+                m_linearKalman.transitionMatrix.at<track_t>(2, 9) = m_deltaTime;
			
 
				+                m_linearKalman.transitionMatrix.at<track_t>(3, 10) = m_deltaTime;
			
 
				+                m_linearKalman.transitionMatrix.at<track_t>(4, 11) = m_deltaTime;
			
 
				+                m_linearKalman.transitionMatrix.at<track_t>(5, 12) = m_deltaTime;
			
 
				+                m_linearKalman.transitionMatrix.at<track_t>(6, 13) = m_deltaTime;
			
 
				+			}
			
 
				+            break;
			
 
				+        }
			
 
				+        }
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        if (dataCorrect)
			
 
				+        {
			
 
				+            m_lastRect3DResult.center.x = static_cast<track_t>(rect.center.x);
			
 
				+            m_lastRect3DResult.center.y = static_cast<track_t>(rect.center.y);
			
 
				+            m_lastRect3DResult.center.z = static_cast<track_t>(rect.center.z);
			
 
				+            m_lastRect3DResult.size.width = static_cast<track_t>(rect.size.width);
			
 
				+            m_lastRect3DResult.size.height = static_cast<track_t>(rect.size.height);
			
 
				+            m_lastRect3DResult.size.length = static_cast<track_t>(rect.size.length);
			
 
				+            m_lastRect3DResult.yaw = static_cast<track_t>(rect.yaw);
			
 
				+        }
			
 
				+    }
			
 
				+    return m_lastRect3DResult;
			
 
				+}
			
 
				+
			
 
				+//---------------------------------------------------------------------------
			
 
				+cv::Vec<track_t, 2> TKalmanFilter::GetVelocity() const
			
 
				+{
			
 
				+    cv::Vec<track_t, 2> res(0, 0);
			
 
				+    if (m_initialized)
			
 
				+    {
			
 
				+        switch (m_type)
			
 
				+        {
			
 
				+        case tracking::KalmanLinear:
			
 
				+        {
			
 
				+            if (m_linearKalman.statePre.rows > 3)
			
 
				+            {
			
 
				+                int indX = 2;
			
 
				+                int indY = 3;
			
 
				+                if (m_linearKalman.statePre.rows > 5)
			
 
				+                {
			
 
				+                    indX = 4;
			
 
				+                    indY = 5;
			
 
				+                    if (m_linearKalman.statePre.rows > 8)
			
 
				+                    {
			
 
				+                        indX = 7;
			
 
				+                        indY = 8;
			
 
				+                    }
			
 
				+                    res[0] = m_linearKalman.statePre.at<track_t>(indX);
			
 
				+                    res[1] = m_linearKalman.statePre.at<track_t>(indY);
			
 
				+                }
			
 
				+            }
			
 
				+            break;
			
 
				+        }
			
 
				+
			
 
				+        }
			
 
				+    }
			
 
				+    return res;
			
 
				+}
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/Tracker/Kalman.h
+++ b/src/api/perception_lidar_PointPillars_MultiHead/Tracker/Kalman.h
@@ -0,0 +1,120 @@
 
				+#pragma once
			
 
				+#include "defines.h"
			
 
				+#include <memory>
			
 
				+#include <deque>
			
 
				+
			
 
				+#include <opencv2/opencv.hpp>
			
 
				+
			
 
				+///
			
 
				+/// \brief The TKalmanFilter class
			
 
				+/// http://www.morethantechnical.com/2011/06/17/simple-kalman-filter-for-tracking-using-opencv-2-2-w-code/
			
 
				+///
			
 
				+class TKalmanFilter
			
 
				+{
			
 
				+public:
			
 
				+    TKalmanFilter(tracking::KalmanType type, bool useAcceleration, track_t deltaTime, track_t accelNoiseMag);
			
 
				+    ~TKalmanFilter() = default;
			
 
				+
			
 
				+    cv::Point3f GetPointPrediction();
			
 
				+    cv::Point3f Update(cv::Point3f pt, bool dataCorrect);
			
 
				+
			
 
				+    cv::Rect GetRectPrediction();
			
 
				+    cv::Rect Update(cv::Rect rect, bool dataCorrect);
			
 
				+
			
 
				+    Rect3D GetRect3DPrediction();
			
 
				+    Rect3D Update(Rect3D rect, bool dataCorrect);
			
 
				+
			
 
				+	cv::Vec<track_t, 2> GetVelocity() const;
			
 
				+
			
 
				+private:
			
 
				+    tracking::KalmanType m_type = tracking::KalmanLinear;
			
 
				+    cv::KalmanFilter m_linearKalman;
			
 
				+    static const size_t MIN_INIT_VALS = 4;
			
 
				+    std::deque<cv::Point3f> m_initialPoints;
			
 
				+    cv::Point3f m_lastPointResult;
			
 
				+
			
 
				+    std::deque<cv::Rect> m_initialRects;
			
 
				+    cv::Rect_<track_t> m_lastRectResult;
			
 
				+    cv::Rect_<track_t> m_lastRect;
			
 
				+
			
 
				+    std::deque<Rect3D> m_initialRect3Ds;
			
 
				+    Rect3D m_lastRect3DResult;
			
 
				+    cv::Rect_<track_t> m_lastRect3D;
			
 
				+
			
 
				+    bool m_initialized = false;
			
 
				+    track_t m_deltaTime = 0.2f;
			
 
				+    track_t m_deltaTimeMin = 0.2f;
			
 
				+    track_t m_deltaTimeMax = 2 * 0.2f;
			
 
				+    track_t m_lastDist = 0;
			
 
				+    track_t m_deltaStep = 0;
			
 
				+    static const int m_deltaStepsCount = 20;
			
 
				+    track_t m_accelNoiseMag = 0.5f;
			
 
				+	bool m_useAcceleration = false; // If set true then will be used motion model x(t) = x0 + v0 * t + a * t^2 / 2
			
 
				+
			
 
				+	// Constant velocity model
			
 
				+    void CreateLinear(cv::Point3f xy0, cv::Point3f xyv0);
			
 
				+    void CreateLinear(cv::Rect_<track_t> rect0, Point_t rectv0);
			
 
				+    void CreateLinear3D(Rect3D rect0, cv::Point3f rectv0);
			
 
				+
			
 
				+	// Constant acceleration model
			
 
				+	// https://www.mathworks.com/help/driving/ug/linear-kalman-filters.html
			
 
				+    void CreateLinearAcceleration(cv::Point3f xy0, cv::Point3f xyv0);
			
 
				+    void CreateLinearAcceleration(cv::Rect_<track_t> rect0, Point_t rectv0);
			
 
				+    void CreateLinearAcceleration3D(Rect3D rect0, cv::Point3f rectv0);
			
 
				+};
			
 
				+
			
 
				+//---------------------------------------------------------------------------
			
 
				+///
			
 
				+/// \brief sqr
			
 
				+/// \param val
			
 
				+/// \return
			
 
				+///
			
 
				+template<class T> inline
			
 
				+T sqr(T val)
			
 
				+{
			
 
				+    return val * val;
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief get_lin_regress_params
			
 
				+/// \param in_data
			
 
				+/// \param start_pos
			
 
				+/// \param in_data_size
			
 
				+/// \param kx
			
 
				+/// \param bx
			
 
				+/// \param ky
			
 
				+/// \param by
			
 
				+///
			
 
				+template<typename T, typename CONT>
			
 
				+void get_lin_regress_params(
			
 
				+        const CONT& in_data,
			
 
				+        size_t start_pos,
			
 
				+        size_t in_data_size,
			
 
				+        T& kx, T& bx, T& ky, T& by)
			
 
				+{
			
 
				+    T m1(0.), m2(0.);
			
 
				+    T m3_x(0.), m4_x(0.);
			
 
				+    T m3_y(0.), m4_y(0.);
			
 
				+
			
 
				+    const T el_count = static_cast<T>(in_data_size - start_pos);
			
 
				+    for (size_t i = start_pos; i < in_data_size; ++i)
			
 
				+    {
			
 
				+        m1 += i;
			
 
				+        m2 += sqr(i);
			
 
				+
			
 
				+        m3_x += in_data[i].x;
			
 
				+        m4_x += i * in_data[i].x;
			
 
				+
			
 
				+        m3_y += in_data[i].y;
			
 
				+        m4_y += i * in_data[i].y;
			
 
				+    }
			
 
				+    T det_1 = 1 / (el_count * m2 - sqr(m1));
			
 
				+
			
 
				+    m1 *= -1;
			
 
				+
			
 
				+    kx = det_1 * (m1 * m3_x + el_count * m4_x);
			
 
				+    bx = det_1 * (m2 * m3_x + m1 * m4_x);
			
 
				+
			
 
				+    ky = det_1 * (m1 * m3_y + el_count * m4_y);
			
 
				+    by = det_1 * (m2 * m3_y + m1 * m4_y);
			
 
				+}
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/Tracker/ShortPathCalculator.h
+++ b/src/api/perception_lidar_PointPillars_MultiHead/Tracker/ShortPathCalculator.h
@@ -0,0 +1,65 @@
 
				+#pragma once
			
 
				+#include "defines.h"
			
 
				+#include "HungarianAlg.h"
			
 
				+
			
 
				+///
			
 
				+/// \brief The SPSettings struct
			
 
				+///
			
 
				+struct SPSettings
			
 
				+{
			
 
				+    track_t m_distThres = 0.8f;
			
 
				+    size_t m_maxHistory = 10;
			
 
				+};
			
 
				+
			
 
				+///
			
 
				+/// \brief The ShortPathCalculator class
			
 
				+///
			
 
				+class ShortPathCalculator
			
 
				+{
			
 
				+public:
			
 
				+    ShortPathCalculator(const SPSettings& settings)
			
 
				+        : m_settings(settings)
			
 
				+    {
			
 
				+    }
			
 
				+    virtual ~ShortPathCalculator()
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    virtual void Solve(const distMatrix_t& costMatrix, size_t N, size_t M, assignments_t& assignment, track_t maxCost) = 0;
			
 
				+protected:
			
 
				+    SPSettings m_settings;
			
 
				+};
			
 
				+
			
 
				+///
			
 
				+/// \brief The SPHungrian class
			
 
				+///
			
 
				+class SPHungrian : public ShortPathCalculator
			
 
				+{
			
 
				+public:
			
 
				+    SPHungrian(const SPSettings& settings)
			
 
				+        : ShortPathCalculator(settings)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    void Solve(const distMatrix_t& costMatrix, size_t N, size_t M, assignments_t& assignment, track_t /*maxCost*/)
			
 
				+    {
			
 
				+        m_solver.Solve(costMatrix, N, M, assignment, AssignmentProblemSolver::optimal);
			
 
				+    }
			
 
				+
			
 
				+private:
			
 
				+    AssignmentProblemSolver m_solver;
			
 
				+};
			
 
				+
			
 
				+///
			
 
				+/// \brief The SPBipart class
			
 
				+///
			
 
				+//class SPBipart : public ShortPathCalculator
			
 
				+//{
			
 
				+//public:
			
 
				+//    SPBipart(const SPSettings& settings)
			
 
				+//        : ShortPathCalculator(settings)
			
 
				+//    {
			
 
				+//    }
			
 
				+
			
 
				+//    void Solve(const distMatrix_t& costMatrix, size_t N, size_t M, assignments_t& assignment, track_t maxCost);
			
 
				+//};
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/Tracker/Tracking.hpp
+++ b/src/api/perception_lidar_PointPillars_MultiHead/Tracker/Tracking.hpp
@@ -0,0 +1,159 @@
 
				+#pragma once
			
 
				+#include "objectarray.pb.h"
			
 
				+#include "Ctracker.h"
			
 
				+#include <iostream>
			
 
				+#include <vector>
			
 
				+#define FPS 8 
			
 
				+#define MIN_DETECTEDFRAMES 5
			
 
				+///
			
 
				+/// \brief 跟踪器参数设置
			
 
				+///
			
 
				+bool InitTracker(CTracker& tracker)
			
 
				+{
			
 
				+    TrackerSettings settings;
			
 
				+    settings.SetDistance(tracking::DistRect3Ds); // 代价矩阵：两中心点之间的距离
			
 
				+    settings.m_kalmanType = tracking::KalmanLinear; // 滤波器类型：卡尔曼线性滤波器
			
 
				+    settings.m_filterGoal = tracking::FilterRect3D; // 滤波对象：Rect3D
			
 
				+    settings.m_lostTrackType = tracking::TrackNone; // 丢失了的目标，不再追踪
			
 
				+    settings.m_matchType = tracking::MatchHungrian; // 匹配算法：匈牙利
			
 
				+    settings.m_dt = 1.f; // 卡尔曼滤波器的时间步长
			
 
				+    settings.m_accelNoiseMag = 0.5f; // 卡尔曼的噪声放大器
			
 
				+    settings.m_distThres = 30.f; // 匹配算法中的距离阈值
			
 
				+    settings.m_minAreaRadiusPix = -1.f;//frame.rows / 20.f; // 目标的最小面积半径（像素）
			
 
				+    settings.m_maximumAllowedSkippedFrames = 3; // 被跟踪目标允许未匹配到的最大次数，当超过这一数值，该目标的跟踪器将被移除
			
 
				+    settings.m_maxTraceLength = 5; // 最大跟踪长度，即历史轨迹保留的最大长度
			
 
				+
			
 
				+    tracker.setSettings(settings);
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief 对融合后的目标进行跟踪，并以跟踪后的最优估计值更新融合目标的状态信息
			
 
				+///
			
 
				+iv::lidar::objectarray Tracking(iv::lidar::objectarray& lidarobjvec, CTracker& tracker)
			
 
				+{
			
 
				+#ifdef DEBUG_SHOW
			
 
				+    std::cout<<"-------------------------------------------------"<<std::endl;
			
 
				+#endif
			
 
				+    iv::lidar::objectarray trackedobjvec;
			
 
				+    trackedobjvec.clear_obj();
			
 
				+    trackedobjvec.set_timestamp(lidarobjvec.timestamp());
			
 
				+    regions_t regions;
			
 
				+    cv::Point3f pointXYZ;
			
 
				+    for(int i = 0;i<lidarobjvec.obj_size();i++)
			
 
				+    {
			
 
				+        pointXYZ.x = lidarobjvec.obj(i).centroid().x();
			
 
				+        pointXYZ.y = lidarobjvec.obj(i).centroid().y();
			
 
				+        pointXYZ.z = lidarobjvec.obj(i).centroid().z();
			
 
				+        Rect3D rect;
			
 
				+        rect.center = pointXYZ;
			
 
				+        rect.size.width = lidarobjvec.obj(i).dimensions().x();//w
			
 
				+        rect.size.height = lidarobjvec.obj(i).dimensions().y();//l
			
 
				+        rect.size.length = lidarobjvec.obj(i).dimensions().z();//h
			
 
				+        rect.yaw = lidarobjvec.obj(i).tyaw();
			
 
				+        CRegion region = CRegion(rect,lidarobjvec.obj(i).mntype(),lidarobjvec.obj(i).score());
			
 
				+        regions.push_back(region);
			
 
				+#ifdef DEBUG_SHOW
			
 
				+        std::cout<<"old id:"<<i<<std::endl;
			
 
				+        std::cout<<"old type:"<<lidarobjvec.obj(i).mntype()<<std::endl;
			
 
				+        std::cout<<"old x,y,z,w,h,l,yaw:"<<rect.center.x<<","<<rect.center.y<<","<<rect.center.z<<"  "<<rect.size.width<<","<<rect.size.height<<","<<rect.size.length<<"  "<<rect.yaw<<std::endl;
			
 
				+#endif
			
 
				+
			
 
				+    }
			
 
				+    tracker.Update(regions, cv::UMat(), 30);
			
 
				+    auto tracks = tracker.GetTracks();
			
 
				+#ifdef DEBUG_SHOW
			
 
				+    std::cout<<"detect size, tracker size:"<<regions.size()<<","<<tracks.size()<<std::endl;
			
 
				+#endif
			
 
				+    for (size_t i = 0; i < tracks.size(); i++)
			
 
				+    {
			
 
				+        const auto& track = tracks[i];
			
 
				+        if(track.m_detectedFrames < MIN_DETECTEDFRAMES) continue;
			
 
				+        int obj_id = track.m_regionID;
			
 
				+        iv::lidar::lidarobject lidar_object;
			
 
				+        if(obj_id != -1) // 当前融合目标成功匹配上跟踪器中的已有目标，则根据跟踪所得的最优估计值更新融合目标状态
			
 
				+        {
			
 
				+            lidar_object = lidarobjvec.obj(obj_id);
			
 
				+            lidar_object.set_id(track.m_ID);
			
 
				+
			
 
				+            iv::lidar::PointXYZ centroid;
			
 
				+            iv::lidar::PointXYZ *centerpoint;
			
 
				+            centroid.set_x(track.m_rect.center.x);
			
 
				+            centroid.set_y(track.m_rect.center.y);
			
 
				+            centroid.set_z(track.m_rect.center.z);
			
 
				+            centerpoint=lidar_object.mutable_centroid();
			
 
				+            centerpoint->CopyFrom(centroid);
			
 
				+/* not update */
			
 
				+//            iv::lidar::Dimension dimension;
			
 
				+//            iv::lidar::Dimension *obj_dimension;
			
 
				+//            dimension.set_x(track.m_rect.size.width);
			
 
				+//            dimension.set_y(track.m_rect.size.height);
			
 
				+//            dimension.set_z(track.m_rect.size.length);
			
 
				+//            obj_dimension=lidar_object.mutable_dimensions();
			
 
				+//            obj_dimension->CopyFrom(dimension);
			
 
				+
			
 
				+//            lidar_object.set_tyaw(track.m_rect.yaw);
			
 
				+
			
 
				+            iv::lidar::VelXY vel_relative;
			
 
				+            iv::lidar::VelXY *velrelative;
			
 
				+            vel_relative.set_x(track.m_velocity[0]*FPS);
			
 
				+            vel_relative.set_y(track.m_velocity[1]*FPS);
			
 
				+            velrelative = lidar_object.mutable_vel_relative();
			
 
				+            velrelative->CopyFrom(vel_relative);
			
 
				+
			
 
				+            iv::lidar::lidarobject *pe = trackedobjvec.add_obj();
			
 
				+            pe->CopyFrom(lidar_object);
			
 
				+        }else{ // 当前时刻没有融合目标与跟踪器中的已有目标匹配上，则将跟踪器中已有目标的预测结果增加到融合结果中            
			
 
				+            lidar_object.set_id(track.m_ID);
			
 
				+            lidar_object.set_mntype(track.m_region.m_type);
			
 
				+            lidar_object.set_score(track.m_region.m_confidence);
			
 
				+
			
 
				+            iv::lidar::PointXYZ centroid;
			
 
				+            iv::lidar::PointXYZ *centerpoint;
			
 
				+            centroid.set_x(track.m_rect.center.x);
			
 
				+            centroid.set_y(track.m_rect.center.y);
			
 
				+            centroid.set_z(track.m_rect.center.z);
			
 
				+            centerpoint=lidar_object.mutable_centroid();
			
 
				+            centerpoint->CopyFrom(centroid);
			
 
				+
			
 
				+            iv::lidar::Dimension dimension;
			
 
				+            iv::lidar::Dimension *obj_dimension;
			
 
				+            dimension.set_x(track.m_region.m_rect.size.width);
			
 
				+            dimension.set_y(track.m_region.m_rect.size.height);
			
 
				+            dimension.set_z(track.m_region.m_rect.size.length);
			
 
				+            obj_dimension=lidar_object.mutable_dimensions();
			
 
				+            obj_dimension->CopyFrom(dimension);
			
 
				+
			
 
				+            lidar_object.set_tyaw(track.m_region.m_rect.yaw);
			
 
				+
			
 
				+            iv::lidar::VelXY vel_relative;
			
 
				+            iv::lidar::VelXY *velrelative;
			
 
				+            vel_relative.set_x(track.m_velocity[0]*FPS);
			
 
				+            vel_relative.set_y(track.m_velocity[1]*FPS);
			
 
				+            velrelative = lidar_object.mutable_vel_relative();
			
 
				+            velrelative->CopyFrom(vel_relative);
			
 
				+
			
 
				+            iv::lidar::lidarobject *pe = trackedobjvec.add_obj();
			
 
				+            pe->CopyFrom(lidar_object);
			
 
				+        }
			
 
				+#ifdef DEBUG_SHOW
			
 
				+        std::cout<<"id:"<<lidar_object.id()<<"  "<<obj_id<<std::endl;
			
 
				+        std::cout<<"type:"<<lidar_object.mntype()<<std::endl;
			
 
				+        std::cout<<"update x,y,z,w,h,l,yaw,vx,vy:"<<lidar_object.centroid().x()<<","<<lidar_object.centroid().y()<<","<<lidar_object.centroid().z()<<"  "<<lidar_object.dimensions().x()<<","<<lidar_object.dimensions().y()<<","<<lidar_object.dimensions().z()<<"  "<<lidar_object.tyaw()<<"  "<<lidar_object.vel_relative().x()<<","<<lidar_object.vel_relative().y()<<std::endl;
			
 
				+#endif
			
 
				+    }
			
 
				+#ifdef DEBUG_SHOW
			
 
				+    std::cout<<"trackedobjvec size:"<<trackedobjvec.obj_size()<<std::endl;
			
 
				+#endif
			
 
				+    //    for (size_t i = 0; i < trackedobjvec.obj_size(); i++)
			
 
				+    //    {
			
 
				+    //        iv::lidar::lidarobject lidar_object;
			
 
				+    //        lidar_object = trackedobjvec.obj(i);
			
 
				+    //        std::cout<<"historical size:"<<lidar_object.point_historical_size()<<std::endl;
			
 
				+    //    }
			
 
				+    regions.clear();
			
 
				+    tracks.clear();
			
 
				+    return trackedobjvec;
			
 
				+}
			
 
				+
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/Tracker/defines.h
+++ b/src/api/perception_lidar_PointPillars_MultiHead/Tracker/defines.h
@@ -0,0 +1,147 @@
 
				+#pragma once
			
 
				+
			
 
				+#include <vector>
			
 
				+#include <string>
			
 
				+#include <map>
			
 
				+#include <opencv2/opencv.hpp>
			
 
				+
			
 
				+
			
 
				+// ---------------------------------------------------------------------------
			
 
				+//
			
 
				+// ---------------------------------------------------------------------------
			
 
				+typedef float track_t;
			
 
				+typedef cv::Point_<track_t> Point_t;
			
 
				+#define El_t CV_32F
			
 
				+#define Mat_t CV_32FC
			
 
				+
			
 
				+typedef std::vector<int> assignments_t;
			
 
				+typedef std::vector<track_t> distMatrix_t;
			
 
				+typedef struct Size3D{
			
 
				+    Size3D()
			
 
				+    {
			
 
				+    }
			
 
				+    Size3D(float w, float h, float l)
			
 
				+        :
			
 
				+          width(w),
			
 
				+          height(h),
			
 
				+          length(l)
			
 
				+    {
			
 
				+    }
			
 
				+    float length;
			
 
				+    float width;
			
 
				+    float height;
			
 
				+} Size3D;
			
 
				+typedef struct Rect3D{
			
 
				+    Rect3D()
			
 
				+    {
			
 
				+    }
			
 
				+    Rect3D(cv::Point3f c, Size3D s, float y)
			
 
				+        :
			
 
				+          center(c),
			
 
				+          size(s),
			
 
				+          yaw(y)
			
 
				+    {
			
 
				+    }
			
 
				+    cv::Point3f center;
			
 
				+    Size3D size;
			
 
				+    float yaw;
			
 
				+} Rect3D;
			
 
				+///
			
 
				+/// \brief config_t
			
 
				+///
			
 
				+typedef std::multimap<std::string, std::string> config_t;
			
 
				+
			
 
				+///
			
 
				+/// \brief The CRegion class
			
 
				+///
			
 
				+class CRegion
			
 
				+{
			
 
				+public:
			
 
				+    CRegion()
			
 
				+        : m_type(-1), m_type_name(""), m_confidence(-1)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    CRegion(const Rect3D& rect, const int& type, float confidence)
			
 
				+        : m_rect(rect), m_type(type), m_confidence(confidence)
			
 
				+    {
			
 
				+        RBRect();
			
 
				+    }
			
 
				+
			
 
				+    CRegion(const Rect3D& rect, const std::string& type, float confidence)
			
 
				+        : m_rect(rect), m_type_name(type), m_confidence(confidence)
			
 
				+    {
			
 
				+        RBRect();
			
 
				+    }
			
 
				+    Rect3D m_rect;
			
 
				+    cv::RotatedRect m_rrect;
			
 
				+    cv::Rect m_brect;
			
 
				+    int m_type = -1;
			
 
				+    std::string m_type_name = "";
			
 
				+    float m_confidence = -1;
			
 
				+private:
			
 
				+    ///
			
 
				+    /// \brief B2RRect
			
 
				+    /// \return
			
 
				+    ///
			
 
				+    cv::RotatedRect RBRect()
			
 
				+    {
			
 
				+        m_rrect = cv::RotatedRect(Point_t(m_rect.center.x,m_rect.center.y), cv::Size(m_rect.size.width,m_rect.size.height), 0);
			
 
				+        m_brect = cv::Rect(m_rect.center.x-m_rect.size.width/2,m_rect.center.y-m_rect.size.height/2, m_rect.size.width,m_rect.size.height);
			
 
				+        return m_rrect;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+typedef std::vector<CRegion> regions_t;
			
 
				+
			
 
				+///
			
 
				+///
			
 
				+///
			
 
				+namespace tracking
			
 
				+{
			
 
				+
			
 
				+///
			
 
				+/// \brief The DistType enum
			
 
				+///
			
 
				+enum DistType
			
 
				+{
			
 
				+    DistCenters,   // Euclidean distance between centers, pixels
			
 
				+    DistRects,     // Euclidean distance between bounding rectangles, pixels
			
 
				+    DistRect3Ds,     // Euclidean distance between bounding rectangles, pixels
			
 
				+	DistsCount
			
 
				+};
			
 
				+
			
 
				+///
			
 
				+/// \brief The FilterGoal enum
			
 
				+///
			
 
				+enum FilterGoal
			
 
				+{
			
 
				+    FilterCenter, // x,y
			
 
				+    FilterRect,   // x,y,w,h
			
 
				+    FilterRect3D  // x,y,z,w,h,l,yaw
			
 
				+};
			
 
				+
			
 
				+///
			
 
				+/// \brief The KalmanType enum
			
 
				+///
			
 
				+enum KalmanType
			
 
				+{
			
 
				+    KalmanLinear
			
 
				+};
			
 
				+
			
 
				+///
			
 
				+/// \brief The MatchType enum
			
 
				+///
			
 
				+enum MatchType
			
 
				+{
			
 
				+    MatchHungrian
			
 
				+};
			
 
				+
			
 
				+///
			
 
				+/// \brief The LostTrackType enum
			
 
				+///
			
 
				+enum LostTrackType
			
 
				+{
			
 
				+    TrackNone
			
 
				+};
			
 
				+}
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/Tracker/track.cpp
+++ b/src/api/perception_lidar_PointPillars_MultiHead/Tracker/track.cpp
@@ -0,0 +1,492 @@
 
				+#include "track.h"
			
 
				+
			
 
				+//#include "dat/dat_tracker.hpp"
			
 
				+
			
 
				+///
			
 
				+/// \brief CTrack
			
 
				+/// \param pt
			
 
				+/// \param region
			
 
				+/// \param deltaTime
			
 
				+/// \param accelNoiseMag
			
 
				+/// \param trackID
			
 
				+/// \param filterObjectSize
			
 
				+/// \param externalTrackerForLost
			
 
				+///
			
 
				+CTrack::CTrack(const CRegion& region,
			
 
				+        tracking::KalmanType kalmanType,
			
 
				+        track_t deltaTime,
			
 
				+        track_t accelNoiseMag,
			
 
				+        bool useAcceleration,
			
 
				+        size_t trackID,
			
 
				+        size_t regionID,
			
 
				+        tracking::FilterGoal filterGoal,
			
 
				+        tracking::LostTrackType externalTrackerForLost
			
 
				+        )
			
 
				+    :
			
 
				+      m_trackID(trackID),
			
 
				+      m_regionID(regionID),
			
 
				+      m_skippedFrames(0),
			
 
				+      m_lastRegion(region),
			
 
				+      m_predictionPoint(region.m_rect.center),
			
 
				+      m_predictionRect(region.m_rrect),
			
 
				+      m_predictionRect3D(region.m_rect),
			
 
				+      m_kalman(kalmanType, useAcceleration, deltaTime, accelNoiseMag),
			
 
				+      m_filterGoal(filterGoal),
			
 
				+      m_outOfTheFrame(false),
			
 
				+      m_externalTrackerForLost(externalTrackerForLost)
			
 
				+{
			
 
				+    if (filterGoal == tracking::FilterRect)
			
 
				+        m_kalman.Update(region.m_brect, true);
			
 
				+    else if(filterGoal == tracking::FilterRect3D)
			
 
				+        m_kalman.Update(region.m_rect, true);
			
 
				+    else
			
 
				+        m_kalman.Update(m_predictionPoint, true);
			
 
				+
			
 
				+    m_trace.push_back(m_predictionPoint, m_predictionPoint);
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTrack::CalcDistCenter
			
 
				+/// \param reg
			
 
				+/// \return
			
 
				+///
			
 
				+track_t CTrack::CalcDistCenter(const CRegion& reg) const
			
 
				+{
			
 
				+    cv::Point3f diff = m_predictionPoint - reg.m_rect.center;
			
 
				+    return sqrtf(sqr(diff.x) + sqr(diff.y));
			
 
				+}
			
 
				+///
			
 
				+/// \brief CTrack::CalcDistRect
			
 
				+/// \param reg
			
 
				+/// \return
			
 
				+///
			
 
				+track_t CTrack::CalcDistRect(const CRegion& reg) const
			
 
				+{
			
 
				+    std::array<track_t, 5> diff;
			
 
				+    diff[0] = reg.m_rrect.center.x - m_lastRegion.m_rrect.center.x;
			
 
				+    diff[1] = reg.m_rrect.center.y - m_lastRegion.m_rrect.center.y;
			
 
				+    diff[2] = static_cast<track_t>(m_lastRegion.m_rrect.size.width - reg.m_rrect.size.width);
			
 
				+    diff[3] = static_cast<track_t>(m_lastRegion.m_rrect.size.height - reg.m_rrect.size.height);
			
 
				+    diff[4] = static_cast<track_t>(m_lastRegion.m_rrect.angle - reg.m_rrect.angle);
			
 
				+
			
 
				+    track_t dist = 0;
			
 
				+    for (size_t i = 0; i < diff.size(); ++i)
			
 
				+    {
			
 
				+        dist += sqr(diff[i]);
			
 
				+    }
			
 
				+    return sqrtf(dist);
			
 
				+}
			
 
				+///
			
 
				+/// \brief CTrack::CalcDistRect3D
			
 
				+/// \param reg
			
 
				+/// \return
			
 
				+///
			
 
				+track_t CTrack::CalcDistRect3D(const CRegion& reg) const
			
 
				+{
			
 
				+    std::array<track_t, 7> diff;
			
 
				+    diff[0] = reg.m_rect.center.x - m_lastRegion.m_rect.center.x;
			
 
				+    diff[1] = reg.m_rect.center.y - m_lastRegion.m_rect.center.y;
			
 
				+    diff[2] = 0;//reg.m_rect.center.z - m_lastRegion.m_rect.center.z;
			
 
				+    diff[3] = static_cast<track_t>(m_lastRegion.m_rect.size.width - reg.m_rect.size.width);
			
 
				+    diff[4] = static_cast<track_t>(m_lastRegion.m_rect.size.height - reg.m_rect.size.height);
			
 
				+    diff[5] = static_cast<track_t>(m_lastRegion.m_rect.size.length - reg.m_rect.size.length);
			
 
				+    diff[6] = 0;//static_cast<track_t>(m_lastRegion.m_rect.yaw - reg.m_rect.yaw);
			
 
				+
			
 
				+    track_t dist = 0;
			
 
				+    for (size_t i = 0; i < diff.size(); ++i)
			
 
				+    {
			
 
				+        dist += sqr(diff[i]);
			
 
				+    }
			
 
				+    return sqrtf(dist);
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTrack::Update
			
 
				+/// \*param region
			
 
				+/// \param dataCorrect
			
 
				+/// \param max_trace_length
			
 
				+/// \param prevFrame
			
 
				+/// \param currFrame
			
 
				+/// \param trajLen
			
 
				+///
			
 
				+void CTrack::Update(
			
 
				+        const CRegion& region,
			
 
				+        bool dataCorrect,
			
 
				+        size_t max_trace_length,
			
 
				+        cv::UMat prevFrame,
			
 
				+        cv::UMat currFrame,
			
 
				+        int trajLen
			
 
				+        )
			
 
				+{
			
 
				+    if (m_filterGoal == tracking::FilterRect) // Kalman filter for object coordinates and size
			
 
				+        RectUpdate(region, dataCorrect, prevFrame, currFrame);
			
 
				+    if (m_filterGoal == tracking::FilterRect3D) // Kalman filter for object coordinates and size
			
 
				+        Rect3DUpdate(region, dataCorrect, prevFrame, currFrame);
			
 
				+    else // Kalman filter only for object center
			
 
				+        PointUpdate(region.m_rect.center, region.m_rrect.size, dataCorrect, currFrame.size());
			
 
				+
			
 
				+    if (dataCorrect)
			
 
				+    {
			
 
				+        //std::cout << m_lastRegion.m_brect << " - " << region.m_brect << std::endl;
			
 
				+
			
 
				+        m_lastRegion = region;
			
 
				+        m_trace.push_back(m_predictionPoint, region.m_rect.center);
			
 
				+
			
 
				+        CheckStatic(trajLen, currFrame, region);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        m_trace.push_back(m_predictionPoint);
			
 
				+    }
			
 
				+
			
 
				+    if (m_trace.size() > max_trace_length)
			
 
				+        m_trace.pop_front(m_trace.size() - max_trace_length);
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTrack::IsStatic
			
 
				+/// \return
			
 
				+///
			
 
				+bool CTrack::IsStatic() const
			
 
				+{
			
 
				+    return m_isStatic;
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTrack::IsStaticTimeout
			
 
				+/// \param framesTime
			
 
				+/// \return
			
 
				+///
			
 
				+bool CTrack::IsStaticTimeout(int framesTime) const
			
 
				+{
			
 
				+    return (m_staticFrames > framesTime);
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTrack::IsOutOfTheFrame
			
 
				+/// \return
			
 
				+///
			
 
				+bool CTrack::IsOutOfTheFrame() const
			
 
				+{
			
 
				+    return m_outOfTheFrame;
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+//cv::RotatedRect CTrack::CalcPredictionEllipse(cv::Size_<track_t> minRadius) const
			
 
				+//{
			
 
				+//    // Move ellipse to velocity
			
 
				+//    auto velocity = m_kalman.GetVelocity();
			
 
				+//    Point_t d(3.f * velocity[0], 3.f * velocity[1]);
			
 
				+
			
 
				+//    cv::RotatedRect rrect(m_predictionPoint, cv::Size2f(std::max(minRadius.width, fabs(d.x)), std::max(minRadius.height, fabs(d.y))), 0);
			
 
				+
			
 
				+//    if (fabs(d.x) + fabs(d.y) > 4) // pix
			
 
				+//    {
			
 
				+//        if (fabs(d.x) > 0.0001f)
			
 
				+//        {
			
 
				+//            track_t l = std::min(rrect.size.width, rrect.size.height) / 3;
			
 
				+
			
 
				+//            track_t p2_l = sqrtf(sqr(d.x) + sqr(d.y));
			
 
				+//            rrect.center.x = l * d.x / p2_l + m_predictionPoint.x;
			
 
				+//            rrect.center.y = l * d.y / p2_l + m_predictionPoint.y;
			
 
				+
			
 
				+//            rrect.angle = atanf(d.y / d.x);
			
 
				+//        }
			
 
				+//        else
			
 
				+//        {
			
 
				+//            rrect.center.y += d.y / 3;
			
 
				+//            rrect.angle = static_cast<float>(CV_PI / 2.);
			
 
				+//        }
			
 
				+//    }
			
 
				+//    return rrect;
			
 
				+//}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTrack::IsInsideArea
			
 
				+///        If result <= 1 then center of the object is inside ellipse with prediction and velocity
			
 
				+/// \param pt
			
 
				+/// \return
			
 
				+///
			
 
				+track_t CTrack::IsInsideArea(const Point_t& pt, const cv::RotatedRect& rrect) const
			
 
				+{
			
 
				+    Point_t pt_(pt.x - rrect.center.x, pt.y - rrect.center.y);
			
 
				+    track_t r = sqrtf(sqr(pt_.x) + sqr(pt_.y));
			
 
				+    track_t t = (r > 1) ? acosf(pt_.x / r) : 0;
			
 
				+    track_t t_ = t - rrect.angle;
			
 
				+    Point_t pt_rotated(r * cosf(t_), r * sinf(t_));
			
 
				+
			
 
				+    return sqr(pt_rotated.x) / sqr(rrect.size.width) + sqr(pt_rotated.y) / sqr(rrect.size.height);
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTrack::WidthDist
			
 
				+/// \param reg
			
 
				+/// \return
			
 
				+///
			
 
				+track_t CTrack::WidthDist(const CRegion& reg) const
			
 
				+{
			
 
				+    if (m_lastRegion.m_rect.size.width < reg.m_rect.size.width)
			
 
				+        return m_lastRegion.m_rect.size.width / reg.m_rect.size.width;
			
 
				+    else
			
 
				+        return reg.m_rect.size.width / m_lastRegion.m_rect.size.width;
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTrack::HeightDist
			
 
				+/// \param reg
			
 
				+/// \return
			
 
				+///
			
 
				+track_t CTrack::HeightDist(const CRegion& reg) const
			
 
				+{
			
 
				+    if (m_lastRegion.m_rect.size.height < reg.m_rect.size.height)
			
 
				+        return m_lastRegion.m_rect.size.height / reg.m_rect.size.height;
			
 
				+    else
			
 
				+        return reg.m_rect.size.height / m_lastRegion.m_rect.size.height;
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTrack::CheckStatic
			
 
				+/// \param trajLen
			
 
				+/// \return
			
 
				+///
			
 
				+bool CTrack::CheckStatic(int trajLen, cv::UMat currFrame, const CRegion& region)
			
 
				+{
			
 
				+    if (!trajLen || static_cast<int>(m_trace.size()) < trajLen)
			
 
				+    {
			
 
				+        m_isStatic = false;
			
 
				+        m_staticFrames = 0;
			
 
				+        m_staticFrame = cv::UMat();
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        track_t kx = 0;
			
 
				+        track_t bx = 0;
			
 
				+        track_t ky = 0;
			
 
				+        track_t by = 0;
			
 
				+        get_lin_regress_params(m_trace, m_trace.size() - trajLen, m_trace.size(), kx, bx, ky, by);
			
 
				+        track_t speed = sqrt(sqr(kx * trajLen) + sqr(ky * trajLen));
			
 
				+        const track_t speedThresh = 10;
			
 
				+        if (speed < speedThresh)
			
 
				+        {
			
 
				+            if (!m_isStatic)
			
 
				+            {
			
 
				+                m_staticFrame = currFrame.clone();
			
 
				+                m_staticRect = region.m_rect;
			
 
				+            }
			
 
				+
			
 
				+            ++m_staticFrames;
			
 
				+            m_isStatic = true;
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            m_isStatic = false;
			
 
				+            m_staticFrames = 0;
			
 
				+            m_staticFrame = cv::UMat();
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return m_isStatic;
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief GetLastRect
			
 
				+/// \return
			
 
				+///
			
 
				+Rect3D CTrack::GetLastRect() const
			
 
				+{
			
 
				+    if (m_filterGoal == tracking::FilterRect)
			
 
				+        return Rect3D(cv::Point3f(m_predictionRect.center.x,m_predictionRect.center.y,0),Size3D(m_predictionRect.boundingRect().width,m_predictionRect.boundingRect().height,0),0);
			
 
				+    else if(m_filterGoal == tracking::FilterRect3D)
			
 
				+        return m_predictionRect3D;
			
 
				+    else
			
 
				+        return Rect3D(cv::Point3f(m_predictionPoint.x, m_predictionPoint.y,0),Size3D(0,0,0),0);
			
 
				+
			
 
				+}
			
 
				+///
			
 
				+/// \brief CTrack::LastRegion
			
 
				+/// \return
			
 
				+///
			
 
				+const CRegion& CTrack::LastRegion() const
			
 
				+{
			
 
				+    return m_lastRegion;
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTrack::ConstructObject
			
 
				+/// \return
			
 
				+///
			
 
				+TrackingObject CTrack::ConstructObject() const
			
 
				+{
			
 
				+    return TrackingObject(GetLastRect(), m_lastRegion, m_trackID, m_regionID, m_trace, IsStatic(), IsOutOfTheFrame(), m_kalman.GetVelocity(), m_detectedFrames);
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTrack::SkippedFrames
			
 
				+/// \return
			
 
				+///
			
 
				+size_t CTrack::SkippedFrames() const
			
 
				+{
			
 
				+    return m_skippedFrames;
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief CTrack::SkippedFrames
			
 
				+/// \return
			
 
				+///
			
 
				+size_t& CTrack::SkippedFrames()
			
 
				+{
			
 
				+    return m_skippedFrames;
			
 
				+}
			
 
				+///
			
 
				+/// \brief CTrack::DetectedFrames
			
 
				+/// \return
			
 
				+///
			
 
				+size_t& CTrack::DetectedFrames()
			
 
				+{
			
 
				+    return m_detectedFrames;
			
 
				+}
			
 
				+///
			
 
				+/// \brief RectUpdate
			
 
				+/// \param region
			
 
				+/// \param dataCorrect
			
 
				+/// \param prevFrame
			
 
				+/// \param currFrame
			
 
				+///
			
 
				+void CTrack::RectUpdate(
			
 
				+        const CRegion& region,
			
 
				+        bool dataCorrect,
			
 
				+        cv::UMat prevFrame,
			
 
				+        cv::UMat currFrame
			
 
				+        )
			
 
				+{
			
 
				+    m_kalman.GetRectPrediction();
			
 
				+
			
 
				+    bool recalcPrediction = true;
			
 
				+
			
 
				+    auto Clamp = [](int& v, int& size, int hi) -> int
			
 
				+    {
			
 
				+        int res = 0;
			
 
				+
			
 
				+        if (size < 1)
			
 
				+            size = 0;
			
 
				+
			
 
				+        if (v < 0)
			
 
				+        {
			
 
				+            res = v;
			
 
				+            v = 0;
			
 
				+            return res;
			
 
				+        }
			
 
				+        else if (v + size > hi - 1)
			
 
				+        {
			
 
				+            res = v;
			
 
				+            v = hi - 1 - size;
			
 
				+            if (v < 0)
			
 
				+            {
			
 
				+                size += v;
			
 
				+                v = 0;
			
 
				+            }
			
 
				+            res -= v;
			
 
				+            return res;
			
 
				+        }
			
 
				+        return res;
			
 
				+    };
			
 
				+
			
 
				+    auto UpdateRRect = [&](cv::Rect prevRect, cv::Rect newRect)
			
 
				+    {
			
 
				+        m_predictionRect.center.x += newRect.x - prevRect.x;
			
 
				+        m_predictionRect.center.y += newRect.y - prevRect.y;
			
 
				+        m_predictionRect.size.width *= newRect.width / static_cast<float>(prevRect.width);
			
 
				+        m_predictionRect.size.height *= newRect.height / static_cast<float>(prevRect.height);
			
 
				+    };
			
 
				+
			
 
				+    switch (m_externalTrackerForLost)
			
 
				+    {
			
 
				+    case tracking::TrackNone:
			
 
				+        break;
			
 
				+    }
			
 
				+
			
 
				+    if (recalcPrediction)
			
 
				+        UpdateRRect(m_predictionRect.boundingRect(), m_kalman.Update(region.m_brect, dataCorrect));
			
 
				+
			
 
				+    cv::Rect brect = m_predictionRect.boundingRect();
			
 
				+    int dx = Clamp(brect.x, brect.width, currFrame.cols);
			
 
				+    int dy = Clamp(brect.y, brect.height, currFrame.rows);
			
 
				+
			
 
				+    m_outOfTheFrame = (dx != 0) || (dy != 0) || (brect.width < 2) || (brect.height < 2);
			
 
				+
			
 
				+    m_predictionPoint = cv::Point3f(m_predictionRect.center.x,m_predictionRect.center.y,0);
			
 
				+
			
 
				+    //std::cout << "brect = " << brect << ", dx = " << dx << ", dy = " << dy << ", outOfTheFrame = " << m_outOfTheFrame << ", predictionPoint = " << m_predictionPoint << std::endl;
			
 
				+}
			
 
				+///
			
 
				+/// \brief Rect3DUpdate
			
 
				+/// \param region
			
 
				+/// \param dataCorrect
			
 
				+/// \param prevFrame
			
 
				+/// \param currFrame
			
 
				+///
			
 
				+void CTrack::Rect3DUpdate(
			
 
				+        const CRegion& region,
			
 
				+        bool dataCorrect,
			
 
				+        cv::UMat prevFrame,
			
 
				+        cv::UMat currFrame
			
 
				+        )
			
 
				+{
			
 
				+    m_kalman.GetRect3DPrediction();
			
 
				+
			
 
				+    switch (m_externalTrackerForLost)
			
 
				+    {
			
 
				+    case tracking::TrackNone:
			
 
				+        break;
			
 
				+    }
			
 
				+
			
 
				+    m_predictionRect3D = m_kalman.Update(region.m_rect, dataCorrect);
			
 
				+
			
 
				+    m_predictionPoint = m_predictionRect3D.center;
			
 
				+
			
 
				+    //std::cout << "brect = " << brect << ", dx = " << dx << ", dy = " << dy << ", outOfTheFrame = " << m_outOfTheFrame << ", predictionPoint = " << m_predictionPoint << std::endl;
			
 
				+}
			
 
				+
			
 
				+///
			
 
				+/// \brief PointUpdate
			
 
				+/// \param pt
			
 
				+/// \param dataCorrect
			
 
				+///
			
 
				+void CTrack::PointUpdate(
			
 
				+        const cv::Point3f& pt,
			
 
				+        const cv::Size& newObjSize,
			
 
				+        bool dataCorrect,
			
 
				+        const cv::Size& frameSize
			
 
				+        )
			
 
				+{
			
 
				+    m_kalman.GetPointPrediction();
			
 
				+
			
 
				+    m_predictionPoint = m_kalman.Update(pt, dataCorrect);
			
 
				+
			
 
				+    if (dataCorrect)
			
 
				+    {
			
 
				+        const int a1 = 1;
			
 
				+        const int a2 = 9;
			
 
				+        m_predictionRect.size.width = (a1 * newObjSize.width + a2 * m_predictionRect.size.width) / (a1 + a2);
			
 
				+        m_predictionRect.size.height = (a1 * newObjSize.height + a2 * m_predictionRect.size.height) / (a1 + a2);
			
 
				+    }
			
 
				+
			
 
				+    auto Clamp = [](track_t& v, int hi) -> bool
			
 
				+    {
			
 
				+        if (v < 0)
			
 
				+        {
			
 
				+            v = 0;
			
 
				+            return true;
			
 
				+        }
			
 
				+        else if (hi && v > hi - 1)
			
 
				+        {
			
 
				+            v = static_cast<track_t>(hi - 1);
			
 
				+            return true;
			
 
				+        }
			
 
				+        return false;
			
 
				+    };
			
 
				+    auto p = m_predictionPoint;
			
 
				+    m_outOfTheFrame = Clamp(p.x, frameSize.width) || Clamp(p.y, frameSize.height) || (m_predictionRect.size.width < 2) || (m_predictionRect.size.height < 2);
			
 
				+
			
 
				+    //std::cout << "predictionRect = " << m_predictionRect.boundingRect() << ", outOfTheFrame = " << m_outOfTheFrame << ", predictionPoint = " << m_predictionPoint << std::endl;
			
 
				+}
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/Tracker/track.h
+++ b/src/api/perception_lidar_PointPillars_MultiHead/Tracker/track.h
@@ -0,0 +1,303 @@
 
				+#pragma once
			
 
				+#include <iostream>
			
 
				+#include <vector>
			
 
				+#include <deque>
			
 
				+#include <memory>
			
 
				+#include <array>
			
 
				+
			
 
				+#ifdef USE_OCV_KCF
			
 
				+#include <opencv2/tracking.hpp>
			
 
				+#endif
			
 
				+
			
 
				+#include "defines.h"
			
 
				+#include "Kalman.h"
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+///
			
 
				+/// \brief The TrajectoryPoint struct
			
 
				+///
			
 
				+struct TrajectoryPoint
			
 
				+{
			
 
				+    ///
			
 
				+    /// \brief TrajectoryPoint
			
 
				+    ///
			
 
				+    TrajectoryPoint()
			
 
				+        : m_hasRaw(false)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief TrajectoryPoint
			
 
				+    /// \param prediction
			
 
				+    ///
			
 
				+    TrajectoryPoint(const cv::Point3f& prediction)
			
 
				+        :
			
 
				+          m_hasRaw(false),
			
 
				+          m_prediction(prediction)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief TrajectoryPoint
			
 
				+    /// \param prediction
			
 
				+    /// \param raw
			
 
				+    ///
			
 
				+    TrajectoryPoint(const cv::Point3f& prediction, const cv::Point3f& raw)
			
 
				+        :
			
 
				+          m_hasRaw(true),
			
 
				+          m_prediction(prediction),
			
 
				+          m_raw(raw)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    bool m_hasRaw = false;
			
 
				+    cv::Point3f m_prediction;
			
 
				+    cv::Point3f m_raw;
			
 
				+};
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+///
			
 
				+/// \brief The Trace class
			
 
				+///
			
 
				+class Trace
			
 
				+{
			
 
				+public:
			
 
				+    ///
			
 
				+    /// \brief operator []
			
 
				+    /// \param i
			
 
				+    /// \return
			
 
				+    ///
			
 
				+    const cv::Point3f& operator[](size_t i) const
			
 
				+    {
			
 
				+        return m_trace[i].m_prediction;
			
 
				+    }
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief operator []
			
 
				+    /// \param i
			
 
				+    /// \return
			
 
				+    ///
			
 
				+    cv::Point3f& operator[](size_t i)
			
 
				+    {
			
 
				+        return m_trace[i].m_prediction;
			
 
				+    }
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief at
			
 
				+    /// \param i
			
 
				+    /// \return
			
 
				+    ///
			
 
				+    const TrajectoryPoint& at(size_t i) const
			
 
				+    {
			
 
				+        return m_trace[i];
			
 
				+    }
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief size
			
 
				+    /// \return
			
 
				+    ///
			
 
				+    size_t size() const
			
 
				+    {
			
 
				+        return m_trace.size();
			
 
				+    }
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief push_back
			
 
				+    /// \param prediction
			
 
				+    ///
			
 
				+    void push_back(const cv::Point3f& prediction)
			
 
				+    {
			
 
				+        m_trace.emplace_back(prediction);
			
 
				+    }
			
 
				+    void push_back(const cv::Point3f& prediction, const cv::Point3f& raw)
			
 
				+    {
			
 
				+        m_trace.emplace_back(prediction, raw);
			
 
				+    }
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief pop_front
			
 
				+    /// \param count
			
 
				+    ///
			
 
				+    void pop_front(size_t count)
			
 
				+    {
			
 
				+        if (count < size())
			
 
				+            m_trace.erase(m_trace.begin(), m_trace.begin() + count);
			
 
				+        else
			
 
				+            m_trace.clear();
			
 
				+    }
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief GetRawCount
			
 
				+    /// \param lastPeriod
			
 
				+    /// \return
			
 
				+    ///
			
 
				+    size_t GetRawCount(size_t lastPeriod) const
			
 
				+    {
			
 
				+        size_t res = 0;
			
 
				+
			
 
				+        size_t i = 0;
			
 
				+        if (lastPeriod < m_trace.size())
			
 
				+            i = m_trace.size() - lastPeriod;
			
 
				+
			
 
				+        for (; i < m_trace.size(); ++i)
			
 
				+        {
			
 
				+            if (m_trace[i].m_hasRaw)
			
 
				+                ++res;
			
 
				+        }
			
 
				+
			
 
				+        return res;
			
 
				+    }
			
 
				+
			
 
				+private:
			
 
				+    std::deque<TrajectoryPoint> m_trace;
			
 
				+};
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+///
			
 
				+/// \brief The TrackingObject class
			
 
				+///
			
 
				+struct TrackingObject
			
 
				+{
			
 
				+    Rect3D m_rect;           // Coordinates
			
 
				+    CRegion m_region;         // detect region
			
 
				+    Trace m_trace;                     // Trajectory
			
 
				+    size_t m_ID = 0;                   // Objects ID
			
 
				+    size_t m_regionID = 0;                   // Objects ID
			
 
				+    bool m_isStatic = false;           // Object is abandoned
			
 
				+    bool m_outOfTheFrame = false;      // Is object out of freme
			
 
				+    cv::Vec<track_t, 2> m_velocity;    // pixels/sec
			
 
				+    size_t m_detectedFrames = 0;       // detected frames' count
			
 
				+    ///
			
 
				+    TrackingObject(const Rect3D& rect, const CRegion& region, size_t ID, size_t regionID, const Trace& trace,
			
 
				+                   bool isStatic, bool outOfTheFrame, cv::Vec<track_t, 2> velocity, size_t detectedFrames)
			
 
				+        :
			
 
				+          m_rect(rect), m_region(region), m_ID(ID), m_regionID(regionID), m_isStatic(isStatic), m_outOfTheFrame(outOfTheFrame), m_velocity(velocity), m_detectedFrames(detectedFrames)
			
 
				+    {
			
 
				+        for (size_t i = 0; i < trace.size(); ++i)
			
 
				+        {
			
 
				+            auto tp = trace.at(i);
			
 
				+            if (tp.m_hasRaw)
			
 
				+                m_trace.push_back(tp.m_prediction, tp.m_raw);
			
 
				+            else
			
 
				+                m_trace.push_back(tp.m_prediction);
			
 
				+        }
			
 
				+    }
			
 
				+    ///
			
 
				+    bool IsRobust(int minTraceSize, float minRawRatio, cv::Size2f sizeRatio) const
			
 
				+    {
			
 
				+        bool res = m_trace.size() > static_cast<size_t>(minTraceSize);
			
 
				+        res &= m_trace.GetRawCount(m_trace.size() - 1) / static_cast<float>(m_trace.size()) > minRawRatio;
			
 
				+        if (sizeRatio.width + sizeRatio.height > 0)
			
 
				+        {
			
 
				+            float sr = m_rect.size.width / m_rect.size.height;
			
 
				+            if (sizeRatio.width > 0)
			
 
				+                res &= (sr > sizeRatio.width);
			
 
				+
			
 
				+            if (sizeRatio.height > 0)
			
 
				+                res &= (sr < sizeRatio.height);
			
 
				+        }
			
 
				+        if (m_outOfTheFrame)
			
 
				+            res = false;
			
 
				+
			
 
				+        return res;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+// --------------------------------------------------------------------------
			
 
				+///
			
 
				+/// \brief The CTrack class
			
 
				+///
			
 
				+class CTrack
			
 
				+{
			
 
				+public:
			
 
				+    CTrack(const CRegion& region,
			
 
				+           tracking::KalmanType kalmanType,
			
 
				+           track_t deltaTime,
			
 
				+           track_t accelNoiseMag,
			
 
				+           bool useAcceleration,
			
 
				+           size_t trackID,
			
 
				+           size_t regionID,
			
 
				+           tracking::FilterGoal filterObjectGoal,
			
 
				+           tracking::LostTrackType externalTrackerForLost);
			
 
				+
			
 
				+    ///
			
 
				+    /// \brief CalcDist
			
 
				+    /// Euclidean distance in pixels between objects centres on two N and N+1 frames
			
 
				+    /// \param reg
			
 
				+    /// \return
			
 
				+    ///
			
 
				+    track_t CalcDistCenter(const CRegion& reg) const;
			
 
				+    ///
			
 
				+    /// \brief CalcDist
			
 
				+    /// Euclidean distance in pixels between object contours on two N and N+1 frames
			
 
				+    /// \param reg
			
 
				+    /// \return
			
 
				+    ///
			
 
				+    track_t CalcDistRect(const CRegion& reg) const;
			
 
				+    ///
			
 
				+    /// \brief CalcDist
			
 
				+    /// Euclidean distance in pixels between object contours on two N and N+1 frames
			
 
				+    /// \param reg
			
 
				+    /// \return
			
 
				+    ///
			
 
				+    track_t CalcDistRect3D(const CRegion& reg) const;
			
 
				+
			
 
				+    //cv::RotatedRect CalcPredictionEllipse(cv::Size_<track_t> minRadius) const;
			
 
				+    ///
			
 
				+    /// \brief IsInsideArea
			
 
				+    /// Test point inside in prediction area: prediction area + object velocity
			
 
				+    /// \param pt
			
 
				+    /// \param minVal
			
 
				+    /// \return
			
 
				+    ///
			
 
				+    track_t IsInsideArea(const Point_t& pt, const cv::RotatedRect& rrect) const;
			
 
				+    track_t WidthDist(const CRegion& reg) const;
			
 
				+    track_t HeightDist(const CRegion& reg) const;
			
 
				+
			
 
				+    void Update(const CRegion& region, bool dataCorrect, size_t max_trace_length, cv::UMat prevFrame, cv::UMat currFrame, int trajLen);
			
 
				+
			
 
				+    bool IsStatic() const;
			
 
				+    bool IsStaticTimeout(int framesTime) const;
			
 
				+    bool IsOutOfTheFrame() const;
			
 
				+
			
 
				+    Rect3D GetLastRect() const;
			
 
				+
			
 
				+    const cv::Point3f& AveragePoint() const;
			
 
				+    cv::Point3f& AveragePoint();
			
 
				+    const CRegion& LastRegion() const;
			
 
				+    size_t SkippedFrames() const;
			
 
				+    size_t& SkippedFrames();
			
 
				+    size_t& DetectedFrames();
			
 
				+
			
 
				+    TrackingObject ConstructObject() const;
			
 
				+public:
			
 
				+    size_t m_regionID = 0;
			
 
				+private:
			
 
				+    Trace m_trace;
			
 
				+    size_t m_trackID = 0;
			
 
				+    size_t m_skippedFrames = 0;
			
 
				+    size_t m_detectedFrames = 0;
			
 
				+    CRegion m_lastRegion;
			
 
				+
			
 
				+    cv::Point3f m_predictionPoint;
			
 
				+    cv::RotatedRect m_predictionRect;
			
 
				+    Rect3D m_predictionRect3D;
			
 
				+    TKalmanFilter m_kalman;
			
 
				+    tracking::FilterGoal m_filterGoal = tracking::FilterCenter;
			
 
				+    bool m_outOfTheFrame = false;
			
 
				+
			
 
				+    tracking::LostTrackType m_externalTrackerForLost;
			
 
				+
			
 
				+    void RectUpdate(const CRegion& region, bool dataCorrect, cv::UMat prevFrame, cv::UMat currFrame);
			
 
				+    void Rect3DUpdate(const CRegion& region, bool dataCorrect, cv::UMat prevFrame, cv::UMat currFrame);
			
 
				+    void PointUpdate(const cv::Point3f& pt, const cv::Size& newObjSize, bool dataCorrect, const cv::Size& frameSize);
			
 
				+
			
 
				+    bool CheckStatic(int trajLen, cv::UMat currFrame, const CRegion& region);
			
 
				+    bool m_isStatic = false;
			
 
				+    int m_staticFrames = 0;
			
 
				+    cv::UMat m_staticFrame;
			
 
				+    Rect3D m_staticRect;
			
 
				+};
			
 
				+
			
 
				+typedef std::vector<std::unique_ptr<CTrack>> tracks_t;
			
 
				+
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/cfgs/cbgs_pp_multihead.yaml
+++ b/src/api/perception_lidar_PointPillars_MultiHead/cfgs/cbgs_pp_multihead.yaml
@@ -0,0 +1,246 @@
 
				+CLASS_NAMES: ['car','truck', 'construction_vehicle', 'bus', 'trailer',
			
 
				+              'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']
			
 
				+
			
 
				+DATA_CONFIG:
			
 
				+    _BASE_CONFIG_: cfgs/dataset_configs/nuscenes_dataset.yaml
			
 
				+
			
 
				+    POINT_CLOUD_RANGE: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
			
 
				+    DATA_PROCESSOR:
			
 
				+        -   NAME: mask_points_and_boxes_outside_range
			
 
				+            REMOVE_OUTSIDE_BOXES: True
			
 
				+
			
 
				+        -   NAME: shuffle_points
			
 
				+            SHUFFLE_ENABLED: {
			
 
				+                'train': True,
			
 
				+                'test': True
			
 
				+            }
			
 
				+
			
 
				+        -   NAME: transform_points_to_voxels
			
 
				+            VOXEL_SIZE: [0.2, 0.2, 8.0]
			
 
				+            MAX_POINTS_PER_VOXEL: 20
			
 
				+            MAX_NUMBER_OF_VOXELS: {
			
 
				+                'train': 30000,
			
 
				+                'test': 30000
			
 
				+            }
			
 
				+
			
 
				+MODEL:
			
 
				+    NAME: PointPillar
			
 
				+
			
 
				+    VFE:
			
 
				+        NAME: PillarVFE
			
 
				+        WITH_DISTANCE: False
			
 
				+        USE_ABSLOTE_XYZ: True
			
 
				+        USE_NORM: True
			
 
				+        NUM_FILTERS: [64]
			
 
				+
			
 
				+    MAP_TO_BEV:
			
 
				+        NAME: PointPillarScatter
			
 
				+        NUM_BEV_FEATURES: 64
			
 
				+
			
 
				+    BACKBONE_2D:
			
 
				+        NAME: BaseBEVBackbone
			
 
				+        LAYER_NUMS: [3, 5, 5]
			
 
				+        LAYER_STRIDES: [2, 2, 2]
			
 
				+        NUM_FILTERS: [64, 128, 256]
			
 
				+        UPSAMPLE_STRIDES: [0.5, 1, 2]
			
 
				+        NUM_UPSAMPLE_FILTERS: [128, 128, 128]
			
 
				+
			
 
				+    DENSE_HEAD:
			
 
				+        NAME: AnchorHeadMulti
			
 
				+        CLASS_AGNOSTIC: False
			
 
				+
			
 
				+        DIR_OFFSET: 0.78539
			
 
				+        DIR_LIMIT_OFFSET: 0.0
			
 
				+        NUM_DIR_BINS: 2
			
 
				+
			
 
				+        USE_MULTIHEAD: True
			
 
				+        SEPARATE_MULTIHEAD: True
			
 
				+        ANCHOR_GENERATOR_CONFIG: [
			
 
				+            {
			
 
				+                'class_name': car,
			
 
				+                'anchor_sizes': [[4.63, 1.97, 1.74]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-0.95],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.6,
			
 
				+                'unmatched_threshold': 0.45
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': truck,
			
 
				+                'anchor_sizes': [[6.93, 2.51, 2.84]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-0.6],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.55,
			
 
				+                'unmatched_threshold': 0.4
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': construction_vehicle,
			
 
				+                'anchor_sizes': [[6.37, 2.85, 3.19]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-0.225],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.5,
			
 
				+                'unmatched_threshold': 0.35
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': bus,
			
 
				+                'anchor_sizes': [[10.5, 2.94, 3.47]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-0.085],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.55,
			
 
				+                'unmatched_threshold': 0.4
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': trailer,
			
 
				+                'anchor_sizes': [[12.29, 2.90, 3.87]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [0.115],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.5,
			
 
				+                'unmatched_threshold': 0.35
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': barrier,
			
 
				+                'anchor_sizes': [[0.50, 2.53, 0.98]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-1.33],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.55,
			
 
				+                'unmatched_threshold': 0.4
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': motorcycle,
			
 
				+                'anchor_sizes': [[2.11, 0.77, 1.47]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-1.085],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.5,
			
 
				+                'unmatched_threshold': 0.3
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': bicycle,
			
 
				+                'anchor_sizes': [[1.70, 0.60, 1.28]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-1.18],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.5,
			
 
				+                'unmatched_threshold': 0.35
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': pedestrian,
			
 
				+                'anchor_sizes': [[0.73, 0.67, 1.77]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-0.935],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.6,
			
 
				+                'unmatched_threshold': 0.4
			
 
				+            },
			
 
				+            {
			
 
				+                'class_name': traffic_cone,
			
 
				+                'anchor_sizes': [[0.41, 0.41, 1.07]],
			
 
				+                'anchor_rotations': [0, 1.57],
			
 
				+                'anchor_bottom_heights': [-1.285],
			
 
				+                'align_center': False,
			
 
				+                'feature_map_stride': 4,
			
 
				+                'matched_threshold': 0.6,
			
 
				+                'unmatched_threshold': 0.4
			
 
				+            },
			
 
				+        ]
			
 
				+
			
 
				+        SHARED_CONV_NUM_FILTER: 64
			
 
				+
			
 
				+        RPN_HEAD_CFGS: [
			
 
				+            {
			
 
				+                'HEAD_CLS_NAME': ['car'],
			
 
				+            },
			
 
				+            {
			
 
				+                'HEAD_CLS_NAME': ['truck', 'construction_vehicle'],
			
 
				+            },
			
 
				+            {
			
 
				+                'HEAD_CLS_NAME': ['bus', 'trailer'],
			
 
				+            },
			
 
				+            {
			
 
				+                'HEAD_CLS_NAME': ['barrier'],
			
 
				+            },
			
 
				+            {
			
 
				+                'HEAD_CLS_NAME': ['motorcycle', 'bicycle'],
			
 
				+            },
			
 
				+            {
			
 
				+                'HEAD_CLS_NAME': ['pedestrian', 'traffic_cone'],
			
 
				+            },
			
 
				+        ]
			
 
				+        SEPARATE_REG_CONFIG: 
			
 
				+            NUM_MIDDLE_CONV: 1
			
 
				+            NUM_MIDDLE_FILTER: 64
			
 
				+            REG_LIST: ['reg:2', 'height:1', 'size:3', 'angle:2', 'velo:2']
			
 
				+
			
 
				+        TARGET_ASSIGNER_CONFIG:
			
 
				+            NAME: AxisAlignedTargetAssigner
			
 
				+            POS_FRACTION: -1.0
			
 
				+            SAMPLE_SIZE: 512
			
 
				+            NORM_BY_NUM_EXAMPLES: False
			
 
				+            MATCH_HEIGHT: False
			
 
				+            BOX_CODER: ResidualCoder
			
 
				+            BOX_CODER_CONFIG: {
			
 
				+                'code_size': 9,
			
 
				+                'encode_angle_by_sincos': True
			
 
				+            }
			
 
				+
			
 
				+
			
 
				+        LOSS_CONFIG:
			
 
				+            REG_LOSS_TYPE: WeightedL1Loss
			
 
				+            LOSS_WEIGHTS: {
			
 
				+                'pos_cls_weight': 1.0,
			
 
				+                'neg_cls_weight': 2.0,
			
 
				+                'cls_weight': 1.0,
			
 
				+                'loc_weight': 0.25,
			
 
				+                'dir_weight': 0.2,
			
 
				+                'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
			
 
				+            }
			
 
				+
			
 
				+    POST_PROCESSING:
			
 
				+        RECALL_THRESH_LIST: [0.3, 0.5, 0.7]
			
 
				+        SCORE_THRESH: 0.1
			
 
				+        OUTPUT_RAW_SCORE: False
			
 
				+
			
 
				+        EVAL_METRIC: kitti
			
 
				+
			
 
				+        NMS_CONFIG:
			
 
				+            MULTI_CLASSES_NMS: True
			
 
				+            NMS_TYPE: nms_gpu
			
 
				+            NMS_THRESH: 0.2
			
 
				+            NMS_PRE_MAXSIZE: 1000
			
 
				+            NMS_POST_MAXSIZE: 83
			
 
				+
			
 
				+
			
 
				+OPTIMIZATION:
			
 
				+    BATCH_SIZE_PER_GPU: 4
			
 
				+    NUM_EPOCHS: 20
			
 
				+
			
 
				+    OPTIMIZER: adam_onecycle
			
 
				+    LR: 0.001
			
 
				+    WEIGHT_DECAY: 0.01
			
 
				+    MOMENTUM: 0.9
			
 
				+
			
 
				+    MOMS: [0.95, 0.85]
			
 
				+    PCT_START: 0.4
			
 
				+    DIV_FACTOR: 10
			
 
				+    DECAY_STEP_LIST: [35, 45]
			
 
				+    LR_DECAY: 0.1
			
 
				+    LR_CLIP: 0.0000001
			
 
				+
			
 
				+    LR_WARMUP: False
			
 
				+    WARMUP_EPOCH: 1
			
 
				+
			
 
				+    GRAD_NORM_CLIP: 10
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/common.h
+++ b/src/api/perception_lidar_PointPillars_MultiHead/common.h
@@ -0,0 +1,141 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+// headers in STL
			
 
				+#include <stdio.h>
			
 
				+#include <assert.h>
			
 
				+#include <iostream>
			
 
				+#include <fstream>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+// headers in CUDA
			
 
				+#include "cuda_runtime_api.h"
			
 
				+
			
 
				+using namespace std;
			
 
				+// using MACRO to allocate memory inside CUDA kernel
			
 
				+#define NUM_3D_BOX_CORNERS_MACRO 8
			
 
				+
			
 
				+#define NUM_2D_BOX_CORNERS_MACRO 4
			
 
				+
			
 
				+#define NUM_THREADS_MACRO 64
			
 
				+
			
 
				+// need to be changed when num_threads_ is changed
			
 
				+
			
 
				+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
			
 
				+
			
 
				+#define GPU_CHECK(ans)                    \
			
 
				+  {                                       \
			
 
				+    GPUAssert((ans), __FILE__, __LINE__); \
			
 
				+  }
			
 
				+inline void GPUAssert(cudaError_t code, const char *file, int line,
			
 
				+                      bool abort = true)
			
 
				+{
			
 
				+  if (code != cudaSuccess)
			
 
				+  {
			
 
				+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
			
 
				+            line);
			
 
				+    if (abort)
			
 
				+      exit(code);
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+template <typename T>
			
 
				+void HOST_SAVE(T *array, int size, string filename, string root = "../test/result", string postfix = ".txt")
			
 
				+{
			
 
				+  string filepath = root + "/" + filename + postfix;
			
 
				+  if (postfix == ".bin")
			
 
				+  {
			
 
				+    fstream file(filepath, ios::out | ios::binary);
			
 
				+    file.write(reinterpret_cast<char *>(array), sizeof(size * sizeof(T)));
			
 
				+    file.close();
			
 
				+    std::cout << "|>>>|  Data has been written in " << filepath << "  |<<<|" << std::endl;
			
 
				+    return;
			
 
				+  }
			
 
				+  else if (postfix == ".txt")
			
 
				+  {
			
 
				+    ofstream file(filepath, ios::out);
			
 
				+    for (int i = 0; i < size; ++i)
			
 
				+      file << array[i] << " ";
			
 
				+    file.close();
			
 
				+    std::cout << "|>>>|  Data has been written in " << filepath << "  |<<<|" << std::endl;
			
 
				+    return;
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+template <typename T>
			
 
				+void DEVICE_SAVE(T *array, int size, string filename, string root = "../test/result", string postfix = ".txt")
			
 
				+{
			
 
				+  T *temp_ = new T[size];
			
 
				+  cudaMemcpy(temp_, array, size * sizeof(T), cudaMemcpyDeviceToHost);
			
 
				+  HOST_SAVE<T>(temp_, size, filename, root, postfix);
			
 
				+  delete[] temp_;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+// int TXTtoArrary( float* &points_array , string file_name , int num_feature = 4)
			
 
				+// {
			
 
				+//   ifstream InFile;
			
 
				+//   InFile.open(file_name.data());
			
 
				+//   assert(InFile.is_open());
			
 
				+
			
 
				+//   vector<float> temp_points;
			
 
				+//   string c;
			
 
				+
			
 
				+//   while (!InFile.eof())
			
 
				+//   {
			
 
				+//       InFile >> c;
			
 
				+
			
 
				+//       temp_points.push_back(atof(c.c_str()));
			
 
				+//   }
			
 
				+//   points_array = new float[temp_points.size()];
			
 
				+//   for (int i = 0 ; i < temp_points.size() ; ++i) {
			
 
				+//     points_array[i] = temp_points[i];
			
 
				+//   }
			
 
				+
			
 
				+//   InFile.close();  
			
 
				+//   return temp_points.size() / num_feature;
			
 
				+// };
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/main.cpp
+++ b/src/api/perception_lidar_PointPillars_MultiHead/main.cpp
@@ -0,0 +1,377 @@
 
				+#include <QCoreApplication>
			
 
				+#include <QDateTime>
			
 
				+#include <iostream>
			
 
				+#include "pointpillars.h"
			
 
				+#include <iostream>
			
 
				+#include <pcl/point_cloud.h>
			
 
				+#include <pcl/point_types.h>
			
 
				+#include <pcl/io/io.h>
			
 
				+#include <pcl/io/pcd_io.h>
			
 
				+#include "xmlparam.h"
			
 
				+#include "modulecomm.h"
			
 
				+#include <thread>
			
 
				+#include "objectarray.pb.h"
			
 
				+#include "Tracking.hpp"
			
 
				+
			
 
				+std::thread * gpthread;
			
 
				+PointPillars * pPillars = nullptr ;
			
 
				+void * gpa;
			
 
				+void * gpdetect;
			
 
				+int gnothavedatatime = 0;
			
 
				+const int kNumPointFeature = 5;
			
 
				+const int kOutputNumBoxFeature = 7;
			
 
				+std::string gstrinput;
			
 
				+std::string gstroutput;
			
 
				+
			
 
				+TrackerSettings settings;
			
 
				+CTracker tracker(settings);
			
 
				+bool m_isTrackerInitialized = false;
			
 
				+
			
 
				+void PclToArray(
			
 
				+        const pcl::PointCloud<pcl::PointXYZI>::Ptr& in_pcl_pc_ptr,
			
 
				+        float* out_points_array, const float normalizing_factor) {
			
 
				+    for (size_t i = 0; i < in_pcl_pc_ptr->size(); ++i) {
			
 
				+        pcl::PointXYZI point = in_pcl_pc_ptr->at(i);
			
 
				+        out_points_array[i * 4 + 0] = point.x;
			
 
				+        out_points_array[i * 4 + 1] = point.y;
			
 
				+        out_points_array[i * 4 + 2] = point.z;
			
 
				+        out_points_array[i * 4 + 3] =
			
 
				+                static_cast<float>(point.intensity / normalizing_factor);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void PclXYZITToArray(
			
 
				+        const pcl::PointCloud<pcl::PointXYZI>::Ptr& in_pcl_pc_ptr,
			
 
				+        float* out_points_array, const float normalizing_factor) {
			
 
				+    for (size_t i = 0; i < in_pcl_pc_ptr->size(); ++i) {
			
 
				+        pcl::PointXYZI point = in_pcl_pc_ptr->at(i);
			
 
				+        out_points_array[i * 5 + 0] = point.x;
			
 
				+        out_points_array[i * 5 + 1] = point.y;
			
 
				+        out_points_array[i * 5 + 2] = point.z;
			
 
				+        out_points_array[i * 5 + 3] =
			
 
				+                static_cast<float>(point.intensity / normalizing_factor);
			
 
				+        out_points_array[i * 5 + 4] = 0;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void GetLidarObj(std::vector<float> out_detections,std::vector<int> out_labels,
			
 
				+                 std::vector<float> out_scores,iv::lidar::objectarray & lidarobjvec)
			
 
				+{
			
 
				+    int i;
			
 
				+    int obj_size = out_detections.size()/kOutputNumBoxFeature;
			
 
				+    //    givlog->verbose("OBJ","object size is %d",obj_size);
			
 
				+    for(i=0;i<obj_size;i++)
			
 
				+    {
			
 
				+        iv::lidar::lidarobject lidarobj;
			
 
				+        if (out_scores.at(i) < 0.10) continue;
			
 
				+
			
 
				+        lidarobj.set_tyaw(out_detections.at(i*7+6));
			
 
				+        iv::lidar::PointXYZ centroid;
			
 
				+        iv::lidar::PointXYZ * _centroid;
			
 
				+        centroid.set_x(out_detections.at(i*7));
			
 
				+        centroid.set_y(out_detections.at(i*7+1));
			
 
				+        centroid.set_z(out_detections.at(i*7+2));
			
 
				+        _centroid = lidarobj.mutable_centroid();
			
 
				+        _centroid->CopyFrom(centroid);
			
 
				+
			
 
				+        iv::lidar::PointXYZ min_point;
			
 
				+        iv::lidar::PointXYZ * _min_point;
			
 
				+        min_point.set_x(0);
			
 
				+        min_point.set_y(0);
			
 
				+        min_point.set_z(0);
			
 
				+        _min_point = lidarobj.mutable_min_point();
			
 
				+        _min_point->CopyFrom(min_point);
			
 
				+
			
 
				+        iv::lidar::PointXYZ max_point;
			
 
				+        iv::lidar::PointXYZ * _max_point;
			
 
				+        max_point.set_x(0);
			
 
				+        max_point.set_y(0);
			
 
				+        max_point.set_z(0);
			
 
				+        _max_point = lidarobj.mutable_max_point();
			
 
				+        _max_point->CopyFrom(max_point);
			
 
				+
			
 
				+        iv::lidar::PointXYZ position;
			
 
				+        iv::lidar::PointXYZ * _position;
			
 
				+        position.set_x(out_detections.at(i*7));
			
 
				+        position.set_y(out_detections.at(i*7+1));
			
 
				+        position.set_z(out_detections.at(i*7+2));
			
 
				+        _position = lidarobj.mutable_position();
			
 
				+        _position->CopyFrom(position);
			
 
				+        lidarobj.set_mntype(out_labels.at(i));
			
 
				+        // label 2  8
			
 
				+        if(out_labels.at(i)==2){
			
 
				+            lidarobj.set_mntype(8);
			
 
				+        }else if(out_labels.at(i)==8){
			
 
				+            lidarobj.set_mntype(2);
			
 
				+        }
			
 
				+        lidarobj.set_score(out_scores.at(i));
			
 
				+        lidarobj.add_type_probs(out_scores.at(i));
			
 
				+
			
 
				+        iv::lidar::PointXYZI point_cloud;
			
 
				+        iv::lidar::PointXYZI * _point_cloud;
			
 
				+        point_cloud.set_x(out_detections.at(i*7));
			
 
				+        point_cloud.set_y(out_detections.at(i*7+1));
			
 
				+        point_cloud.set_z(out_detections.at(i*7+2));
			
 
				+        point_cloud.set_i(0);
			
 
				+
			
 
				+        _point_cloud = lidarobj.add_cloud();
			
 
				+        _point_cloud->CopyFrom(point_cloud);
			
 
				+
			
 
				+        iv::lidar::Dimension ld;
			
 
				+        iv::lidar::Dimension * pld;
			
 
				+        ld.set_x(out_detections.at(i*7+3));// w
			
 
				+        ld.set_y(out_detections.at(i*7+4));// l
			
 
				+        ld.set_z(out_detections.at(i*7+5));// h
			
 
				+        pld = lidarobj.mutable_dimensions();
			
 
				+        pld->CopyFrom(ld);
			
 
				+
			
 
				+        //        std::cout<<"x y z   :  "<<out_detections.at(i*7+3)<<"    "<< out_detections.at(i*7+4)<<"    "<<out_detections.at(i*7+5)<<std::endl;
			
 
				+        iv::lidar::lidarobject * po = lidarobjvec.add_obj();
			
 
				+        po->CopyFrom(lidarobj);
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void DectectOnePCD(const pcl::PointCloud<pcl::PointXYZI>::Ptr &pc_ptr)
			
 
				+{
			
 
				+    std::shared_ptr<float> points_array_ptr = std::shared_ptr<float>(new float[pc_ptr->size() * kNumPointFeature]);
			
 
				+    //   float* points_array = new float[pc_ptr->size() * kNumPointFeature];
			
 
				+    PclXYZITToArray(pc_ptr, points_array_ptr.get(), 1.0);
			
 
				+
			
 
				+    int    in_num_points = pc_ptr->width;
			
 
				+
			
 
				+    std::vector<float> out_detections;
			
 
				+    std::vector<int> out_labels;
			
 
				+    std::vector<float> out_scores;
			
 
				+
			
 
				+    QTime xTime;
			
 
				+
			
 
				+    xTime.start();
			
 
				+
			
 
				+    cudaDeviceSynchronize();
			
 
				+    pPillars->DoInference(points_array_ptr.get(), in_num_points, &out_detections, &out_labels , &out_scores);
			
 
				+    cudaDeviceSynchronize();
			
 
				+
			
 
				+    //    givlog->verbose("obj size is %d", num_objects);
			
 
				+    //    std::cout<<"obj size is "<<num_objects<<std::endl;
			
 
				+
			
 
				+    //    std::vector<iv::lidar::lidarobject> lidarobjvec;
			
 
				+    iv::lidar::objectarray lidarobjvec;
			
 
				+    GetLidarObj(out_detections,out_labels,out_scores,lidarobjvec);
			
 
				+
			
 
				+    double timex = pc_ptr->header.stamp;
			
 
				+    timex = timex/1000.0;
			
 
				+    lidarobjvec.set_timestamp(pc_ptr->header.stamp);
			
 
				+    //---------------------------------------------  init tracker  -------------------------------------------------
			
 
				+    if (!m_isTrackerInitialized)
			
 
				+    {
			
 
				+        m_isTrackerInitialized = InitTracker(tracker);
			
 
				+        if (!m_isTrackerInitialized)
			
 
				+        {
			
 
				+            std::cerr << "Tracker initialize error!!!" << std::endl;
			
 
				+        }
			
 
				+    }
			
 
				+    iv::lidar::objectarray trackedobjvec = Tracking(lidarobjvec, tracker);
			
 
				+    //    std::<<"track    end"<<std::endl;
			
 
				+
			
 
				+    //    --------------------------------------------  end tracking  --------------------------------------------------
			
 
				+    int ntlen;
			
 
				+    std::string out = trackedobjvec.SerializeAsString();
			
 
				+    iv::modulecomm::ModuleSendMsg(gpdetect,out.data(),out.length());
			
 
				+
			
 
				+    //    givlog->verbose("lenth is %d",out.length());
			
 
				+}
			
 
				+
			
 
				+void ListenPointCloud(const char *strdata,const unsigned int nSize,
			
 
				+                      const unsigned int index,const QDateTime * dt,const char * strmemname)
			
 
				+{
			
 
				+    (void)index;(void)dt;(void)strmemname;
			
 
				+    //    std::cout<<" is  ok  ------------  "<<std::endl;
			
 
				+    if(nSize <=16)return;
			
 
				+    unsigned int * pHeadSize = (unsigned int *)strdata;
			
 
				+    if(*pHeadSize > nSize)
			
 
				+    {
			
 
				+        std::cout<<"ListenPointCloud data is small headsize ="<<*pHeadSize<<"  data size is"<<nSize<<std::endl;
			
 
				+    }
			
 
				+
			
 
				+    gnothavedatatime = 0;
			
 
				+    QTime xTime;
			
 
				+    xTime.start();
			
 
				+
			
 
				+    pcl::PointCloud<pcl::PointXYZI>::Ptr point_cloud(
			
 
				+                new pcl::PointCloud<pcl::PointXYZI>());
			
 
				+    int nNameSize;
			
 
				+    nNameSize = *pHeadSize - 4-4-8;
			
 
				+    char * strName = new char[nNameSize+1];strName[nNameSize] = 0;
			
 
				+    std::shared_ptr<char> str_ptr;
			
 
				+    str_ptr.reset(strName);
			
 
				+    memcpy(strName,(char *)((char *)strdata +4),nNameSize);
			
 
				+    point_cloud->header.frame_id = strName;
			
 
				+    memcpy(&point_cloud->header.seq,(char *)strdata+4+nNameSize,4);
			
 
				+    memcpy(&point_cloud->header.stamp,(char *)strdata+4+nNameSize+4,8);
			
 
				+    int nPCount = (nSize - *pHeadSize)/sizeof(pcl::PointXYZI);
			
 
				+    int i;
			
 
				+    pcl::PointXYZI * p;
			
 
				+    p = (pcl::PointXYZI *)((char *)strdata + *pHeadSize);
			
 
				+    for(i=0;i<nPCount;i++)
			
 
				+    {
			
 
				+        pcl::PointXYZI xp;
			
 
				+        memcpy(&xp,p,sizeof(pcl::PointXYZI));
			
 
				+        xp.z = xp.z;
			
 
				+        point_cloud->push_back(xp);
			
 
				+        p++;
			
 
				+    }
			
 
				+
			
 
				+    DectectOnePCD(point_cloud);
			
 
				+    std::cout<<"time is "<<(QDateTime::currentMSecsSinceEpoch() % 1000)<<" "<<xTime.elapsed()<<std::endl;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+bool gbstate = true;
			
 
				+void statethread()
			
 
				+{
			
 
				+    int nstate = 0;
			
 
				+    int nlaststate = 0;
			
 
				+    while (gbstate)
			
 
				+    {
			
 
				+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
			
 
				+        if(gnothavedatatime < 100000) gnothavedatatime++;
			
 
				+
			
 
				+        if (gnothavedatatime  < 100){
			
 
				+            nstate = 0;
			
 
				+        }
			
 
				+        if (gnothavedatatime > 1000)
			
 
				+        {
			
 
				+            nstate = 1;
			
 
				+        }
			
 
				+        if (gnothavedatatime > 6000)
			
 
				+        {
			
 
				+            nstate = 2;
			
 
				+        }
			
 
				+        if (nstate != nlaststate) {
			
 
				+            switch (nstate) {
			
 
				+            case 0:
			
 
				+                break;
			
 
				+            case 1:
			
 
				+            {
			
 
				+                static int64_t nLastLog = 0;
			
 
				+                int64_t nnow = std::chrono::system_clock::now().time_since_epoch().count()/1e6;
			
 
				+                if(abs(nnow - nLastLog) > 1e3)
			
 
				+                {
			
 
				+                    std::cout<<"more than 10 seconds not have lidar pointcloud."<<std::endl;
			
 
				+                    nLastLog = nnow;
			
 
				+                }
			
 
				+            }
			
 
				+                break;
			
 
				+            case 2:
			
 
				+            {
			
 
				+                static int64_t nLastLog = 0;
			
 
				+                int64_t nnow = std::chrono::system_clock::now().time_since_epoch().count()/1e6;
			
 
				+                if(abs(nnow - nLastLog) > 1e3)
			
 
				+                {
			
 
				+                    std::cout<<"more than 60 seconds not have lidar pointcloud.."<<std::endl;
			
 
				+                    nLastLog = nnow;
			
 
				+                }
			
 
				+            }
			
 
				+                break;
			
 
				+            default:
			
 
				+                break;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void exitfunc()
			
 
				+{
			
 
				+    gbstate = false;
			
 
				+    gpthread->join();
			
 
				+    std::cout<<" state thread closed."<<std::endl;
			
 
				+    iv::modulecomm::Unregister(gpa);
			
 
				+    iv::modulecomm::Unregister(gpdetect);
			
 
				+    std::cout<<"exit func complete"<<std::endl;
			
 
				+}
			
 
				+#include <QFile>
			
 
				+bool trtisexist(std::string strpfe,std::string strbackbone)
			
 
				+{
			
 
				+    QFile xFile;
			
 
				+    xFile.setFileName(strpfe.data());
			
 
				+    if(xFile.exists() == false)
			
 
				+    {
			
 
				+        return false;
			
 
				+    }
			
 
				+    xFile.setFileName(strbackbone.data());
			
 
				+    if(xFile.exists() == false)
			
 
				+    {
			
 
				+        return false;
			
 
				+    }
			
 
				+    return true;
			
 
				+}
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+    QCoreApplication a(argc, argv);
			
 
				+
			
 
				+    //    RegisterIVBackTrace();
			
 
				+    tracker.setSettings(settings);
			
 
				+
			
 
				+
			
 
				+    char * strhome = getenv("HOME");
			
 
				+    std::string pfe_file = strhome;
			
 
				+    pfe_file += "/models/lidar/cbgs_pp_multihead_pfe.onnx";
			
 
				+    std::string pfe_trt_file = pfe_file.substr(0, pfe_file.find(".")) + ".trt";
			
 
				+
			
 
				+    std::string backbone_file = strhome;
			
 
				+    backbone_file += "/models/lidar/cbgs_pp_multihead_backbone.onnx";
			
 
				+    std::string backbone_trt_file = backbone_file.substr(0, backbone_file.find(".")) + ".trt";
			
 
				+
			
 
				+    bool btrtexist = trtisexist(pfe_trt_file,backbone_trt_file);
			
 
				+
			
 
				+
			
 
				+    QString strpath = QCoreApplication::applicationDirPath();
			
 
				+    std::string pp_config = strpath.toStdString() ;
			
 
				+    pp_config += "/cfgs/cbgs_pp_multihead.yaml";
			
 
				+    if (argc < 2)
			
 
				+        strpath = strpath + "/detection_lidar_pointpillar.xml";
			
 
				+    else
			
 
				+        strpath = argv[1];
			
 
				+
			
 
				+    std::cout<<pp_config<<std::endl;
			
 
				+
			
 
				+    iv::xmlparam::Xmlparam xparam(strpath.toStdString());
			
 
				+    pfe_file = xparam.GetParam("pfe_file",pfe_file.data());
			
 
				+    backbone_file = xparam.GetParam("backbone_file",backbone_file.data());
			
 
				+    gstrinput = xparam.GetParam("input","lidar_pc");
			
 
				+    gstroutput = xparam.GetParam("output","lidar_pointpillar");
			
 
				+
			
 
				+    if(btrtexist == false)
			
 
				+    {
			
 
				+
			
 
				+        std::cout<<"use onnx model."<<std::endl;
			
 
				+        pPillars = new PointPillars(
			
 
				+                    0.4,
			
 
				+                    0.2,
			
 
				+                    true,
			
 
				+                    pfe_file,
			
 
				+                    backbone_file,
			
 
				+                    pp_config
			
 
				+                    );
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        std::cout<<"use trt model."<<std::endl;
			
 
				+        pPillars = new PointPillars(
			
 
				+                    0.1,
			
 
				+                    0.2,
			
 
				+                    false,
			
 
				+                    pfe_trt_file,
			
 
				+                    backbone_trt_file,
			
 
				+                    pp_config
			
 
				+                    );
			
 
				+    }
			
 
				+    std::cout<<"PointPillars Init OK."<<std::endl;
			
 
				+
			
 
				+    gpa = iv::modulecomm::RegisterRecv(gstrinput.data(),ListenPointCloud);
			
 
				+    gpdetect = iv::modulecomm::RegisterSend(gstroutput.data(), 10000000,1);
			
 
				+    gpthread = new std::thread(statethread);
			
 
				+
			
 
				+    return a.exec();
			
 
				+}
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/nms.cu
+++ b/src/api/perception_lidar_PointPillars_MultiHead/nms.cu
@@ -0,0 +1,422 @@
 
				+/*
			
 
				+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
			
 
				+Written by Shaoshuai Shi
			
 
				+All Rights Reserved 2019-2020.
			
 
				+*/
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include "nms.h"
			
 
				+#include "common.h"
			
 
				+#define THREADS_PER_BLOCK 16
			
 
				+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
			
 
				+
			
 
				+// #define DEBUG
			
 
				+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
			
 
				+const float EPS = 1e-8;
			
 
				+struct Point {
			
 
				+    float x, y;
			
 
				+    __device__ Point() {}
			
 
				+    __device__ Point(double _x, double _y){
			
 
				+        x = _x, y = _y;
			
 
				+    }
			
 
				+
			
 
				+    __device__ void set(float _x, float _y){
			
 
				+        x = _x; y = _y;
			
 
				+    }
			
 
				+
			
 
				+    __device__ Point operator +(const Point &b)const{
			
 
				+        return Point(x + b.x, y + b.y);
			
 
				+    }
			
 
				+
			
 
				+    __device__ Point operator -(const Point &b)const{
			
 
				+        return Point(x - b.x, y - b.y);
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+__device__ inline float cross(const Point &a, const Point &b){
			
 
				+    return a.x * b.y - a.y * b.x;
			
 
				+}
			
 
				+
			
 
				+__device__ inline float cross(const Point &p1, const Point &p2, const Point &p0){
			
 
				+    return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
			
 
				+}
			
 
				+
			
 
				+__device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2){
			
 
				+    int ret = min(p1.x,p2.x) <= max(q1.x,q2.x)  &&
			
 
				+              min(q1.x,q2.x) <= max(p1.x,p2.x) &&
			
 
				+              min(p1.y,p2.y) <= max(q1.y,q2.y) &&
			
 
				+              min(q1.y,q2.y) <= max(p1.y,p2.y);
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+__device__ inline int check_in_box2d(const float *box, const Point &p){
			
 
				+    //params: (7) [x, y, z, dx, dy, dz, heading]
			
 
				+    const float MARGIN = 1e-2;
			
 
				+
			
 
				+    float center_x = box[0], center_y = box[1];
			
 
				+    float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);  // rotate the point in the opposite direction of box
			
 
				+    float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
			
 
				+    float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
			
 
				+
			
 
				+    return (fabs(rot_x) < box[3] / 2 + MARGIN && fabs(rot_y) < box[4] / 2 + MARGIN);
			
 
				+}
			
 
				+
			
 
				+__device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans){
			
 
				+    // fast exclusion
			
 
				+    if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
			
 
				+
			
 
				+    // check cross standing
			
 
				+    float s1 = cross(q0, p1, p0);
			
 
				+    float s2 = cross(p1, q1, p0);
			
 
				+    float s3 = cross(p0, q1, q0);
			
 
				+    float s4 = cross(q1, p1, q0);
			
 
				+
			
 
				+    if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
			
 
				+
			
 
				+    // calculate intersection of two lines
			
 
				+    float s5 = cross(q1, p1, p0);
			
 
				+    if(fabs(s5 - s1) > EPS){
			
 
				+        ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
			
 
				+        ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
			
 
				+
			
 
				+    }
			
 
				+    else{
			
 
				+        float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
			
 
				+        float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
			
 
				+        float D = a0 * b1 - a1 * b0;
			
 
				+
			
 
				+        ans.x = (b0 * c1 - b1 * c0) / D;
			
 
				+        ans.y = (a1 * c0 - a0 * c1) / D;
			
 
				+    }
			
 
				+
			
 
				+    return 1;
			
 
				+}
			
 
				+
			
 
				+__device__ inline void rotate_around_center(const Point &center, const float angle_cos, const float angle_sin, Point &p){
			
 
				+    float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
			
 
				+    float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
			
 
				+    p.set(new_x, new_y);
			
 
				+}
			
 
				+
			
 
				+__device__ inline int point_cmp(const Point &a, const Point &b, const Point &center){
			
 
				+    return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x);
			
 
				+}
			
 
				+
			
 
				+__device__ inline float box_overlap(const float *box_a, const float *box_b){
			
 
				+    // params box_a: [x, y, z, dx, dy, dz, heading]
			
 
				+    // params box_b: [x, y, z, dx, dy, dz, heading]
			
 
				+
			
 
				+    float a_angle = box_a[6], b_angle = box_b[6];
			
 
				+    float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
			
 
				+    float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
			
 
				+    float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
			
 
				+    float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
			
 
				+    float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
			
 
				+
			
 
				+    Point center_a(box_a[0], box_a[1]);
			
 
				+    Point center_b(box_b[0], box_b[1]);
			
 
				+
			
 
				+#ifdef DEBUG
			
 
				+    printf("a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", a_x1, a_y1, a_x2, a_y2, a_angle,
			
 
				+           b_x1, b_y1, b_x2, b_y2, b_angle);
			
 
				+    printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y, center_b.x, center_b.y);
			
 
				+#endif
			
 
				+
			
 
				+    Point box_a_corners[5];
			
 
				+    box_a_corners[0].set(a_x1, a_y1);
			
 
				+    box_a_corners[1].set(a_x2, a_y1);
			
 
				+    box_a_corners[2].set(a_x2, a_y2);
			
 
				+    box_a_corners[3].set(a_x1, a_y2);
			
 
				+
			
 
				+    Point box_b_corners[5];
			
 
				+    box_b_corners[0].set(b_x1, b_y1);
			
 
				+    box_b_corners[1].set(b_x2, b_y1);
			
 
				+    box_b_corners[2].set(b_x2, b_y2);
			
 
				+    box_b_corners[3].set(b_x1, b_y2);
			
 
				+
			
 
				+    // get oriented corners
			
 
				+    float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
			
 
				+    float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
			
 
				+
			
 
				+    for (int k = 0; k < 4; k++){
			
 
				+#ifdef DEBUG
			
 
				+        printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
			
 
				+#endif
			
 
				+        rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
			
 
				+        rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
			
 
				+#ifdef DEBUG
			
 
				+        printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
			
 
				+#endif
			
 
				+    }
			
 
				+
			
 
				+    box_a_corners[4] = box_a_corners[0];
			
 
				+    box_b_corners[4] = box_b_corners[0];
			
 
				+
			
 
				+    // get intersection of lines
			
 
				+    Point cross_points[16];
			
 
				+    Point poly_center;
			
 
				+    int cnt = 0, flag = 0;
			
 
				+
			
 
				+    poly_center.set(0, 0);
			
 
				+    for (int i = 0; i < 4; i++){
			
 
				+        for (int j = 0; j < 4; j++){
			
 
				+            flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]);
			
 
				+            if (flag){
			
 
				+                poly_center = poly_center + cross_points[cnt];
			
 
				+                cnt++;
			
 
				+#ifdef DEBUG
			
 
				+                printf("Cross points (%.3f, %.3f): a(%.3f, %.3f)->(%.3f, %.3f), b(%.3f, %.3f)->(%.3f, %.3f) \n",
			
 
				+                    cross_points[cnt - 1].x, cross_points[cnt - 1].y,
			
 
				+                    box_a_corners[i].x, box_a_corners[i].y, box_a_corners[i + 1].x, box_a_corners[i + 1].y,
			
 
				+                    box_b_corners[i].x, box_b_corners[i].y, box_b_corners[i + 1].x, box_b_corners[i + 1].y);
			
 
				+#endif
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // check corners
			
 
				+    for (int k = 0; k < 4; k++){
			
 
				+        if (check_in_box2d(box_a, box_b_corners[k])){
			
 
				+            poly_center = poly_center + box_b_corners[k];
			
 
				+            cross_points[cnt] = box_b_corners[k];
			
 
				+            cnt++;
			
 
				+#ifdef DEBUG
			
 
				+                printf("b corners in a: corner_b(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
			
 
				+#endif
			
 
				+        }
			
 
				+        if (check_in_box2d(box_b, box_a_corners[k])){
			
 
				+            poly_center = poly_center + box_a_corners[k];
			
 
				+            cross_points[cnt] = box_a_corners[k];
			
 
				+            cnt++;
			
 
				+#ifdef DEBUG
			
 
				+                printf("a corners in b: corner_a(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
			
 
				+#endif
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    poly_center.x /= cnt;
			
 
				+    poly_center.y /= cnt;
			
 
				+
			
 
				+    // sort the points of polygon
			
 
				+    Point temp;
			
 
				+    for (int j = 0; j < cnt - 1; j++){
			
 
				+        for (int i = 0; i < cnt - j - 1; i++){
			
 
				+            if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)){
			
 
				+                temp = cross_points[i];
			
 
				+                cross_points[i] = cross_points[i + 1];
			
 
				+                cross_points[i + 1] = temp;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+#ifdef DEBUG
			
 
				+    printf("cnt=%d\n", cnt);
			
 
				+    for (int i = 0; i < cnt; i++){
			
 
				+        printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x, cross_points[i].y);
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				+    // get the overlap areas
			
 
				+    float area = 0;
			
 
				+    for (int k = 0; k < cnt - 1; k++){
			
 
				+        area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]);
			
 
				+    }
			
 
				+
			
 
				+    return fabs(area) / 2.0;
			
 
				+}
			
 
				+
			
 
				+__device__ inline float iou_bev(const float *box_a, const float *box_b){
			
 
				+    // params box_a: [x, y, z, dx, dy, dz, heading]
			
 
				+    // params box_b: [x, y, z, dx, dy, dz, heading]
			
 
				+    float sa = box_a[3] * box_a[4];
			
 
				+    float sb = box_b[3] * box_b[4];
			
 
				+    float s_overlap = box_overlap(box_a, box_b);
			
 
				+    return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
			
 
				+}
			
 
				+
			
 
				+__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){
			
 
				+    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
			
 
				+    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
			
 
				+
			
 
				+    if (a_idx >= num_a || b_idx >= num_b){
			
 
				+        return;
			
 
				+    }
			
 
				+    const float * cur_box_a = boxes_a + a_idx * 7;
			
 
				+    const float * cur_box_b = boxes_b + b_idx * 7;
			
 
				+    float s_overlap = box_overlap(cur_box_a, cur_box_b);
			
 
				+    ans_overlap[a_idx * num_b + b_idx] = s_overlap;
			
 
				+}
			
 
				+
			
 
				+__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){
			
 
				+    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
			
 
				+    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
			
 
				+
			
 
				+    if (a_idx >= num_a || b_idx >= num_b){
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const float * cur_box_a = boxes_a + a_idx * 7;
			
 
				+    const float * cur_box_b = boxes_b + b_idx * 7;
			
 
				+    float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
			
 
				+    ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
			
 
				+}
			
 
				+
			
 
				+__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
			
 
				+                           const float *boxes, unsigned long long *mask){
			
 
				+    //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
			
 
				+
			
 
				+    const int row_start = blockIdx.y;
			
 
				+    const int col_start = blockIdx.x;
			
 
				+
			
 
				+    // if (row_start > col_start) return;
			
 
				+
			
 
				+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
			
 
				+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
			
 
				+
			
 
				+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
			
 
				+
			
 
				+    if (threadIdx.x < col_size) {
			
 
				+        block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
			
 
				+        block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
			
 
				+        block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
			
 
				+        block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
			
 
				+        block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
			
 
				+        block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
			
 
				+        block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    if (threadIdx.x < row_size) {
			
 
				+        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
			
 
				+        const float *cur_box = boxes + cur_box_idx * 7;
			
 
				+
			
 
				+        int i = 0;
			
 
				+        unsigned long long t = 0;
			
 
				+        int start = 0;
			
 
				+        if (row_start == col_start) {
			
 
				+          start = threadIdx.x + 1;
			
 
				+        }
			
 
				+        for (i = start; i < col_size; i++) {
			
 
				+            if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh){
			
 
				+                t |= 1ULL << i;
			
 
				+            }
			
 
				+        }
			
 
				+        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
			
 
				+        mask[cur_box_idx * col_blocks + col_start] = t;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__device__ inline float iou_normal(float const * const a, float const * const b) {
			
 
				+    //params: a: [x, y, z, dx, dy, dz, heading]
			
 
				+    //params: b: [x, y, z, dx, dy, dz, heading]
			
 
				+
			
 
				+    float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2), right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
			
 
				+    float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2), bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
			
 
				+    float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
			
 
				+    float interS = width * height;
			
 
				+    float Sa = a[3] * a[4];
			
 
				+    float Sb = b[3] * b[4];
			
 
				+    return interS / fmaxf(Sa + Sb - interS, EPS);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_thresh,
			
 
				+                           const float *boxes, unsigned long long *mask){
			
 
				+    //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
			
 
				+
			
 
				+    const int row_start = blockIdx.y;
			
 
				+    const int col_start = blockIdx.x;
			
 
				+
			
 
				+    // if (row_start > col_start) return;
			
 
				+
			
 
				+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
			
 
				+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
			
 
				+
			
 
				+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
			
 
				+
			
 
				+    if (threadIdx.x < col_size) {
			
 
				+        block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
			
 
				+        block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
			
 
				+        block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
			
 
				+        block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
			
 
				+        block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
			
 
				+        block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
			
 
				+        block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    if (threadIdx.x < row_size) {
			
 
				+        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
			
 
				+        const float *cur_box = boxes + cur_box_idx * 7;
			
 
				+
			
 
				+        int i = 0;
			
 
				+        unsigned long long t = 0;
			
 
				+        int start = 0;
			
 
				+        if (row_start == col_start) {
			
 
				+          start = threadIdx.x + 1;
			
 
				+        }
			
 
				+        for (i = start; i < col_size; i++) {
			
 
				+            if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh){
			
 
				+                t |= 1ULL << i;
			
 
				+            }
			
 
				+        }
			
 
				+        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
			
 
				+        mask[cur_box_idx * col_blocks + col_start] = t;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+NmsCuda::NmsCuda(const int num_threads, const int num_box_corners,
			
 
				+    const float nms_overlap_threshold)
			
 
				+: num_threads_(num_threads),
			
 
				+  num_box_corners_(num_box_corners),
			
 
				+  nms_overlap_threshold_(nms_overlap_threshold) {}
			
 
				+
			
 
				+void NmsCuda::DoNmsCuda(const int host_filter_count,
			
 
				+    float *dev_sorted_box_for_nms, long *out_keep_inds,
			
 
				+    int *out_num_to_keep) {
			
 
				+
			
 
				+    const int col_blocks = DIVUP(host_filter_count, num_threads_);
			
 
				+    unsigned long long *dev_mask = NULL;
			
 
				+    GPU_CHECK(cudaMalloc(&dev_mask, host_filter_count * col_blocks * sizeof(unsigned long long)));    
			
 
				+
			
 
				+    dim3 blocks(DIVUP(host_filter_count, num_threads_),
			
 
				+                DIVUP(host_filter_count, num_threads_));
			
 
				+    dim3 threads(num_threads_);
			
 
				+
			
 
				+    nms_kernel<<<blocks, threads>>>(host_filter_count, nms_overlap_threshold_, dev_sorted_box_for_nms, dev_mask);
			
 
				+    // postprocess for nms output
			
 
				+    std::vector<unsigned long long> host_mask(host_filter_count * col_blocks);
			
 
				+    GPU_CHECK(cudaMemcpy(&host_mask[0], dev_mask,
			
 
				+            sizeof(unsigned long long) * host_filter_count * col_blocks,
			
 
				+            cudaMemcpyDeviceToHost));
			
 
				+    std::vector<unsigned long long> remv(col_blocks);
			
 
				+    memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
			
 
				+
			
 
				+    for (int i = 0; i < host_filter_count; ++i) {
			
 
				+        int nblock = i / num_threads_;
			
 
				+        int inblock = i % num_threads_;
			
 
				+
			
 
				+        if (!(remv[nblock] & (1ULL << inblock))) {
			
 
				+            out_keep_inds[(*out_num_to_keep)++] = i;
			
 
				+            unsigned long long *p = &host_mask[0] + i * col_blocks;
			
 
				+            for (int j = nblock; j < col_blocks; ++j) {
			
 
				+                remv[j] |= p[j];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    GPU_CHECK(cudaFree(dev_mask));
			
 
				+}
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/nms.h
+++ b/src/api/perception_lidar_PointPillars_MultiHead/nms.h
@@ -0,0 +1,64 @@
 
				+
			
 
				+/*
			
 
				+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
			
 
				+Written by Shaoshuai Shi
			
 
				+All Rights Reserved 2019-2020.
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+class NmsCuda {
			
 
				+ private:
			
 
				+  const int num_threads_;
			
 
				+  const int num_box_corners_;
			
 
				+  const float nms_overlap_threshold_;
			
 
				+
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @brief Constructor
			
 
				+   * @param[in] num_threads Number of threads when launching cuda kernel
			
 
				+   * @param[in] num_box_corners Number of corners for 2D box
			
 
				+   * @param[in] nms_overlap_threshold IOU threshold for NMS
			
 
				+   * @details Captital variables never change after the compile, Non-captital
			
 
				+   * variables could be chaned through rosparam
			
 
				+   */
			
 
				+  NmsCuda(const int num_threads, const int num_box_corners,
			
 
				+          const float nms_overlap_threshold);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief GPU Non-Maximum Suppresion for network output
			
 
				+   * @param[in] host_filter_count Number of filtered output
			
 
				+   * @param[in] dev_sorted_box_for_nms Bounding box output sorted by score
			
 
				+   * @param[out] out_keep_inds Indexes of selected bounding box
			
 
				+   * @param[out] out_num_to_keep Number of kept bounding boxes
			
 
				+   * @details NMS in GPU and postprocessing for selecting box in CPU
			
 
				+   */
			
 
				+  void DoNmsCuda(const int host_filter_count, float* dev_sorted_box_for_nms,
			
 
				+                 long* out_keep_inds, int* out_num_to_keep);
			
 
				+};
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/object.proto
+++ b/src/api/perception_lidar_PointPillars_MultiHead/object.proto
@@ -0,0 +1,51 @@
 
				+syntax = "proto2";
			
 
				+
			
 
				+package iv.lidar;
			
 
				+
			
 
				+message PointXYZI {
			
 
				+  optional float x = 1 [default = nan];
			
 
				+  optional float y = 2 [default = nan];
			
 
				+  optional float z = 3 [default = nan];
			
 
				+  optional float i = 4 [default = 0];
			
 
				+};
			
 
				+
			
 
				+message PointXYZ {
			
 
				+  optional float x = 1 [default = nan];
			
 
				+  optional float y = 2 [default = nan];
			
 
				+  optional float z = 3 [default = nan];
			
 
				+};
			
 
				+
			
 
				+message Dimension {
			
 
				+  optional float x = 1 [default = 0];
			
 
				+  optional float y = 2 [default = 0];
			
 
				+  optional float z = 3 [default = 0];
			
 
				+};
			
 
				+
			
 
				+message VelXY {
			
 
				+  optional float x =1 [default = 0];
			
 
				+  optional float y =2 [default = 0];
			
 
				+};
			
 
				+
			
 
				+message lidarobject
			
 
				+{
			
 
				+  optional PointXYZ min_point = 1;
			
 
				+  optional PointXYZ max_point = 2;
			
 
				+  optional PointXYZ centroid = 3;
			
 
				+  optional Dimension dimensions = 4;
			
 
				+  optional PointXYZ position = 5;
			
 
				+  optional double angle =6;
			
 
				+  optional double velocity_linear_x = 7;
			
 
				+  optional double acceleration_linear_y = 8;
			
 
				+  optional double timestamp = 9;
			
 
				+  optional double tyaw = 10;
			
 
				+  required int32 mnType = 11;
			
 
				+  optional int32 id = 12;
			
 
				+  optional int32 behavior_state = 13;
			
 
				+  optional float score = 14;
			
 
				+  optional bool velocity_reliable = 15;
			
 
				+  optional bool pose_reliable = 16;
			
 
				+  repeated float type_probs = 17;
			
 
				+  repeated PointXYZI cloud = 18;
			
 
				+  optional string type_name = 19;
			
 
				+  optional VelXY vel_relative = 20; //相对速度
			
 
				+};
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/objectarray.proto
+++ b/src/api/perception_lidar_PointPillars_MultiHead/objectarray.proto
@@ -0,0 +1,11 @@
 
				+syntax = "proto2";
			
 
				+
			
 
				+package iv.lidar;
			
 
				+
			
 
				+import "object.proto";
			
 
				+
			
 
				+message objectarray
			
 
				+{
			
 
				+  repeated lidarobject obj = 1;
			
 
				+  optional double timestamp = 2;
			
 
				+};
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/perception_lidar_PointPillars_MultiHead.pro
+++ b/src/api/perception_lidar_PointPillars_MultiHead/perception_lidar_PointPillars_MultiHead.pro
@@ -0,0 +1,182 @@
 
				+QT -= gui
			
 
				+
			
 
				+CONFIG += c++14 console
			
 
				+CONFIG -= app_bundle
			
 
				+
			
 
				+QMAKE_CXXFLAGS += -std=gnu++17
			
 
				+QMAKE_LFLAGS += -no-pie  -Wl,--no-as-needed
			
 
				+
			
 
				+
			
 
				+system("protoc *.proto -I=./ --cpp_out=./")
			
 
				+# The following define makes your compiler emit warnings if you use
			
 
				+# any feature of Qt which as been marked deprecated (the exact warnings
			
 
				+# depend on your compiler). Please consult the documentation of the
			
 
				+# deprecated API in order to know how to port your code away from it.
			
 
				+DEFINES += QT_DEPRECATED_WARNINGS
			
 
				+#DEFINES += DEBUG_SHOW
			
 
				+# You can also make your code fail to compile if you use deprecated APIs.
			
 
				+# In order to do so, uncomment the following line.
			
 
				+# You can also select to disable deprecated APIs only up to a certain version of Qt.
			
 
				+#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0
			
 
				+
			
 
				+SOURCES += main.cpp \
			
 
				+    pointpillars.cc \
			
 
				+    object.pb.cc \
			
 
				+    objectarray.pb.cc \
			
 
				+    Tracker/Ctracker.cpp \
			
 
				+    Tracker/HungarianAlg.cpp \
			
 
				+    Tracker/Kalman.cpp \
			
 
				+    Tracker/track.cpp
			
 
				+
			
 
				+DISTFILES += \
			
 
				+    nms.cu \
			
 
				+    postprocess.cu \
			
 
				+    preprocess.cu \
			
 
				+    scatter.cu
			
 
				+
			
 
				+HEADERS += \
			
 
				+    common.h \
			
 
				+    nms.h \
			
 
				+    pointpillars.h \
			
 
				+    postprocess.h \
			
 
				+    preprocess.h \
			
 
				+    scatter.h \
			
 
				+    object.pb.h \
			
 
				+    objectarray.pb.h \
			
 
				+    Tracker/Ctracker.h \
			
 
				+    Tracker/defines.h \
			
 
				+    Tracker/HungarianAlg.h \
			
 
				+    Tracker/Kalman.h \
			
 
				+    Tracker/ShortPathCalculator.h \
			
 
				+    Tracker/track.h \
			
 
				+    Tracker/Tracking.hpp
			
 
				+
			
 
				+INCLUDEPATH+=Tracker
			
 
				+
			
 
				+CUDA_SOURCES +=  \
			
 
				+    nms.cu \
			
 
				+    postprocess.cu \
			
 
				+    preprocess.cu \
			
 
				+    scatter.cu
			
 
				+
			
 
				+CUDA_SDK = "/usr/local/cuda/"   # cudaSDK路径
			
 
				+
			
 
				+CUDA_DIR = "/usr/local/cuda/"            # CUDA tookit路径
			
 
				+
			
 
				+SYSTEM_NAME = linux         # 自己系统环境 'Win32', 'x64', or 'Win64'
			
 
				+
			
 
				+SYSTEM_TYPE = 64           #操作系统位数 '32' or '64',
			
 
				+
			
 
				+CUDA_ARCH = sm_72         # cuda架构, for example 'compute_10', 'compute_11', 'sm_10'
			
 
				+
			
 
				+NVCC_OPTIONS = --use_fast_math --compiler-options "-fPIC"
			
 
				+
			
 
				+
			
 
				+# include paths
			
 
				+
			
 
				+INCLUDEPATH += $$CUDA_DIR/include
			
 
				+#INCLUDEPATH += /usr/local/cuda-10.0/targets/aarch64-linux/include/crt
			
 
				+
			
 
				+# library directories
			
 
				+
			
 
				+QMAKE_LIBDIR += $$CUDA_DIR/lib/
			
 
				+
			
 
				+CUDA_OBJECTS_DIR = ./
			
 
				+
			
 
				+# The following library conflicts with something in Cuda
			
 
				+
			
 
				+#QMAKE_LFLAGS_RELEASE = /NODEFAULTLIB:msvcrt.lib
			
 
				+
			
 
				+#QMAKE_LFLAGS_DEBUG   = /NODEFAULTLIB:msvcrtd.lib
			
 
				+
			
 
				+# Add the necessary libraries
			
 
				+
			
 
				+CUDA_LIBS =  cudart cufft
			
 
				+
			
 
				+# The following makes sure all path names (which often include spaces) are put between quotation marks
			
 
				+
			
 
				+CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
			
 
				+
			
 
				+NVCC_LIBS = $$join(CUDA_LIBS,' -l','-l', '')
			
 
				+
			
 
				+#LIBS += $$join(CUDA_LIBS,'.so ', '', '.so')
			
 
				+
			
 
				+# Configuration of the Cuda compiler
			
 
				+
			
 
				+CONFIG(debug, debug|release) {
			
 
				+
			
 
				+    # Debug mode
			
 
				+
			
 
				+    cuda_d.input = CUDA_SOURCES
			
 
				+
			
 
				+    cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
			
 
				+
			
 
				+    cuda_d.commands = $$CUDA_DIR/bin/nvcc -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
			
 
				+
			
 
				+    cuda_d.dependency_type = TYPE_C
			
 
				+
			
 
				+    QMAKE_EXTRA_COMPILERS += cuda_d
			
 
				+
			
 
				+}
			
 
				+
			
 
				+else {
			
 
				+
			
 
				+    # Release mode
			
 
				+
			
 
				+    cuda.input = CUDA_SOURCES
			
 
				+
			
 
				+    cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.o
			
 
				+
			
 
				+    cuda.commands = $$CUDA_DIR/bin/nvcc $$NVCC_OPTIONS $$CUDA_INC $$NVCC_LIBS --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH -O3 -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
			
 
				+
			
 
				+    cuda.dependency_type = TYPE_C
			
 
				+
			
 
				+    QMAKE_EXTRA_COMPILERS += cuda
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+LIBS += -L/usr/local/cuda-10.2/targets/aarch64-linux/lib
			
 
				+LIBS += -L/usr/local/cuda-11.4/targets/aarch64-linux/lib
			
 
				+
			
 
				+LIBS += -lcudart -lcufft -lyaml-cpp
			
 
				+
			
 
				+#LIBS += -L/home/adc/soft/cudnn-10.2-linux-x64-v7.6.5.32/cuda/lib64 -lcudnn
			
 
				+
			
 
				+LIBS +=  -lnvinfer -lnvonnxparser -lnvcaffe_parser  # -lmyelin
			
 
				+
			
 
				+#LIBS += -L/home/nvidia/git/libtorch_gpu-1.6.0-linux-aarch64/lib -ltorch_cuda  -ltorch -lc10 -ltorch_cpu
			
 
				+
			
 
				+unix:INCLUDEPATH += /usr/include/eigen3
			
 
				+unix:INCLUDEPATH += /usr/include/pcl-1.7
			
 
				+unix:INCLUDEPATH += /usr/include/pcl-1.8
			
 
				+unix:INCLUDEPATH += /usr/include/pcl-1.10
			
 
				+unix:INCLUDEPATH += /usr/include/pcl-1.12
			
 
				+unix:INCLUDEPATH += /usr/include/pcl-1.14
			
 
				+
			
 
				+
			
 
				+LIBS += -lboost_system -lprotobuf
			
 
				+
			
 
				+LIBS += -L$$PWD -lmodulecomm -lxmlparam
			
 
				+
			
 
				+unix:LIBS +=  -lpcl_common\
			
 
				+        -lpcl_features\
			
 
				+        -lpcl_filters\
			
 
				+        -lpcl_io\
			
 
				+        -lpcl_io_ply\
			
 
				+        -lpcl_kdtree\
			
 
				+        -lpcl_keypoints\
			
 
				+        -lpcl_octree\
			
 
				+        -lpcl_outofcore\
			
 
				+        -lpcl_people\
			
 
				+        -lpcl_recognition\
			
 
				+        -lpcl_registration\
			
 
				+        -lpcl_sample_consensus\
			
 
				+        -lpcl_search\
			
 
				+        -lpcl_segmentation\
			
 
				+        -lpcl_surface\
			
 
				+        -lpcl_tracking\
			
 
				+        -lpcl_visualization
			
 
				+
			
 
				+INCLUDEPATH += /usr/include/opencv4/
			
 
				+LIBS += /usr/lib/aarch64-linux-gnu/libopencv*.so
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/pointpillars.cc
+++ b/src/api/perception_lidar_PointPillars_MultiHead/pointpillars.cc
@@ -0,0 +1,507 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+#include "pointpillars.h"
			
 
				+
			
 
				+#include <chrono>
			
 
				+#include <iostream>
			
 
				+#include <iostream>
			
 
				+
			
 
				+PointPillars::PointPillars(const float score_threshold,
			
 
				+                           const float nms_overlap_threshold,
			
 
				+                           const bool use_onnx,
			
 
				+                           const std::string pfe_file,
			
 
				+                           const std::string backbone_file,
			
 
				+                           const std::string pp_config)
			
 
				+    : score_threshold_(score_threshold),
			
 
				+      nms_overlap_threshold_(nms_overlap_threshold),
			
 
				+      use_onnx_(use_onnx),
			
 
				+      pfe_file_(pfe_file),
			
 
				+      backbone_file_(backbone_file),
			
 
				+      pp_config_(pp_config)
			
 
				+{
			
 
				+    InitParams();
			
 
				+    InitTRT(use_onnx_);
			
 
				+    DeviceMemoryMalloc();
			
 
				+
			
 
				+    preprocess_points_cuda_ptr_.reset(new PreprocessPointsCuda(
			
 
				+        kNumThreads,
			
 
				+        kMaxNumPillars,
			
 
				+        kMaxNumPointsPerPillar,
			
 
				+        kNumPointFeature,
			
 
				+        kNumIndsForScan,
			
 
				+        kGridXSize,kGridYSize, kGridZSize,
			
 
				+        kPillarXSize,kPillarYSize, kPillarZSize,
			
 
				+        kMinXRange, kMinYRange, kMinZRange));
			
 
				+
			
 
				+    scatter_cuda_ptr_.reset(new ScatterCuda(kNumThreads, kGridXSize, kGridYSize));
			
 
				+
			
 
				+    const float float_min = std::numeric_limits<float>::lowest();
			
 
				+    const float float_max = std::numeric_limits<float>::max();
			
 
				+    postprocess_cuda_ptr_.reset(
			
 
				+      new PostprocessCuda(kNumThreads,
			
 
				+                          float_min, float_max, 
			
 
				+                          kNumClass,kNumAnchorPerCls,
			
 
				+                          kMultiheadLabelMapping,
			
 
				+                          score_threshold_, 
			
 
				+                          nms_overlap_threshold_,
			
 
				+                          kNmsPreMaxsize, 
			
 
				+                          kNmsPostMaxsize,
			
 
				+                          kNumBoxCorners, 
			
 
				+                          kNumInputBoxFeature,
			
 
				+                          7));  /*kNumOutputBoxFeature*/
			
 
				+    
			
 
				+}
			
 
				+
			
 
				+PointPillars::~PointPillars() {
			
 
				+    // for pillars 
			
 
				+    GPU_CHECK(cudaFree(dev_num_points_per_pillar_));
			
 
				+    GPU_CHECK(cudaFree(dev_x_coors_));
			
 
				+    GPU_CHECK(cudaFree(dev_y_coors_));
			
 
				+    GPU_CHECK(cudaFree(dev_pillar_point_feature_));
			
 
				+    GPU_CHECK(cudaFree(dev_pillar_coors_));
			
 
				+    // for sparse map
			
 
				+    GPU_CHECK(cudaFree(dev_sparse_pillar_map_));    
			
 
				+    GPU_CHECK(cudaFree(dev_cumsum_along_x_));
			
 
				+    GPU_CHECK(cudaFree(dev_cumsum_along_y_));
			
 
				+    // for pfe forward
			
 
				+    GPU_CHECK(cudaFree(dev_pfe_gather_feature_));
			
 
				+      
			
 
				+    GPU_CHECK(cudaFree(pfe_buffers_[0]));
			
 
				+    GPU_CHECK(cudaFree(pfe_buffers_[1]));
			
 
				+
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[0]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[1]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[2]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[3]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[4]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[5]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[6]));
			
 
				+    GPU_CHECK(cudaFree(rpn_buffers_[7]));
			
 
				+    pfe_context_->destroy();
			
 
				+    backbone_context_->destroy();
			
 
				+    pfe_engine_->destroy();
			
 
				+    backbone_engine_->destroy();
			
 
				+    // for post process
			
 
				+    GPU_CHECK(cudaFree(dev_scattered_feature_));
			
 
				+    GPU_CHECK(cudaFree(dev_filtered_box_));
			
 
				+    GPU_CHECK(cudaFree(dev_filtered_score_));
			
 
				+    GPU_CHECK(cudaFree(dev_filtered_label_));
			
 
				+    GPU_CHECK(cudaFree(dev_filtered_dir_));
			
 
				+    GPU_CHECK(cudaFree(dev_box_for_nms_));
			
 
				+    GPU_CHECK(cudaFree(dev_filter_count_));
			
 
				+
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void PointPillars::InitParams()
			
 
				+{
			
 
				+    YAML::Node params = YAML::LoadFile(pp_config_);
			
 
				+    kPillarXSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][0].as<float>();
			
 
				+    kPillarYSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][1].as<float>();
			
 
				+    kPillarZSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][2].as<float>();
			
 
				+    kMinXRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][0].as<float>();
			
 
				+    kMinYRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][1].as<float>();
			
 
				+    kMinZRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][2].as<float>();
			
 
				+    kMaxXRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][3].as<float>();
			
 
				+    kMaxYRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][4].as<float>();
			
 
				+    kMaxZRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][5].as<float>();
			
 
				+    kNumClass = params["CLASS_NAMES"].size();
			
 
				+    kMaxNumPillars = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["MAX_NUMBER_OF_VOXELS"]["test"].as<int>();
			
 
				+    kMaxNumPointsPerPillar = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["MAX_POINTS_PER_VOXEL"].as<int>();
			
 
				+    kNumPointFeature = 5; // [x, y, z, i,0]
			
 
				+    kNumInputBoxFeature = 7;
			
 
				+    kNumOutputBoxFeature = params["MODEL"]["DENSE_HEAD"]["TARGET_ASSIGNER_CONFIG"]["BOX_CODER_CONFIG"]["code_size"].as<int>();
			
 
				+    kBatchSize = 1;
			
 
				+    kNumIndsForScan = 1024;
			
 
				+    kNumThreads = 64;
			
 
				+    kNumBoxCorners = 8;
			
 
				+    kAnchorStrides = 4;
			
 
				+    kNmsPreMaxsize = params["MODEL"]["POST_PROCESSING"]["NMS_CONFIG"]["NMS_PRE_MAXSIZE"].as<int>();
			
 
				+    kNmsPostMaxsize = params["MODEL"]["POST_PROCESSING"]["NMS_CONFIG"]["NMS_POST_MAXSIZE"].as<int>();
			
 
				+    //params for initialize anchors
			
 
				+    //Adapt to OpenPCDet
			
 
				+    kAnchorNames = params["CLASS_NAMES"].as<std::vector<std::string>>();
			
 
				+    for (int i = 0; i < kAnchorNames.size(); ++i)
			
 
				+    {
			
 
				+        kAnchorDxSizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][0].as<float>());
			
 
				+        kAnchorDySizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][1].as<float>());
			
 
				+        kAnchorDzSizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][2].as<float>());
			
 
				+        kAnchorBottom.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_bottom_heights"][0].as<float>());
			
 
				+    }
			
 
				+    for (int idx_head = 0; idx_head < params["MODEL"]["DENSE_HEAD"]["RPN_HEAD_CFGS"].size(); ++idx_head)
			
 
				+    {
			
 
				+        int num_cls_per_head = params["MODEL"]["DENSE_HEAD"]["RPN_HEAD_CFGS"][idx_head]["HEAD_CLS_NAME"].size();
			
 
				+        std::vector<int> value;
			
 
				+        for (int i = 0; i < num_cls_per_head; ++i)
			
 
				+        {
			
 
				+            value.emplace_back(idx_head + i);
			
 
				+        }
			
 
				+        kMultiheadLabelMapping.emplace_back(value);
			
 
				+    }
			
 
				+
			
 
				+    // Generate secondary parameters based on above.
			
 
				+    kGridXSize = static_cast<int>((kMaxXRange - kMinXRange) / kPillarXSize); //512
			
 
				+    kGridYSize = static_cast<int>((kMaxYRange - kMinYRange) / kPillarYSize); //512
			
 
				+    kGridZSize = static_cast<int>((kMaxZRange - kMinZRange) / kPillarZSize); //1
			
 
				+    kRpnInputSize = 64 * kGridYSize * kGridXSize;
			
 
				+
			
 
				+    kNumAnchorXinds = static_cast<int>(kGridXSize / kAnchorStrides); //Width
			
 
				+    kNumAnchorYinds = static_cast<int>(kGridYSize / kAnchorStrides); //Hight
			
 
				+    kNumAnchor = kNumAnchorXinds * kNumAnchorYinds * 2 * kNumClass;  // H * W * Ro * N = 196608
			
 
				+
			
 
				+    kNumAnchorPerCls = kNumAnchorXinds * kNumAnchorYinds * 2; //H * W * Ro = 32768
			
 
				+    kRpnBoxOutputSize = kNumAnchor * kNumOutputBoxFeature;
			
 
				+    kRpnClsOutputSize = kNumAnchor * kNumClass;
			
 
				+    kRpnDirOutputSize = kNumAnchor * 2;
			
 
				+}
			
 
				+
			
 
				+void PointPillars::DeviceMemoryMalloc() {
			
 
				+    // for pillars 
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_num_points_per_pillar_), kMaxNumPillars * sizeof(float))); // M
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_x_coors_), kMaxNumPillars * sizeof(int))); // M
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_y_coors_), kMaxNumPillars * sizeof(int))); // M
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_point_feature_), kMaxNumPillars * kMaxNumPointsPerPillar * kNumPointFeature * sizeof(float))); // [M , m , 4]
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_coors_),  kMaxNumPillars * 4 * sizeof(float))); // [M , 4]
			
 
				+    // for sparse map
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_sparse_pillar_map_), kNumIndsForScan * kNumIndsForScan * sizeof(int))); // [1024 , 1024]
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_cumsum_along_x_), kNumIndsForScan * kNumIndsForScan * sizeof(int))); // [1024 , 1024]
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_cumsum_along_y_), kNumIndsForScan * kNumIndsForScan * sizeof(int)));// [1024 , 1024]
			
 
				+
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pfe_gather_feature_),
			
 
				+                        kMaxNumPillars * kMaxNumPointsPerPillar *
			
 
				+                            kNumGatherPointFeature * sizeof(float)));
			
 
				+    // for trt inference
			
 
				+    // create GPU buffers and a stream
			
 
				+
			
 
				+    GPU_CHECK(
			
 
				+        cudaMalloc(&pfe_buffers_[0], kMaxNumPillars * kMaxNumPointsPerPillar *
			
 
				+                                        kNumGatherPointFeature * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(&pfe_buffers_[1], kMaxNumPillars * 64 * sizeof(float)));
			
 
				+
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[0],  kRpnInputSize * sizeof(float)));
			
 
				+
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[1],  kNumAnchorPerCls  * sizeof(float)));  //classes
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[2],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[3],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[4],  kNumAnchorPerCls  * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[5],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[6],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
			
 
				+    
			
 
				+    GPU_CHECK(cudaMalloc(&rpn_buffers_[7],  kNumAnchorPerCls * kNumClass * kNumOutputBoxFeature * sizeof(float))); //boxes
			
 
				+
			
 
				+    // for scatter kernel
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_scattered_feature_),
			
 
				+                        kNumThreads * kGridYSize * kGridXSize * sizeof(float)));
			
 
				+    // for filter
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_box_),
			
 
				+                        kNumAnchor * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_score_),
			
 
				+                        kNumAnchor * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_label_),
			
 
				+                        kNumAnchor * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_dir_),
			
 
				+                        kNumAnchor * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_box_for_nms_),
			
 
				+                        kNumAnchor * kNumBoxCorners * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filter_count_), kNumClass * sizeof(int)));
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void PointPillars::SetDeviceMemoryToZero() {
			
 
				+
			
 
				+    GPU_CHECK(cudaMemset(dev_num_points_per_pillar_, 0, kMaxNumPillars * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMemset(dev_x_coors_,               0, kMaxNumPillars * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMemset(dev_y_coors_,               0, kMaxNumPillars * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMemset(dev_pillar_point_feature_,  0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumPointFeature * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMemset(dev_pillar_coors_,          0, kMaxNumPillars * 4 * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(dev_sparse_pillar_map_,     0, kNumIndsForScan * kNumIndsForScan * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMemset(dev_pfe_gather_feature_,    0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float)));
			
 
				+    
			
 
				+    // GPU_CHECK(cudaMemset(pfe_buffers_[0],       0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(pfe_buffers_[1],       0, kMaxNumPillars * 64 * sizeof(float)));
			
 
				+
			
 
				+    GPU_CHECK(cudaMemset(dev_scattered_feature_,    0, kNumThreads * kGridYSize * kGridXSize * sizeof(float)));
			
 
				+    
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[0],    0, kRpnInputSize * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[1],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[2],    0, kNumAnchorPerCls     * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[3],    0, kNumAnchorPerCls * 2 * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[4],    0, kNumAnchorPerCls * 4 * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[5],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[6],    0, kNumAnchorPerCls     * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[7],    0, kNumAnchorPerCls * 2 * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[8],    0, kNumAnchorPerCls * 4 * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[9],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    // GPU_CHECK(cudaMemset(rpn_buffers_[10],   0, kNumAnchorPerCls     * sizeof(float)));
			
 
				+
			
 
				+    GPU_CHECK(cudaMemset(dev_filtered_box_,     0, kNumAnchor * kNumOutputBoxFeature * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMemset(dev_filtered_score_,   0, kNumAnchor * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMemset(dev_filter_count_,     0, kNumClass * sizeof(int)));
			
 
				+}
			
 
				+
			
 
				+void PointPillars::InitTRT(const bool use_onnx) {
			
 
				+  if (use_onnx_) {
			
 
				+    // create a TensorRT model from the onnx model and load it into an engine
			
 
				+    OnnxToTRTModel(pfe_file_, &pfe_engine_);
			
 
				+    SaveEngine(pfe_engine_, pfe_file_.substr(0, pfe_file_.find(".")) + ".trt");
			
 
				+    OnnxToTRTModel(backbone_file_, &backbone_engine_);
			
 
				+    SaveEngine(backbone_engine_, backbone_file_.substr(0, backbone_file_.find(".")) + ".trt");
			
 
				+  }else {
			
 
				+    EngineToTRTModel(pfe_file_, &pfe_engine_);
			
 
				+    EngineToTRTModel(backbone_file_, &backbone_engine_);
			
 
				+  }
			
 
				+    if (pfe_engine_ == nullptr || backbone_engine_ == nullptr) {
			
 
				+        std::cerr << "Failed to load ONNX file.";
			
 
				+    }
			
 
				+
			
 
				+    // create execution context from the engine
			
 
				+    pfe_context_ = pfe_engine_->createExecutionContext();
			
 
				+    backbone_context_ = backbone_engine_->createExecutionContext();
			
 
				+    if (pfe_context_ == nullptr || backbone_context_ == nullptr) {
			
 
				+        std::cerr << "Failed to create TensorRT Execution Context.";
			
 
				+    }
			
 
				+  
			
 
				+}
			
 
				+
			
 
				+void PointPillars::OnnxToTRTModel(
			
 
				+    const std::string& model_file,  // name of the onnx model
			
 
				+    nvinfer1::ICudaEngine** engine_ptr) {
			
 
				+    int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kWARNING);
			
 
				+
			
 
				+    // create the builder
			
 
				+    const auto explicit_batch =
			
 
				+        static_cast<uint32_t>(kBatchSize) << static_cast<uint32_t>(
			
 
				+            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
			
 
				+    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(g_logger_);
			
 
				+    nvinfer1::INetworkDefinition* network =
			
 
				+        builder->createNetworkV2(explicit_batch);
			
 
				+
			
 
				+    // parse onnx model
			
 
				+    auto parser = nvonnxparser::createParser(*network, g_logger_);
			
 
				+    if (!parser->parseFromFile(model_file.c_str(), verbosity)) {
			
 
				+        std::string msg("failed to parse onnx file");
			
 
				+        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
			
 
				+        exit(EXIT_FAILURE);
			
 
				+    }
			
 
				+
			
 
				+    // Build the engine
			
 
				+    builder->setMaxBatchSize(kBatchSize);
			
 
				+ //   builder->setHalf2Mode(true);
			
 
				+    nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
			
 
				+    config->setMaxWorkspaceSize(1 << 25);
			
 
				+    nvinfer1::ICudaEngine* engine =
			
 
				+        builder->buildEngineWithConfig(*network, *config);
			
 
				+
			
 
				+    *engine_ptr = engine;
			
 
				+    parser->destroy();
			
 
				+    network->destroy();
			
 
				+    config->destroy();
			
 
				+    builder->destroy();
			
 
				+}
			
 
				+
			
 
				+void PointPillars::SaveEngine(const nvinfer1::ICudaEngine* engine, const std::string& engine_filepath)
			
 
				+{
			
 
				+    // serialize the engine, then close everything down
			
 
				+    nvinfer1::IHostMemory& trtModelStream = *(engine->serialize());
			
 
				+    std::ofstream file;
			
 
				+    file.open(engine_filepath, std::ios::binary | std::ios::out);
			
 
				+    if(!file.is_open())
			
 
				+    {
			
 
				+        std::cout << "read create engine file" << engine_filepath <<" failed" << std::endl;
			
 
				+        return;
			
 
				+    }
			
 
				+    file.write((const char*)trtModelStream.data(), std::streamsize(trtModelStream.size()));
			
 
				+    file.close();
			
 
				+}
			
 
				+
			
 
				+void PointPillars::EngineToTRTModel(
			
 
				+    const std::string &engine_file ,     
			
 
				+    nvinfer1::ICudaEngine** engine_ptr)  {
			
 
				+    int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kWARNING);
			
 
				+    std::stringstream gieModelStream; 
			
 
				+    gieModelStream.seekg(0, gieModelStream.beg); 
			
 
				+
			
 
				+    std::ifstream cache(engine_file); 
			
 
				+    gieModelStream << cache.rdbuf();
			
 
				+    cache.close(); 
			
 
				+    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(g_logger_); 
			
 
				+
			
 
				+    if (runtime == nullptr) {
			
 
				+        std::string msg("failed to build runtime parser");
			
 
				+        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
			
 
				+        exit(EXIT_FAILURE);
			
 
				+    }
			
 
				+    gieModelStream.seekg(0, std::ios::end);
			
 
				+    const int modelSize = gieModelStream.tellg(); 
			
 
				+
			
 
				+    gieModelStream.seekg(0, std::ios::beg);
			
 
				+    void* modelMem = malloc(modelSize); 
			
 
				+    gieModelStream.read((char*)modelMem, modelSize);
			
 
				+
			
 
				+    std::cout << "                                                                  "<< std::endl;
			
 
				+    std::cout << "------------------------------------------------------------------"<< std::endl;
			
 
				+    std::cout << ">>>>                                                          >>>>"<< std::endl;
			
 
				+    std::cout << "                                                                  "<< std::endl;
			
 
				+    std::cout << "Input filename:   " << engine_file << std::endl;
			
 
				+    std::cout << "                                                                  "<< std::endl;
			
 
				+    std::cout << ">>>>                                                          >>>>"<< std::endl;
			
 
				+    std::cout << "------------------------------------------------------------------"<< std::endl;
			
 
				+    std::cout << "                                                                  "<< std::endl;
			
 
				+
			
 
				+    nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(modelMem, modelSize, NULL); 
			
 
				+    if (engine == nullptr) {
			
 
				+        std::string msg("failed to build engine parser");
			
 
				+        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
			
 
				+        exit(EXIT_FAILURE);
			
 
				+    }
			
 
				+    *engine_ptr = engine;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void PointPillars::DoInference(const float* in_points_array,
			
 
				+                                const int in_num_points,
			
 
				+                                std::vector<float>* out_detections,
			
 
				+                                std::vector<int>* out_labels,
			
 
				+                                std::vector<float>* out_scores) 
			
 
				+{
			
 
				+    SetDeviceMemoryToZero();
			
 
				+    cudaDeviceSynchronize();
			
 
				+    // [STEP 1] : load pointcloud
			
 
				+    float* dev_points;
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_points),
			
 
				+                        in_num_points * kNumPointFeature * sizeof(float))); // in_num_points , 5
			
 
				+    GPU_CHECK(cudaMemset(dev_points, 0, in_num_points * kNumPointFeature * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMemcpy(dev_points, in_points_array,
			
 
				+                        in_num_points * kNumPointFeature * sizeof(float),
			
 
				+                        cudaMemcpyHostToDevice));
			
 
				+    
			
 
				+    // [STEP 2] : preprocess
			
 
				+    host_pillar_count_[0] = 0;
			
 
				+    auto preprocess_start = std::chrono::high_resolution_clock::now();
			
 
				+    preprocess_points_cuda_ptr_->DoPreprocessPointsCuda(
			
 
				+          dev_points, in_num_points, dev_x_coors_, dev_y_coors_,
			
 
				+          dev_num_points_per_pillar_, dev_pillar_point_feature_, dev_pillar_coors_,
			
 
				+          dev_sparse_pillar_map_, host_pillar_count_ ,
			
 
				+          dev_pfe_gather_feature_ );
			
 
				+    cudaDeviceSynchronize();
			
 
				+    auto preprocess_end = std::chrono::high_resolution_clock::now();
			
 
				+    // DEVICE_SAVE<float>(dev_pfe_gather_feature_,  kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature  , "0_Model_pfe_input_gather_feature");
			
 
				+
			
 
				+    // [STEP 3] : pfe forward
			
 
				+    cudaStream_t stream;
			
 
				+    GPU_CHECK(cudaStreamCreate(&stream));
			
 
				+    auto pfe_start = std::chrono::high_resolution_clock::now();
			
 
				+    GPU_CHECK(cudaMemcpyAsync(pfe_buffers_[0], dev_pfe_gather_feature_,
			
 
				+                            kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float), ///kNumGatherPointFeature
			
 
				+                            cudaMemcpyDeviceToDevice, stream));
			
 
				+    pfe_context_->enqueueV2(pfe_buffers_, stream, nullptr);
			
 
				+    cudaDeviceSynchronize();
			
 
				+    auto pfe_end = std::chrono::high_resolution_clock::now();
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(pfe_buffers_[1]),  kMaxNumPillars * 64 , "1_Model_pfe_output_buffers_[1]");
			
 
				+
			
 
				+    // [STEP 4] : scatter pillar feature
			
 
				+    auto scatter_start = std::chrono::high_resolution_clock::now();
			
 
				+    scatter_cuda_ptr_->DoScatterCuda(
			
 
				+        host_pillar_count_[0], dev_x_coors_, dev_y_coors_,
			
 
				+        reinterpret_cast<float*>(pfe_buffers_[1]), dev_scattered_feature_);
			
 
				+    cudaDeviceSynchronize();
			
 
				+    auto scatter_end = std::chrono::high_resolution_clock::now();   
			
 
				+    // DEVICE_SAVE<float>(dev_scattered_feature_ ,  kRpnInputSize,"2_Model_backbone_input_dev_scattered_feature");
			
 
				+
			
 
				+    // [STEP 5] : backbone forward
			
 
				+    auto backbone_start = std::chrono::high_resolution_clock::now();
			
 
				+    GPU_CHECK(cudaMemcpyAsync(rpn_buffers_[0], dev_scattered_feature_,
			
 
				+                            kBatchSize * kRpnInputSize * sizeof(float),
			
 
				+                            cudaMemcpyDeviceToDevice, stream));
			
 
				+    backbone_context_->enqueueV2(rpn_buffers_, stream, nullptr);
			
 
				+    cudaDeviceSynchronize();
			
 
				+    auto backbone_end = std::chrono::high_resolution_clock::now();
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[1]) ,  kNumAnchorPerCls    ,"3_rpn_buffers_[1]");
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[2]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[2]");
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[3]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[3]");
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[4]) ,  kNumAnchorPerCls    ,"3_rpn_buffers_[4]");
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[5]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[5]");
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[6]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[6]");
			
 
				+    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[7]) ,  kNumAnchorPerCls * kNumClass * 9 ,"3_rpn_buffers_[7]");
			
 
				+
			
 
				+    // [STEP 6]: postprocess (multihead)
			
 
				+    auto postprocess_start = std::chrono::high_resolution_clock::now();
			
 
				+    GPU_CHECK(cudaMemset(dev_filter_count_, 0, kNumClass * sizeof(int)));
			
 
				+    postprocess_cuda_ptr_->DoPostprocessCuda(
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[1]), // [cls]   kNumAnchorPerCls 
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[2]), // [cls]   kNumAnchorPerCls * 2 * 2
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[3]), // [cls]   kNumAnchorPerCls * 2 * 2
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[4]), // [cls]   kNumAnchorPerCls 
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[5]), // [cls]   kNumAnchorPerCls * 2 * 2
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[6]), // [cls]   kNumAnchorPerCls * 2 * 2
			
 
				+        reinterpret_cast<float*>(rpn_buffers_[7]), // [boxes] kNumAnchorPerCls * kNumClass * kNumOutputBoxFeature
			
 
				+        dev_filtered_box_, dev_filtered_score_, dev_filter_count_,
			
 
				+        *out_detections, *out_labels , *out_scores);
			
 
				+    cudaDeviceSynchronize();
			
 
				+    auto postprocess_end = std::chrono::high_resolution_clock::now();
			
 
				+
			
 
				+    // release the stream and the buffers
			
 
				+//    std::chrono::duration<double> preprocess_cost = preprocess_end - preprocess_start;
			
 
				+//    std::chrono::duration<double> pfe_cost = pfe_end - pfe_start;
			
 
				+//    std::chrono::duration<double> scatter_cost = scatter_end - scatter_start;
			
 
				+//    std::chrono::duration<double> backbone_cost = backbone_end - backbone_start;
			
 
				+//    std::chrono::duration<double> postprocess_cost = postprocess_end - postprocess_start;
			
 
				+
			
 
				+//    std::chrono::duration<double> pointpillars_cost = postprocess_end - preprocess_start;
			
 
				+//    std::cout << "------------------------------------" << std::endl;
			
 
				+//    std::cout << setiosflags(ios::left)  << setw(14) << "Module" << setw(12)  << "Time"  << resetiosflags(ios::left) << std::endl;
			
 
				+//    std::cout << "------------------------------------" << std::endl;
			
 
				+//    std::string Modules[] = {"Preprocess" , "Pfe" , "Scatter" , "Backbone" , "Postprocess" , "Summary"};
			
 
				+//    double Times[] = {preprocess_cost.count() , pfe_cost.count() , scatter_cost.count() , backbone_cost.count() , postprocess_cost.count() , pointpillars_cost.count()};
			
 
				+
			
 
				+//    for (int i =0 ; i < 6 ; ++i) {
			
 
				+//        std::cout << setiosflags(ios::left) << setw(14) << Modules[i]  << setw(8)  << Times[i] * 1000 << " ms" << resetiosflags(ios::left) << std::endl;
			
 
				+//    }
			
 
				+//    std::cout << "------------------------------------" << std::endl;
			
 
				+    cudaStreamDestroy(stream);
			
 
				+
			
 
				+}
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/pointpillars.h
+++ b/src/api/perception_lidar_PointPillars_MultiHead/pointpillars.h
@@ -0,0 +1,287 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+// headers in STL
			
 
				+#include <algorithm>
			
 
				+#include <cmath>
			
 
				+#include <iomanip>
			
 
				+#include <limits>
			
 
				+#include <map>
			
 
				+#include <memory>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+#include <iostream>
			
 
				+#include <sstream>
			
 
				+#include <fstream>
			
 
				+// headers in TensorRT
			
 
				+#include "NvInfer.h"
			
 
				+#include "NvOnnxParser.h"
			
 
				+
			
 
				+// headers in local files
			
 
				+// #include "params.h"
			
 
				+#include "common.h"
			
 
				+#include <yaml-cpp/yaml.h>
			
 
				+#include "preprocess.h"
			
 
				+#include "scatter.h"
			
 
				+#include "postprocess.h"
			
 
				+
			
 
				+using namespace std;
			
 
				+
			
 
				+// Logger for TensorRT info/warning/errors
			
 
				+class Logger : public nvinfer1::ILogger {
			
 
				+ public:
			
 
				+  explicit Logger(Severity severity = Severity::kWARNING)
			
 
				+      : reportable_severity(severity) {}
			
 
				+
			
 
				+  void log(Severity severity, const char* msg)   noexcept {
			
 
				+    // suppress messages with severity enum value greater than the reportable
			
 
				+    if (severity > reportable_severity) return;
			
 
				+
			
 
				+    switch (severity) {
			
 
				+      case Severity::kINTERNAL_ERROR:
			
 
				+        std::cerr << "INTERNAL_ERROR: ";
			
 
				+        break;
			
 
				+      case Severity::kERROR:
			
 
				+        std::cerr << "ERROR: ";
			
 
				+        break;
			
 
				+      case Severity::kWARNING:
			
 
				+        std::cerr << "WARNING: ";
			
 
				+        break;
			
 
				+      case Severity::kINFO:
			
 
				+        std::cerr << "INFO: ";
			
 
				+        break;
			
 
				+      default:
			
 
				+        std::cerr << "UNKNOWN: ";
			
 
				+        break;
			
 
				+    }
			
 
				+    std::cerr << msg << std::endl;
			
 
				+  }
			
 
				+
			
 
				+  Severity reportable_severity;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+class PointPillars {
			
 
				+ private:
			
 
				+    // initialize in initializer list
			
 
				+    const float score_threshold_;
			
 
				+    const float nms_overlap_threshold_;
			
 
				+    const bool use_onnx_;
			
 
				+    const std::string pfe_file_;
			
 
				+    const std::string backbone_file_;
			
 
				+    const std::string pp_config_;
			
 
				+    // end initializer list
			
 
				+    // voxel size
			
 
				+    float kPillarXSize;
			
 
				+    float kPillarYSize;
			
 
				+    float kPillarZSize;
			
 
				+    // point cloud range
			
 
				+    float kMinXRange;
			
 
				+    float kMinYRange;
			
 
				+    float kMinZRange;
			
 
				+    float kMaxXRange;
			
 
				+    float kMaxYRange;
			
 
				+    float kMaxZRange;
			
 
				+    // hyper parameters
			
 
				+    int kNumClass;
			
 
				+    int kMaxNumPillars;
			
 
				+    int kMaxNumPointsPerPillar;
			
 
				+    int kNumPointFeature;
			
 
				+    int kNumGatherPointFeature = 11;
			
 
				+    int kGridXSize;
			
 
				+    int kGridYSize;
			
 
				+    int kGridZSize;
			
 
				+    int kNumAnchorXinds;
			
 
				+    int kNumAnchorYinds;
			
 
				+    int kRpnInputSize;
			
 
				+    int kNumAnchor;
			
 
				+    int kNumInputBoxFeature;
			
 
				+    int kNumOutputBoxFeature;
			
 
				+    int kRpnBoxOutputSize;
			
 
				+    int kRpnClsOutputSize;
			
 
				+    int kRpnDirOutputSize;
			
 
				+    int kBatchSize;
			
 
				+    int kNumIndsForScan;
			
 
				+    int kNumThreads;
			
 
				+    // if you change kNumThreads, need to modify NUM_THREADS_MACRO in
			
 
				+    // common.h
			
 
				+    int kNumBoxCorners;
			
 
				+    int kNmsPreMaxsize;
			
 
				+    int kNmsPostMaxsize;
			
 
				+    //params for initialize anchors
			
 
				+    //Adapt to OpenPCDet
			
 
				+    int kAnchorStrides;
			
 
				+    std::vector<string> kAnchorNames;
			
 
				+    std::vector<float> kAnchorDxSizes;
			
 
				+    std::vector<float> kAnchorDySizes;
			
 
				+    std::vector<float> kAnchorDzSizes;
			
 
				+    std::vector<float> kAnchorBottom;
			
 
				+    std::vector<std::vector<int>> kMultiheadLabelMapping;
			
 
				+    int kNumAnchorPerCls;
			
 
				+    int host_pillar_count_[1];
			
 
				+
			
 
				+    int* dev_x_coors_;
			
 
				+    int* dev_y_coors_;
			
 
				+    float* dev_num_points_per_pillar_;
			
 
				+    int* dev_sparse_pillar_map_;
			
 
				+    int* dev_cumsum_along_x_;
			
 
				+    int* dev_cumsum_along_y_;
			
 
				+
			
 
				+    float* dev_pillar_point_feature_;
			
 
				+    float* dev_pillar_coors_;
			
 
				+    float* dev_points_mean_;
			
 
				+
			
 
				+    float* dev_pfe_gather_feature_;
			
 
				+    void* pfe_buffers_[2];
			
 
				+    //variable for doPostprocessCudaMultiHead
			
 
				+    void* rpn_buffers_[8];
			
 
				+    
			
 
				+    std::vector<float*> rpn_box_output_; 
			
 
				+    std::vector<float*> rpn_cls_output_;
			
 
				+
			
 
				+    float* dev_scattered_feature_;
			
 
				+
			
 
				+    float* dev_filtered_box_;
			
 
				+    float* dev_filtered_score_;
			
 
				+    int*   dev_filtered_label_;
			
 
				+    int*   dev_filtered_dir_;
			
 
				+    float* dev_box_for_nms_;
			
 
				+    int*   dev_filter_count_;
			
 
				+
			
 
				+    std::unique_ptr<PreprocessPointsCuda> preprocess_points_cuda_ptr_;
			
 
				+    std::unique_ptr<ScatterCuda> scatter_cuda_ptr_;
			
 
				+    std::unique_ptr<PostprocessCuda> postprocess_cuda_ptr_;
			
 
				+
			
 
				+    Logger g_logger_;
			
 
				+    nvinfer1::ICudaEngine* pfe_engine_;
			
 
				+    nvinfer1::ICudaEngine* backbone_engine_;
			
 
				+    nvinfer1::IExecutionContext* pfe_context_;
			
 
				+    nvinfer1::IExecutionContext* backbone_context_;
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Memory allocation for device memory
			
 
				+     * @details Called in the constructor
			
 
				+     */
			
 
				+    void DeviceMemoryMalloc();
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Memory set to 0 for device memory
			
 
				+     * @details Called in the DoInference
			
 
				+     */
			
 
				+    void SetDeviceMemoryToZero();
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Initializing paraments from pointpillars.yaml
			
 
				+     * @details Called in the constructor
			
 
				+     */
			
 
				+    void InitParams();
			
 
				+    /**
			
 
				+     * @brief Initializing TensorRT instances
			
 
				+     * @param[in] usr_onnx_ if true, parse ONNX 
			
 
				+     * @details Called in the constructor
			
 
				+     */
			
 
				+    void InitTRT(const bool use_onnx);
			
 
				+    void SaveEngine(const nvinfer1::ICudaEngine* engine, const std::string& engine_filepath);
			
 
				+    /**
			
 
				+     * @brief Convert ONNX to TensorRT model
			
 
				+     * @param[in] model_file ONNX model file path
			
 
				+     * @param[out] engine_ptr TensorRT model engine made out of ONNX model
			
 
				+     * @details Load ONNX model, and convert it to TensorRT model
			
 
				+     */
			
 
				+    void OnnxToTRTModel(const std::string& model_file,
			
 
				+                        nvinfer1::ICudaEngine** engine_ptr);
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Convert Engine to TensorRT model
			
 
				+     * @param[in] model_file Engine(TensorRT) model file path
			
 
				+     * @param[out] engine_ptr TensorRT model engine made 
			
 
				+     * @details Load Engine model, and convert it to TensorRT model
			
 
				+     */
			
 
				+    void EngineToTRTModel(const std::string &engine_file ,     
			
 
				+                        nvinfer1::ICudaEngine** engine_ptr) ;
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Preproces points
			
 
				+     * @param[in] in_points_array Point cloud array
			
 
				+     * @param[in] in_num_points Number of points
			
 
				+     * @details Call CPU or GPU preprocess
			
 
				+     */
			
 
				+    void Preprocess(const float* in_points_array, const int in_num_points);
			
 
				+
			
 
				+    public:
			
 
				+    /**
			
 
				+     * @brief Constructor
			
 
				+     * @param[in] score_threshold Score threshold for filtering output
			
 
				+     * @param[in] nms_overlap_threshold IOU threshold for NMS
			
 
				+     * @param[in] use_onnx if true,using onnx file ,else using engine file
			
 
				+     * @param[in] pfe_file Pillar Feature Extractor ONNX file path
			
 
				+     * @param[in] rpn_file Region Proposal Network ONNX file path
			
 
				+     * @details Variables could be changed through point_pillars_detection
			
 
				+     */
			
 
				+    PointPillars(const float score_threshold,
			
 
				+                const float nms_overlap_threshold,
			
 
				+                const bool use_onnx,
			
 
				+                const std::string pfe_file,
			
 
				+                const std::string rpn_file,
			
 
				+                const std::string pp_config);
			
 
				+    ~PointPillars();
			
 
				+
			
 
				+    /**
			
 
				+     * @brief Call PointPillars for the inference
			
 
				+     * @param[in] in_points_array Point cloud array
			
 
				+     * @param[in] in_num_points Number of points
			
 
				+     * @param[out] out_detections Network output bounding box
			
 
				+     * @param[out] out_labels Network output object's label
			
 
				+     * @details This is an interface for the algorithm
			
 
				+     */
			
 
				+    void DoInference(const float* in_points_array,
			
 
				+                    const int in_num_points,
			
 
				+                    std::vector<float>* out_detections,
			
 
				+                    std::vector<int>* out_labels,
			
 
				+                    std::vector<float>* out_scores);
			
 
				+};
			
 
				+
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/postprocess.cu
+++ b/src/api/perception_lidar_PointPillars_MultiHead/postprocess.cu
@@ -0,0 +1,383 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+// headers in CUDA
			
 
				+#include <thrust/sort.h>
			
 
				+
			
 
				+// headers in local files
			
 
				+#include "common.h"
			
 
				+#include "postprocess.h"
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+
			
 
				+// sigmoid_filter_warp
			
 
				+__device__ void box_decode_warp(int head_offset, const float* box_pred, 
			
 
				+    int tid , int num_anchors_per_head , int counter, float* filtered_box) 
			
 
				+{
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 0] = box_pred[ head_offset + tid * 9 + 0];
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 1] = box_pred[ head_offset + tid * 9 + 1];
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 2] = box_pred[ head_offset + tid * 9 + 2];
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 3] = box_pred[ head_offset + tid * 9 + 3];
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 4] = box_pred[ head_offset + tid * 9 + 4];
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 5] = box_pred[ head_offset + tid * 9 + 5];
			
 
				+    filtered_box[blockIdx.z * num_anchors_per_head * 7  + counter * 7 + 6] = box_pred[ head_offset + tid * 9 + 6];
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__global__ void sigmoid_filter_kernel(
			
 
				+
			
 
				+    float* cls_pred_0,
			
 
				+    float* cls_pred_12,
			
 
				+    float* cls_pred_34,
			
 
				+    float* cls_pred_5,
			
 
				+    float* cls_pred_67,
			
 
				+    float* cls_pred_89,
			
 
				+
			
 
				+    const float* box_pred_0,
			
 
				+
			
 
				+    const float* box_pred_1,
			
 
				+    const float* box_pred_2,
			
 
				+
			
 
				+    const float* box_pred_3,
			
 
				+    const float* box_pred_4,
			
 
				+
			
 
				+    const float* box_pred_5,
			
 
				+
			
 
				+    const float* box_pred_6,
			
 
				+    const float* box_pred_7,
			
 
				+
			
 
				+    const float* box_pred_8,
			
 
				+    const float* box_pred_9,
			
 
				+
			
 
				+    float* filtered_box, 
			
 
				+    float* filtered_score, 
			
 
				+    int* filter_count,
			
 
				+
			
 
				+    const float score_threshold) {   
			
 
				+
			
 
				+    // cls_pred_34 
			
 
				+    // 32768*2 , 2
			
 
				+
			
 
				+    int num_anchors_per_head = gridDim.x * gridDim.y * blockDim.x;
			
 
				+    // 16 * 4 * 512 = 32768
			
 
				+    extern __shared__ float cls_score[];
			
 
				+    cls_score[threadIdx.x + blockDim.x] = -1.0f;
			
 
				+
			
 
				+    int tid = blockIdx.x * gridDim.y * blockDim.x + blockIdx.y *  blockDim.x + threadIdx.x; 
			
 
				+
			
 
				+
			
 
				+    if ( blockIdx.z == 0) cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_0[ tid ]));
			
 
				+    if ( blockIdx.z == 1) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_12[ tid * 2 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_12[ (num_anchors_per_head + tid) * 2]));}
			
 
				+    if ( blockIdx.z == 2) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_12[ tid * 2 + 1]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_12[ (num_anchors_per_head + tid) * 2 + 1]));}
			
 
				+
			
 
				+    if ( blockIdx.z == 3) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_34[ tid * 2 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_34[ (num_anchors_per_head + tid) * 2]));}
			
 
				+    if ( blockIdx.z == 4) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_34[ tid * 2 + 1 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_34[ (num_anchors_per_head + tid) * 2 + 1]));}
			
 
				+
			
 
				+    if ( blockIdx.z == 5) cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_5[ tid ]));
			
 
				+
			
 
				+    if ( blockIdx.z == 6) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_67[ tid * 2 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_67[ (num_anchors_per_head + tid) * 2]));}
			
 
				+    if ( blockIdx.z == 7) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_67[ tid * 2 + 1 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_67[ (num_anchors_per_head + tid) * 2 + 1]));}
			
 
				+
			
 
				+    if ( blockIdx.z == 8) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_89[ tid * 2 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_89[ (num_anchors_per_head + tid) * 2]));}
			
 
				+    if ( blockIdx.z == 9) {
			
 
				+        cls_score[ threadIdx.x ] = 1 / (1 + expf(-cls_pred_89[ tid * 2 + 1 ]));
			
 
				+        cls_score[ threadIdx.x + blockDim.x] = 1 / (1 + expf(-cls_pred_89[ (num_anchors_per_head + tid) * 2 + 1]));}
			
 
				+    
			
 
				+    __syncthreads();
			
 
				+    
			
 
				+    if( cls_score[ threadIdx.x ] > score_threshold) 
			
 
				+    {
			
 
				+        int counter = atomicAdd(&filter_count[blockIdx.z], 1);
			
 
				+        if ( blockIdx.z == 0) {
			
 
				+            box_decode_warp(0 ,box_pred_0 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 1) {
			
 
				+            box_decode_warp(0 ,box_pred_1 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 2) {
			
 
				+            box_decode_warp(0 ,box_pred_1 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 3) {
			
 
				+            box_decode_warp(0 ,box_pred_3 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else 
			
 
				+        if (blockIdx.z == 4) {
			
 
				+            box_decode_warp(0 ,box_pred_3 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];            
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 5) {
			
 
				+            box_decode_warp(0 ,box_pred_5 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 6) {
			
 
				+            box_decode_warp(0 ,box_pred_6 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 7) {
			
 
				+            box_decode_warp(0 ,box_pred_6 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 8) {
			
 
				+
			
 
				+            box_decode_warp(0 ,box_pred_8 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }else
			
 
				+        if ( blockIdx.z == 9) {
			
 
				+            box_decode_warp(0 ,box_pred_8 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+            filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+        }
			
 
				+    }
			
 
				+    __syncthreads();  
			
 
				+    if( cls_score[ threadIdx.x + blockDim.x ] > score_threshold)  {     
			
 
				+            int counter = atomicAdd(&filter_count[blockIdx.z], 1);
			
 
				+            // printf("counter : %d \n" , counter);
			
 
				+            if (blockIdx.z == 1) {
			
 
				+                box_decode_warp(0 ,box_pred_2 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 2) {
			
 
				+                box_decode_warp(0 ,box_pred_2 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 3) {
			
 
				+                box_decode_warp(0 ,box_pred_4 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 4) {
			
 
				+                box_decode_warp(0 ,box_pred_4 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 6) {
			
 
				+                box_decode_warp(0 ,box_pred_7 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 7) {
			
 
				+                box_decode_warp(0 ,box_pred_7 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 8) {
			
 
				+                box_decode_warp(0 ,box_pred_9 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }else 
			
 
				+            if (blockIdx.z == 9) {
			
 
				+                box_decode_warp(0 ,box_pred_9 , tid , num_anchors_per_head , counter , filtered_box);
			
 
				+                filtered_score[blockIdx.z * num_anchors_per_head + counter] = cls_score[ threadIdx.x ];
			
 
				+            }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+__global__ void sort_boxes_by_indexes_kernel(float* filtered_box, float* filtered_scores, int* indexes, int filter_count,
			
 
				+    float* sorted_filtered_boxes, float* sorted_filtered_scores,
			
 
				+    const int num_output_box_feature)
			
 
				+{
			
 
				+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				+    if(tid < filter_count)  {
			
 
				+
			
 
				+        int sort_index = indexes[tid];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 0] = filtered_box[sort_index * num_output_box_feature + 0];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 1] = filtered_box[sort_index * num_output_box_feature + 1];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 2] = filtered_box[sort_index * num_output_box_feature + 2];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 3] = filtered_box[sort_index * num_output_box_feature + 3];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 4] = filtered_box[sort_index * num_output_box_feature + 4];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 5] = filtered_box[sort_index * num_output_box_feature + 5];
			
 
				+        sorted_filtered_boxes[tid * num_output_box_feature + 6] = filtered_box[sort_index * num_output_box_feature + 6];
			
 
				+
			
 
				+        // sorted_filtered_dir[tid] = filtered_dir[sort_index];
			
 
				+        sorted_filtered_scores[tid] = filtered_scores[sort_index];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+PostprocessCuda::PostprocessCuda(const int num_threads, const float float_min, const float float_max,
			
 
				+    const int num_class,const int num_anchor_per_cls,
			
 
				+    const std::vector<std::vector<int>> multihead_label_mapping,
			
 
				+    const float score_threshold,  const float nms_overlap_threshold, 
			
 
				+    const int nms_pre_maxsize, const int nms_post_maxsize,
			
 
				+    const int num_box_corners, 
			
 
				+    const int num_input_box_feature,
			
 
				+    const int num_output_box_feature)
			
 
				+: num_threads_(num_threads),
			
 
				+  float_min_(float_min),
			
 
				+  float_max_(float_max),
			
 
				+  num_class_(num_class),
			
 
				+  num_anchor_per_cls_(num_anchor_per_cls),
			
 
				+  multihead_label_mapping_(multihead_label_mapping),
			
 
				+  score_threshold_(score_threshold),
			
 
				+  nms_overlap_threshold_(nms_overlap_threshold),
			
 
				+  nms_pre_maxsize_(nms_pre_maxsize),
			
 
				+  nms_post_maxsize_(nms_post_maxsize),
			
 
				+  num_box_corners_(num_box_corners),
			
 
				+  num_input_box_feature_(num_input_box_feature),
			
 
				+  num_output_box_feature_(num_output_box_feature) {
			
 
				+    nms_cuda_ptr_.reset(
			
 
				+    new NmsCuda(num_threads_, num_box_corners_, nms_overlap_threshold_));
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void PostprocessCuda::DoPostprocessCuda(
			
 
				+    float* cls_pred_0,
			
 
				+    float* cls_pred_12,
			
 
				+    float* cls_pred_34,
			
 
				+    float* cls_pred_5,
			
 
				+    float* cls_pred_67,
			
 
				+    float* cls_pred_89,
			
 
				+
			
 
				+    const float* box_preds,
			
 
				+   
			
 
				+    float* dev_filtered_box, 
			
 
				+    float* dev_filtered_score, 
			
 
				+    int* dev_filter_count,
			
 
				+    std::vector<float>& out_detection, std::vector<int>& out_label , std::vector<float>& out_score) {
			
 
				+    // 在此之前，先进行rpn_box_output的concat. 
			
 
				+    // 128x128 的feature map， cls_pred 的shape为（32768，1），（32768,1），（32768,1），（65536,2），（32768，1）
			
 
				+    dim3 gridsize(16, 4 , 10);  //16 *  4  * 512  = 32768 代表一个head的anchors
			
 
				+    sigmoid_filter_kernel<<< gridsize, 512 , 512 * 2 * sizeof(float)>>>(
			
 
				+        cls_pred_0,
			
 
				+        cls_pred_12, 
			
 
				+        cls_pred_34, 
			
 
				+        cls_pred_5, 
			
 
				+        cls_pred_67, 
			
 
				+        cls_pred_89,
			
 
				+
			
 
				+        &box_preds[0 * 32768 * 9],
			
 
				+        &box_preds[1 * 32768 * 9],
			
 
				+        &box_preds[2 * 32768 * 9],
			
 
				+        &box_preds[3 * 32768 * 9],
			
 
				+        &box_preds[4 * 32768 * 9],
			
 
				+        &box_preds[5 * 32768 * 9],
			
 
				+        &box_preds[6 * 32768 * 9],
			
 
				+        &box_preds[7 * 32768 * 9],
			
 
				+        &box_preds[8 * 32768 * 9],
			
 
				+        &box_preds[9 * 32768 * 9],
			
 
				+
			
 
				+        dev_filtered_box, 
			
 
				+        dev_filtered_score,  
			
 
				+        dev_filter_count, 
			
 
				+    
			
 
				+        score_threshold_);
			
 
				+    cudaDeviceSynchronize();
			
 
				+    
			
 
				+    int host_filter_count[num_class_] = {0};
			
 
				+    GPU_CHECK(cudaMemcpy(host_filter_count, dev_filter_count, num_class_ * sizeof(int), cudaMemcpyDeviceToHost));
			
 
				+    
			
 
				+    for (int i = 0; i < num_class_; ++ i) {
			
 
				+        if(host_filter_count[i] <= 0) continue;
			
 
				+
			
 
				+        int* dev_indexes;
			
 
				+        float* dev_sorted_filtered_box;
			
 
				+        float* dev_sorted_filtered_scores;
			
 
				+        GPU_CHECK(cudaMalloc((void**)&dev_indexes, host_filter_count[i] * sizeof(int)));
			
 
				+        GPU_CHECK(cudaMalloc((void**)&dev_sorted_filtered_box, host_filter_count[i] * num_output_box_feature_ * sizeof(float)));
			
 
				+        GPU_CHECK(cudaMalloc((void**)&dev_sorted_filtered_scores, host_filter_count[i]*sizeof(float)));
			
 
				+        // GPU_CHECK(cudaMalloc((void**)&dev_sorted_box_for_nms, NUM_BOX_CORNERS_*host_filter_count[i]*sizeof(float)));
			
 
				+        thrust::sequence(thrust::device, dev_indexes, dev_indexes + host_filter_count[i]);
			
 
				+        thrust::sort_by_key(thrust::device, 
			
 
				+                            &dev_filtered_score[i * num_anchor_per_cls_], 
			
 
				+                            &dev_filtered_score[i * num_anchor_per_cls_ + host_filter_count[i]],
			
 
				+                            dev_indexes, 
			
 
				+                            thrust::greater<float>());
			
 
				+
			
 
				+        const int num_blocks = DIVUP(host_filter_count[i], num_threads_);
			
 
				+
			
 
				+        sort_boxes_by_indexes_kernel<<<num_blocks, num_threads_>>>(
			
 
				+            &dev_filtered_box[i * num_anchor_per_cls_ * num_output_box_feature_], 
			
 
				+            &dev_filtered_score[i * num_anchor_per_cls_], 
			
 
				+            dev_indexes, 
			
 
				+            host_filter_count[i],
			
 
				+            dev_sorted_filtered_box, 
			
 
				+            dev_sorted_filtered_scores,
			
 
				+            num_output_box_feature_);
			
 
				+
			
 
				+        int num_box_for_nms = min(nms_pre_maxsize_, host_filter_count[i]);
			
 
				+        long* keep_inds = new long[num_box_for_nms];  // index of kept box
			
 
				+        memset(keep_inds, 0, num_box_for_nms * sizeof(int));
			
 
				+        int num_out = 0;
			
 
				+        nms_cuda_ptr_->DoNmsCuda(num_box_for_nms, dev_sorted_filtered_box, keep_inds, &num_out);
			
 
				+
			
 
				+        num_out = min(num_out, nms_post_maxsize_);
			
 
				+
			
 
				+        float* host_filtered_box = new float[host_filter_count[i] * num_output_box_feature_]();
			
 
				+        float* host_filtered_scores = new float[host_filter_count[i]]();
			
 
				+
			
 
				+
			
 
				+        cudaMemcpy(host_filtered_box, dev_sorted_filtered_box, host_filter_count[i] * num_output_box_feature_ * sizeof(float), cudaMemcpyDeviceToHost);
			
 
				+        cudaMemcpy(host_filtered_scores, dev_sorted_filtered_scores, host_filter_count[i] * sizeof(float), cudaMemcpyDeviceToHost);
			
 
				+        for (int j = 0; j < num_out; ++j)  {
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 0]);
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 1]);
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 2]);
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 3]);
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 4]);
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 5]);
			
 
				+            out_detection.emplace_back(host_filtered_box[keep_inds[j] * num_output_box_feature_ + 6]);
			
 
				+            out_score.emplace_back(host_filtered_scores[keep_inds[j]]);
			
 
				+            out_label.emplace_back(i);
			
 
				+        }
			
 
				+        delete[] keep_inds;
			
 
				+        delete[] host_filtered_scores;
			
 
				+        delete[] host_filtered_box;
			
 
				+
			
 
				+        GPU_CHECK(cudaFree(dev_indexes));
			
 
				+        GPU_CHECK(cudaFree(dev_sorted_filtered_box));
			
 
				+    }
			
 
				+}
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/postprocess.h
+++ b/src/api/perception_lidar_PointPillars_MultiHead/postprocess.h
@@ -0,0 +1,125 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+#pragma once
			
 
				+#include <memory>
			
 
				+#include <vector>
			
 
				+#include "nms.h"
			
 
				+
			
 
				+class PostprocessCuda {
			
 
				+ private:
			
 
				+    // initializer list
			
 
				+
			
 
				+    const int num_threads_;
			
 
				+    const float float_min_;
			
 
				+    const float float_max_;
			
 
				+    const int num_class_;
			
 
				+    const int num_anchor_per_cls_;
			
 
				+    const float score_threshold_;
			
 
				+    const float nms_overlap_threshold_;
			
 
				+    const int nms_pre_maxsize_;
			
 
				+    const int nms_post_maxsize_;
			
 
				+    const int num_box_corners_;
			
 
				+    const int num_input_box_feature_;
			
 
				+    const int num_output_box_feature_;
			
 
				+    const std::vector<std::vector<int>> multihead_label_mapping_;
			
 
				+    // end initializer list
			
 
				+
			
 
				+    std::unique_ptr<NmsCuda> nms_cuda_ptr_;
			
 
				+  public:
			
 
				+  /**
			
 
				+   * @brief Constructor
			
 
				+   * @param[in] num_threads Number of threads when launching cuda kernel
			
 
				+   * @param[in] float_min The lowest float value
			
 
				+   * @param[in] float_max The maximum float value
			
 
				+   * @param[in] num_class Number of classes 
			
 
				+   * @param[in] num_anchor_per_cls Number anchor per category
			
 
				+   * @param[in] multihead_label_mapping 
			
 
				+   * @param[in] score_threshold Score threshold for filtering output
			
 
				+   * @param[in] nms_overlap_threshold IOU threshold for NMS
			
 
				+   * @param[in] nms_pre_maxsize Maximum number of boxes into NMS
			
 
				+   * @param[in] nms_post_maxsize Maximum number of boxes after NMS
			
 
				+   * @param[in] num_box_corners Number of box's corner
			
 
				+   * @param[in] num_output_box_feature Number of output box's feature
			
 
				+   * @details Captital variables never change after the compile, non-capital
			
 
				+   * variables could be changed through rosparam
			
 
				+   */
			
 
				+  PostprocessCuda(const int num_threads, 
			
 
				+                  const float float_min, const float float_max,
			
 
				+                  const int num_class, const int num_anchor_per_cls, 
			
 
				+                  const std::vector<std::vector<int>> multihead_label_mapping,
			
 
				+                  const float score_threshold,  
			
 
				+                  const float nms_overlap_threshold, 
			
 
				+                  const int nms_pre_maxsize, 
			
 
				+                  const int nms_post_maxsize,
			
 
				+                  const int num_box_corners,
			
 
				+                  const int num_input_box_feature, 
			
 
				+                  const int num_output_box_feature);
			
 
				+  ~PostprocessCuda(){}
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Postprocessing for the network output
			
 
				+   * @param[in] rpn_box_output Box predictions from the network output
			
 
				+   * @param[in] rpn_cls_output Class predictions from the network output
			
 
				+   * @param[in] rpn_dir_output Direction predictions from the network output
			
 
				+   * @param[in] dev_filtered_box Filtered box predictions
			
 
				+   * @param[in] dev_filtered_score Filtered score predictions
			
 
				+   * @param[in] dev_filter_count The number of filtered output
			
 
				+   * @param[out] out_detection Output bounding boxes
			
 
				+   * @param[out] out_label Output labels of objects
			
 
				+   * @details dev_* represents device memory allocated variables
			
 
				+   */
			
 
				+  void DoPostprocessCuda(
			
 
				+    float* cls_pred_0,
			
 
				+    float* cls_pred_12,
			
 
				+    float* cls_pred_34,
			
 
				+    float* cls_pred_5,
			
 
				+    float* cls_pred_67,
			
 
				+    float* cls_pred_89,
			
 
				+    
			
 
				+    const float* box_preds,
			
 
				+    float* dev_filtered_box, 
			
 
				+    float* dev_filtered_score, 
			
 
				+    int* dev_filter_count,
			
 
				+    std::vector<float>& out_detection, std::vector<int>& out_label , std::vector<float>& out_score);
			
 
				+};
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/preprocess.cu
+++ b/src/api/perception_lidar_PointPillars_MultiHead/preprocess.cu
@@ -0,0 +1,410 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+// headers in STL
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+// headers in local files
			
 
				+#include "common.h"
			
 
				+#include "preprocess.h"
			
 
				+
			
 
				+__global__ void make_pillar_histo_kernel(
			
 
				+    const float* dev_points, float* dev_pillar_point_feature_in_coors,
			
 
				+    int* pillar_count_histo, const int num_points,
			
 
				+    const int max_points_per_pillar, const int grid_x_size,
			
 
				+    const int grid_y_size, const int grid_z_size, const float min_x_range,
			
 
				+    const float min_y_range, const float min_z_range, const float pillar_x_size,
			
 
				+    const float pillar_y_size, const float pillar_z_size,
			
 
				+    const int num_point_feature) {
			
 
				+  int th_i = blockIdx.x * blockDim.x +  threadIdx.x ;
			
 
				+  if (th_i >= num_points) {
			
 
				+    return;
			
 
				+  }
			
 
				+  int x_coor = floor((dev_points[th_i * num_point_feature + 0] - min_x_range) / pillar_x_size);
			
 
				+  int y_coor = floor((dev_points[th_i * num_point_feature + 1] - min_y_range) / pillar_y_size);
			
 
				+  int z_coor = floor((dev_points[th_i * num_point_feature + 2] - min_z_range) / pillar_z_size);
			
 
				+
			
 
				+  if (x_coor >= 0 && x_coor < grid_x_size && y_coor >= 0 &&
			
 
				+      y_coor < grid_y_size && z_coor >= 0 && z_coor < grid_z_size) {
			
 
				+    int count =
			
 
				+        atomicAdd(&pillar_count_histo[y_coor * grid_x_size + x_coor], 1);
			
 
				+    if (count < max_points_per_pillar) {
			
 
				+      int ind =
			
 
				+          y_coor * grid_x_size * max_points_per_pillar * num_point_feature +
			
 
				+          x_coor * max_points_per_pillar * num_point_feature +
			
 
				+          count * num_point_feature;
			
 
				+ 
			
 
				+      for (int i = 0; i < num_point_feature; ++i) {
			
 
				+        dev_pillar_point_feature_in_coors[ind + i] =
			
 
				+            dev_points[th_i * num_point_feature + i];
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+__global__ void make_pillar_index_kernel(
			
 
				+    int* dev_pillar_count_histo, int* dev_counter, int* dev_pillar_count,
			
 
				+    int* dev_x_coors, int* dev_y_coors, float* dev_num_points_per_pillar,
			
 
				+    int* dev_sparse_pillar_map, const int max_pillars,
			
 
				+    const int max_points_per_pillar, const int grid_x_size,
			
 
				+    const int num_inds_for_scan) {
			
 
				+  int x = blockIdx.x;
			
 
				+  int y = threadIdx.x;
			
 
				+  int num_points_at_this_pillar = dev_pillar_count_histo[y * grid_x_size + x];
			
 
				+  if (num_points_at_this_pillar == 0) {
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  int count = atomicAdd(dev_counter, 1);
			
 
				+  if (count < max_pillars) {
			
 
				+    atomicAdd(dev_pillar_count, 1);
			
 
				+    if (num_points_at_this_pillar >= max_points_per_pillar) {
			
 
				+      dev_num_points_per_pillar[count] = max_points_per_pillar;
			
 
				+    } else {
			
 
				+      dev_num_points_per_pillar[count] = num_points_at_this_pillar;
			
 
				+    }
			
 
				+    dev_x_coors[count] = x;
			
 
				+    dev_y_coors[count] = y;
			
 
				+    dev_sparse_pillar_map[y * num_inds_for_scan + x] = 1;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+__global__ void make_pillar_feature_kernel(
			
 
				+    float* dev_pillar_point_feature_in_coors, float* dev_pillar_point_feature,
			
 
				+    float* dev_pillar_coors, int* dev_x_coors, int* dev_y_coors,
			
 
				+    float* dev_num_points_per_pillar, const int max_points,
			
 
				+    const int num_point_feature, const int grid_x_size) {
			
 
				+  int ith_pillar = blockIdx.x;
			
 
				+  int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
			
 
				+  int ith_point = threadIdx.x;
			
 
				+  if (ith_point >= num_points_at_this_pillar) {
			
 
				+    return;
			
 
				+  }
			
 
				+  int x_ind = dev_x_coors[ith_pillar];
			
 
				+  int y_ind = dev_y_coors[ith_pillar];
			
 
				+  int pillar_ind = ith_pillar * max_points * num_point_feature +
			
 
				+                   ith_point * num_point_feature;
			
 
				+  int coors_ind = y_ind * grid_x_size * max_points * num_point_feature +
			
 
				+                  x_ind * max_points * num_point_feature +
			
 
				+                  ith_point * num_point_feature;
			
 
				+  #pragma unroll 
			
 
				+  for (int i = 0; i < num_point_feature; ++i) {
			
 
				+    dev_pillar_point_feature[pillar_ind + i] =
			
 
				+        dev_pillar_point_feature_in_coors[coors_ind + i];
			
 
				+  }
			
 
				+
			
 
				+  float coor_x = static_cast<float>(x_ind);
			
 
				+  float coor_y = static_cast<float>(y_ind);
			
 
				+  dev_pillar_coors[ith_pillar * 4 + 0] = 0;  // batch idx
			
 
				+  dev_pillar_coors[ith_pillar * 4 + 1] = 0;  // z
			
 
				+  dev_pillar_coors[ith_pillar * 4 + 2] = coor_y;
			
 
				+  dev_pillar_coors[ith_pillar * 4 + 3] = coor_x;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+__global__ void pillar_mean_kernel(
			
 
				+  float* dev_points_mean, 
			
 
				+  const int num_point_feature,
			
 
				+  const float* dev_pillar_point_feature, 
			
 
				+  const float* dev_num_points_per_pillar, 
			
 
				+  int max_pillars , 
			
 
				+  int max_points_per_pillar) {
			
 
				+
			
 
				+    extern __shared__ float temp[];
			
 
				+    int ith_pillar = blockIdx.x; 
			
 
				+    int ith_point  = threadIdx.x;
			
 
				+    int axis = threadIdx.y;
			
 
				+  
			
 
				+    int reduce_size = max_points_per_pillar > 32 ? 64 : 32;
			
 
				+    temp[threadIdx.x * 3 + axis] =  dev_pillar_point_feature[ith_pillar * max_points_per_pillar * num_point_feature + ith_point * num_point_feature + axis];  
			
 
				+    if (threadIdx.x < reduce_size - max_points_per_pillar) {
			
 
				+        temp[(threadIdx.x + max_points_per_pillar) * 3 + axis] = 0.0f; //--> dummy placeholds will set as 0
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+    int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
			
 
				+
			
 
				+    if (ith_point >= num_points_at_this_pillar) {
			
 
				+          return;
			
 
				+    }
			
 
				+
			
 
				+    for (unsigned int d = reduce_size >> 1 ; d > 0.6; d >>= 1) {
			
 
				+        if (ith_point < d) {
			
 
				+            temp[ith_point*3 +axis] += temp[(ith_point + d) * 3 + axis];
			
 
				+        }
			
 
				+        __syncthreads();
			
 
				+    }
			
 
				+
			
 
				+    if (ith_point == 0) {
			
 
				+        dev_points_mean[ith_pillar * 3 + axis] = temp[ith_point + axis] / num_points_at_this_pillar ;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+__device__ void warpReduce(volatile float* sdata , int ith_point , int axis) {
			
 
				+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 8) * blockDim.y + axis];
			
 
				+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 4) * blockDim.y + axis];
			
 
				+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 2) * blockDim.y + axis];
			
 
				+    sdata[ith_point * blockDim.y + axis] += sdata[(ith_point + 1) * blockDim.y + axis];
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+__global__ void make_pillar_mean_kernel(
			
 
				+  float* dev_points_mean, 
			
 
				+  const int num_point_feature,
			
 
				+  const float* dev_pillar_point_feature, 
			
 
				+  const float* dev_num_points_per_pillar, 
			
 
				+  int max_pillars , 
			
 
				+  int max_points_pre_pillar) {
			
 
				+    extern __shared__ float temp[];
			
 
				+    unsigned int ith_pillar = blockIdx.x;  // { 0 , 1, 2, ... , 10000+}
			
 
				+    unsigned int ith_point  = threadIdx.x; // { 0 , 1, 2, ...,9}
			
 
				+    unsigned int axis = threadIdx.y; 
			
 
				+    unsigned int idx_pre  = ith_pillar * max_points_pre_pillar * num_point_feature \
			
 
				+                     + ith_point  * num_point_feature;
			
 
				+    unsigned int idx_post = ith_pillar * max_points_pre_pillar * num_point_feature \
			
 
				+                     + (ith_point + blockDim.x)  * num_point_feature;
			
 
				+
			
 
				+    temp[ith_point * blockDim.y + axis] = 0.0;
			
 
				+    unsigned int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
			
 
				+
			
 
				+    // if (ith_point < num_points_at_this_pillar / 2) {
			
 
				+      temp[ith_point * blockDim.y + axis] = dev_pillar_point_feature[idx_pre  + axis] 
			
 
				+                                          + dev_pillar_point_feature[idx_post + axis];
			
 
				+    // }
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    // do reduction in shared mem
			
 
				+    // Sequential addressing. This solves the bank conflicts as
			
 
				+    // the threads now access shared memory with a stride of one
			
 
				+    // 32-bit word (unsigned int) now, which does not cause bank 
			
 
				+    // conflicts
			
 
				+    warpReduce(temp , ith_point , axis);
			
 
				+
			
 
				+	// // write result for this block to global mem
			
 
				+    if (ith_point == 0)
			
 
				+    dev_points_mean[ith_pillar * blockDim.y + axis] = temp[ith_point * blockDim.y + axis] / num_points_at_this_pillar ;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__global__ void gather_point_feature_kernel(
			
 
				+  const int max_num_pillars_,const int max_num_points_per_pillar,const int num_point_feature,
			
 
				+  const float min_x_range, const float min_y_range, const float min_z_range, 
			
 
				+  const float pillar_x_size,  const float pillar_y_size, const float pillar_z_size,
			
 
				+  const float* dev_pillar_point_feature, const float* dev_num_points_per_pillar, 
			
 
				+  const float* dev_pillar_coors,
			
 
				+  float* dev_points_mean, 
			
 
				+  float* dev_pfe_gather_feature_){
			
 
				+
			
 
				+  int ith_pillar = blockIdx.x; 
			
 
				+  int ith_point = threadIdx.x;
			
 
				+  // int kNumPointFeature = 5;
			
 
				+  int num_gather_feature = 11;
			
 
				+  int num_points_at_this_pillar = dev_num_points_per_pillar[ith_pillar];
			
 
				+
			
 
				+  if (ith_point >= num_points_at_this_pillar){
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 0] 
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0]; 
			
 
				+  
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 1]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1];
			
 
				+  
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 2]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2];
			
 
				+  
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 3]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 3];
			
 
				+
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 4]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 4];
			
 
				+  
			
 
				+    // dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 4]  =  0.0f;
			
 
				+    //   f_cluster = voxel_features[:, :, :3] - points_mean
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 5]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0] - dev_points_mean[ith_pillar * 3 + 0 ];
			
 
				+
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 6] 
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1] - dev_points_mean[ith_pillar * 3 + 1 ];
			
 
				+  
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 7]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2] - dev_points_mean[ith_pillar * 3 + 2 ];
			
 
				+
			
 
				+    // f_center[:, :, 0] = voxel_features[:, :, 0] - (coords[:, 3].to(voxel_features.dtype).unsqueeze(1) * self.voxel_x + self.x_offset)
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 8]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 0] - (dev_pillar_coors[ith_pillar * 4 + 3] * pillar_x_size + (pillar_x_size/2 + min_x_range));
			
 
				+  
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 9]  
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 1] - (dev_pillar_coors[ith_pillar * 4 + 2] * pillar_y_size + (pillar_y_size/2 + min_y_range));
			
 
				+  
			
 
				+    dev_pfe_gather_feature_[ith_pillar * max_num_points_per_pillar * num_gather_feature + ith_point * num_gather_feature + 10] 
			
 
				+    =  dev_pillar_point_feature[ith_pillar * max_num_points_per_pillar * num_point_feature + ith_point * num_point_feature + 2] - (dev_pillar_coors[ith_pillar * 4 + 1] * pillar_z_size + (pillar_z_size/2 + min_z_range));
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+PreprocessPointsCuda::PreprocessPointsCuda(
			
 
				+    const int num_threads, const int max_num_pillars,
			
 
				+    const int max_points_per_pillar, const int num_point_feature,
			
 
				+    const int num_inds_for_scan, const int grid_x_size, const int grid_y_size,
			
 
				+    const int grid_z_size, const float pillar_x_size, const float pillar_y_size,
			
 
				+    const float pillar_z_size, const float min_x_range, const float min_y_range,
			
 
				+    const float min_z_range)
			
 
				+    : num_threads_(num_threads),
			
 
				+      max_num_pillars_(max_num_pillars),
			
 
				+      max_num_points_per_pillar_(max_points_per_pillar),
			
 
				+      num_point_feature_(num_point_feature),
			
 
				+      num_inds_for_scan_(num_inds_for_scan),
			
 
				+      grid_x_size_(grid_x_size),
			
 
				+      grid_y_size_(grid_y_size),
			
 
				+      grid_z_size_(grid_z_size),
			
 
				+      pillar_x_size_(pillar_x_size),
			
 
				+      pillar_y_size_(pillar_y_size),
			
 
				+      pillar_z_size_(pillar_z_size),
			
 
				+      min_x_range_(min_x_range),
			
 
				+      min_y_range_(min_y_range),
			
 
				+      min_z_range_(min_z_range) {
			
 
				+    
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_point_feature_in_coors_),
			
 
				+        grid_y_size_ * grid_x_size_ * max_num_points_per_pillar_ *  num_point_feature_ * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_count_histo_),
			
 
				+        grid_y_size_ * grid_x_size_ * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_counter_), sizeof(int)));
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_count_), sizeof(int)));    
			
 
				+    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_points_mean_), max_num_pillars_ * 3 *sizeof(float)));  
			
 
				+    }
			
 
				+
			
 
				+PreprocessPointsCuda::~PreprocessPointsCuda() {
			
 
				+    GPU_CHECK(cudaFree(dev_pillar_point_feature_in_coors_));
			
 
				+    GPU_CHECK(cudaFree(dev_pillar_count_histo_));
			
 
				+    GPU_CHECK(cudaFree(dev_counter_));
			
 
				+    GPU_CHECK(cudaFree(dev_pillar_count_));
			
 
				+    GPU_CHECK(cudaFree(dev_points_mean_));
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+void PreprocessPointsCuda::DoPreprocessPointsCuda(
			
 
				+    const float* dev_points, const int in_num_points, 
			
 
				+    int* dev_x_coors,int* dev_y_coors, 
			
 
				+    float* dev_num_points_per_pillar,
			
 
				+    float* dev_pillar_point_feature, float* dev_pillar_coors,
			
 
				+    int* dev_sparse_pillar_map, int* host_pillar_count , float* dev_pfe_gather_feature) {
			
 
				+    // initialize paraments
			
 
				+    GPU_CHECK(cudaMemset(dev_pillar_point_feature_in_coors_, 0 , grid_y_size_ * grid_x_size_ * max_num_points_per_pillar_ *  num_point_feature_ * sizeof(float)));
			
 
				+    GPU_CHECK(cudaMemset(dev_pillar_count_histo_, 0 , grid_y_size_ * grid_x_size_ * sizeof(int)));
			
 
				+    GPU_CHECK(cudaMemset(dev_counter_, 0, sizeof(int)));
			
 
				+    GPU_CHECK(cudaMemset(dev_pillar_count_, 0, sizeof(int)));
			
 
				+    GPU_CHECK(cudaMemset(dev_points_mean_, 0,  max_num_pillars_ * 3 * sizeof(float)));
			
 
				+    int num_block = DIVUP(in_num_points , num_threads_);
			
 
				+    make_pillar_histo_kernel<<<num_block , num_threads_>>>(
			
 
				+        dev_points, dev_pillar_point_feature_in_coors_, dev_pillar_count_histo_,
			
 
				+        in_num_points, max_num_points_per_pillar_, grid_x_size_, grid_y_size_,
			
 
				+        grid_z_size_, min_x_range_, min_y_range_, min_z_range_, pillar_x_size_,
			
 
				+        pillar_y_size_, pillar_z_size_, num_point_feature_);
			
 
				+    
			
 
				+    make_pillar_index_kernel<<<grid_x_size_, grid_y_size_>>>(
			
 
				+        dev_pillar_count_histo_, dev_counter_, dev_pillar_count_, dev_x_coors,
			
 
				+        dev_y_coors, dev_num_points_per_pillar, dev_sparse_pillar_map,
			
 
				+        max_num_pillars_, max_num_points_per_pillar_, grid_x_size_,
			
 
				+        num_inds_for_scan_);  
			
 
				+
			
 
				+    GPU_CHECK(cudaMemcpy(host_pillar_count, dev_pillar_count_, 1 * sizeof(int),
			
 
				+        cudaMemcpyDeviceToHost));
			
 
				+    make_pillar_feature_kernel<<<host_pillar_count[0],max_num_points_per_pillar_>>>(
			
 
				+        dev_pillar_point_feature_in_coors_, dev_pillar_point_feature,
			
 
				+        dev_pillar_coors, dev_x_coors, dev_y_coors, dev_num_points_per_pillar,
			
 
				+        max_num_points_per_pillar_, num_point_feature_, grid_x_size_);
			
 
				+    
			
 
				+
			
 
				+    dim3 mean_block(max_num_points_per_pillar_,3); //(32,3)
			
 
				+
			
 
				+    pillar_mean_kernel<<<host_pillar_count[0],mean_block,64 * 3 *sizeof(float)>>>(
			
 
				+      dev_points_mean_  ,num_point_feature_, dev_pillar_point_feature, dev_num_points_per_pillar, 
			
 
				+        max_num_pillars_ , max_num_points_per_pillar_);
			
 
				+
			
 
				+    // dim3 mean_block(10,3); // Unrolling the Last Warp
			
 
				+    // make_pillar_mean_kernel<<<host_pillar_count[0], mean_block , 32 * 3 *sizeof(float)>>>(
			
 
				+    //       dev_points_mean_  ,num_point_feature_, dev_pillar_point_feature, dev_num_points_per_pillar, 
			
 
				+    //       max_num_pillars_ , max_num_points_per_pillar_);
			
 
				+
			
 
				+    gather_point_feature_kernel<<<max_num_pillars_, max_num_points_per_pillar_>>>(
			
 
				+      max_num_pillars_,max_num_points_per_pillar_,num_point_feature_,
			
 
				+      min_x_range_, min_y_range_, min_z_range_,
			
 
				+      pillar_x_size_, pillar_y_size_, pillar_z_size_, 
			
 
				+      dev_pillar_point_feature, dev_num_points_per_pillar, dev_pillar_coors,
			
 
				+      dev_points_mean_,
			
 
				+      dev_pfe_gather_feature);
			
 
				+
			
 
				+    // DEVICE_SAVE<float>(dev_pillar_point_feature , \
			
 
				+    //     max_num_pillars_ * max_num_points_per_pillar_ * num_point_feature_ , "dev_pillar_point_feature");
			
 
				+    // DEVICE_SAVE<float>(dev_num_points_per_pillar , \
			
 
				+    //   max_num_pillars_ , "dev_num_points_per_pillar");
			
 
				+    // DEVICE_SAVE<float>(dev_pfe_gather_feature , \
			
 
				+    //   max_num_pillars_ * 11, "dev_pfe_gather_feature");
			
 
				+}
			
 
				+
			
 
				+
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/preprocess.h
+++ b/src/api/perception_lidar_PointPillars_MultiHead/preprocess.h
@@ -0,0 +1,138 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @file preprocess_points_cuda.h
			
 
				+ * @brief GPU version of preprocess points
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+
			
 
				+class PreprocessPointsCuda {
			
 
				+ private:
			
 
				+    // initializer list
			
 
				+    const int num_threads_;
			
 
				+    const int max_num_pillars_;
			
 
				+    const int max_num_points_per_pillar_;
			
 
				+    const int num_point_feature_;
			
 
				+    const int num_inds_for_scan_;
			
 
				+    const int grid_x_size_;
			
 
				+    const int grid_y_size_;
			
 
				+    const int grid_z_size_;
			
 
				+    const float pillar_x_size_;
			
 
				+    const float pillar_y_size_;
			
 
				+    const float pillar_z_size_;
			
 
				+    const float min_x_range_;
			
 
				+    const float min_y_range_;
			
 
				+    const float min_z_range_;
			
 
				+    // end initializer list
			
 
				+
			
 
				+    float* dev_pillar_point_feature_in_coors_;
			
 
				+    int* dev_pillar_count_histo_;
			
 
				+
			
 
				+    int* dev_counter_;
			
 
				+    int* dev_pillar_count_;
			
 
				+    float* dev_points_mean_;
			
 
				+
			
 
				+
			
 
				+
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @brief Constructor
			
 
				+   * @param[in] num_threads Number of threads when launching cuda kernel
			
 
				+   * @param[in] num_point_feature Number of features in a point
			
 
				+   * @param[in] num_inds_for_scan Number of indexes for scan(cumsum)
			
 
				+   * 
			
 
				+   * @param[in] max_num_pillars Maximum number of pillars
			
 
				+   * @param[in] max_points_per_pillar Maximum number of points per pillar
			
 
				+
			
 
				+   * @param[in] grid_x_size Number of pillars in x-coordinate
			
 
				+   * @param[in] grid_y_size Number of pillars in y-coordinate
			
 
				+   * @param[in] grid_z_size Number of pillars in z-coordinate
			
 
				+   * 
			
 
				+   * @param[in] pillar_x_size Size of x-dimension for a pillar
			
 
				+   * @param[in] pillar_y_size Size of y-dimension for a pillar
			
 
				+   * @param[in] pillar_z_size Size of z-dimension for a pillar
			
 
				+   * 
			
 
				+   * @param[in] min_x_range Minimum x value for point cloud
			
 
				+   * @param[in] min_y_range Minimum y value for point cloud
			
 
				+   * @param[in] min_z_range Minimum z value for point cloud
			
 
				+   * @details Captital variables never change after the compile
			
 
				+   */
			
 
				+  PreprocessPointsCuda(const int num_threads, const int num_point_feature, const int num_inds_for_scan,
			
 
				+                       const int max_num_pillars, const int max_points_per_pillar,  
			
 
				+                       const int grid_x_size, const int grid_y_size, const int grid_z_size,  // grid size
			
 
				+                       const float pillar_x_size, const float pillar_y_size, const float pillar_z_size, //voxel size
			
 
				+                       const float min_x_range, const float min_y_range, const float min_z_range); // point cloud range
			
 
				+  ~PreprocessPointsCuda();
			
 
				+
			
 
				+  /**
			
 
				+   * @brief CUDA preprocessing for input point cloud
			
 
				+   * @param[in] dev_points Point cloud array
			
 
				+   * @param[in] in_num_points The number of points
			
 
				+   * @param[mid] dev_x_coors X-coordinate indexes for corresponding pillars
			
 
				+   * @param[mid] dev_y_coors Y-coordinate indexes for corresponding pillars
			
 
				+   * @param[mid] dev_num_points_per_pillar
			
 
				+   *   Number of points in corresponding pillars
			
 
				+   * @param[mid] pillar_point_feature
			
 
				+   *   Values of point feature in each pillar
			
 
				+   * @param[mid] pillar_coors Array for coors of pillars
			
 
				+   * @param[mid] dev_points_mean Array for calculate the point center
			
 
				+   * @param[out] dev_sparse_pillar_map
			
 
				+   *   Grid map representation for pillar-occupancy
			
 
				+   * @param[out] host_pillar_count
			
 
				+   *   The number of valid pillars for an input point cloud
			
 
				+   * @param[out] dev_pfe_gather_feature
			
 
				+   *   11 dimensions feature for pfe input channel
			
 
				+   * @details Convert point cloud to pillar representation
			
 
				+   */
			
 
				+  void DoPreprocessPointsCuda(const float* dev_points,
			
 
				+                              const int in_num_points,
			
 
				+                              int* dev_x_coors,
			
 
				+                              int* dev_y_coors,
			
 
				+                              float* dev_num_points_per_pillar,
			
 
				+                              float* dev_pillar_point_feature,
			
 
				+                              float* dev_pillar_coors,
			
 
				+                              int* dev_sparse_pillar_map,
			
 
				+                              int* host_pillar_count,
			
 
				+                              float* dev_pfe_gather_feature);
			
 
				+};
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/scatter.cu
+++ b/src/api/perception_lidar_PointPillars_MultiHead/scatter.cu
@@ -0,0 +1,73 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+
			
 
				+//headers in local files
			
 
				+#include "scatter.h"
			
 
				+
			
 
				+__global__ void scatter_kernel(int *x_coors, int *y_coors, float *pfe_output,
			
 
				+  float *scattered_feature, const int grid_x_size,
			
 
				+  const int grid_y_size) {
			
 
				+int i_pillar = blockIdx.x;
			
 
				+int i_feature = threadIdx.x;
			
 
				+int x_ind = x_coors[i_pillar];
			
 
				+int y_ind = y_coors[i_pillar];
			
 
				+float feature = pfe_output[i_pillar * 64 + i_feature];
			
 
				+scattered_feature[i_feature * grid_y_size * grid_x_size +
			
 
				+y_ind * grid_x_size + x_ind] = feature;
			
 
				+}
			
 
				+
			
 
				+ScatterCuda::ScatterCuda(const int num_threads, const int grid_x_size,
			
 
				+const int grid_y_size)
			
 
				+: num_threads_(num_threads),
			
 
				+grid_x_size_(grid_x_size),
			
 
				+grid_y_size_(grid_y_size) {}
			
 
				+
			
 
				+void ScatterCuda::DoScatterCuda(const int pillar_count, int *x_coors,
			
 
				+   int *y_coors, float *pfe_output,
			
 
				+   float *scattered_feature) {
			
 
				+scatter_kernel<<<pillar_count, num_threads_>>>(x_coors, y_coors, pfe_output,
			
 
				+                    scattered_feature,
			
 
				+                    grid_x_size_, grid_y_size_);
			
 
				+}
			
--- a/src/api/perception_lidar_PointPillars_MultiHead/scatter.h
+++ b/src/api/perception_lidar_PointPillars_MultiHead/scatter.h
@@ -0,0 +1,77 @@
 
				+/******************************************************************************
			
 
				+ * Copyright 2020 The Apollo Authors. All Rights Reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ *****************************************************************************/
			
 
				+
			
 
				+/*
			
 
				+ * Copyright 2018-2019 Autoware Foundation. All rights reserved.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @author Kosuke Murakami
			
 
				+ * @date 2019/02/26
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+* @author Yan haixu
			
 
				+* Contact: just github.com/hova88
			
 
				+* @date 2021/04/30
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+class ScatterCuda {
			
 
				+ private:
			
 
				+  const int num_threads_;
			
 
				+  const int grid_x_size_;
			
 
				+  const int grid_y_size_;
			
 
				+
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @brief Constructor
			
 
				+   * @param[in] num_threads The number of threads to launch cuda kernel
			
 
				+   * @param[in] grid_x_size Number of pillars in x-coordinate
			
 
				+   * @param[in] grid_y_size Number of pillars in y-coordinate
			
 
				+   * @details Captital variables never change after the compile
			
 
				+   */
			
 
				+  ScatterCuda(const int num_threads, const int grid_x_size,
			
 
				+              const int grid_y_size);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Call scatter cuda kernel
			
 
				+   * @param[in] pillar_count The valid number of pillars
			
 
				+   * @param[in] x_coors X-coordinate indexes for corresponding pillars
			
 
				+   * @param[in] y_coors Y-coordinate indexes for corresponding pillars
			
 
				+   * @param[in] pfe_output Output from Pillar Feature Extractor
			
 
				+   * @param[out] scattered_feature Gridmap representation for pillars' feature
			
 
				+   * @details Allocate pillars in gridmap based on index(coordinates)
			
 
				+   * information
			
 
				+   */
			
 
				+  void DoScatterCuda(const int pillar_count, int* x_coors, int* y_coors,
			
 
				+                     float* pfe_output, float* scattered_feature);
			
 
				+};
			
 
				+
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/blob.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/blob.hpp
@@ -0,0 +1,282 @@
 
				+#ifndef CAFFE_BLOB_HPP_
			
 
				+#define CAFFE_BLOB_HPP_
			
 
				+
			
 
				+#include <algorithm>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/common.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+#include "caffe/syncedmem.hpp"
			
 
				+
			
 
				+const int kMaxBlobAxes = 32;
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief A wrapper around SyncedMemory holders serving as the basic
			
 
				+ *        computational unit through which Layer%s, Net%s, and Solver%s
			
 
				+ *        interact.
			
 
				+ *
			
 
				+ * TODO(dox): more thorough description.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class Blob {
			
 
				+ public:
			
 
				+  Blob()
			
 
				+       : data_(), diff_(), count_(0), capacity_(0) {}
			
 
				+
			
 
				+  /// @brief Deprecated; use <code>Blob(const vector<int>& shape)</code>.
			
 
				+  explicit Blob(const int num, const int channels, const int height,
			
 
				+      const int width);
			
 
				+  explicit Blob(const vector<int>& shape);
			
 
				+
			
 
				+  /// @brief Deprecated; use <code>Reshape(const vector<int>& shape)</code>.
			
 
				+  void Reshape(const int num, const int channels, const int height,
			
 
				+      const int width);
			
 
				+  /**
			
 
				+   * @brief Change the dimensions of the blob, allocating new memory if
			
 
				+   *        necessary.
			
 
				+   *
			
 
				+   * This function can be called both to create an initial allocation
			
 
				+   * of memory, and to adjust the dimensions of a top blob during Layer::Reshape
			
 
				+   * or Layer::Forward. When changing the size of blob, memory will only be
			
 
				+   * reallocated if sufficient memory does not already exist, and excess memory
			
 
				+   * will never be freed.
			
 
				+   *
			
 
				+   * Note that reshaping an input blob and immediately calling Net::Backward is
			
 
				+   * an error; either Net::Forward or Net::Reshape need to be called to
			
 
				+   * propagate the new input shape to higher layers.
			
 
				+   */
			
 
				+  void Reshape(const vector<int>& shape);
			
 
				+  void Reshape(const BlobShape& shape);
			
 
				+  void ReshapeLike(const Blob& other);
			
 
				+  inline string shape_string() const {
			
 
				+    ostringstream stream;
			
 
				+    for (int i = 0; i < shape_.size(); ++i) {
			
 
				+      stream << shape_[i] << " ";
			
 
				+    }
			
 
				+    stream << "(" << count_ << ")";
			
 
				+    return stream.str();
			
 
				+  }
			
 
				+  inline const vector<int>& shape() const { return shape_; }
			
 
				+  /**
			
 
				+   * @brief Returns the dimension of the index-th axis (or the negative index-th
			
 
				+   *        axis from the end, if index is negative).
			
 
				+   *
			
 
				+   * @param index the axis index, which may be negative as it will be
			
 
				+   *        "canonicalized" using CanonicalAxisIndex.
			
 
				+   *        Dies on out of range index.
			
 
				+   */
			
 
				+  inline int shape(int index) const {
			
 
				+    return shape_[CanonicalAxisIndex(index)];
			
 
				+  }
			
 
				+  inline int num_axes() const { return shape_.size(); }
			
 
				+  inline int count() const { return count_; }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Compute the volume of a slice; i.e., the product of dimensions
			
 
				+   *        among a range of axes.
			
 
				+   *
			
 
				+   * @param start_axis The first axis to include in the slice.
			
 
				+   *
			
 
				+   * @param end_axis The first axis to exclude from the slice.
			
 
				+   */
			
 
				+  inline int count(int start_axis, int end_axis) const {
			
 
				+    CHECK_LE(start_axis, end_axis);
			
 
				+    CHECK_GE(start_axis, 0);
			
 
				+    CHECK_GE(end_axis, 0);
			
 
				+    CHECK_LE(start_axis, num_axes());
			
 
				+    CHECK_LE(end_axis, num_axes());
			
 
				+    int count = 1;
			
 
				+    for (int i = start_axis; i < end_axis; ++i) {
			
 
				+      count *= shape(i);
			
 
				+    }
			
 
				+    return count;
			
 
				+  }
			
 
				+  /**
			
 
				+   * @brief Compute the volume of a slice spanning from a particular first
			
 
				+   *        axis to the final axis.
			
 
				+   *
			
 
				+   * @param start_axis The first axis to include in the slice.
			
 
				+   */
			
 
				+  inline int count(int start_axis) const {
			
 
				+    return count(start_axis, num_axes());
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Returns the 'canonical' version of a (usually) user-specified axis,
			
 
				+   *        allowing for negative indexing (e.g., -1 for the last axis).
			
 
				+   *
			
 
				+   * @param axis_index the axis index.
			
 
				+   *        If 0 <= index < num_axes(), return index.
			
 
				+   *        If -num_axes <= index <= -1, return (num_axes() - (-index)),
			
 
				+   *        e.g., the last axis index (num_axes() - 1) if index == -1,
			
 
				+   *        the second to last if index == -2, etc.
			
 
				+   *        Dies on out of range index.
			
 
				+   */
			
 
				+  inline int CanonicalAxisIndex(int axis_index) const {
			
 
				+    CHECK_GE(axis_index, -num_axes())
			
 
				+        << "axis " << axis_index << " out of range for " << num_axes()
			
 
				+        << "-D Blob with shape " << shape_string();
			
 
				+    CHECK_LT(axis_index, num_axes())
			
 
				+        << "axis " << axis_index << " out of range for " << num_axes()
			
 
				+        << "-D Blob with shape " << shape_string();
			
 
				+    if (axis_index < 0) {
			
 
				+      return axis_index + num_axes();
			
 
				+    }
			
 
				+    return axis_index;
			
 
				+  }
			
 
				+
			
 
				+  /// @brief Deprecated legacy shape accessor num: use shape(0) instead.
			
 
				+  inline int num() const { return LegacyShape(0); }
			
 
				+  /// @brief Deprecated legacy shape accessor channels: use shape(1) instead.
			
 
				+  inline int channels() const { return LegacyShape(1); }
			
 
				+  /// @brief Deprecated legacy shape accessor height: use shape(2) instead.
			
 
				+  inline int height() const { return LegacyShape(2); }
			
 
				+  /// @brief Deprecated legacy shape accessor width: use shape(3) instead.
			
 
				+  inline int width() const { return LegacyShape(3); }
			
 
				+  inline int LegacyShape(int index) const {
			
 
				+    CHECK_LE(num_axes(), 4)
			
 
				+        << "Cannot use legacy accessors on Blobs with > 4 axes.";
			
 
				+    CHECK_LT(index, 4);
			
 
				+    CHECK_GE(index, -4);
			
 
				+    if (index >= num_axes() || index < -num_axes()) {
			
 
				+      // Axis is out of range, but still in [0, 3] (or [-4, -1] for reverse
			
 
				+      // indexing) -- this special case simulates the one-padding used to fill
			
 
				+      // extraneous axes of legacy blobs.
			
 
				+      return 1;
			
 
				+    }
			
 
				+    return shape(index);
			
 
				+  }
			
 
				+
			
 
				+  inline int offset(const int n, const int c = 0, const int h = 0,
			
 
				+      const int w = 0) const {
			
 
				+    CHECK_GE(n, 0);
			
 
				+    CHECK_LE(n, num());
			
 
				+    CHECK_GE(channels(), 0);
			
 
				+    CHECK_LE(c, channels());
			
 
				+    CHECK_GE(height(), 0);
			
 
				+    CHECK_LE(h, height());
			
 
				+    CHECK_GE(width(), 0);
			
 
				+    CHECK_LE(w, width());
			
 
				+    return ((n * channels() + c) * height() + h) * width() + w;
			
 
				+  }
			
 
				+
			
 
				+  inline int offset(const vector<int>& indices) const {
			
 
				+    CHECK_LE(indices.size(), num_axes());
			
 
				+    int offset = 0;
			
 
				+    for (int i = 0; i < num_axes(); ++i) {
			
 
				+      offset *= shape(i);
			
 
				+      if (indices.size() > i) {
			
 
				+        CHECK_GE(indices[i], 0);
			
 
				+        CHECK_LT(indices[i], shape(i));
			
 
				+        offset += indices[i];
			
 
				+      }
			
 
				+    }
			
 
				+    return offset;
			
 
				+  }
			
 
				+  /**
			
 
				+   * @brief Copy from a source Blob.
			
 
				+   *
			
 
				+   * @param source the Blob to copy from
			
 
				+   * @param copy_diff if false, copy the data; if true, copy the diff
			
 
				+   * @param reshape if false, require this Blob to be pre-shaped to the shape
			
 
				+   *        of other (and die otherwise); if true, Reshape this Blob to other's
			
 
				+   *        shape if necessary
			
 
				+   */
			
 
				+  void CopyFrom(const Blob<Dtype>& source, bool copy_diff = false,
			
 
				+      bool reshape = false);
			
 
				+
			
 
				+  inline Dtype data_at(const int n, const int c, const int h,
			
 
				+      const int w) const {
			
 
				+    return cpu_data()[offset(n, c, h, w)];
			
 
				+  }
			
 
				+
			
 
				+  inline Dtype diff_at(const int n, const int c, const int h,
			
 
				+      const int w) const {
			
 
				+    return cpu_diff()[offset(n, c, h, w)];
			
 
				+  }
			
 
				+
			
 
				+  inline Dtype data_at(const vector<int>& index) const {
			
 
				+    return cpu_data()[offset(index)];
			
 
				+  }
			
 
				+
			
 
				+  inline Dtype diff_at(const vector<int>& index) const {
			
 
				+    return cpu_diff()[offset(index)];
			
 
				+  }
			
 
				+
			
 
				+  inline const shared_ptr<SyncedMemory>& data() const {
			
 
				+    CHECK(data_);
			
 
				+    return data_;
			
 
				+  }
			
 
				+
			
 
				+  inline const shared_ptr<SyncedMemory>& diff() const {
			
 
				+    CHECK(diff_);
			
 
				+    return diff_;
			
 
				+  }
			
 
				+
			
 
				+  const Dtype* cpu_data() const;
			
 
				+  void set_cpu_data(Dtype* data);
			
 
				+  const int* gpu_shape() const;
			
 
				+  const Dtype* gpu_data() const;
			
 
				+  void set_gpu_data(Dtype* data);
			
 
				+  const Dtype* cpu_diff() const;
			
 
				+  const Dtype* gpu_diff() const;
			
 
				+  Dtype* mutable_cpu_data();
			
 
				+  Dtype* mutable_gpu_data();
			
 
				+  Dtype* mutable_cpu_diff();
			
 
				+  Dtype* mutable_gpu_diff();
			
 
				+  void Update();
			
 
				+  void FromProto(const BlobProto& proto, bool reshape = true);
			
 
				+  void ToProto(BlobProto* proto, bool write_diff = false) const;
			
 
				+
			
 
				+  /// @brief Compute the sum of absolute values (L1 norm) of the data.
			
 
				+  Dtype asum_data() const;
			
 
				+  /// @brief Compute the sum of absolute values (L1 norm) of the diff.
			
 
				+  Dtype asum_diff() const;
			
 
				+  /// @brief Compute the sum of squares (L2 norm squared) of the data.
			
 
				+  Dtype sumsq_data() const;
			
 
				+  /// @brief Compute the sum of squares (L2 norm squared) of the diff.
			
 
				+  Dtype sumsq_diff() const;
			
 
				+
			
 
				+  /// @brief Scale the blob data by a constant factor.
			
 
				+  void scale_data(Dtype scale_factor);
			
 
				+  /// @brief Scale the blob diff by a constant factor.
			
 
				+  void scale_diff(Dtype scale_factor);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the
			
 
				+   *        data_ of Blob other -- useful in Layer%s which simply perform a copy
			
 
				+   *        in their Forward pass.
			
 
				+   *
			
 
				+   * This deallocates the SyncedMemory holding this Blob's data_, as
			
 
				+   * shared_ptr calls its destructor when reset with the "=" operator.
			
 
				+   */
			
 
				+  void ShareData(const Blob& other);
			
 
				+  /**
			
 
				+   * @brief Set the diff_ shared_ptr to point to the SyncedMemory holding the
			
 
				+   *        diff_ of Blob other -- useful in Layer%s which simply perform a copy
			
 
				+   *        in their Forward pass.
			
 
				+   *
			
 
				+   * This deallocates the SyncedMemory holding this Blob's diff_, as
			
 
				+   * shared_ptr calls its destructor when reset with the "=" operator.
			
 
				+   */
			
 
				+  void ShareDiff(const Blob& other);
			
 
				+
			
 
				+  bool ShapeEquals(const BlobProto& other);
			
 
				+
			
 
				+ protected:
			
 
				+  shared_ptr<SyncedMemory> data_;
			
 
				+  shared_ptr<SyncedMemory> diff_;
			
 
				+  shared_ptr<SyncedMemory> shape_data_;
			
 
				+  vector<int> shape_;
			
 
				+  int count_;
			
 
				+  int capacity_;
			
 
				+
			
 
				+  DISABLE_COPY_AND_ASSIGN(Blob);
			
 
				+};  // class Blob
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_BLOB_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/caffe.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/caffe.hpp
@@ -0,0 +1,22 @@
 
				+// caffe.hpp is the header file that you need to include in your code. It wraps
			
 
				+// all the internal caffe header files into one for simpler inclusion.
			
 
				+
			
 
				+#ifndef CAFFE_CAFFE_HPP_
			
 
				+#define CAFFE_CAFFE_HPP_
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/common.hpp"
			
 
				+#include "caffe/filler.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/layer_factory.hpp"
			
 
				+#include "caffe/net.hpp"
			
 
				+#include "caffe/parallel.hpp"
			
 
				+//#include "caffe/proto/caffe.pb.h"
			
 
				+#include "caffe.pb.h"
			
 
				+#include "caffe/solver.hpp"
			
 
				+#include "caffe/solver_factory.hpp"
			
 
				+#include "caffe/util/benchmark.hpp"
			
 
				+#include "caffe/util/io.hpp"
			
 
				+#include "caffe/util/upgrade_proto.hpp"
			
 
				+
			
 
				+#endif  // CAFFE_CAFFE_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/common.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/common.hpp
@@ -0,0 +1,193 @@
 
				+#ifndef CAFFE_COMMON_HPP_
			
 
				+#define CAFFE_COMMON_HPP_
			
 
				+
			
 
				+#include <boost/shared_ptr.hpp>
			
 
				+#include <gflags/gflags.h>
			
 
				+#include <glog/logging.h>
			
 
				+
			
 
				+#include <climits>
			
 
				+#include <cmath>
			
 
				+#include <fstream>  // NOLINT(readability/streams)
			
 
				+#include <iostream>  // NOLINT(readability/streams)
			
 
				+#include <map>
			
 
				+#include <set>
			
 
				+#include <sstream>
			
 
				+#include <string>
			
 
				+#include <utility>  // pair
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/util/device_alternate.hpp"
			
 
				+
			
 
				+// Convert macro to string
			
 
				+#define STRINGIFY(m) #m
			
 
				+#define AS_STRING(m) STRINGIFY(m)
			
 
				+
			
 
				+// gflags 2.1 issue: namespace google was changed to gflags without warning.
			
 
				+// Luckily we will be able to use GFLAGS_GFLAGS_H_ to detect if it is version
			
 
				+// 2.1. If yes, we will add a temporary solution to redirect the namespace.
			
 
				+// TODO(Yangqing): Once gflags solves the problem in a more elegant way, let's
			
 
				+// remove the following hack.
			
 
				+#ifndef GFLAGS_GFLAGS_H_
			
 
				+namespace gflags = google;
			
 
				+#endif  // GFLAGS_GFLAGS_H_
			
 
				+
			
 
				+// Disable the copy and assignment operator for a class.
			
 
				+#define DISABLE_COPY_AND_ASSIGN(classname) \
			
 
				+private:\
			
 
				+  classname(const classname&);\
			
 
				+  classname& operator=(const classname&)
			
 
				+
			
 
				+// Instantiate a class with float and double specifications.
			
 
				+#define INSTANTIATE_CLASS(classname) \
			
 
				+  char gInstantiationGuard##classname; \
			
 
				+  template class classname<float>; \
			
 
				+  template class classname<double>
			
 
				+
			
 
				+#define INSTANTIATE_LAYER_GPU_FORWARD(classname) \
			
 
				+  template void classname<float>::Forward_gpu( \
			
 
				+      const std::vector<Blob<float>*>& bottom, \
			
 
				+      const std::vector<Blob<float>*>& top); \
			
 
				+  template void classname<double>::Forward_gpu( \
			
 
				+      const std::vector<Blob<double>*>& bottom, \
			
 
				+      const std::vector<Blob<double>*>& top);
			
 
				+
			
 
				+#define INSTANTIATE_LAYER_GPU_BACKWARD(classname) \
			
 
				+  template void classname<float>::Backward_gpu( \
			
 
				+      const std::vector<Blob<float>*>& top, \
			
 
				+      const std::vector<bool>& propagate_down, \
			
 
				+      const std::vector<Blob<float>*>& bottom); \
			
 
				+  template void classname<double>::Backward_gpu( \
			
 
				+      const std::vector<Blob<double>*>& top, \
			
 
				+      const std::vector<bool>& propagate_down, \
			
 
				+      const std::vector<Blob<double>*>& bottom)
			
 
				+
			
 
				+#define INSTANTIATE_LAYER_GPU_FUNCS(classname) \
			
 
				+  INSTANTIATE_LAYER_GPU_FORWARD(classname); \
			
 
				+  INSTANTIATE_LAYER_GPU_BACKWARD(classname)
			
 
				+
			
 
				+// A simple macro to mark codes that are not implemented, so that when the code
			
 
				+// is executed we will see a fatal log.
			
 
				+#define NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented Yet"
			
 
				+
			
 
				+// See PR #1236
			
 
				+namespace cv { class Mat; }
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+// We will use the boost shared_ptr instead of the new C++11 one mainly
			
 
				+// because cuda does not work (at least now) well with C++11 features.
			
 
				+using boost::shared_ptr;
			
 
				+
			
 
				+// Common functions and classes from std that caffe often uses.
			
 
				+using std::fstream;
			
 
				+using std::ios;
			
 
				+using std::isnan;
			
 
				+using std::isinf;
			
 
				+using std::iterator;
			
 
				+using std::make_pair;
			
 
				+using std::map;
			
 
				+using std::ostringstream;
			
 
				+using std::pair;
			
 
				+using std::set;
			
 
				+using std::string;
			
 
				+using std::stringstream;
			
 
				+using std::vector;
			
 
				+
			
 
				+// A global initialization function that you should call in your main function.
			
 
				+// Currently it initializes google flags and google logging.
			
 
				+void GlobalInit(int* pargc, char*** pargv);
			
 
				+
			
 
				+// A singleton class to hold common caffe stuff, such as the handler that
			
 
				+// caffe is going to use for cublas, curand, etc.
			
 
				+class Caffe {
			
 
				+ public:
			
 
				+  ~Caffe();
			
 
				+
			
 
				+  // Thread local context for Caffe. Moved to common.cpp instead of
			
 
				+  // including boost/thread.hpp to avoid a boost/NVCC issues (#1009, #1010)
			
 
				+  // on OSX. Also fails on Linux with CUDA 7.0.18.
			
 
				+  static Caffe& Get();
			
 
				+
			
 
				+  enum Brew { CPU, GPU };
			
 
				+
			
 
				+  // This random number generator facade hides boost and CUDA rng
			
 
				+  // implementation from one another (for cross-platform compatibility).
			
 
				+  class RNG {
			
 
				+   public:
			
 
				+    RNG();
			
 
				+    explicit RNG(unsigned int seed);
			
 
				+    explicit RNG(const RNG&);
			
 
				+    RNG& operator=(const RNG&);
			
 
				+    void* generator();
			
 
				+   private:
			
 
				+    class Generator;
			
 
				+    shared_ptr<Generator> generator_;
			
 
				+  };
			
 
				+
			
 
				+  // Getters for boost rng, curand, and cublas handles
			
 
				+  inline static RNG& rng_stream() {
			
 
				+    if (!Get().random_generator_) {
			
 
				+      Get().random_generator_.reset(new RNG());
			
 
				+    }
			
 
				+    return *(Get().random_generator_);
			
 
				+  }
			
 
				+#ifndef CPU_ONLY
			
 
				+  inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; }
			
 
				+  inline static curandGenerator_t curand_generator() {
			
 
				+    return Get().curand_generator_;
			
 
				+  }
			
 
				+#endif
			
 
				+
			
 
				+  // Returns the mode: running on CPU or GPU.
			
 
				+  inline static Brew mode() { return Get().mode_; }
			
 
				+  // The setters for the variables
			
 
				+  // Sets the mode. It is recommended that you don't change the mode halfway
			
 
				+  // into the program since that may cause allocation of pinned memory being
			
 
				+  // freed in a non-pinned way, which may cause problems - I haven't verified
			
 
				+  // it personally but better to note it here in the header file.
			
 
				+  inline static void set_mode(Brew mode) { Get().mode_ = mode; }
			
 
				+  // Sets the random seed of both boost and curand
			
 
				+  static void set_random_seed(const unsigned int seed);
			
 
				+  // Sets the device. Since we have cublas and curand stuff, set device also
			
 
				+  // requires us to reset those values.
			
 
				+  static void SetDevice(const int device_id);
			
 
				+  // Prints the current GPU status.
			
 
				+  static void DeviceQuery();
			
 
				+  // Check if specified device is available
			
 
				+  static bool CheckDevice(const int device_id);
			
 
				+  // Search from start_id to the highest possible device ordinal,
			
 
				+  // return the ordinal of the first available device.
			
 
				+  static int FindDevice(const int start_id = 0);
			
 
				+  // Parallel training
			
 
				+  inline static int solver_count() { return Get().solver_count_; }
			
 
				+  inline static void set_solver_count(int val) { Get().solver_count_ = val; }
			
 
				+  inline static int solver_rank() { return Get().solver_rank_; }
			
 
				+  inline static void set_solver_rank(int val) { Get().solver_rank_ = val; }
			
 
				+  inline static bool multiprocess() { return Get().multiprocess_; }
			
 
				+  inline static void set_multiprocess(bool val) { Get().multiprocess_ = val; }
			
 
				+  inline static bool root_solver() { return Get().solver_rank_ == 0; }
			
 
				+
			
 
				+ protected:
			
 
				+#ifndef CPU_ONLY
			
 
				+  cublasHandle_t cublas_handle_;
			
 
				+  curandGenerator_t curand_generator_;
			
 
				+#endif
			
 
				+  shared_ptr<RNG> random_generator_;
			
 
				+
			
 
				+  Brew mode_;
			
 
				+
			
 
				+  // Parallel training
			
 
				+  int solver_count_;
			
 
				+  int solver_rank_;
			
 
				+  bool multiprocess_;
			
 
				+
			
 
				+ private:
			
 
				+  // The private constructor to avoid duplicate instantiation.
			
 
				+  Caffe();
			
 
				+
			
 
				+  DISABLE_COPY_AND_ASSIGN(Caffe);
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_COMMON_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/data_transformer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/data_transformer.hpp
@@ -0,0 +1,154 @@
 
				+#ifndef CAFFE_DATA_TRANSFORMER_HPP
			
 
				+#define CAFFE_DATA_TRANSFORMER_HPP
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/common.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Applies common transformations to the input data, such as
			
 
				+ * scaling, mirroring, substracting the image mean...
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class DataTransformer {
			
 
				+ public:
			
 
				+  explicit DataTransformer(const TransformationParameter& param, Phase phase);
			
 
				+  virtual ~DataTransformer() {}
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Initialize the Random number generations if needed by the
			
 
				+   *    transformation.
			
 
				+   */
			
 
				+  void InitRand();
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Applies the transformation defined in the data layer's
			
 
				+   * transform_param block to the data.
			
 
				+   *
			
 
				+   * @param datum
			
 
				+   *    Datum containing the data to be transformed.
			
 
				+   * @param transformed_blob
			
 
				+   *    This is destination blob. It can be part of top blob's data if
			
 
				+   *    set_cpu_data() is used. See data_layer.cpp for an example.
			
 
				+   */
			
 
				+  void Transform(const Datum& datum, Blob<Dtype>* transformed_blob);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Applies the transformation defined in the data layer's
			
 
				+   * transform_param block to a vector of Datum.
			
 
				+   *
			
 
				+   * @param datum_vector
			
 
				+   *    A vector of Datum containing the data to be transformed.
			
 
				+   * @param transformed_blob
			
 
				+   *    This is destination blob. It can be part of top blob's data if
			
 
				+   *    set_cpu_data() is used. See memory_layer.cpp for an example.
			
 
				+   */
			
 
				+  void Transform(const vector<Datum> & datum_vector,
			
 
				+                Blob<Dtype>* transformed_blob);
			
 
				+
			
 
				+#ifdef USE_OPENCV
			
 
				+  /**
			
 
				+   * @brief Applies the transformation defined in the data layer's
			
 
				+   * transform_param block to a vector of Mat.
			
 
				+   *
			
 
				+   * @param mat_vector
			
 
				+   *    A vector of Mat containing the data to be transformed.
			
 
				+   * @param transformed_blob
			
 
				+   *    This is destination blob. It can be part of top blob's data if
			
 
				+   *    set_cpu_data() is used. See memory_layer.cpp for an example.
			
 
				+   */
			
 
				+  void Transform(const vector<cv::Mat> & mat_vector,
			
 
				+                Blob<Dtype>* transformed_blob);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Applies the transformation defined in the data layer's
			
 
				+   * transform_param block to a cv::Mat
			
 
				+   *
			
 
				+   * @param cv_img
			
 
				+   *    cv::Mat containing the data to be transformed.
			
 
				+   * @param transformed_blob
			
 
				+   *    This is destination blob. It can be part of top blob's data if
			
 
				+   *    set_cpu_data() is used. See image_data_layer.cpp for an example.
			
 
				+   */
			
 
				+  void Transform(const cv::Mat& cv_img, Blob<Dtype>* transformed_blob);
			
 
				+#endif  // USE_OPENCV
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Applies the same transformation defined in the data layer's
			
 
				+   * transform_param block to all the num images in a input_blob.
			
 
				+   *
			
 
				+   * @param input_blob
			
 
				+   *    A Blob containing the data to be transformed. It applies the same
			
 
				+   *    transformation to all the num images in the blob.
			
 
				+   * @param transformed_blob
			
 
				+   *    This is destination blob, it will contain as many images as the
			
 
				+   *    input blob. It can be part of top blob's data.
			
 
				+   */
			
 
				+  void Transform(Blob<Dtype>* input_blob, Blob<Dtype>* transformed_blob);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Infers the shape of transformed_blob will have when
			
 
				+   *    the transformation is applied to the data.
			
 
				+   *
			
 
				+   * @param datum
			
 
				+   *    Datum containing the data to be transformed.
			
 
				+   */
			
 
				+  vector<int> InferBlobShape(const Datum& datum);
			
 
				+  /**
			
 
				+   * @brief Infers the shape of transformed_blob will have when
			
 
				+   *    the transformation is applied to the data.
			
 
				+   *    It uses the first element to infer the shape of the blob.
			
 
				+   *
			
 
				+   * @param datum_vector
			
 
				+   *    A vector of Datum containing the data to be transformed.
			
 
				+   */
			
 
				+  vector<int> InferBlobShape(const vector<Datum> & datum_vector);
			
 
				+  /**
			
 
				+   * @brief Infers the shape of transformed_blob will have when
			
 
				+   *    the transformation is applied to the data.
			
 
				+   *    It uses the first element to infer the shape of the blob.
			
 
				+   *
			
 
				+   * @param mat_vector
			
 
				+   *    A vector of Mat containing the data to be transformed.
			
 
				+   */
			
 
				+#ifdef USE_OPENCV
			
 
				+  vector<int> InferBlobShape(const vector<cv::Mat> & mat_vector);
			
 
				+  /**
			
 
				+   * @brief Infers the shape of transformed_blob will have when
			
 
				+   *    the transformation is applied to the data.
			
 
				+   *
			
 
				+   * @param cv_img
			
 
				+   *    cv::Mat containing the data to be transformed.
			
 
				+   */
			
 
				+  vector<int> InferBlobShape(const cv::Mat& cv_img);
			
 
				+#endif  // USE_OPENCV
			
 
				+
			
 
				+ protected:
			
 
				+   /**
			
 
				+   * @brief Generates a random integer from Uniform({0, 1, ..., n-1}).
			
 
				+   *
			
 
				+   * @param n
			
 
				+   *    The upperbound (exclusive) value of the random number.
			
 
				+   * @return
			
 
				+   *    A uniformly random integer value from ({0, 1, ..., n-1}).
			
 
				+   */
			
 
				+  virtual int Rand(int n);
			
 
				+
			
 
				+  void Transform(const Datum& datum, Dtype* transformed_data);
			
 
				+  // Tranformation parameters
			
 
				+  TransformationParameter param_;
			
 
				+
			
 
				+
			
 
				+  shared_ptr<Caffe::RNG> rng_;
			
 
				+  Phase phase_;
			
 
				+  Blob<Dtype> data_mean_;
			
 
				+  vector<Dtype> mean_values_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_DATA_TRANSFORMER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/filler.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/filler.hpp
@@ -0,0 +1,301 @@
 
				+// Fillers are random number generators that fills a blob using the specified
			
 
				+// algorithm. The expectation is that they are only going to be used during
			
 
				+// initialization time and will not involve any GPUs.
			
 
				+
			
 
				+#ifndef CAFFE_FILLER_HPP
			
 
				+#define CAFFE_FILLER_HPP
			
 
				+
			
 
				+#include <string>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+#include "caffe/syncedmem.hpp"
			
 
				+#include "caffe/util/math_functions.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/// @brief Fills a Blob with constant or randomly-generated data.
			
 
				+template <typename Dtype>
			
 
				+class Filler {
			
 
				+ public:
			
 
				+  explicit Filler(const FillerParameter& param) : filler_param_(param) {}
			
 
				+  virtual ~Filler() {}
			
 
				+  virtual void Fill(Blob<Dtype>* blob) = 0;
			
 
				+ protected:
			
 
				+  FillerParameter filler_param_;
			
 
				+};  // class Filler
			
 
				+
			
 
				+
			
 
				+/// @brief Fills a Blob with constant values @f$ x = 0 @f$.
			
 
				+template <typename Dtype>
			
 
				+class ConstantFiller : public Filler<Dtype> {
			
 
				+ public:
			
 
				+  explicit ConstantFiller(const FillerParameter& param)
			
 
				+      : Filler<Dtype>(param) {}
			
 
				+  virtual void Fill(Blob<Dtype>* blob) {
			
 
				+    Dtype* data = blob->mutable_cpu_data();
			
 
				+    const int count = blob->count();
			
 
				+    const Dtype value = this->filler_param_.value();
			
 
				+    CHECK(count);
			
 
				+    for (int i = 0; i < count; ++i) {
			
 
				+      data[i] = value;
			
 
				+    }
			
 
				+    CHECK_EQ(this->filler_param_.sparse(), -1)
			
 
				+         << "Sparsity not supported by this Filler.";
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+/// @brief Fills a Blob with uniformly distributed values @f$ x\sim U(a, b) @f$.
			
 
				+template <typename Dtype>
			
 
				+class UniformFiller : public Filler<Dtype> {
			
 
				+ public:
			
 
				+  explicit UniformFiller(const FillerParameter& param)
			
 
				+      : Filler<Dtype>(param) {}
			
 
				+  virtual void Fill(Blob<Dtype>* blob) {
			
 
				+    CHECK(blob->count());
			
 
				+    caffe_rng_uniform<Dtype>(blob->count(), Dtype(this->filler_param_.min()),
			
 
				+        Dtype(this->filler_param_.max()), blob->mutable_cpu_data());
			
 
				+    CHECK_EQ(this->filler_param_.sparse(), -1)
			
 
				+         << "Sparsity not supported by this Filler.";
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+/// @brief Fills a Blob with Gaussian-distributed values @f$ x = a @f$.
			
 
				+template <typename Dtype>
			
 
				+class GaussianFiller : public Filler<Dtype> {
			
 
				+ public:
			
 
				+  explicit GaussianFiller(const FillerParameter& param)
			
 
				+      : Filler<Dtype>(param) {}
			
 
				+  virtual void Fill(Blob<Dtype>* blob) {
			
 
				+    Dtype* data = blob->mutable_cpu_data();
			
 
				+    CHECK(blob->count());
			
 
				+    caffe_rng_gaussian<Dtype>(blob->count(), Dtype(this->filler_param_.mean()),
			
 
				+        Dtype(this->filler_param_.std()), blob->mutable_cpu_data());
			
 
				+    int sparse = this->filler_param_.sparse();
			
 
				+    CHECK_GE(sparse, -1);
			
 
				+    if (sparse >= 0) {
			
 
				+      // Sparse initialization is implemented for "weight" blobs; i.e. matrices.
			
 
				+      // These have num == channels == 1; width is number of inputs; height is
			
 
				+      // number of outputs.  The 'sparse' variable specifies the mean number
			
 
				+      // of non-zero input weights for a given output.
			
 
				+      CHECK_GE(blob->num_axes(), 1);
			
 
				+      const int num_outputs = blob->shape(0);
			
 
				+      Dtype non_zero_probability = Dtype(sparse) / Dtype(num_outputs);
			
 
				+      rand_vec_.reset(new SyncedMemory(blob->count() * sizeof(int)));
			
 
				+      int* mask = reinterpret_cast<int*>(rand_vec_->mutable_cpu_data());
			
 
				+      caffe_rng_bernoulli(blob->count(), non_zero_probability, mask);
			
 
				+      for (int i = 0; i < blob->count(); ++i) {
			
 
				+        data[i] *= mask[i];
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+ protected:
			
 
				+  shared_ptr<SyncedMemory> rand_vec_;
			
 
				+};
			
 
				+
			
 
				+/** @brief Fills a Blob with values @f$ x \in [0, 1] @f$
			
 
				+ *         such that @f$ \forall i \sum_j x_{ij} = 1 @f$.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class PositiveUnitballFiller : public Filler<Dtype> {
			
 
				+ public:
			
 
				+  explicit PositiveUnitballFiller(const FillerParameter& param)
			
 
				+      : Filler<Dtype>(param) {}
			
 
				+  virtual void Fill(Blob<Dtype>* blob) {
			
 
				+    Dtype* data = blob->mutable_cpu_data();
			
 
				+    DCHECK(blob->count());
			
 
				+    caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
			
 
				+    // We expect the filler to not be called very frequently, so we will
			
 
				+    // just use a simple implementation
			
 
				+    int dim = blob->count() / blob->shape(0);
			
 
				+    CHECK(dim);
			
 
				+    for (int i = 0; i < blob->shape(0); ++i) {
			
 
				+      Dtype sum = 0;
			
 
				+      for (int j = 0; j < dim; ++j) {
			
 
				+        sum += data[i * dim + j];
			
 
				+      }
			
 
				+      for (int j = 0; j < dim; ++j) {
			
 
				+        data[i * dim + j] /= sum;
			
 
				+      }
			
 
				+    }
			
 
				+    CHECK_EQ(this->filler_param_.sparse(), -1)
			
 
				+         << "Sparsity not supported by this Filler.";
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Fills a Blob with values @f$ x \sim U(-a, +a) @f$ where @f$ a @f$ is
			
 
				+ *        set inversely proportional to number of incoming nodes, outgoing
			
 
				+ *        nodes, or their average.
			
 
				+ *
			
 
				+ * A Filler based on the paper [Bengio and Glorot 2010]: Understanding
			
 
				+ * the difficulty of training deep feedforward neuralnetworks.
			
 
				+ *
			
 
				+ * It fills the incoming matrix by randomly sampling uniform data from [-scale,
			
 
				+ * scale] where scale = sqrt(3 / n) where n is the fan_in, fan_out, or their
			
 
				+ * average, depending on the variance_norm option. You should make sure the
			
 
				+ * input blob has shape (num, a, b, c) where a * b * c = fan_in and num * b * c
			
 
				+ * = fan_out. Note that this is currently not the case for inner product layers.
			
 
				+ *
			
 
				+ * TODO(dox): make notation in above comment consistent with rest & use LaTeX.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class XavierFiller : public Filler<Dtype> {
			
 
				+ public:
			
 
				+  explicit XavierFiller(const FillerParameter& param)
			
 
				+      : Filler<Dtype>(param) {}
			
 
				+  virtual void Fill(Blob<Dtype>* blob) {
			
 
				+    CHECK(blob->count());
			
 
				+    int fan_in = blob->count() / blob->shape(0);
			
 
				+    // Compatibility with ND blobs
			
 
				+    int fan_out = blob->num_axes() > 1 ?
			
 
				+                  blob->count() / blob->shape(1) :
			
 
				+                  blob->count();
			
 
				+    Dtype n = fan_in;  // default to fan_in
			
 
				+    if (this->filler_param_.variance_norm() ==
			
 
				+        FillerParameter_VarianceNorm_AVERAGE) {
			
 
				+      n = (fan_in + fan_out) / Dtype(2);
			
 
				+    } else if (this->filler_param_.variance_norm() ==
			
 
				+        FillerParameter_VarianceNorm_FAN_OUT) {
			
 
				+      n = fan_out;
			
 
				+    }
			
 
				+    Dtype scale = sqrt(Dtype(3) / n);
			
 
				+    caffe_rng_uniform<Dtype>(blob->count(), -scale, scale,
			
 
				+        blob->mutable_cpu_data());
			
 
				+    CHECK_EQ(this->filler_param_.sparse(), -1)
			
 
				+         << "Sparsity not supported by this Filler.";
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Fills a Blob with values @f$ x \sim N(0, \sigma^2) @f$ where
			
 
				+ *        @f$ \sigma^2 @f$ is set inversely proportional to number of incoming
			
 
				+ *        nodes, outgoing nodes, or their average.
			
 
				+ *
			
 
				+ * A Filler based on the paper [He, Zhang, Ren and Sun 2015]: Specifically
			
 
				+ * accounts for ReLU nonlinearities.
			
 
				+ *
			
 
				+ * Aside: for another perspective on the scaling factor, see the derivation of
			
 
				+ * [Saxe, McClelland, and Ganguli 2013 (v3)].
			
 
				+ *
			
 
				+ * It fills the incoming matrix by randomly sampling Gaussian data with std =
			
 
				+ * sqrt(2 / n) where n is the fan_in, fan_out, or their average, depending on
			
 
				+ * the variance_norm option. You should make sure the input blob has shape (num,
			
 
				+ * a, b, c) where a * b * c = fan_in and num * b * c = fan_out. Note that this
			
 
				+ * is currently not the case for inner product layers.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class MSRAFiller : public Filler<Dtype> {
			
 
				+ public:
			
 
				+  explicit MSRAFiller(const FillerParameter& param)
			
 
				+      : Filler<Dtype>(param) {}
			
 
				+  virtual void Fill(Blob<Dtype>* blob) {
			
 
				+    CHECK(blob->count());
			
 
				+    int fan_in = blob->count() / blob->shape(0);
			
 
				+    // Compatibility with ND blobs
			
 
				+    int fan_out = blob->num_axes() > 1 ?
			
 
				+                  blob->count() / blob->shape(1) :
			
 
				+                  blob->count();
			
 
				+    Dtype n = fan_in;  // default to fan_in
			
 
				+    if (this->filler_param_.variance_norm() ==
			
 
				+        FillerParameter_VarianceNorm_AVERAGE) {
			
 
				+      n = (fan_in + fan_out) / Dtype(2);
			
 
				+    } else if (this->filler_param_.variance_norm() ==
			
 
				+        FillerParameter_VarianceNorm_FAN_OUT) {
			
 
				+      n = fan_out;
			
 
				+    }
			
 
				+    Dtype std = sqrt(Dtype(2) / n);
			
 
				+    caffe_rng_gaussian<Dtype>(blob->count(), Dtype(0), std,
			
 
				+        blob->mutable_cpu_data());
			
 
				+    CHECK_EQ(this->filler_param_.sparse(), -1)
			
 
				+         << "Sparsity not supported by this Filler.";
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+/*!
			
 
				+@brief Fills a Blob with coefficients for bilinear interpolation.
			
 
				+
			
 
				+A common use case is with the DeconvolutionLayer acting as upsampling.
			
 
				+You can upsample a feature map with shape of (B, C, H, W) by any integer factor
			
 
				+using the following proto.
			
 
				+\code
			
 
				+layer {
			
 
				+  name: "upsample", type: "Deconvolution"
			
 
				+  bottom: "{{bottom_name}}" top: "{{top_name}}"
			
 
				+  convolution_param {
			
 
				+    kernel_size: {{2 * factor - factor % 2}} stride: {{factor}}
			
 
				+    num_output: {{C}} group: {{C}}
			
 
				+    pad: {{ceil((factor - 1) / 2.)}}
			
 
				+    weight_filler: { type: "bilinear" } bias_term: false
			
 
				+  }
			
 
				+  param { lr_mult: 0 decay_mult: 0 }
			
 
				+}
			
 
				+\endcode
			
 
				+Please use this by replacing `{{}}` with your values. By specifying
			
 
				+`num_output: {{C}} group: {{C}}`, it behaves as
			
 
				+channel-wise convolution. The filter shape of this deconvolution layer will be
			
 
				+(C, 1, K, K) where K is `kernel_size`, and this filler will set a (K, K)
			
 
				+interpolation kernel for every channel of the filter identically. The resulting
			
 
				+shape of the top feature map will be (B, C, factor * H, factor * W).
			
 
				+Note that the learning rate and the
			
 
				+weight decay are set to 0 in order to keep coefficient values of bilinear
			
 
				+interpolation unchanged during training. If you apply this to an image, this
			
 
				+operation is equivalent to the following call in Python with Scikit.Image.
			
 
				+\code{.py}
			
 
				+out = skimage.transform.rescale(img, factor, mode='constant', cval=0)
			
 
				+\endcode
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class BilinearFiller : public Filler<Dtype> {
			
 
				+ public:
			
 
				+  explicit BilinearFiller(const FillerParameter& param)
			
 
				+      : Filler<Dtype>(param) {}
			
 
				+  virtual void Fill(Blob<Dtype>* blob) {
			
 
				+    CHECK_EQ(blob->num_axes(), 4) << "Blob must be 4 dim.";
			
 
				+    CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";
			
 
				+    Dtype* data = blob->mutable_cpu_data();
			
 
				+    int f = ceil(blob->width() / 2.);
			
 
				+    Dtype c = (blob->width() - 1) / (2. * f);
			
 
				+    for (int i = 0; i < blob->count(); ++i) {
			
 
				+      Dtype x = i % blob->width();
			
 
				+      Dtype y = (i / blob->width()) % blob->height();
			
 
				+      data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
			
 
				+    }
			
 
				+    CHECK_EQ(this->filler_param_.sparse(), -1)
			
 
				+         << "Sparsity not supported by this Filler.";
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief Get a specific filler from the specification given in FillerParameter.
			
 
				+ *
			
 
				+ * Ideally this would be replaced by a factory pattern, but we will leave it
			
 
				+ * this way for now.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+Filler<Dtype>* GetFiller(const FillerParameter& param) {
			
 
				+  const std::string& type = param.type();
			
 
				+  if (type == "constant") {
			
 
				+    return new ConstantFiller<Dtype>(param);
			
 
				+  } else if (type == "gaussian") {
			
 
				+    return new GaussianFiller<Dtype>(param);
			
 
				+  } else if (type == "positive_unitball") {
			
 
				+    return new PositiveUnitballFiller<Dtype>(param);
			
 
				+  } else if (type == "uniform") {
			
 
				+    return new UniformFiller<Dtype>(param);
			
 
				+  } else if (type == "xavier") {
			
 
				+    return new XavierFiller<Dtype>(param);
			
 
				+  } else if (type == "msra") {
			
 
				+    return new MSRAFiller<Dtype>(param);
			
 
				+  } else if (type == "bilinear") {
			
 
				+    return new BilinearFiller<Dtype>(param);
			
 
				+  } else {
			
 
				+    CHECK(false) << "Unknown filler name: " << param.type();
			
 
				+  }
			
 
				+  return (Filler<Dtype>*)(NULL);
			
 
				+}
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_FILLER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/internal_thread.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/internal_thread.hpp
@@ -0,0 +1,53 @@
 
				+#ifndef CAFFE_INTERNAL_THREAD_HPP_
			
 
				+#define CAFFE_INTERNAL_THREAD_HPP_
			
 
				+
			
 
				+#include "caffe/common.hpp"
			
 
				+
			
 
				+/**
			
 
				+ Forward declare boost::thread instead of including boost/thread.hpp
			
 
				+ to avoid a boost/NVCC issues (#1009, #1010) on OSX.
			
 
				+ */
			
 
				+namespace boost { class thread; }
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * Virtual class encapsulate boost::thread for use in base class
			
 
				+ * The child class will acquire the ability to run a single thread,
			
 
				+ * by reimplementing the virtual function InternalThreadEntry.
			
 
				+ */
			
 
				+class InternalThread {
			
 
				+ public:
			
 
				+  InternalThread() : thread_() {}
			
 
				+  virtual ~InternalThread();
			
 
				+
			
 
				+  /**
			
 
				+   * Caffe's thread local state will be initialized using the current
			
 
				+   * thread values, e.g. device id, solver index etc. The random seed
			
 
				+   * is initialized using caffe_rng_rand.
			
 
				+   */
			
 
				+  void StartInternalThread();
			
 
				+
			
 
				+  /** Will not return until the internal thread has exited. */
			
 
				+  void StopInternalThread();
			
 
				+
			
 
				+  bool is_started() const;
			
 
				+
			
 
				+ protected:
			
 
				+  /* Implement this method in your subclass
			
 
				+      with the code you want your thread to run. */
			
 
				+  virtual void InternalThreadEntry() {}
			
 
				+
			
 
				+  /* Should be tested when running loops to exit when requested. */
			
 
				+  bool must_stop();
			
 
				+
			
 
				+ private:
			
 
				+  void entry(int device, Caffe::Brew mode, int rand_seed,
			
 
				+      int solver_count, int solver_rank, bool multiprocess);
			
 
				+
			
 
				+  shared_ptr<boost::thread> thread_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_INTERNAL_THREAD_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layer.hpp
@@ -0,0 +1,477 @@
 
				+#ifndef CAFFE_LAYER_H_
			
 
				+#define CAFFE_LAYER_H_
			
 
				+
			
 
				+#include <algorithm>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/common.hpp"
			
 
				+#include "caffe/layer_factory.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+#include "caffe/util/math_functions.hpp"
			
 
				+
			
 
				+/**
			
 
				+ Forward declare boost::thread instead of including boost/thread.hpp
			
 
				+ to avoid a boost/NVCC issues (#1009, #1010) on OSX.
			
 
				+ */
			
 
				+namespace boost { class mutex; }
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief An interface for the units of computation which can be composed into a
			
 
				+ *        Net.
			
 
				+ *
			
 
				+ * Layer%s must implement a Forward function, in which they take their input
			
 
				+ * (bottom) Blob%s (if any) and compute their output Blob%s (if any).
			
 
				+ * They may also implement a Backward function, in which they compute the error
			
 
				+ * gradients with respect to their input Blob%s, given the error gradients with
			
 
				+ * their output Blob%s.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class Layer {
			
 
				+ public:
			
 
				+  /**
			
 
				+   * You should not implement your own constructor. Any set up code should go
			
 
				+   * to SetUp(), where the dimensions of the bottom blobs are provided to the
			
 
				+   * layer.
			
 
				+   */
			
 
				+  explicit Layer(const LayerParameter& param)
			
 
				+    : layer_param_(param) {
			
 
				+      // Set phase and copy blobs (if there are any).
			
 
				+      phase_ = param.phase();
			
 
				+      if (layer_param_.blobs_size() > 0) {
			
 
				+        blobs_.resize(layer_param_.blobs_size());
			
 
				+        for (int i = 0; i < layer_param_.blobs_size(); ++i) {
			
 
				+          blobs_[i].reset(new Blob<Dtype>());
			
 
				+          blobs_[i]->FromProto(layer_param_.blobs(i));
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  virtual ~Layer() {}
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Implements common layer setup functionality.
			
 
				+   *
			
 
				+   * @param bottom the preshaped input blobs
			
 
				+   * @param top
			
 
				+   *     the allocated but unshaped output blobs, to be shaped by Reshape
			
 
				+   *
			
 
				+   * Checks that the number of bottom and top blobs is correct.
			
 
				+   * Calls LayerSetUp to do special layer setup for individual layer types,
			
 
				+   * followed by Reshape to set up sizes of top blobs and internal buffers.
			
 
				+   * Sets up the loss weight multiplier blobs for any non-zero loss weights.
			
 
				+   * This method may not be overridden.
			
 
				+   */
			
 
				+  void SetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {
			
 
				+    CheckBlobCounts(bottom, top);
			
 
				+    LayerSetUp(bottom, top);
			
 
				+    Reshape(bottom, top);
			
 
				+    SetLossWeights(top);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Does layer-specific setup: your layer should implement this function
			
 
				+   *        as well as Reshape.
			
 
				+   *
			
 
				+   * @param bottom
			
 
				+   *     the preshaped input blobs, whose data fields store the input data for
			
 
				+   *     this layer
			
 
				+   * @param top
			
 
				+   *     the allocated but unshaped output blobs
			
 
				+   *
			
 
				+   * This method should do one-time layer specific setup. This includes reading
			
 
				+   * and processing relevent parameters from the <code>layer_param_</code>.
			
 
				+   * Setting up the shapes of top blobs and internal buffers should be done in
			
 
				+   * <code>Reshape</code>, which will be called before the forward pass to
			
 
				+   * adjust the top blob sizes.
			
 
				+   */
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {}
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Adjust the shapes of top blobs and internal buffers to accommodate
			
 
				+   *        the shapes of the bottom blobs.
			
 
				+   *
			
 
				+   * @param bottom the input blobs, with the requested input shapes
			
 
				+   * @param top the top blobs, which should be reshaped as needed
			
 
				+   *
			
 
				+   * This method should reshape top blobs as needed according to the shapes
			
 
				+   * of the bottom (input) blobs, as well as reshaping any internal buffers
			
 
				+   * and making any other necessary adjustments so that the layer can
			
 
				+   * accommodate the bottom blobs.
			
 
				+   */
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) = 0;
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Given the bottom blobs, compute the top blobs and the loss.
			
 
				+   *
			
 
				+   * @param bottom
			
 
				+   *     the input blobs, whose data fields store the input data for this layer
			
 
				+   * @param top
			
 
				+   *     the preshaped output blobs, whose data fields will store this layers'
			
 
				+   *     outputs
			
 
				+   * \return The total loss from the layer.
			
 
				+   *
			
 
				+   * The Forward wrapper calls the relevant device wrapper function
			
 
				+   * (Forward_cpu or Forward_gpu) to compute the top blob values given the
			
 
				+   * bottom blobs.  If the layer has any non-zero loss_weights, the wrapper
			
 
				+   * then computes and returns the loss.
			
 
				+   *
			
 
				+   * Your layer should implement Forward_cpu and (optionally) Forward_gpu.
			
 
				+   */
			
 
				+  inline Dtype Forward(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Given the top blob error gradients, compute the bottom blob error
			
 
				+   *        gradients.
			
 
				+   *
			
 
				+   * @param top
			
 
				+   *     the output blobs, whose diff fields store the gradient of the error
			
 
				+   *     with respect to themselves
			
 
				+   * @param propagate_down
			
 
				+   *     a vector with equal length to bottom, with each index indicating
			
 
				+   *     whether to propagate the error gradients down to the bottom blob at
			
 
				+   *     the corresponding index
			
 
				+   * @param bottom
			
 
				+   *     the input blobs, whose diff fields will store the gradient of the error
			
 
				+   *     with respect to themselves after Backward is run
			
 
				+   *
			
 
				+   * The Backward wrapper calls the relevant device wrapper function
			
 
				+   * (Backward_cpu or Backward_gpu) to compute the bottom blob diffs given the
			
 
				+   * top blob diffs.
			
 
				+   *
			
 
				+   * Your layer should implement Backward_cpu and (optionally) Backward_gpu.
			
 
				+   */
			
 
				+  inline void Backward(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down,
			
 
				+      const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Returns the vector of learnable parameter blobs.
			
 
				+   */
			
 
				+  vector<shared_ptr<Blob<Dtype> > >& blobs() {
			
 
				+    return blobs_;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Returns the layer parameter.
			
 
				+   */
			
 
				+  const LayerParameter& layer_param() const { return layer_param_; }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Writes the layer parameter to a protocol buffer
			
 
				+   */
			
 
				+  virtual void ToProto(LayerParameter* param, bool write_diff = false);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Returns the scalar loss associated with a top blob at a given index.
			
 
				+   */
			
 
				+  inline Dtype loss(const int top_index) const {
			
 
				+    return (loss_.size() > top_index) ? loss_[top_index] : Dtype(0);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Sets the loss associated with a top blob at a given index.
			
 
				+   */
			
 
				+  inline void set_loss(const int top_index, const Dtype value) {
			
 
				+    if (loss_.size() <= top_index) {
			
 
				+      loss_.resize(top_index + 1, Dtype(0));
			
 
				+    }
			
 
				+    loss_[top_index] = value;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Returns the layer type.
			
 
				+   */
			
 
				+  virtual inline const char* type() const { return ""; }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Returns the exact number of bottom blobs required by the layer,
			
 
				+   *        or -1 if no exact number is required.
			
 
				+   *
			
 
				+   * This method should be overridden to return a non-negative value if your
			
 
				+   * layer expects some exact number of bottom blobs.
			
 
				+   */
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return -1; }
			
 
				+  /**
			
 
				+   * @brief Returns the minimum number of bottom blobs required by the layer,
			
 
				+   *        or -1 if no minimum number is required.
			
 
				+   *
			
 
				+   * This method should be overridden to return a non-negative value if your
			
 
				+   * layer expects some minimum number of bottom blobs.
			
 
				+   */
			
 
				+  virtual inline int MinBottomBlobs() const { return -1; }
			
 
				+  /**
			
 
				+   * @brief Returns the maximum number of bottom blobs required by the layer,
			
 
				+   *        or -1 if no maximum number is required.
			
 
				+   *
			
 
				+   * This method should be overridden to return a non-negative value if your
			
 
				+   * layer expects some maximum number of bottom blobs.
			
 
				+   */
			
 
				+  virtual inline int MaxBottomBlobs() const { return -1; }
			
 
				+  /**
			
 
				+   * @brief Returns the exact number of top blobs required by the layer,
			
 
				+   *        or -1 if no exact number is required.
			
 
				+   *
			
 
				+   * This method should be overridden to return a non-negative value if your
			
 
				+   * layer expects some exact number of top blobs.
			
 
				+   */
			
 
				+  virtual inline int ExactNumTopBlobs() const { return -1; }
			
 
				+  /**
			
 
				+   * @brief Returns the minimum number of top blobs required by the layer,
			
 
				+   *        or -1 if no minimum number is required.
			
 
				+   *
			
 
				+   * This method should be overridden to return a non-negative value if your
			
 
				+   * layer expects some minimum number of top blobs.
			
 
				+   */
			
 
				+  virtual inline int MinTopBlobs() const { return -1; }
			
 
				+  /**
			
 
				+   * @brief Returns the maximum number of top blobs required by the layer,
			
 
				+   *        or -1 if no maximum number is required.
			
 
				+   *
			
 
				+   * This method should be overridden to return a non-negative value if your
			
 
				+   * layer expects some maximum number of top blobs.
			
 
				+   */
			
 
				+  virtual inline int MaxTopBlobs() const { return -1; }
			
 
				+  /**
			
 
				+   * @brief Returns true if the layer requires an equal number of bottom and
			
 
				+   *        top blobs.
			
 
				+   *
			
 
				+   * This method should be overridden to return true if your layer expects an
			
 
				+   * equal number of bottom and top blobs.
			
 
				+   */
			
 
				+  virtual inline bool EqualNumBottomTopBlobs() const { return false; }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Return whether "anonymous" top blobs are created automatically
			
 
				+   *        by the layer.
			
 
				+   *
			
 
				+   * If this method returns true, Net::Init will create enough "anonymous" top
			
 
				+   * blobs to fulfill the requirement specified by ExactNumTopBlobs() or
			
 
				+   * MinTopBlobs().
			
 
				+   */
			
 
				+  virtual inline bool AutoTopBlobs() const { return false; }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Return whether to allow force_backward for a given bottom blob
			
 
				+   *        index.
			
 
				+   *
			
 
				+   * If AllowForceBackward(i) == false, we will ignore the force_backward
			
 
				+   * setting and backpropagate to blob i only if it needs gradient information
			
 
				+   * (as is done when force_backward == false).
			
 
				+   */
			
 
				+  virtual inline bool AllowForceBackward(const int bottom_index) const {
			
 
				+    return true;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Specifies whether the layer should compute gradients w.r.t. a
			
 
				+   *        parameter at a particular index given by param_id.
			
 
				+   *
			
 
				+   * You can safely ignore false values and always compute gradients
			
 
				+   * for all parameters, but possibly with wasteful computation.
			
 
				+   */
			
 
				+  inline bool param_propagate_down(const int param_id) {
			
 
				+    return (param_propagate_down_.size() > param_id) ?
			
 
				+        param_propagate_down_[param_id] : false;
			
 
				+  }
			
 
				+  /**
			
 
				+   * @brief Sets whether the layer should compute gradients w.r.t. a
			
 
				+   *        parameter at a particular index given by param_id.
			
 
				+   */
			
 
				+  inline void set_param_propagate_down(const int param_id, const bool value) {
			
 
				+    if (param_propagate_down_.size() <= param_id) {
			
 
				+      param_propagate_down_.resize(param_id + 1, true);
			
 
				+    }
			
 
				+    param_propagate_down_[param_id] = value;
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+ protected:
			
 
				+  /** The protobuf that stores the layer parameters */
			
 
				+  LayerParameter layer_param_;
			
 
				+  /** The phase: TRAIN or TEST */
			
 
				+  Phase phase_;
			
 
				+  /** The vector that stores the learnable parameters as a set of blobs. */
			
 
				+  vector<shared_ptr<Blob<Dtype> > > blobs_;
			
 
				+  /** Vector indicating whether to compute the diff of each param blob. */
			
 
				+  vector<bool> param_propagate_down_;
			
 
				+
			
 
				+  /** The vector that indicates whether each top blob has a non-zero weight in
			
 
				+   *  the objective function. */
			
 
				+  vector<Dtype> loss_;
			
 
				+
			
 
				+  /** @brief Using the CPU device, compute the layer output. */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) = 0;
			
 
				+  /**
			
 
				+   * @brief Using the GPU device, compute the layer output.
			
 
				+   *        Fall back to Forward_cpu() if unavailable.
			
 
				+   */
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {
			
 
				+    // LOG(WARNING) << "Using CPU code as backup.";
			
 
				+    return Forward_cpu(bottom, top);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Using the CPU device, compute the gradients for any parameters and
			
 
				+   *        for the bottom blobs if propagate_down is true.
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down,
			
 
				+      const vector<Blob<Dtype>*>& bottom) = 0;
			
 
				+  /**
			
 
				+   * @brief Using the GPU device, compute the gradients for any parameters and
			
 
				+   *        for the bottom blobs if propagate_down is true.
			
 
				+   *        Fall back to Backward_cpu() if unavailable.
			
 
				+   */
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down,
			
 
				+      const vector<Blob<Dtype>*>& bottom) {
			
 
				+    // LOG(WARNING) << "Using CPU code as backup.";
			
 
				+    Backward_cpu(top, propagate_down, bottom);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Called by the parent Layer's SetUp to check that the number of bottom
			
 
				+   * and top Blobs provided as input match the expected numbers specified by
			
 
				+   * the {ExactNum,Min,Max}{Bottom,Top}Blobs() functions.
			
 
				+   */
			
 
				+  virtual void CheckBlobCounts(const vector<Blob<Dtype>*>& bottom,
			
 
				+                               const vector<Blob<Dtype>*>& top) {
			
 
				+    if (ExactNumBottomBlobs() >= 0) {
			
 
				+      CHECK_EQ(ExactNumBottomBlobs(), bottom.size())
			
 
				+          << type() << " Layer takes " << ExactNumBottomBlobs()
			
 
				+          << " bottom blob(s) as input.";
			
 
				+    }
			
 
				+    if (MinBottomBlobs() >= 0) {
			
 
				+      CHECK_LE(MinBottomBlobs(), bottom.size())
			
 
				+          << type() << " Layer takes at least " << MinBottomBlobs()
			
 
				+          << " bottom blob(s) as input.";
			
 
				+    }
			
 
				+    if (MaxBottomBlobs() >= 0) {
			
 
				+      CHECK_GE(MaxBottomBlobs(), bottom.size())
			
 
				+          << type() << " Layer takes at most " << MaxBottomBlobs()
			
 
				+          << " bottom blob(s) as input.";
			
 
				+    }
			
 
				+    if (ExactNumTopBlobs() >= 0) {
			
 
				+      CHECK_EQ(ExactNumTopBlobs(), top.size())
			
 
				+          << type() << " Layer produces " << ExactNumTopBlobs()
			
 
				+          << " top blob(s) as output.";
			
 
				+    }
			
 
				+    if (MinTopBlobs() >= 0) {
			
 
				+      CHECK_LE(MinTopBlobs(), top.size())
			
 
				+          << type() << " Layer produces at least " << MinTopBlobs()
			
 
				+          << " top blob(s) as output.";
			
 
				+    }
			
 
				+    if (MaxTopBlobs() >= 0) {
			
 
				+      CHECK_GE(MaxTopBlobs(), top.size())
			
 
				+          << type() << " Layer produces at most " << MaxTopBlobs()
			
 
				+          << " top blob(s) as output.";
			
 
				+    }
			
 
				+    if (EqualNumBottomTopBlobs()) {
			
 
				+      CHECK_EQ(bottom.size(), top.size())
			
 
				+          << type() << " Layer produces one top blob as output for each "
			
 
				+          << "bottom blob input.";
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Called by SetUp to initialize the weights associated with any top blobs in
			
 
				+   * the loss function. Store non-zero loss weights in the diff blob.
			
 
				+   */
			
 
				+  inline void SetLossWeights(const vector<Blob<Dtype>*>& top) {
			
 
				+    const int num_loss_weights = layer_param_.loss_weight_size();
			
 
				+    if (num_loss_weights) {
			
 
				+      CHECK_EQ(top.size(), num_loss_weights) << "loss_weight must be "
			
 
				+          "unspecified or specified once per top blob.";
			
 
				+      for (int top_id = 0; top_id < top.size(); ++top_id) {
			
 
				+        const Dtype loss_weight = layer_param_.loss_weight(top_id);
			
 
				+        if (loss_weight == Dtype(0)) { continue; }
			
 
				+        this->set_loss(top_id, loss_weight);
			
 
				+        const int count = top[top_id]->count();
			
 
				+        Dtype* loss_multiplier = top[top_id]->mutable_cpu_diff();
			
 
				+        caffe_set(count, loss_weight, loss_multiplier);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  DISABLE_COPY_AND_ASSIGN(Layer);
			
 
				+};  // class Layer
			
 
				+
			
 
				+// Forward and backward wrappers. You should implement the cpu and
			
 
				+// gpu specific implementations instead, and should not change these
			
 
				+// functions.
			
 
				+template <typename Dtype>
			
 
				+inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
			
 
				+    const vector<Blob<Dtype>*>& top) {
			
 
				+  Dtype loss = 0;
			
 
				+  Reshape(bottom, top);
			
 
				+  switch (Caffe::mode()) {
			
 
				+  case Caffe::CPU:
			
 
				+    Forward_cpu(bottom, top);
			
 
				+    for (int top_id = 0; top_id < top.size(); ++top_id) {
			
 
				+      if (!this->loss(top_id)) { continue; }
			
 
				+      const int count = top[top_id]->count();
			
 
				+      const Dtype* data = top[top_id]->cpu_data();
			
 
				+      const Dtype* loss_weights = top[top_id]->cpu_diff();
			
 
				+      loss += caffe_cpu_dot(count, data, loss_weights);
			
 
				+    }
			
 
				+    break;
			
 
				+  case Caffe::GPU:
			
 
				+    Forward_gpu(bottom, top);
			
 
				+#ifndef CPU_ONLY
			
 
				+    for (int top_id = 0; top_id < top.size(); ++top_id) {
			
 
				+      if (!this->loss(top_id)) { continue; }
			
 
				+      const int count = top[top_id]->count();
			
 
				+      const Dtype* data = top[top_id]->gpu_data();
			
 
				+      const Dtype* loss_weights = top[top_id]->gpu_diff();
			
 
				+      Dtype blob_loss = 0;
			
 
				+      caffe_gpu_dot(count, data, loss_weights, &blob_loss);
			
 
				+      loss += blob_loss;
			
 
				+    }
			
 
				+#endif
			
 
				+    break;
			
 
				+  default:
			
 
				+    LOG(FATAL) << "Unknown caffe mode.";
			
 
				+  }
			
 
				+  return loss;
			
 
				+}
			
 
				+
			
 
				+template <typename Dtype>
			
 
				+inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
			
 
				+    const vector<bool>& propagate_down,
			
 
				+    const vector<Blob<Dtype>*>& bottom) {
			
 
				+  switch (Caffe::mode()) {
			
 
				+  case Caffe::CPU:
			
 
				+    Backward_cpu(top, propagate_down, bottom);
			
 
				+    break;
			
 
				+  case Caffe::GPU:
			
 
				+    Backward_gpu(top, propagate_down, bottom);
			
 
				+    break;
			
 
				+  default:
			
 
				+    LOG(FATAL) << "Unknown caffe mode.";
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// Serialize LayerParameter to protocol buffer
			
 
				+template <typename Dtype>
			
 
				+void Layer<Dtype>::ToProto(LayerParameter* param, bool write_diff) {
			
 
				+  param->Clear();
			
 
				+  param->CopyFrom(layer_param_);
			
 
				+  param->clear_blobs();
			
 
				+  for (int i = 0; i < blobs_.size(); ++i) {
			
 
				+    blobs_[i]->ToProto(param->add_blobs(), write_diff);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_LAYER_H_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layer_factory.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layer_factory.hpp
@@ -0,0 +1,141 @@
 
				+/**
			
 
				+ * @brief A layer factory that allows one to register layers.
			
 
				+ * During runtime, registered layers can be called by passing a LayerParameter
			
 
				+ * protobuffer to the CreateLayer function:
			
 
				+ *
			
 
				+ *     LayerRegistry<Dtype>::CreateLayer(param);
			
 
				+ *
			
 
				+ * There are two ways to register a layer. Assuming that we have a layer like:
			
 
				+ *
			
 
				+ *   template <typename Dtype>
			
 
				+ *   class MyAwesomeLayer : public Layer<Dtype> {
			
 
				+ *     // your implementations
			
 
				+ *   };
			
 
				+ *
			
 
				+ * and its type is its C++ class name, but without the "Layer" at the end
			
 
				+ * ("MyAwesomeLayer" -> "MyAwesome").
			
 
				+ *
			
 
				+ * If the layer is going to be created simply by its constructor, in your c++
			
 
				+ * file, add the following line:
			
 
				+ *
			
 
				+ *    REGISTER_LAYER_CLASS(MyAwesome);
			
 
				+ *
			
 
				+ * Or, if the layer is going to be created by another creator function, in the
			
 
				+ * format of:
			
 
				+ *
			
 
				+ *    template <typename Dtype>
			
 
				+ *    Layer<Dtype*> GetMyAwesomeLayer(const LayerParameter& param) {
			
 
				+ *      // your implementation
			
 
				+ *    }
			
 
				+ *
			
 
				+ * (for example, when your layer has multiple backends, see GetConvolutionLayer
			
 
				+ * for a use case), then you can register the creator function instead, like
			
 
				+ *
			
 
				+ * REGISTER_LAYER_CREATOR(MyAwesome, GetMyAwesomeLayer)
			
 
				+ *
			
 
				+ * Note that each layer type should only be registered once.
			
 
				+ */
			
 
				+
			
 
				+#ifndef CAFFE_LAYER_FACTORY_H_
			
 
				+#define CAFFE_LAYER_FACTORY_H_
			
 
				+
			
 
				+#include <map>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/common.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+template <typename Dtype>
			
 
				+class Layer;
			
 
				+
			
 
				+template <typename Dtype>
			
 
				+class LayerRegistry {
			
 
				+ public:
			
 
				+  typedef shared_ptr<Layer<Dtype> > (*Creator)(const LayerParameter&);
			
 
				+  typedef std::map<string, Creator> CreatorRegistry;
			
 
				+
			
 
				+  static CreatorRegistry& Registry() {
			
 
				+    static CreatorRegistry* g_registry_ = new CreatorRegistry();
			
 
				+    return *g_registry_;
			
 
				+  }
			
 
				+
			
 
				+  // Adds a creator.
			
 
				+  static void AddCreator(const string& type, Creator creator) {
			
 
				+    CreatorRegistry& registry = Registry();
			
 
				+    CHECK_EQ(registry.count(type), 0)
			
 
				+        << "Layer type " << type << " already registered.";
			
 
				+    registry[type] = creator;
			
 
				+  }
			
 
				+
			
 
				+  // Get a layer using a LayerParameter.
			
 
				+  static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
			
 
				+    if (Caffe::root_solver()) {
			
 
				+      LOG(INFO) << "Creating layer " << param.name();
			
 
				+    }
			
 
				+    const string& type = param.type();
			
 
				+    CreatorRegistry& registry = Registry();
			
 
				+    CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
			
 
				+        << " (known types: " << LayerTypeListString() << ")";
			
 
				+    return registry[type](param);
			
 
				+  }
			
 
				+
			
 
				+  static vector<string> LayerTypeList() {
			
 
				+    CreatorRegistry& registry = Registry();
			
 
				+    vector<string> layer_types;
			
 
				+    for (typename CreatorRegistry::iterator iter = registry.begin();
			
 
				+         iter != registry.end(); ++iter) {
			
 
				+      layer_types.push_back(iter->first);
			
 
				+    }
			
 
				+    return layer_types;
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  // Layer registry should never be instantiated - everything is done with its
			
 
				+  // static variables.
			
 
				+  LayerRegistry() {}
			
 
				+
			
 
				+  static string LayerTypeListString() {
			
 
				+    vector<string> layer_types = LayerTypeList();
			
 
				+    string layer_types_str;
			
 
				+    for (vector<string>::iterator iter = layer_types.begin();
			
 
				+         iter != layer_types.end(); ++iter) {
			
 
				+      if (iter != layer_types.begin()) {
			
 
				+        layer_types_str += ", ";
			
 
				+      }
			
 
				+      layer_types_str += *iter;
			
 
				+    }
			
 
				+    return layer_types_str;
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+template <typename Dtype>
			
 
				+class LayerRegisterer {
			
 
				+ public:
			
 
				+  LayerRegisterer(const string& type,
			
 
				+                  shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&)) {
			
 
				+    // LOG(INFO) << "Registering layer type: " << type;
			
 
				+    LayerRegistry<Dtype>::AddCreator(type, creator);
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+#define REGISTER_LAYER_CREATOR(type, creator)                                  \
			
 
				+  static LayerRegisterer<float> g_creator_f_##type(#type, creator<float>);     \
			
 
				+  static LayerRegisterer<double> g_creator_d_##type(#type, creator<double>)    \
			
 
				+
			
 
				+#define REGISTER_LAYER_CLASS(type)                                             \
			
 
				+  template <typename Dtype>                                                    \
			
 
				+  shared_ptr<Layer<Dtype> > Creator_##type##Layer(const LayerParameter& param) \
			
 
				+  {                                                                            \
			
 
				+    return shared_ptr<Layer<Dtype> >(new type##Layer<Dtype>(param));           \
			
 
				+  }                                                                            \
			
 
				+  REGISTER_LAYER_CREATOR(type, Creator_##type##Layer)
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_LAYER_FACTORY_H_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/absval_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/absval_layer.hpp
@@ -0,0 +1,68 @@
 
				+#ifndef CAFFE_ABSVAL_LAYER_HPP_
			
 
				+#define CAFFE_ABSVAL_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/neuron_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Computes @f$ y = |x| @f$
			
 
				+ *
			
 
				+ * @param bottom input Blob vector (length 1)
			
 
				+ *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+ *      the inputs @f$ x @f$
			
 
				+ * @param top output Blob vector (length 1)
			
 
				+ *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+ *      the computed outputs @f$ y = |x| @f$
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class AbsValLayer : public NeuronLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit AbsValLayer(const LayerParameter& param)
			
 
				+      : NeuronLayer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "AbsVal"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  /// @copydoc AbsValLayer
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the error gradient w.r.t. the absolute value inputs.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *      respect to the outputs
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
			
 
				+   *      with respect to computed outputs @f$ y @f$
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length 2)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$; Backward fills their diff with
			
 
				+   *      gradients @f$
			
 
				+   *        \frac{\partial E}{\partial x} =
			
 
				+   *            \mathrm{sign}(x) \frac{\partial E}{\partial y}
			
 
				+   *      @f$ if propagate_down[0]
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_ABSVAL_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/accuracy_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/accuracy_layer.hpp
@@ -0,0 +1,99 @@
 
				+#ifndef CAFFE_ACCURACY_LAYER_HPP_
			
 
				+#define CAFFE_ACCURACY_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/loss_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Computes the classification accuracy for a one-of-many
			
 
				+ *        classification task.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class AccuracyLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @param param provides AccuracyParameter accuracy_param,
			
 
				+   *     with AccuracyLayer options:
			
 
				+   *   - top_k (\b optional, default 1).
			
 
				+   *     Sets the maximum rank @f$ k @f$ at which a prediction is considered
			
 
				+   *     correct.  For example, if @f$ k = 5 @f$, a prediction is counted
			
 
				+   *     correct if the correct label is among the top 5 predicted labels.
			
 
				+   */
			
 
				+  explicit AccuracyLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Accuracy"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 2; }
			
 
				+
			
 
				+  // If there are two top blobs, then the second blob will contain
			
 
				+  // accuracies per class.
			
 
				+  virtual inline int MinTopBlobs() const { return 1; }
			
 
				+  virtual inline int MaxTopBlobs() const { return 2; }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 2)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the predictions @f$ x @f$, a Blob with values in
			
 
				+   *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
			
 
				+   *      the @f$ K = CHW @f$ classes. Each @f$ x_n @f$ is mapped to a predicted
			
 
				+   *      label @f$ \hat{l}_n @f$ given by its maximal index:
			
 
				+   *      @f$ \hat{l}_n = \arg\max\limits_k x_{nk} @f$
			
 
				+   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
			
 
				+   *      the labels @f$ l @f$, an integer-valued Blob with values
			
 
				+   *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
			
 
				+   *      indicating the correct class label among the @f$ K @f$ classes
			
 
				+   * @param top output Blob vector (length 1)
			
 
				+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
			
 
				+   *      the computed accuracy: @f$
			
 
				+   *        \frac{1}{N} \sum\limits_{n=1}^N \delta\{ \hat{l}_n = l_n \}
			
 
				+   *      @f$, where @f$
			
 
				+   *      \delta\{\mathrm{condition}\} = \left\{
			
 
				+   *         \begin{array}{lr}
			
 
				+   *            1 & \mbox{if condition} \\
			
 
				+   *            0 & \mbox{otherwise}
			
 
				+   *         \end{array} \right.
			
 
				+   *      @f$
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+
			
 
				+  /// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
			
 
				+    for (int i = 0; i < propagate_down.size(); ++i) {
			
 
				+      if (propagate_down[i]) { NOT_IMPLEMENTED; }
			
 
				+    }
			
 
				+  }
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  int label_axis_, outer_num_, inner_num_;
			
 
				+
			
 
				+  int top_k_;
			
 
				+
			
 
				+  /// Whether to ignore instances with a certain label.
			
 
				+  bool has_ignore_label_;
			
 
				+  /// The label indicating that an instance should be ignored.
			
 
				+  int ignore_label_;
			
 
				+  /// Keeps counts of the number of samples per class.
			
 
				+  Blob<Dtype> nums_buffer_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_ACCURACY_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/argmax_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/argmax_layer.hpp
@@ -0,0 +1,77 @@
 
				+#ifndef CAFFE_ARGMAX_LAYER_HPP_
			
 
				+#define CAFFE_ARGMAX_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute the index of the @f$ K @f$ max values for each datum across
			
 
				+ *        all dimensions @f$ (C \times H \times W) @f$.
			
 
				+ *
			
 
				+ * Intended for use after a classification layer to produce a prediction.
			
 
				+ * If parameter out_max_val is set to true, output is a vector of pairs
			
 
				+ * (max_ind, max_val) for each image. The axis parameter specifies an axis
			
 
				+ * along which to maximise.
			
 
				+ *
			
 
				+ * NOTE: does not implement Backwards operation.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class ArgMaxLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @param param provides ArgMaxParameter argmax_param,
			
 
				+   *     with ArgMaxLayer options:
			
 
				+   *   - top_k (\b optional uint, default 1).
			
 
				+   *     the number @f$ K @f$ of maximal items to output.
			
 
				+   *   - out_max_val (\b optional bool, default false).
			
 
				+   *     if set, output a vector of pairs (max_ind, max_val) unless axis is set then
			
 
				+   *     output max_val along the specified axis.
			
 
				+   *   - axis (\b optional int).
			
 
				+   *     if set, maximise along the specified axis else maximise the flattened
			
 
				+   *     trailing dimensions for each index of the first / num dimension.
			
 
				+   */
			
 
				+  explicit ArgMaxLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "ArgMax"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$
			
 
				+   * @param top output Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times 1 \times K) @f$ or, if out_max_val
			
 
				+   *      @f$ (N \times 2 \times K) @f$ unless axis set than e.g.
			
 
				+   *      @f$ (N \times K \times H \times W) @f$ if axis == 1
			
 
				+   *      the computed outputs @f$
			
 
				+   *       y_n = \arg\max\limits_i x_{ni}
			
 
				+   *      @f$ (for @f$ K = 1 @f$).
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  /// @brief Not implemented (non-differentiable function)
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
			
 
				+    NOT_IMPLEMENTED;
			
 
				+  }
			
 
				+  bool out_max_val_;
			
 
				+  size_t top_k_;
			
 
				+  bool has_axis_;
			
 
				+  int axis_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_ARGMAX_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/base_conv_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/base_conv_layer.hpp
@@ -0,0 +1,174 @@
 
				+#ifndef CAFFE_BASE_CONVOLUTION_LAYER_HPP_
			
 
				+#define CAFFE_BASE_CONVOLUTION_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+#include "caffe/util/im2col.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Abstract base class that factors out the BLAS code common to
			
 
				+ *        ConvolutionLayer and DeconvolutionLayer.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class BaseConvolutionLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit BaseConvolutionLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline int MinBottomBlobs() const { return 1; }
			
 
				+  virtual inline int MinTopBlobs() const { return 1; }
			
 
				+  virtual inline bool EqualNumBottomTopBlobs() const { return true; }
			
 
				+
			
 
				+ protected:
			
 
				+  // Helper functions that abstract away the column buffer and gemm arguments.
			
 
				+  // The last argument in forward_cpu_gemm is so that we can skip the im2col if
			
 
				+  // we just called weight_cpu_gemm with the same input.
			
 
				+  void forward_cpu_gemm(const Dtype* input, const Dtype* weights,
			
 
				+      Dtype* output, bool skip_im2col = false);
			
 
				+  void forward_cpu_bias(Dtype* output, const Dtype* bias);
			
 
				+  void backward_cpu_gemm(const Dtype* input, const Dtype* weights,
			
 
				+      Dtype* output);
			
 
				+  void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype*
			
 
				+      weights);
			
 
				+  void backward_cpu_bias(Dtype* bias, const Dtype* input);
			
 
				+
			
 
				+#ifndef CPU_ONLY
			
 
				+  void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
			
 
				+      Dtype* output, bool skip_im2col = false);
			
 
				+  void forward_gpu_bias(Dtype* output, const Dtype* bias);
			
 
				+  void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
			
 
				+      Dtype* col_output);
			
 
				+  void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype*
			
 
				+      weights);
			
 
				+  void backward_gpu_bias(Dtype* bias, const Dtype* input);
			
 
				+#endif
			
 
				+
			
 
				+  /// @brief The spatial dimensions of the input.
			
 
				+  inline int input_shape(int i) {
			
 
				+    return (*bottom_shape_)[channel_axis_ + i];
			
 
				+  }
			
 
				+  // reverse_dimensions should return true iff we are implementing deconv, so
			
 
				+  // that conv helpers know which dimensions are which.
			
 
				+  virtual bool reverse_dimensions() = 0;
			
 
				+  // Compute height_out_ and width_out_ from other parameters.
			
 
				+  virtual void compute_output_shape() = 0;
			
 
				+
			
 
				+  /// @brief The spatial dimensions of a filter kernel.
			
 
				+  Blob<int> kernel_shape_;
			
 
				+  /// @brief The spatial dimensions of the stride.
			
 
				+  Blob<int> stride_;
			
 
				+  /// @brief The spatial dimensions of the padding.
			
 
				+  Blob<int> pad_;
			
 
				+  /// @brief The spatial dimensions of the dilation.
			
 
				+  Blob<int> dilation_;
			
 
				+  /// @brief The spatial dimensions of the convolution input.
			
 
				+  Blob<int> conv_input_shape_;
			
 
				+  /// @brief The spatial dimensions of the col_buffer.
			
 
				+  vector<int> col_buffer_shape_;
			
 
				+  /// @brief The spatial dimensions of the output.
			
 
				+  vector<int> output_shape_;
			
 
				+  const vector<int>* bottom_shape_;
			
 
				+
			
 
				+  int num_spatial_axes_;
			
 
				+  int bottom_dim_;
			
 
				+  int top_dim_;
			
 
				+
			
 
				+  int channel_axis_;
			
 
				+  int num_;
			
 
				+  int channels_;
			
 
				+  int group_;
			
 
				+  int out_spatial_dim_;
			
 
				+  int weight_offset_;
			
 
				+  int num_output_;
			
 
				+  bool bias_term_;
			
 
				+  bool is_1x1_;
			
 
				+  bool force_nd_im2col_;
			
 
				+
			
 
				+ private:
			
 
				+  // wrap im2col/col2im so we don't have to remember the (long) argument lists
			
 
				+  inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
			
 
				+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
			
 
				+      im2col_cpu(data, conv_in_channels_,
			
 
				+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
			
 
				+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
			
 
				+          pad_.cpu_data()[0], pad_.cpu_data()[1],
			
 
				+          stride_.cpu_data()[0], stride_.cpu_data()[1],
			
 
				+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);
			
 
				+    } else {
			
 
				+      im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(),
			
 
				+          col_buffer_shape_.data(), kernel_shape_.cpu_data(),
			
 
				+          pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), col_buff);
			
 
				+    }
			
 
				+  }
			
 
				+  inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
			
 
				+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
			
 
				+      col2im_cpu(col_buff, conv_in_channels_,
			
 
				+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
			
 
				+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
			
 
				+          pad_.cpu_data()[0], pad_.cpu_data()[1],
			
 
				+          stride_.cpu_data()[0], stride_.cpu_data()[1],
			
 
				+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], data);
			
 
				+    } else {
			
 
				+      col2im_nd_cpu(col_buff, num_spatial_axes_, conv_input_shape_.cpu_data(),
			
 
				+          col_buffer_shape_.data(), kernel_shape_.cpu_data(),
			
 
				+          pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), data);
			
 
				+    }
			
 
				+  }
			
 
				+#ifndef CPU_ONLY
			
 
				+  inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
			
 
				+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
			
 
				+      im2col_gpu(data, conv_in_channels_,
			
 
				+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
			
 
				+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
			
 
				+          pad_.cpu_data()[0], pad_.cpu_data()[1],
			
 
				+          stride_.cpu_data()[0], stride_.cpu_data()[1],
			
 
				+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);
			
 
				+    } else {
			
 
				+      im2col_nd_gpu(data, num_spatial_axes_, num_kernels_im2col_,
			
 
				+          conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
			
 
				+          kernel_shape_.gpu_data(), pad_.gpu_data(),
			
 
				+          stride_.gpu_data(), dilation_.gpu_data(), col_buff);
			
 
				+    }
			
 
				+  }
			
 
				+  inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
			
 
				+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
			
 
				+      col2im_gpu(col_buff, conv_in_channels_,
			
 
				+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
			
 
				+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
			
 
				+          pad_.cpu_data()[0], pad_.cpu_data()[1],
			
 
				+          stride_.cpu_data()[0], stride_.cpu_data()[1],
			
 
				+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], data);
			
 
				+    } else {
			
 
				+      col2im_nd_gpu(col_buff, num_spatial_axes_, num_kernels_col2im_,
			
 
				+          conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
			
 
				+          kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
			
 
				+          dilation_.gpu_data(), data);
			
 
				+    }
			
 
				+  }
			
 
				+#endif
			
 
				+
			
 
				+  int num_kernels_im2col_;
			
 
				+  int num_kernels_col2im_;
			
 
				+  int conv_out_channels_;
			
 
				+  int conv_in_channels_;
			
 
				+  int conv_out_spatial_dim_;
			
 
				+  int kernel_dim_;
			
 
				+  int col_offset_;
			
 
				+  int output_offset_;
			
 
				+
			
 
				+  Blob<Dtype> col_buffer_;
			
 
				+  Blob<Dtype> bias_multiplier_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_BASE_CONVOLUTION_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/base_data_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/base_data_layer.hpp
@@ -0,0 +1,82 @@
 
				+#ifndef CAFFE_DATA_LAYERS_HPP_
			
 
				+#define CAFFE_DATA_LAYERS_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/data_transformer.hpp"
			
 
				+#include "caffe/internal_thread.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+#include "caffe/util/blocking_queue.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Provides base for data layers that feed blobs to the Net.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class BaseDataLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit BaseDataLayer(const LayerParameter& param);
			
 
				+  // LayerSetUp: implements common data layer setup functionality, and calls
			
 
				+  // DataLayerSetUp to do special data layer setup for individual layer types.
			
 
				+  // This method may not be overridden except by the BasePrefetchingDataLayer.
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {}
			
 
				+  // Data layers have no bottoms, so reshaping is trivial.
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {}
			
 
				+
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
			
 
				+
			
 
				+ protected:
			
 
				+  TransformationParameter transform_param_;
			
 
				+  shared_ptr<DataTransformer<Dtype> > data_transformer_;
			
 
				+  bool output_labels_;
			
 
				+};
			
 
				+
			
 
				+template <typename Dtype>
			
 
				+class Batch {
			
 
				+ public:
			
 
				+  Blob<Dtype> data_, label_;
			
 
				+};
			
 
				+
			
 
				+template <typename Dtype>
			
 
				+class BasePrefetchingDataLayer :
			
 
				+    public BaseDataLayer<Dtype>, public InternalThread {
			
 
				+ public:
			
 
				+  explicit BasePrefetchingDataLayer(const LayerParameter& param);
			
 
				+  // LayerSetUp: implements common data layer setup functionality, and calls
			
 
				+  // DataLayerSetUp to do special data layer setup for individual layer types.
			
 
				+  // This method may not be overridden.
			
 
				+  void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void InternalThreadEntry();
			
 
				+  virtual void load_batch(Batch<Dtype>* batch) = 0;
			
 
				+
			
 
				+  vector<shared_ptr<Batch<Dtype> > > prefetch_;
			
 
				+  BlockingQueue<Batch<Dtype>*> prefetch_free_;
			
 
				+  BlockingQueue<Batch<Dtype>*> prefetch_full_;
			
 
				+  Batch<Dtype>* prefetch_current_;
			
 
				+
			
 
				+  Blob<Dtype> transformed_data_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_DATA_LAYERS_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/batch_norm_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/batch_norm_layer.hpp
@@ -0,0 +1,78 @@
 
				+#ifndef CAFFE_BATCHNORM_LAYER_HPP_
			
 
				+#define CAFFE_BATCHNORM_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Normalizes the input to have 0-mean and/or unit (1) variance across
			
 
				+ *        the batch.
			
 
				+ *
			
 
				+ * This layer computes Batch Normalization as described in [1]. For each channel
			
 
				+ * in the data (i.e. axis 1), it subtracts the mean and divides by the variance,
			
 
				+ * where both statistics are computed across both spatial dimensions and across
			
 
				+ * the different examples in the batch.
			
 
				+ *
			
 
				+ * By default, during training time, the network is computing global
			
 
				+ * mean/variance statistics via a running average, which is then used at test
			
 
				+ * time to allow deterministic outputs for each input. You can manually toggle
			
 
				+ * whether the network is accumulating or using the statistics via the
			
 
				+ * use_global_stats option. For reference, these statistics are kept in the
			
 
				+ * layer's three blobs: (0) mean, (1) variance, and (2) moving average factor.
			
 
				+ *
			
 
				+ * Note that the original paper also included a per-channel learned bias and
			
 
				+ * scaling factor. To implement this in Caffe, define a `ScaleLayer` configured
			
 
				+ * with `bias_term: true` after each `BatchNormLayer` to handle both the bias
			
 
				+ * and scaling factor.
			
 
				+ *
			
 
				+ * [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
			
 
				+ *     Training by Reducing Internal Covariate Shift." arXiv preprint
			
 
				+ *     arXiv:1502.03167 (2015).
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class BatchNormLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit BatchNormLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "BatchNorm"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  Blob<Dtype> mean_, variance_, temp_, x_norm_;
			
 
				+  bool use_global_stats_;
			
 
				+  Dtype moving_average_fraction_;
			
 
				+  int channels_;
			
 
				+  Dtype eps_;
			
 
				+
			
 
				+  // extra temporarary variables is used to carry out sums/broadcasting
			
 
				+  // using BLAS
			
 
				+  Blob<Dtype> batch_sum_multiplier_;
			
 
				+  Blob<Dtype> num_by_chans_;
			
 
				+  Blob<Dtype> spatial_sum_multiplier_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_BATCHNORM_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/batch_reindex_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/batch_reindex_layer.hpp
@@ -0,0 +1,83 @@
 
				+#ifndef CAFFE_BATCHREINDEX_LAYER_HPP_
			
 
				+#define CAFFE_BATCHREINDEX_LAYER_HPP_
			
 
				+
			
 
				+#include <utility>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Index into the input blob along its first axis.
			
 
				+ *
			
 
				+ * This layer can be used to select, reorder, and even replicate examples in a
			
 
				+ * batch.  The second blob is cast to int and treated as an index into the
			
 
				+ * first axis of the first blob.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class BatchReindexLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit BatchReindexLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "BatchReindex"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 2; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 2+)
			
 
				+   *   -# @f$ (N \times ...) @f$
			
 
				+   *      the inputs @f$ x_1 @f$
			
 
				+   *   -# @f$ (M) @f$
			
 
				+   *      the inputs @f$ x_2 @f$
			
 
				+   * @param top output Blob vector (length 1)
			
 
				+   *   -# @f$ (M \times ...) @f$:
			
 
				+   *      the reindexed array @f$
			
 
				+   *        y = x_1[x_2]
			
 
				+   *      @f$
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the error gradient w.r.t. the reordered input.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient
			
 
				+   *        with respect to the outputs
			
 
				+   *   -# @f$ (M \times ...) @f$:
			
 
				+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
			
 
				+   *      with respect to concatenated outputs @f$ y @f$
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length 2):
			
 
				+   *   - @f$ \frac{\partial E}{\partial y} @f$ is de-indexed (summing where
			
 
				+   *     required) back to the input x_1
			
 
				+   *   - This layer cannot backprop to x_2, i.e. propagate_down[1] must be
			
 
				+   *     false.
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+ private:
			
 
				+  struct pair_sort_first {
			
 
				+    bool operator()(const std::pair<int, int> &left,
			
 
				+                    const std::pair<int, int> &right) {
			
 
				+      return left.first < right.first;
			
 
				+    }
			
 
				+  };
			
 
				+  void check_batch_reindex(int initial_num, int final_num,
			
 
				+                           const Dtype* ridx_data);
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_BATCHREINDEX_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/bias_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/bias_layer.hpp
@@ -0,0 +1,54 @@
 
				+#ifndef CAFFE_BIAS_LAYER_HPP_
			
 
				+#define CAFFE_BIAS_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Computes a sum of two input Blobs, with the shape of the latter Blob
			
 
				+ *        "broadcast" to match the shape of the former. Equivalent to tiling
			
 
				+ *        the latter Blob, then computing the elementwise sum.
			
 
				+ *
			
 
				+ * The second input may be omitted, in which case it's learned as a parameter
			
 
				+ * of the layer. Note: in case bias and scaling are desired, both operations can
			
 
				+ * be handled by `ScaleLayer` configured with `bias_term: true`.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class BiasLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit BiasLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Bias"; }
			
 
				+  virtual inline int MinBottomBlobs() const { return 1; }
			
 
				+  virtual inline int MaxBottomBlobs() const { return 2; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+ private:
			
 
				+  Blob<Dtype> bias_multiplier_;
			
 
				+  int outer_dim_, bias_dim_, inner_dim_, dim_;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_BIAS_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/bnll_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/bnll_layer.hpp
@@ -0,0 +1,70 @@
 
				+#ifndef CAFFE_BNLL_LAYER_HPP_
			
 
				+#define CAFFE_BNLL_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/neuron_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Computes @f$ y = x + \log(1 + \exp(-x)) @f$ if @f$ x > 0 @f$;
			
 
				+ *        @f$ y = \log(1 + \exp(x)) @f$ otherwise.
			
 
				+ *
			
 
				+ * @param bottom input Blob vector (length 1)
			
 
				+ *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+ *      the inputs @f$ x @f$
			
 
				+ * @param top output Blob vector (length 1)
			
 
				+ *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+ *      the computed outputs @f$
			
 
				+ *      y = \left\{
			
 
				+ *         \begin{array}{ll}
			
 
				+ *            x + \log(1 + \exp(-x)) & \mbox{if } x > 0 \\
			
 
				+ *            \log(1 + \exp(x)) & \mbox{otherwise}
			
 
				+ *         \end{array} \right.
			
 
				+ *      @f$
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class BNLLLayer : public NeuronLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit BNLLLayer(const LayerParameter& param)
			
 
				+      : NeuronLayer<Dtype>(param) {}
			
 
				+
			
 
				+  virtual inline const char* type() const { return "BNLL"; }
			
 
				+
			
 
				+ protected:
			
 
				+  /// @copydoc BNLLLayer
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the error gradient w.r.t. the BNLL inputs.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *      respect to the outputs
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
			
 
				+   *      with respect to computed outputs @f$ y @f$
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length 2)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$; Backward fills their diff with
			
 
				+   *      gradients @f$
			
 
				+   *        \frac{\partial E}{\partial x}
			
 
				+   *      @f$ if propagate_down[0]
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_BNLL_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/clip_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/clip_layer.hpp
@@ -0,0 +1,75 @@
 
				+#ifndef CAFFE_CLIP_LAYER_HPP_
			
 
				+#define CAFFE_CLIP_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/neuron_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Clip: @f$ y = \max(min, \min(max, x)) @f$.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class ClipLayer : public NeuronLayer<Dtype> {
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @param param provides ClipParameter clip_param,
			
 
				+   *     with ClipLayer options:
			
 
				+   *   - min
			
 
				+   *   - max
			
 
				+   */
			
 
				+  explicit ClipLayer(const LayerParameter& param)
			
 
				+      : NeuronLayer<Dtype>(param) {}
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Clip"; }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$
			
 
				+   * @param top output Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the computed outputs @f$
			
 
				+   *        y = \max(min, \min(max, x))
			
 
				+   *      @f$
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the error gradient w.r.t. the clipped inputs.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *      respect to the outputs
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
			
 
				+   *      with respect to computed outputs @f$ y @f$
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$; Backward fills their diff with
			
 
				+   *      gradients @f$
			
 
				+   *        \frac{\partial E}{\partial x} = \left\{
			
 
				+   *        \begin{array}{lr}
			
 
				+   *            0 & \mathrm{if} \; x < min \vee x > max \\
			
 
				+   *            \frac{\partial E}{\partial y} & \mathrm{if} \; x \ge min \wedge x \le max
			
 
				+   *        \end{array} \right.
			
 
				+   *      @f$
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CLIP_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/concat_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/concat_layer.hpp
@@ -0,0 +1,87 @@
 
				+#ifndef CAFFE_CONCAT_LAYER_HPP_
			
 
				+#define CAFFE_CONCAT_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Takes at least two Blob%s and concatenates them along either the num
			
 
				+ *        or channel dimension, outputting the result.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class ConcatLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit ConcatLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Concat"; }
			
 
				+  virtual inline int MinBottomBlobs() const { return 1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 2+)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x_1 @f$
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x_2 @f$
			
 
				+   *   -# ...
			
 
				+   *   - K @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x_K @f$
			
 
				+   * @param top output Blob vector (length 1)
			
 
				+   *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
			
 
				+   *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
			
 
				+   *      the concatenated output @f$
			
 
				+   *        y = [\begin{array}{cccc} x_1 & x_2 & ... & x_K \end{array}]
			
 
				+   *      @f$
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the error gradient w.r.t. the concatenate inputs.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *        respect to the outputs
			
 
				+   *   -# @f$ (KN \times C \times H \times W) @f$ if axis == 0, or
			
 
				+   *      @f$ (N \times KC \times H \times W) @f$ if axis == 1:
			
 
				+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
			
 
				+   *      with respect to concatenated outputs @f$ y @f$
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length K), into which the top gradient
			
 
				+   *        @f$ \frac{\partial E}{\partial y} @f$ is deconcatenated back to the
			
 
				+   *        inputs @f$
			
 
				+   *        \left[ \begin{array}{cccc}
			
 
				+   *          \frac{\partial E}{\partial x_1} &
			
 
				+   *          \frac{\partial E}{\partial x_2} &
			
 
				+   *          ... &
			
 
				+   *          \frac{\partial E}{\partial x_K}
			
 
				+   *        \end{array} \right] =
			
 
				+   *        \frac{\partial E}{\partial y}
			
 
				+   *        @f$
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  int count_;
			
 
				+  int num_concats_;
			
 
				+  int concat_input_size_;
			
 
				+  int concat_axis_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CONCAT_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/contrastive_loss_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/contrastive_loss_layer.hpp
@@ -0,0 +1,101 @@
 
				+#ifndef CAFFE_CONTRASTIVE_LOSS_LAYER_HPP_
			
 
				+#define CAFFE_CONTRASTIVE_LOSS_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/loss_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Computes the contrastive loss @f$
			
 
				+ *          E = \frac{1}{2N} \sum\limits_{n=1}^N \left(y\right) d^2 +
			
 
				+ *              \left(1-y\right) \max \left(margin-d, 0\right)^2
			
 
				+ *          @f$ where @f$
			
 
				+ *          d = \left| \left| a_n - b_n \right| \right|_2 @f$. This can be
			
 
				+ *          used to train siamese networks.
			
 
				+ *
			
 
				+ * @param bottom input Blob vector (length 3)
			
 
				+ *   -# @f$ (N \times C \times 1 \times 1) @f$
			
 
				+ *      the features @f$ a \in [-\infty, +\infty]@f$
			
 
				+ *   -# @f$ (N \times C \times 1 \times 1) @f$
			
 
				+ *      the features @f$ b \in [-\infty, +\infty]@f$
			
 
				+ *   -# @f$ (N \times 1 \times 1 \times 1) @f$
			
 
				+ *      the binary similarity @f$ s \in [0, 1]@f$
			
 
				+ * @param top output Blob vector (length 1)
			
 
				+ *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
			
 
				+ *      the computed contrastive loss: @f$ E =
			
 
				+ *          \frac{1}{2N} \sum\limits_{n=1}^N \left(y\right) d^2 +
			
 
				+ *          \left(1-y\right) \max \left(margin-d, 0\right)^2
			
 
				+ *          @f$ where @f$
			
 
				+ *          d = \left| \left| a_n - b_n \right| \right|_2 @f$.
			
 
				+ * This can be used to train siamese networks.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class ContrastiveLossLayer : public LossLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit ContrastiveLossLayer(const LayerParameter& param)
			
 
				+      : LossLayer<Dtype>(param), diff_() {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 3; }
			
 
				+  virtual inline const char* type() const { return "ContrastiveLoss"; }
			
 
				+  /**
			
 
				+   * Unlike most loss layers, in the ContrastiveLossLayer we can backpropagate
			
 
				+   * to the first two inputs.
			
 
				+   */
			
 
				+  virtual inline bool AllowForceBackward(const int bottom_index) const {
			
 
				+    return bottom_index != 2;
			
 
				+  }
			
 
				+
			
 
				+ protected:
			
 
				+  /// @copydoc ContrastiveLossLayer
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the Contrastive error gradient w.r.t. the inputs.
			
 
				+   *
			
 
				+   * Computes the gradients with respect to the two input vectors (bottom[0] and
			
 
				+   * bottom[1]), but not the similarity label (bottom[2]).
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *      respect to the outputs
			
 
				+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
			
 
				+   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
			
 
				+   *      as @f$ \lambda @f$ is the coefficient of this layer's output
			
 
				+   *      @f$\ell_i@f$ in the overall Net loss
			
 
				+   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
			
 
				+   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
			
 
				+   *      (*Assuming that this top Blob is not used as a bottom (input) by any
			
 
				+   *      other layer of the Net.)
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length 2)
			
 
				+   *   -# @f$ (N \times C \times 1 \times 1) @f$
			
 
				+   *      the features @f$a@f$; Backward fills their diff with
			
 
				+   *      gradients if propagate_down[0]
			
 
				+   *   -# @f$ (N \times C \times 1 \times 1) @f$
			
 
				+   *      the features @f$b@f$; Backward fills their diff with gradients if
			
 
				+   *      propagate_down[1]
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  Blob<Dtype> diff_;  // cached for backward pass
			
 
				+  Blob<Dtype> dist_sq_;  // cached for backward pass
			
 
				+  Blob<Dtype> diff_sq_;  // tmp storage for gpu forward pass
			
 
				+  Blob<Dtype> summer_vec_;  // tmp storage for gpu forward pass
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CONTRASTIVE_LOSS_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/conv_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/conv_layer.hpp
@@ -0,0 +1,84 @@
 
				+#ifndef CAFFE_CONV_LAYER_HPP_
			
 
				+#define CAFFE_CONV_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/base_conv_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Convolves the input image with a bank of learned filters,
			
 
				+ *        and (optionally) adds biases.
			
 
				+ *
			
 
				+ *   Caffe convolves by reduction to matrix multiplication. This achieves
			
 
				+ *   high-throughput and generality of input and filter dimensions but comes at
			
 
				+ *   the cost of memory for matrices. This makes use of efficiency in BLAS.
			
 
				+ *
			
 
				+ *   The input is "im2col" transformed to a channel K' x H x W data matrix
			
 
				+ *   for multiplication with the N x K' x H x W filter matrix to yield a
			
 
				+ *   N' x H x W output matrix that is then "col2im" restored. K' is the
			
 
				+ *   input channel * kernel height * kernel width dimension of the unrolled
			
 
				+ *   inputs so that the im2col matrix has a column for each input region to
			
 
				+ *   be filtered. col2im restores the output spatial structure by rolling up
			
 
				+ *   the output channel N' columns of the output matrix.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @param param provides ConvolutionParameter convolution_param,
			
 
				+   *    with ConvolutionLayer options:
			
 
				+   *  - num_output. The number of filters.
			
 
				+   *  - kernel_size / kernel_h / kernel_w. The filter dimensions, given by
			
 
				+   *  kernel_size for square filters or kernel_h and kernel_w for rectangular
			
 
				+   *  filters.
			
 
				+   *  - stride / stride_h / stride_w (\b optional, default 1). The filter
			
 
				+   *  stride, given by stride_size for equal dimensions or stride_h and stride_w
			
 
				+   *  for different strides. By default the convolution is dense with stride 1.
			
 
				+   *  - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for
			
 
				+   *  convolution, given by pad for equal dimensions or pad_h and pad_w for
			
 
				+   *  different padding. Input padding is computed implicitly instead of
			
 
				+   *  actually padding.
			
 
				+   *  - dilation (\b optional, default 1). The filter
			
 
				+   *  dilation, given by dilation_size for equal dimensions for different
			
 
				+   *  dilation. By default the convolution has dilation 1.
			
 
				+   *  - group (\b optional, default 1). The number of filter groups. Group
			
 
				+   *  convolution is a method for reducing parameterization by selectively
			
 
				+   *  connecting input and output channels. The input and output channel dimensions must be divisible
			
 
				+   *  by the number of groups. For group @f$ \geq 1 @f$, the
			
 
				+   *  convolutional filters' input and output channels are separated s.t. each
			
 
				+   *  group takes 1 / group of the input channels and makes 1 / group of the
			
 
				+   *  output channels. Concretely 4 input channels, 8 output channels, and
			
 
				+   *  2 groups separate input channels 1-2 and output channels 1-4 into the
			
 
				+   *  first group and input channels 3-4 and output channels 5-8 into the second
			
 
				+   *  group.
			
 
				+   *  - bias_term (\b optional, default true). Whether to have a bias.
			
 
				+   *  - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library
			
 
				+   *    kernels + stream parallelism) engines.
			
 
				+   */
			
 
				+  explicit ConvolutionLayer(const LayerParameter& param)
			
 
				+      : BaseConvolutionLayer<Dtype>(param) {}
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Convolution"; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual inline bool reverse_dimensions() { return false; }
			
 
				+  virtual void compute_output_shape();
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CONV_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/crop_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/crop_layer.hpp
@@ -0,0 +1,78 @@
 
				+#ifndef CAFFE_CROP_LAYER_HPP_
			
 
				+#define CAFFE_CROP_LAYER_HPP_
			
 
				+
			
 
				+#include <utility>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Takes a Blob and crop it, to the shape specified by the second input
			
 
				+ *  Blob, across all dimensions after the specified axis.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
			
 
				+ */
			
 
				+
			
 
				+template <typename Dtype>
			
 
				+class CropLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit CropLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Crop"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 2; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  Blob<int> offsets;
			
 
				+  Blob<int> src_strides_;
			
 
				+  Blob<int> dest_strides_;
			
 
				+
			
 
				+ private:
			
 
				+  // Recursive copy function.
			
 
				+  void crop_copy(const vector<Blob<Dtype>*>& bottom,
			
 
				+               const vector<Blob<Dtype>*>& top,
			
 
				+               const int* offsets,
			
 
				+               vector<int> indices,
			
 
				+               int cur_dim,
			
 
				+               const Dtype* src_data,
			
 
				+               Dtype* dest_data,
			
 
				+               bool is_forward);
			
 
				+
			
 
				+  // Recursive copy function: this is similar to crop_copy() but loops over all
			
 
				+  // but the last two dimensions to allow for ND cropping while still relying on
			
 
				+  // a CUDA kernel for the innermost two dimensions for performance reasons.  An
			
 
				+  // alterantive implementation could rely on the kernel more by passing
			
 
				+  // offsets, but this is problematic because of its variable length.
			
 
				+  // Since in the standard (N,C,W,H) case N,C are usually not cropped a speedup
			
 
				+  // could be achieved by not looping the application of the copy_kernel around
			
 
				+  // these dimensions.
			
 
				+  void crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+                const vector<Blob<Dtype>*>& top,
			
 
				+                const vector<int>& offsets,
			
 
				+                vector<int> indices,
			
 
				+                int cur_dim,
			
 
				+                const Dtype* src_data,
			
 
				+                Dtype* dest_data,
			
 
				+                bool is_forward);
			
 
				+};
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CROP_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_conv_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_conv_layer.hpp
@@ -0,0 +1,72 @@
 
				+#ifndef CAFFE_CUDNN_CONV_LAYER_HPP_
			
 
				+#define CAFFE_CUDNN_CONV_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/conv_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+#ifdef USE_CUDNN
			
 
				+/*
			
 
				+ * @brief cuDNN implementation of ConvolutionLayer.
			
 
				+ *        Fallback to ConvolutionLayer for CPU mode.
			
 
				+ *
			
 
				+ * cuDNN accelerates convolution through forward kernels for filtering and bias
			
 
				+ * plus backward kernels for the gradient w.r.t. the filters, biases, and
			
 
				+ * inputs. Caffe + cuDNN further speeds up the computation through forward
			
 
				+ * parallelism across groups and backward parallelism across gradients.
			
 
				+ *
			
 
				+ * The CUDNN engine does not have memory overhead for matrix buffers. For many
			
 
				+ * input and filter regimes the CUDNN engine is faster than the CAFFE engine,
			
 
				+ * but for fully-convolutional models and large inputs the CAFFE engine can be
			
 
				+ * faster as long as it fits in memory.
			
 
				+*/
			
 
				+template <typename Dtype>
			
 
				+class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit CuDNNConvolutionLayer(const LayerParameter& param)
			
 
				+      : ConvolutionLayer<Dtype>(param), handles_setup_(false) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual ~CuDNNConvolutionLayer();
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  bool handles_setup_;
			
 
				+  cudnnHandle_t* handle_;
			
 
				+  cudaStream_t*  stream_;
			
 
				+
			
 
				+  // algorithms for forward and backwards convolutions
			
 
				+  cudnnConvolutionFwdAlgo_t *fwd_algo_;
			
 
				+  cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_;
			
 
				+  cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_;
			
 
				+
			
 
				+  vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
			
 
				+  cudnnTensorDescriptor_t    bias_desc_;
			
 
				+  cudnnFilterDescriptor_t      filter_desc_;
			
 
				+  vector<cudnnConvolutionDescriptor_t> conv_descs_;
			
 
				+  int bottom_offset_, top_offset_, bias_offset_;
			
 
				+
			
 
				+  size_t *workspace_fwd_sizes_;
			
 
				+  size_t *workspace_bwd_data_sizes_;
			
 
				+  size_t *workspace_bwd_filter_sizes_;
			
 
				+  size_t workspaceSizeInBytes;  // size of underlying storage
			
 
				+  void *workspaceData;  // underlying storage
			
 
				+  void **workspace;  // aliases into workspaceData
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CUDNN_CONV_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_deconv_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_deconv_layer.hpp
@@ -0,0 +1,68 @@
 
				+#ifndef CAFFE_CUDNN_DECONV_LAYER_HPP_
			
 
				+#define CAFFE_CUDNN_DECONV_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/deconv_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+#ifdef USE_CUDNN
			
 
				+/*
			
 
				+ * @brief cuDNN implementation of DeConvolutionLayer.
			
 
				+ *        Fallback to DeConvolutionLayer for CPU mode.
			
 
				+ *
			
 
				+ * cuDNN accelerates deconvolution through forward kernels for filtering and
			
 
				+ * bias plus backward kernels for the gradient w.r.t. the filters, biases, and
			
 
				+ * inputs. Caffe + cuDNN further speeds up the computation through forward
			
 
				+ * parallelism across groups and backward parallelism across gradients.
			
 
				+*/
			
 
				+template <typename Dtype>
			
 
				+class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit CuDNNDeconvolutionLayer(const LayerParameter& param)
			
 
				+    : DeconvolutionLayer<Dtype>(param), handles_setup_(false) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+                          const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+                       const vector<Blob<Dtype>*>& top);
			
 
				+  virtual ~CuDNNDeconvolutionLayer();
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+                           const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+                            const vector<bool>& propagate_down,
			
 
				+                            const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  bool handles_setup_;
			
 
				+  cudnnHandle_t* handle_;
			
 
				+  cudaStream_t*  stream_;
			
 
				+
			
 
				+  // algorithms for forward and backwards convolutions
			
 
				+  cudnnConvolutionFwdAlgo_t *fwd_algo_;
			
 
				+  cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_;
			
 
				+  cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_;
			
 
				+
			
 
				+  vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
			
 
				+  cudnnTensorDescriptor_t bias_desc_;
			
 
				+  cudnnFilterDescriptor_t filter_desc_;
			
 
				+  vector<cudnnConvolutionDescriptor_t> conv_descs_;
			
 
				+  int bottom_offset_, top_offset_, bias_offset_;
			
 
				+
			
 
				+  size_t *workspace_fwd_sizes_;
			
 
				+  size_t *workspace_bwd_data_sizes_;
			
 
				+  size_t *workspace_bwd_filter_sizes_;
			
 
				+  size_t workspaceSizeInBytes;  // size of underlying storage
			
 
				+  void *workspaceData;  // underlying storage
			
 
				+  void **workspace;  // aliases into workspaceData
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CUDNN_DECONV_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_lcn_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_lcn_layer.hpp
@@ -0,0 +1,49 @@
 
				+#ifndef CAFFE_CUDNN_LCN_LAYER_HPP_
			
 
				+#define CAFFE_CUDNN_LCN_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/lrn_layer.hpp"
			
 
				+#include "caffe/layers/power_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+#ifdef USE_CUDNN
			
 
				+template <typename Dtype>
			
 
				+class CuDNNLCNLayer : public LRNLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit CuDNNLCNLayer(const LayerParameter& param)
			
 
				+      : LRNLayer<Dtype>(param), handles_setup_(false), tempDataSize(0),
			
 
				+        tempData1(NULL), tempData2(NULL) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual ~CuDNNLCNLayer();
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  bool handles_setup_;
			
 
				+  cudnnHandle_t             handle_;
			
 
				+  cudnnLRNDescriptor_t norm_desc_;
			
 
				+  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
			
 
				+
			
 
				+  int size_, pre_pad_;
			
 
				+  Dtype alpha_, beta_, k_;
			
 
				+
			
 
				+  size_t tempDataSize;
			
 
				+  void *tempData1, *tempData2;
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CUDNN_LCN_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_lrn_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_lrn_layer.hpp
@@ -0,0 +1,44 @@
 
				+#ifndef CAFFE_CUDNN_LRN_LAYER_HPP_
			
 
				+#define CAFFE_CUDNN_LRN_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/lrn_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+#ifdef USE_CUDNN
			
 
				+template <typename Dtype>
			
 
				+class CuDNNLRNLayer : public LRNLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit CuDNNLRNLayer(const LayerParameter& param)
			
 
				+      : LRNLayer<Dtype>(param), handles_setup_(false) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual ~CuDNNLRNLayer();
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  bool handles_setup_;
			
 
				+  cudnnHandle_t             handle_;
			
 
				+  cudnnLRNDescriptor_t norm_desc_;
			
 
				+  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
			
 
				+
			
 
				+  int size_;
			
 
				+  Dtype alpha_, beta_, k_;
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CUDNN_LRN_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_pooling_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_pooling_layer.hpp
@@ -0,0 +1,49 @@
 
				+#ifndef CAFFE_CUDNN_POOLING_LAYER_HPP_
			
 
				+#define CAFFE_CUDNN_POOLING_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/pooling_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+#ifdef USE_CUDNN
			
 
				+/*
			
 
				+ * @brief cuDNN implementation of PoolingLayer.
			
 
				+ *        Fallback to PoolingLayer for CPU mode.
			
 
				+*/
			
 
				+template <typename Dtype>
			
 
				+class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit CuDNNPoolingLayer(const LayerParameter& param)
			
 
				+      : PoolingLayer<Dtype>(param), handles_setup_(false) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual ~CuDNNPoolingLayer();
			
 
				+  // Currently, cuDNN does not support the extra top blob.
			
 
				+  virtual inline int MinTopBlobs() const { return -1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  bool handles_setup_;
			
 
				+  cudnnHandle_t             handle_;
			
 
				+  cudnnTensorDescriptor_t bottom_desc_, top_desc_;
			
 
				+  cudnnPoolingDescriptor_t  pooling_desc_;
			
 
				+  cudnnPoolingMode_t        mode_;
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CUDNN_POOLING_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_relu_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_relu_layer.hpp
@@ -0,0 +1,46 @@
 
				+#ifndef CAFFE_CUDNN_RELU_LAYER_HPP_
			
 
				+#define CAFFE_CUDNN_RELU_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/neuron_layer.hpp"
			
 
				+#include "caffe/layers/relu_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+#ifdef USE_CUDNN
			
 
				+/**
			
 
				+ * @brief CuDNN acceleration of ReLULayer.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class CuDNNReLULayer : public ReLULayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit CuDNNReLULayer(const LayerParameter& param)
			
 
				+      : ReLULayer<Dtype>(param), handles_setup_(false) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual ~CuDNNReLULayer();
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  bool handles_setup_;
			
 
				+  cudnnHandle_t             handle_;
			
 
				+  cudnnTensorDescriptor_t bottom_desc_;
			
 
				+  cudnnTensorDescriptor_t top_desc_;
			
 
				+  cudnnActivationDescriptor_t activ_desc_;
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CUDNN_RELU_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_sigmoid_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_sigmoid_layer.hpp
@@ -0,0 +1,46 @@
 
				+#ifndef CAFFE_CUDNN_SIGMOID_LAYER_HPP_
			
 
				+#define CAFFE_CUDNN_SIGMOID_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/neuron_layer.hpp"
			
 
				+#include "caffe/layers/sigmoid_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+#ifdef USE_CUDNN
			
 
				+/**
			
 
				+ * @brief CuDNN acceleration of SigmoidLayer.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class CuDNNSigmoidLayer : public SigmoidLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit CuDNNSigmoidLayer(const LayerParameter& param)
			
 
				+      : SigmoidLayer<Dtype>(param), handles_setup_(false) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual ~CuDNNSigmoidLayer();
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  bool handles_setup_;
			
 
				+  cudnnHandle_t             handle_;
			
 
				+  cudnnTensorDescriptor_t bottom_desc_;
			
 
				+  cudnnTensorDescriptor_t top_desc_;
			
 
				+  cudnnActivationDescriptor_t activ_desc_;
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CUDNN_SIGMOID_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_softmax_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_softmax_layer.hpp
@@ -0,0 +1,45 @@
 
				+#ifndef CAFFE_CUDNN_SOFTMAX_LAYER_HPP_
			
 
				+#define CAFFE_CUDNN_SOFTMAX_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/softmax_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+#ifdef USE_CUDNN
			
 
				+/**
			
 
				+ * @brief cuDNN implementation of SoftmaxLayer.
			
 
				+ *        Fallback to SoftmaxLayer for CPU mode.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class CuDNNSoftmaxLayer : public SoftmaxLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit CuDNNSoftmaxLayer(const LayerParameter& param)
			
 
				+      : SoftmaxLayer<Dtype>(param), handles_setup_(false) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual ~CuDNNSoftmaxLayer();
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  bool handles_setup_;
			
 
				+  cudnnHandle_t             handle_;
			
 
				+  cudnnTensorDescriptor_t bottom_desc_;
			
 
				+  cudnnTensorDescriptor_t top_desc_;
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CUDNN_SOFTMAX_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_tanh_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/cudnn_tanh_layer.hpp
@@ -0,0 +1,46 @@
 
				+#ifndef CAFFE_CUDNN_TANH_LAYER_HPP_
			
 
				+#define CAFFE_CUDNN_TANH_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/neuron_layer.hpp"
			
 
				+#include "caffe/layers/tanh_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+#ifdef USE_CUDNN
			
 
				+/**
			
 
				+ * @brief CuDNN acceleration of TanHLayer.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class CuDNNTanHLayer : public TanHLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit CuDNNTanHLayer(const LayerParameter& param)
			
 
				+      : TanHLayer<Dtype>(param), handles_setup_(false) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual ~CuDNNTanHLayer();
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  bool handles_setup_;
			
 
				+  cudnnHandle_t             handle_;
			
 
				+  cudnnTensorDescriptor_t bottom_desc_;
			
 
				+  cudnnTensorDescriptor_t top_desc_;
			
 
				+  cudnnActivationDescriptor_t activ_desc_;
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_CUDNN_TANH_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/data_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/data_layer.hpp
@@ -0,0 +1,40 @@
 
				+#ifndef CAFFE_DATA_LAYER_HPP_
			
 
				+#define CAFFE_DATA_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/data_transformer.hpp"
			
 
				+#include "caffe/internal_thread.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/layers/base_data_layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+#include "caffe/util/db.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+template <typename Dtype>
			
 
				+class DataLayer : public BasePrefetchingDataLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit DataLayer(const LayerParameter& param);
			
 
				+  virtual ~DataLayer();
			
 
				+  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual inline const char* type() const { return "Data"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 0; }
			
 
				+  virtual inline int MinTopBlobs() const { return 1; }
			
 
				+  virtual inline int MaxTopBlobs() const { return 2; }
			
 
				+
			
 
				+ protected:
			
 
				+  void Next();
			
 
				+  bool Skip();
			
 
				+  virtual void load_batch(Batch<Dtype>* batch);
			
 
				+
			
 
				+  shared_ptr<db::DB> db_;
			
 
				+  shared_ptr<db::Cursor> cursor_;
			
 
				+  uint64_t offset_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_DATA_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/deconv_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/deconv_layer.hpp
@@ -0,0 +1,51 @@
 
				+#ifndef CAFFE_DECONV_LAYER_HPP_
			
 
				+#define CAFFE_DECONV_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/base_conv_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Convolve the input with a bank of learned filters, and (optionally)
			
 
				+ *        add biases, treating filters and convolution parameters in the
			
 
				+ *        opposite sense as ConvolutionLayer.
			
 
				+ *
			
 
				+ *   ConvolutionLayer computes each output value by dotting an input window with
			
 
				+ *   a filter; DeconvolutionLayer multiplies each input value by a filter
			
 
				+ *   elementwise, and sums over the resulting output windows. In other words,
			
 
				+ *   DeconvolutionLayer is ConvolutionLayer with the forward and backward passes
			
 
				+ *   reversed. DeconvolutionLayer reuses ConvolutionParameter for its
			
 
				+ *   parameters, but they take the opposite sense as in ConvolutionLayer (so
			
 
				+ *   padding is removed from the output rather than added to the input, and
			
 
				+ *   stride results in upsampling rather than downsampling).
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class DeconvolutionLayer : public BaseConvolutionLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit DeconvolutionLayer(const LayerParameter& param)
			
 
				+      : BaseConvolutionLayer<Dtype>(param) {}
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Deconvolution"; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual inline bool reverse_dimensions() { return true; }
			
 
				+  virtual void compute_output_shape();
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_DECONV_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/dropout_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/dropout_layer.hpp
@@ -0,0 +1,80 @@
 
				+#ifndef CAFFE_DROPOUT_LAYER_HPP_
			
 
				+#define CAFFE_DROPOUT_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/neuron_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief During training only, sets a random portion of @f$x@f$ to 0, adjusting
			
 
				+ *        the rest of the vector magnitude accordingly.
			
 
				+ *
			
 
				+ * @param bottom input Blob vector (length 1)
			
 
				+ *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+ *      the inputs @f$ x @f$
			
 
				+ * @param top output Blob vector (length 1)
			
 
				+ *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+ *      the computed outputs @f$ y = |x| @f$
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class DropoutLayer : public NeuronLayer<Dtype> {
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @param param provides DropoutParameter dropout_param,
			
 
				+   *     with DropoutLayer options:
			
 
				+   *   - dropout_ratio (\b optional, default 0.5).
			
 
				+   *     Sets the probability @f$ p @f$ that any given unit is dropped.
			
 
				+   */
			
 
				+  explicit DropoutLayer(const LayerParameter& param)
			
 
				+      : NeuronLayer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Dropout"; }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$
			
 
				+   * @param top output Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the computed outputs. At training time, we have @f$
			
 
				+   *      y_{\mbox{train}} = \left\{
			
 
				+   *         \begin{array}{ll}
			
 
				+   *            \frac{x}{1 - p} & \mbox{if } u > p \\
			
 
				+   *            0 & \mbox{otherwise}
			
 
				+   *         \end{array} \right.
			
 
				+   *      @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each
			
 
				+   *      input at each iteration. At test time, we simply have
			
 
				+   *      @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$.
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$
			
 
				+  Blob<unsigned int> rand_vec_;
			
 
				+  /// the probability @f$ p @f$ of dropping any input
			
 
				+  Dtype threshold_;
			
 
				+  /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$
			
 
				+  Dtype scale_;
			
 
				+  unsigned int uint_thres_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_DROPOUT_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/dummy_data_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/dummy_data_layer.hpp
@@ -0,0 +1,47 @@
 
				+#ifndef CAFFE_DUMMY_DATA_LAYER_HPP_
			
 
				+#define CAFFE_DUMMY_DATA_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/filler.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Provides data to the Net generated by a Filler.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class DummyDataLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit DummyDataLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  // Data layers have no bottoms, so reshaping is trivial.
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {}
			
 
				+
			
 
				+  virtual inline const char* type() const { return "DummyData"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 0; }
			
 
				+  virtual inline int MinTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
			
 
				+
			
 
				+  vector<shared_ptr<Filler<Dtype> > > fillers_;
			
 
				+  vector<bool> refill_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_DUMMY_DATA_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/eltwise_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/eltwise_layer.hpp
@@ -0,0 +1,51 @@
 
				+#ifndef CAFFE_ELTWISE_LAYER_HPP_
			
 
				+#define CAFFE_ELTWISE_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute elementwise operations, such as product and sum,
			
 
				+ *        along multiple input Blobs.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class EltwiseLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit EltwiseLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Eltwise"; }
			
 
				+  virtual inline int MinBottomBlobs() const { return 2; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  EltwiseParameter_EltwiseOp op_;
			
 
				+  vector<Dtype> coeffs_;
			
 
				+  Blob<int> max_idx_;
			
 
				+
			
 
				+  bool stable_prod_grad_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_ELTWISE_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/elu_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/elu_layer.hpp
@@ -0,0 +1,86 @@
 
				+#ifndef CAFFE_ELU_LAYER_HPP_
			
 
				+#define CAFFE_ELU_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/neuron_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Exponential Linear Unit non-linearity @f$
			
 
				+ *        y = \left\{
			
 
				+ *        \begin{array}{lr}
			
 
				+ *            x                  & \mathrm{if} \; x > 0 \\
			
 
				+ *            \alpha (\exp(x)-1) & \mathrm{if} \; x \le 0
			
 
				+ *        \end{array} \right.
			
 
				+ *      @f$.  
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class ELULayer : public NeuronLayer<Dtype> {
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @param param provides ELUParameter elu_param,
			
 
				+   *     with ELULayer options:
			
 
				+   *   - alpha (\b optional, default 1).
			
 
				+   *     the value @f$ \alpha @f$ by which controls saturation for negative inputs.
			
 
				+   */
			
 
				+  explicit ELULayer(const LayerParameter& param)
			
 
				+      : NeuronLayer<Dtype>(param) {}
			
 
				+
			
 
				+  virtual inline const char* type() const { return "ELU"; }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$
			
 
				+   * @param top output Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the computed outputs @f$
			
 
				+   *        y = \left\{
			
 
				+   *        \begin{array}{lr}
			
 
				+   *            x                  & \mathrm{if} \; x > 0 \\
			
 
				+   *            \alpha (\exp(x)-1) & \mathrm{if} \; x \le 0
			
 
				+   *        \end{array} \right.
			
 
				+   *      @f$.  
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the error gradient w.r.t. the ELU inputs.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *      respect to the outputs
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
			
 
				+   *      with respect to computed outputs @f$ y @f$
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$; Backward fills their diff with
			
 
				+   *      gradients @f$
			
 
				+   *        \frac{\partial E}{\partial x} = \left\{
			
 
				+   *        \begin{array}{lr}
			
 
				+   *            1           & \mathrm{if} \; x > 0 \\
			
 
				+   *            y + \alpha  & \mathrm{if} \; x \le 0
			
 
				+   *        \end{array} \right.
			
 
				+   *      @f$ if propagate_down[0].
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+};
			
 
				+
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_ELU_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/embed_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/embed_layer.hpp
@@ -0,0 +1,52 @@
 
				+#ifndef CAFFE_EMBED_LAYER_HPP_
			
 
				+#define CAFFE_EMBED_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief A layer for learning "embeddings" of one-hot vector input.
			
 
				+ *        Equivalent to an InnerProductLayer with one-hot vectors as input, but
			
 
				+ *        for efficiency the input is the "hot" index of each column itself.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class EmbedLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit EmbedLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Embed"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  int M_;
			
 
				+  int K_;
			
 
				+  int N_;
			
 
				+  bool bias_term_;
			
 
				+  Blob<Dtype> bias_multiplier_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_EMBED_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/euclidean_loss_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/euclidean_loss_layer.hpp
@@ -0,0 +1,107 @@
 
				+#ifndef CAFFE_EUCLIDEAN_LOSS_LAYER_HPP_
			
 
				+#define CAFFE_EUCLIDEAN_LOSS_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/loss_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Computes the Euclidean (L2) loss @f$
			
 
				+ *          E = \frac{1}{2N} \sum\limits_{n=1}^N \left| \left| \hat{y}_n - y_n
			
 
				+ *        \right| \right|_2^2 @f$ for real-valued regression tasks.
			
 
				+ *
			
 
				+ * @param bottom input Blob vector (length 2)
			
 
				+ *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+ *      the predictions @f$ \hat{y} \in [-\infty, +\infty]@f$
			
 
				+ *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+ *      the targets @f$ y \in [-\infty, +\infty]@f$
			
 
				+ * @param top output Blob vector (length 1)
			
 
				+ *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
			
 
				+ *      the computed Euclidean loss: @f$ E =
			
 
				+ *          \frac{1}{2n} \sum\limits_{n=1}^N \left| \left| \hat{y}_n - y_n
			
 
				+ *        \right| \right|_2^2 @f$
			
 
				+ *
			
 
				+ * This can be used for least-squares regression tasks.  An InnerProductLayer
			
 
				+ * input to a EuclideanLossLayer exactly formulates a linear least squares
			
 
				+ * regression problem. With non-zero weight decay the problem becomes one of
			
 
				+ * ridge regression -- see src/caffe/test/test_gradient_based_solver.cpp for a concrete
			
 
				+ * example wherein we check that the gradients computed for a Net with exactly
			
 
				+ * this structure match hand-computed gradient formulas for ridge regression.
			
 
				+ *
			
 
				+ * (Note: Caffe, and SGD in general, is certainly \b not the best way to solve
			
 
				+ * linear least squares problems! We use it only as an instructive example.)
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class EuclideanLossLayer : public LossLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit EuclideanLossLayer(const LayerParameter& param)
			
 
				+      : LossLayer<Dtype>(param), diff_() {}
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "EuclideanLoss"; }
			
 
				+  /**
			
 
				+   * Unlike most loss layers, in the EuclideanLossLayer we can backpropagate
			
 
				+   * to both inputs -- override to return true and always allow force_backward.
			
 
				+   */
			
 
				+  virtual inline bool AllowForceBackward(const int bottom_index) const {
			
 
				+    return true;
			
 
				+  }
			
 
				+
			
 
				+ protected:
			
 
				+  /// @copydoc EuclideanLossLayer
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the Euclidean error gradient w.r.t. the inputs.
			
 
				+   *
			
 
				+   * Unlike other children of LossLayer, EuclideanLossLayer \b can compute
			
 
				+   * gradients with respect to the label inputs bottom[1] (but still only will
			
 
				+   * if propagate_down[1] is set, due to being produced by learnable parameters
			
 
				+   * or if force_backward is set). In fact, this layer is "commutative" -- the
			
 
				+   * result is the same regardless of the order of the two bottoms.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *      respect to the outputs
			
 
				+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
			
 
				+   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
			
 
				+   *      as @f$ \lambda @f$ is the coefficient of this layer's output
			
 
				+   *      @f$\ell_i@f$ in the overall Net loss
			
 
				+   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
			
 
				+   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
			
 
				+   *      (*Assuming that this top Blob is not used as a bottom (input) by any
			
 
				+   *      other layer of the Net.)
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length 2)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the predictions @f$\hat{y}@f$; Backward fills their diff with
			
 
				+   *      gradients @f$
			
 
				+   *        \frac{\partial E}{\partial \hat{y}} =
			
 
				+   *            \frac{1}{n} \sum\limits_{n=1}^N (\hat{y}_n - y_n)
			
 
				+   *      @f$ if propagate_down[0]
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the targets @f$y@f$; Backward fills their diff with gradients
			
 
				+   *      @f$ \frac{\partial E}{\partial y} =
			
 
				+   *          \frac{1}{n} \sum\limits_{n=1}^N (y_n - \hat{y}_n)
			
 
				+   *      @f$ if propagate_down[1]
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  Blob<Dtype> diff_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_EUCLIDEAN_LOSS_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/exp_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/exp_layer.hpp
@@ -0,0 +1,80 @@
 
				+#ifndef CAFFE_EXP_LAYER_HPP_
			
 
				+#define CAFFE_EXP_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/neuron_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Computes @f$ y = \gamma ^ {\alpha x + \beta} @f$,
			
 
				+ *        as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$,
			
 
				+ *        and base @f$ \gamma @f$.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class ExpLayer : public NeuronLayer<Dtype> {
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @param param provides ExpParameter exp_param,
			
 
				+   *     with ExpLayer options:
			
 
				+   *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
			
 
				+   *   - shift (\b optional, default 0) the shift @f$ \beta @f$
			
 
				+   *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
			
 
				+   *         the base @f$ \gamma @f$
			
 
				+   */
			
 
				+  explicit ExpLayer(const LayerParameter& param)
			
 
				+      : NeuronLayer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Exp"; }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$
			
 
				+   * @param top output Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the computed outputs @f$
			
 
				+   *        y = \gamma ^ {\alpha x + \beta}
			
 
				+   *      @f$
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the error gradient w.r.t. the exp inputs.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *      respect to the outputs
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
			
 
				+   *      with respect to computed outputs @f$ y @f$
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$; Backward fills their diff with
			
 
				+   *      gradients @f$
			
 
				+   *        \frac{\partial E}{\partial x} =
			
 
				+   *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
			
 
				+   *      @f$ if propagate_down[0]
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  Dtype inner_scale_, outer_scale_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_EXP_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/filter_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/filter_layer.hpp
@@ -0,0 +1,77 @@
 
				+#ifndef CAFFE_FILTER_LAYER_HPP_
			
 
				+#define CAFFE_FILTER_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Takes two+ Blobs, interprets last Blob as a selector and
			
 
				+ *  filter remaining Blobs accordingly with selector data (0 means that
			
 
				+ * the corresponding item has to be filtered, non-zero means that corresponding
			
 
				+ * item needs to stay).
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class FilterLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit FilterLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Filter"; }
			
 
				+  virtual inline int MinBottomBlobs() const { return 2; }
			
 
				+  virtual inline int MinTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 2+)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs to be filtered @f$ x_1 @f$
			
 
				+   *   -# ...
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs to be filtered @f$ x_K @f$
			
 
				+   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
			
 
				+   *      the selector blob
			
 
				+   * @param top output Blob vector (length 1+)
			
 
				+   *   -# @f$ (S \times C \times H \times W) @f$ ()
			
 
				+   *        the filtered output @f$ x_1 @f$
			
 
				+   *        where S is the number of items
			
 
				+   *        that haven't been filtered
			
 
				+   *      @f$ (S \times C \times H \times W) @f$
			
 
				+   *        the filtered output @f$ x_K @f$
			
 
				+   *        where S is the number of items
			
 
				+   *        that haven't been filtered
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+    const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the error gradient w.r.t. the forwarded inputs.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1+), providing the error gradient with
			
 
				+   *        respect to the outputs
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length 2+), into which the top error
			
 
				+   *        gradient is copied
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  bool first_reshape_;
			
 
				+  vector<int> indices_to_forward_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_FILTER_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/flatten_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/flatten_layer.hpp
@@ -0,0 +1,61 @@
 
				+#ifndef CAFFE_FLATTEN_LAYER_HPP_
			
 
				+#define CAFFE_FLATTEN_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Reshapes the input Blob into flat vectors.
			
 
				+ *
			
 
				+ * Note: because this layer does not change the input values -- merely the
			
 
				+ * dimensions -- it can simply copy the input. The copy happens "virtually"
			
 
				+ * (thus taking effectively 0 real time) by setting, in Forward, the data
			
 
				+ * pointer of the top Blob to that of the bottom Blob (see Blob::ShareData),
			
 
				+ * and in Backward, the diff pointer of the bottom Blob to that of the top Blob
			
 
				+ * (see Blob::ShareDiff).
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class FlattenLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit FlattenLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Flatten"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 2+)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs
			
 
				+   * @param top output Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times CHW \times 1 \times 1) @f$
			
 
				+   *      the outputs -- i.e., the (virtually) copied, flattened inputs
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the error gradient w.r.t. the concatenate inputs.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *        respect to the outputs
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length K), into which the top error
			
 
				+   *        gradient is (virtually) copied
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_FLATTEN_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/hdf5_data_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/hdf5_data_layer.hpp
@@ -0,0 +1,64 @@
 
				+#ifndef CAFFE_HDF5_DATA_LAYER_HPP_
			
 
				+#define CAFFE_HDF5_DATA_LAYER_HPP_
			
 
				+
			
 
				+#include "hdf5.h"
			
 
				+
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/base_data_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Provides data to the Net from HDF5 files.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class HDF5DataLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit HDF5DataLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param), offset_() {}
			
 
				+  virtual ~HDF5DataLayer();
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  // Data layers have no bottoms, so reshaping is trivial.
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {}
			
 
				+
			
 
				+  virtual inline const char* type() const { return "HDF5Data"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 0; }
			
 
				+  virtual inline int MinTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  void Next();
			
 
				+  bool Skip();
			
 
				+
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
			
 
				+  virtual void LoadHDF5FileData(const char* filename);
			
 
				+
			
 
				+  std::vector<std::string> hdf_filenames_;
			
 
				+  unsigned int num_files_;
			
 
				+  unsigned int current_file_;
			
 
				+  hsize_t current_row_;
			
 
				+  std::vector<shared_ptr<Blob<Dtype> > > hdf_blobs_;
			
 
				+  std::vector<unsigned int> data_permutation_;
			
 
				+  std::vector<unsigned int> file_permutation_;
			
 
				+  uint64_t offset_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_HDF5_DATA_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/hdf5_output_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/hdf5_output_layer.hpp
@@ -0,0 +1,62 @@
 
				+#ifndef CAFFE_HDF5_OUTPUT_LAYER_HPP_
			
 
				+#define CAFFE_HDF5_OUTPUT_LAYER_HPP_
			
 
				+
			
 
				+#include "hdf5.h"
			
 
				+
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+#define HDF5_DATA_DATASET_NAME "data"
			
 
				+#define HDF5_DATA_LABEL_NAME "label"
			
 
				+
			
 
				+/**
			
 
				+ * @brief Write blobs to disk as HDF5 files.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class HDF5OutputLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit HDF5OutputLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param), file_opened_(false) {}
			
 
				+  virtual ~HDF5OutputLayer();
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  // Data layers have no bottoms, so reshaping is trivial.
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {}
			
 
				+
			
 
				+  virtual inline const char* type() const { return "HDF5Output"; }
			
 
				+  // TODO: no limit on the number of blobs
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 2; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 0; }
			
 
				+
			
 
				+  inline std::string file_name() const { return file_name_; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void SaveBlobs();
			
 
				+
			
 
				+  bool file_opened_;
			
 
				+  std::string file_name_;
			
 
				+  hid_t file_id_;
			
 
				+  Blob<Dtype> data_blob_;
			
 
				+  Blob<Dtype> label_blob_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_HDF5_OUTPUT_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/hinge_loss_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/hinge_loss_layer.hpp
@@ -0,0 +1,104 @@
 
				+#ifndef CAFFE_HINGE_LOSS_LAYER_HPP_
			
 
				+#define CAFFE_HINGE_LOSS_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/loss_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Computes the hinge loss for a one-of-many classification task.
			
 
				+ *
			
 
				+ * @param bottom input Blob vector (length 2)
			
 
				+ *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+ *      the predictions @f$ t @f$, a Blob with values in
			
 
				+ *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
			
 
				+ *      the @f$ K = CHW @f$ classes. In an SVM, @f$ t @f$ is the result of
			
 
				+ *      taking the inner product @f$ X^T W @f$ of the D-dimensional features
			
 
				+ *      @f$ X \in \mathcal{R}^{D \times N} @f$ and the learned hyperplane
			
 
				+ *      parameters @f$ W \in \mathcal{R}^{D \times K} @f$, so a Net with just
			
 
				+ *      an InnerProductLayer (with num_output = D) providing predictions to a
			
 
				+ *      HingeLossLayer and no other learnable parameters or losses is
			
 
				+ *      equivalent to an SVM.
			
 
				+ *   -# @f$ (N \times 1 \times 1 \times 1) @f$
			
 
				+ *      the labels @f$ l @f$, an integer-valued Blob with values
			
 
				+ *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
			
 
				+ *      indicating the correct class label among the @f$ K @f$ classes
			
 
				+ * @param top output Blob vector (length 1)
			
 
				+ *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
			
 
				+ *      the computed hinge loss: @f$ E =
			
 
				+ *        \frac{1}{N} \sum\limits_{n=1}^N \sum\limits_{k=1}^K
			
 
				+ *        [\max(0, 1 - \delta\{l_n = k\} t_{nk})] ^ p
			
 
				+ *      @f$, for the @f$ L^p @f$ norm
			
 
				+ *      (defaults to @f$ p = 1 @f$, the L1 norm; L2 norm, as in L2-SVM,
			
 
				+ *      is also available), and @f$
			
 
				+ *      \delta\{\mathrm{condition}\} = \left\{
			
 
				+ *         \begin{array}{lr}
			
 
				+ *            1 & \mbox{if condition} \\
			
 
				+ *           -1 & \mbox{otherwise}
			
 
				+ *         \end{array} \right.
			
 
				+ *      @f$
			
 
				+ *
			
 
				+ * In an SVM, @f$ t \in \mathcal{R}^{N \times K} @f$ is the result of taking
			
 
				+ * the inner product @f$ X^T W @f$ of the features
			
 
				+ * @f$ X \in \mathcal{R}^{D \times N} @f$
			
 
				+ * and the learned hyperplane parameters
			
 
				+ * @f$ W \in \mathcal{R}^{D \times K} @f$. So, a Net with just an
			
 
				+ * InnerProductLayer (with num_output = @f$k@f$) providing predictions to a
			
 
				+ * HingeLossLayer is equivalent to an SVM (assuming it has no other learned
			
 
				+ * outside the InnerProductLayer and no other losses outside the
			
 
				+ * HingeLossLayer).
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class HingeLossLayer : public LossLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit HingeLossLayer(const LayerParameter& param)
			
 
				+      : LossLayer<Dtype>(param) {}
			
 
				+
			
 
				+  virtual inline const char* type() const { return "HingeLoss"; }
			
 
				+
			
 
				+ protected:
			
 
				+  /// @copydoc HingeLossLayer
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the hinge loss error gradient w.r.t. the predictions.
			
 
				+   *
			
 
				+   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
			
 
				+   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
			
 
				+   * if propagate_down[1] is set.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *      respect to the outputs
			
 
				+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
			
 
				+   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
			
 
				+   *      as @f$ \lambda @f$ is the coefficient of this layer's output
			
 
				+   *      @f$\ell_i@f$ in the overall Net loss
			
 
				+   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
			
 
				+   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
			
 
				+   *      (*Assuming that this top Blob is not used as a bottom (input) by any
			
 
				+   *      other layer of the Net.)
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   *      propagate_down[1] must be false as we can't compute gradients with
			
 
				+   *      respect to the labels.
			
 
				+   * @param bottom input Blob vector (length 2)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the predictions @f$t@f$; Backward computes diff
			
 
				+   *      @f$ \frac{\partial E}{\partial t} @f$
			
 
				+   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
			
 
				+   *      the labels -- ignored as we can't compute their error gradients
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+};
			
 
				+
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_HINGE_LOSS_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/im2col_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/im2col_layer.hpp
@@ -0,0 +1,65 @@
 
				+#ifndef CAFFE_IM2COL_LAYER_HPP_
			
 
				+#define CAFFE_IM2COL_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief A helper for image operations that rearranges image regions into
			
 
				+ *        column vectors.  Used by ConvolutionLayer to perform convolution
			
 
				+ *        by matrix multiplication.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class Im2colLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit Im2colLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Im2col"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  /// @brief The spatial dimensions of a filter kernel.
			
 
				+  Blob<int> kernel_shape_;
			
 
				+  /// @brief The spatial dimensions of the stride.
			
 
				+  Blob<int> stride_;
			
 
				+  /// @brief The spatial dimensions of the padding.
			
 
				+  Blob<int> pad_;
			
 
				+  /// @brief The spatial dimensions of the dilation.
			
 
				+  Blob<int> dilation_;
			
 
				+
			
 
				+  int num_spatial_axes_;
			
 
				+  int bottom_dim_;
			
 
				+  int top_dim_;
			
 
				+
			
 
				+  int channel_axis_;
			
 
				+  int num_;
			
 
				+  int channels_;
			
 
				+
			
 
				+  bool force_nd_im2col_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_IM2COL_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/image_data_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/image_data_layer.hpp
@@ -0,0 +1,47 @@
 
				+#ifndef CAFFE_IMAGE_DATA_LAYER_HPP_
			
 
				+#define CAFFE_IMAGE_DATA_LAYER_HPP_
			
 
				+
			
 
				+#include <string>
			
 
				+#include <utility>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/data_transformer.hpp"
			
 
				+#include "caffe/internal_thread.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/layers/base_data_layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Provides data to the Net from image files.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class ImageDataLayer : public BasePrefetchingDataLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit ImageDataLayer(const LayerParameter& param)
			
 
				+      : BasePrefetchingDataLayer<Dtype>(param) {}
			
 
				+  virtual ~ImageDataLayer();
			
 
				+  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "ImageData"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 0; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 2; }
			
 
				+
			
 
				+ protected:
			
 
				+  shared_ptr<Caffe::RNG> prefetch_rng_;
			
 
				+  virtual void ShuffleImages();
			
 
				+  virtual void load_batch(Batch<Dtype>* batch);
			
 
				+
			
 
				+  vector<std::pair<std::string, int> > lines_;
			
 
				+  int lines_id_;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_IMAGE_DATA_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/infogain_loss_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/infogain_loss_layer.hpp
@@ -0,0 +1,146 @@
 
				+#ifndef CAFFE_INFOGAIN_LOSS_LAYER_HPP_
			
 
				+#define CAFFE_INFOGAIN_LOSS_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/loss_layer.hpp"
			
 
				+#include "caffe/layers/softmax_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief A generalization of SoftmaxWithLossLayer that takes an
			
 
				+ *        "information gain" (infogain) matrix specifying the "value" of all label
			
 
				+ *        pairs.
			
 
				+ *
			
 
				+ * Equivalent to the SoftmaxWithLossLayer if the infogain matrix is the
			
 
				+ * identity.
			
 
				+ *
			
 
				+ * @param bottom input Blob vector (length 2-3)
			
 
				+ *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+ *      the predictions @f$ x @f$, a Blob with values in
			
 
				+ *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
			
 
				+ *      the @f$ K = CHW @f$ classes. This layer maps these scores to a
			
 
				+ *      probability distribution over classes using the softmax function
			
 
				+ *      @f$ \hat{p}_{nk} = \exp(x_{nk}) /
			
 
				+ *      \left[\sum_{k'} \exp(x_{nk'})\right] @f$ (see SoftmaxLayer).
			
 
				+ *   -# @f$ (N \times 1 \times 1 \times 1) @f$
			
 
				+ *      the labels @f$ l @f$, an integer-valued Blob with values
			
 
				+ *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
			
 
				+ *      indicating the correct class label among the @f$ K @f$ classes
			
 
				+ *   -# @f$ (1 \times 1 \times K \times K) @f$
			
 
				+ *      (\b optional) the infogain matrix @f$ H @f$.  This must be provided as
			
 
				+ *      the third bottom blob input if not provided as the infogain_mat in the
			
 
				+ *      InfogainLossParameter. If @f$ H = I @f$, this layer is equivalent to the
			
 
				+ *      SoftmaxWithLossLayer.
			
 
				+ * @param top output Blob vector (length 1)
			
 
				+ *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
			
 
				+ *      the computed infogain multinomial logistic loss: @f$ E =
			
 
				+ *        \frac{-1}{N} \sum\limits_{n=1}^N H_{l_n} \log(\hat{p}_n) =
			
 
				+ *        \frac{-1}{N} \sum\limits_{n=1}^N \sum\limits_{k=1}^{K} H_{l_n,k}
			
 
				+ *        \log(\hat{p}_{n,k})
			
 
				+ *      @f$, where @f$ H_{l_n} @f$ denotes row @f$l_n@f$ of @f$H@f$.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class InfogainLossLayer : public LossLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit InfogainLossLayer(const LayerParameter& param)
			
 
				+      : LossLayer<Dtype>(param), infogain_() {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  // InfogainLossLayer takes 2-3 bottom Blobs; if there are 3 the third should
			
 
				+  // be the infogain matrix.  (Otherwise the infogain matrix is loaded from a
			
 
				+  // file specified by LayerParameter.)
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return -1; }
			
 
				+  virtual inline int MinBottomBlobs() const { return 2; }
			
 
				+  virtual inline int MaxBottomBlobs() const { return 3; }
			
 
				+
			
 
				+  // InfogainLossLayer computes softmax prob internally.
			
 
				+  // optional second "top" outputs the softmax prob
			
 
				+  virtual inline int ExactNumTopBlobs() const { return -1; }
			
 
				+  virtual inline int MinTopBlobs() const { return 1; }
			
 
				+  virtual inline int MaxTopBlobs() const { return 2; }
			
 
				+
			
 
				+  virtual inline const char* type() const { return "InfogainLoss"; }
			
 
				+
			
 
				+ protected:
			
 
				+  /// @copydoc InfogainLossLayer
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the infogain loss error gradient w.r.t. the predictions.
			
 
				+   *
			
 
				+   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
			
 
				+   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
			
 
				+   * if propagate_down[1] is set. (The same applies to the infogain matrix, if
			
 
				+   * provided as bottom[2] rather than in the layer_param.)
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient
			
 
				+   *      with respect to the outputs
			
 
				+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
			
 
				+   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
			
 
				+   *      as @f$ \lambda @f$ is the coefficient of this layer's output
			
 
				+   *      @f$\ell_i@f$ in the overall Net loss
			
 
				+   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
			
 
				+   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
			
 
				+   *      (*Assuming that this top Blob is not used as a bottom (input) by any
			
 
				+   *      other layer of the Net.)
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   *      propagate_down[1] must be false as we can't compute gradients with
			
 
				+   *      respect to the labels (similarly for propagate_down[2] and the
			
 
				+   *      infogain matrix, if provided as bottom[2])
			
 
				+   * @param bottom input Blob vector (length 2-3)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the predictions @f$ x @f$; Backward computes diff
			
 
				+   *      @f$ \frac{\partial E}{\partial x} @f$
			
 
				+   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
			
 
				+   *      the labels -- ignored as we can't compute their error gradients
			
 
				+   *   -# @f$ (1 \times 1 \times K \times K) @f$
			
 
				+   *      (\b optional) the information gain matrix -- ignored as its error
			
 
				+   *      gradient computation is not implemented.
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  /// Read the normalization mode parameter and compute the normalizer based
			
 
				+  /// on the blob size.  If normalization_mode is VALID, the count of valid
			
 
				+  /// outputs will be read from valid_count, unless it is -1 in which case
			
 
				+  /// all outputs are assumed to be valid.
			
 
				+  virtual Dtype get_normalizer(
			
 
				+      LossParameter_NormalizationMode normalization_mode, int valid_count);
			
 
				+  /// fill sum_rows_H_ according to matrix H
			
 
				+  virtual void sum_rows_of_H(const Blob<Dtype>* H);
			
 
				+
			
 
				+  /// The internal SoftmaxLayer used to map predictions to a distribution.
			
 
				+  shared_ptr<Layer<Dtype> > softmax_layer_;
			
 
				+  /// prob stores the output probability predictions from the SoftmaxLayer.
			
 
				+  Blob<Dtype> prob_;
			
 
				+  /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
			
 
				+  vector<Blob<Dtype>*> softmax_bottom_vec_;
			
 
				+  /// top vector holder used in call to the underlying SoftmaxLayer::Forward
			
 
				+  vector<Blob<Dtype>*> softmax_top_vec_;
			
 
				+
			
 
				+  Blob<Dtype> infogain_;
			
 
				+  Blob<Dtype> sum_rows_H_;  // cache the row sums of H.
			
 
				+
			
 
				+  /// Whether to ignore instances with a certain label.
			
 
				+  bool has_ignore_label_;
			
 
				+  /// The label indicating that an instance should be ignored.
			
 
				+  int ignore_label_;
			
 
				+  /// How to normalize the output loss.
			
 
				+  LossParameter_NormalizationMode normalization_;
			
 
				+
			
 
				+  int infogain_axis_, outer_num_, inner_num_, num_labels_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_INFOGAIN_LOSS_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/inner_product_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/inner_product_layer.hpp
@@ -0,0 +1,52 @@
 
				+#ifndef CAFFE_INNER_PRODUCT_LAYER_HPP_
			
 
				+#define CAFFE_INNER_PRODUCT_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Also known as a "fully-connected" layer, computes an inner product
			
 
				+ *        with a set of learned weights, and (optionally) adds biases.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class InnerProductLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit InnerProductLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "InnerProduct"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  int M_;
			
 
				+  int K_;
			
 
				+  int N_;
			
 
				+  bool bias_term_;
			
 
				+  Blob<Dtype> bias_multiplier_;
			
 
				+  bool transpose_;  ///< if true, assume transposed weights
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_INNER_PRODUCT_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/input_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/input_layer.hpp
@@ -0,0 +1,42 @@
 
				+#ifndef CAFFE_INPUT_LAYER_HPP_
			
 
				+#define CAFFE_INPUT_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Provides data to the Net by assigning tops directly.
			
 
				+ *
			
 
				+ * This data layer is a container that merely holds the data assigned to it;
			
 
				+ * forward, backward, and reshape are all no-ops.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class InputLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit InputLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  // Data layers have no bottoms, so reshaping is trivial.
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {}
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Input"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 0; }
			
 
				+  virtual inline int MinTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {}
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_INPUT_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/log_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/log_layer.hpp
@@ -0,0 +1,82 @@
 
				+#ifndef CAFFE_LOG_LAYER_HPP_
			
 
				+#define CAFFE_LOG_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/neuron_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Computes @f$ y = log_{\gamma}(\alpha x + \beta) @f$,
			
 
				+ *        as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$,
			
 
				+ *        and base @f$ \gamma @f$.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class LogLayer : public NeuronLayer<Dtype> {
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @param param provides LogParameter log_param,
			
 
				+   *     with LogLayer options:
			
 
				+   *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
			
 
				+   *   - shift (\b optional, default 0) the shift @f$ \beta @f$
			
 
				+   *   - base (\b optional, default -1 for a value of @f$ e \approx 2.718 @f$)
			
 
				+   *         the base @f$ \gamma @f$
			
 
				+   */
			
 
				+  explicit LogLayer(const LayerParameter& param)
			
 
				+      : NeuronLayer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Log"; }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$
			
 
				+   * @param top output Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the computed outputs @f$
			
 
				+   *        y = log_{\gamma}(\alpha x + \beta)
			
 
				+   *      @f$
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the error gradient w.r.t. the exp inputs.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *      respect to the outputs
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
			
 
				+   *      with respect to computed outputs @f$ y @f$
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$; Backward fills their diff with
			
 
				+   *      gradients @f$
			
 
				+   *        \frac{\partial E}{\partial x} =
			
 
				+   *            \frac{\partial E}{\partial y} y \alpha \log_e(gamma)
			
 
				+   *      @f$ if propagate_down[0]
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  Dtype base_scale_;
			
 
				+  Dtype input_scale_, input_shift_;
			
 
				+  Dtype backward_num_scale_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_LOG_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/loss_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/loss_layer.hpp
@@ -0,0 +1,53 @@
 
				+#ifndef CAFFE_LOSS_LAYER_HPP_
			
 
				+#define CAFFE_LOSS_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+const float kLOG_THRESHOLD = 1e-20;
			
 
				+
			
 
				+/**
			
 
				+ * @brief An interface for Layer%s that take two Blob%s as input -- usually
			
 
				+ *        (1) predictions and (2) ground-truth labels -- and output a
			
 
				+ *        singleton Blob representing the loss.
			
 
				+ *
			
 
				+ * LossLayers are typically only capable of backpropagating to their first input
			
 
				+ * -- the predictions.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class LossLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit LossLayer(const LayerParameter& param)
			
 
				+     : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(
			
 
				+      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(
			
 
				+      const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 2; }
			
 
				+
			
 
				+  /**
			
 
				+   * @brief For convenience and backwards compatibility, instruct the Net to
			
 
				+   *        automatically allocate a single top Blob for LossLayers, into which
			
 
				+   *        they output their singleton loss, (even if the user didn't specify
			
 
				+   *        one in the prototxt, etc.).
			
 
				+   */
			
 
				+  virtual inline bool AutoTopBlobs() const { return true; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+  /**
			
 
				+   * We usually cannot backpropagate to the labels; ignore force_backward for
			
 
				+   * these inputs.
			
 
				+   */
			
 
				+  virtual inline bool AllowForceBackward(const int bottom_index) const {
			
 
				+    return bottom_index != 1;
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_LOSS_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/lrn_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/lrn_layer.hpp
@@ -0,0 +1,94 @@
 
				+#ifndef CAFFE_LRN_LAYER_HPP_
			
 
				+#define CAFFE_LRN_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/eltwise_layer.hpp"
			
 
				+#include "caffe/layers/pooling_layer.hpp"
			
 
				+#include "caffe/layers/power_layer.hpp"
			
 
				+#include "caffe/layers/split_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Normalize the input in a local region across or within feature maps.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class LRNLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit LRNLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "LRN"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  virtual void CrossChannelForward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void CrossChannelForward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void WithinChannelForward(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void CrossChannelBackward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void CrossChannelBackward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void WithinChannelBackward(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  int size_;
			
 
				+  int pre_pad_;
			
 
				+  Dtype alpha_;
			
 
				+  Dtype beta_;
			
 
				+  Dtype k_;
			
 
				+  int num_;
			
 
				+  int channels_;
			
 
				+  int height_;
			
 
				+  int width_;
			
 
				+
			
 
				+  // Fields used for normalization ACROSS_CHANNELS
			
 
				+  // scale_ stores the intermediate summing results
			
 
				+  Blob<Dtype> scale_;
			
 
				+
			
 
				+  // Fields used for normalization WITHIN_CHANNEL
			
 
				+  shared_ptr<SplitLayer<Dtype> > split_layer_;
			
 
				+  vector<Blob<Dtype>*> split_top_vec_;
			
 
				+  shared_ptr<PowerLayer<Dtype> > square_layer_;
			
 
				+  Blob<Dtype> square_input_;
			
 
				+  Blob<Dtype> square_output_;
			
 
				+  vector<Blob<Dtype>*> square_bottom_vec_;
			
 
				+  vector<Blob<Dtype>*> square_top_vec_;
			
 
				+  shared_ptr<PoolingLayer<Dtype> > pool_layer_;
			
 
				+  Blob<Dtype> pool_output_;
			
 
				+  vector<Blob<Dtype>*> pool_top_vec_;
			
 
				+  shared_ptr<PowerLayer<Dtype> > power_layer_;
			
 
				+  Blob<Dtype> power_output_;
			
 
				+  vector<Blob<Dtype>*> power_top_vec_;
			
 
				+  shared_ptr<EltwiseLayer<Dtype> > product_layer_;
			
 
				+  Blob<Dtype> product_input_;
			
 
				+  vector<Blob<Dtype>*> product_bottom_vec_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_LRN_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/lstm_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/lstm_layer.hpp
@@ -0,0 +1,154 @@
 
				+#ifndef CAFFE_LSTM_LAYER_HPP_
			
 
				+#define CAFFE_LSTM_LAYER_HPP_
			
 
				+
			
 
				+#include <string>
			
 
				+#include <utility>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/common.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/layers/recurrent_layer.hpp"
			
 
				+#include "caffe/net.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+template <typename Dtype> class RecurrentLayer;
			
 
				+
			
 
				+/**
			
 
				+ * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM)
			
 
				+ *        [1] style recurrent neural network (RNN). Implemented by unrolling
			
 
				+ *        the LSTM computation through time.
			
 
				+ *
			
 
				+ * The specific architecture used in this implementation is as described in
			
 
				+ * "Learning to Execute" [2], reproduced below:
			
 
				+ *     i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ]
			
 
				+ *     f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ]
			
 
				+ *     o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ]
			
 
				+ *     g_t :=    \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ]
			
 
				+ *     c_t := (f_t .* c_{t-1}) + (i_t .* g_t)
			
 
				+ *     h_t := o_t .* \tanh[c_t]
			
 
				+ * In the implementation, the i, f, o, and g computations are performed as a
			
 
				+ * single inner product.
			
 
				+ *
			
 
				+ * Notably, this implementation lacks the "diagonal" gates, as used in the
			
 
				+ * LSTM architectures described by Alex Graves [3] and others.
			
 
				+ *
			
 
				+ * [1] Hochreiter, Sepp, and Schmidhuber, Jürgen. "Long short-term memory."
			
 
				+ *     Neural Computation 9, no. 8 (1997): 1735-1780.
			
 
				+ *
			
 
				+ * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute."
			
 
				+ *     arXiv preprint arXiv:1410.4615 (2014).
			
 
				+ *
			
 
				+ * [3] Graves, Alex. "Generating sequences with recurrent neural networks."
			
 
				+ *     arXiv preprint arXiv:1308.0850 (2013).
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class LSTMLayer : public RecurrentLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit LSTMLayer(const LayerParameter& param)
			
 
				+      : RecurrentLayer<Dtype>(param) {}
			
 
				+
			
 
				+  virtual inline const char* type() const { return "LSTM"; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void FillUnrolledNet(NetParameter* net_param) const;
			
 
				+  virtual void RecurrentInputBlobNames(vector<string>* names) const;
			
 
				+  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
			
 
				+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const;
			
 
				+  virtual void OutputBlobNames(vector<string>* names) const;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @brief A helper for LSTMLayer: computes a single timestep of the
			
 
				+ *        non-linearity of the LSTM, producing the updated cell and hidden
			
 
				+ *        states.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class LSTMUnitLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit LSTMUnitLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "LSTMUnit"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 3; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 2; }
			
 
				+
			
 
				+  virtual inline bool AllowForceBackward(const int bottom_index) const {
			
 
				+    // Can't propagate to sequence continuation indicators.
			
 
				+    return bottom_index != 2;
			
 
				+  }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 3)
			
 
				+   *   -# @f$ (1 \times N \times D) @f$
			
 
				+   *      the previous timestep cell state @f$ c_{t-1} @f$
			
 
				+   *   -# @f$ (1 \times N \times 4D) @f$
			
 
				+   *      the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$
			
 
				+   *   -# @f$ (1 \times N) @f$
			
 
				+   *      the sequence continuation indicators  @f$ \delta_t @f$
			
 
				+   * @param top output Blob vector (length 2)
			
 
				+   *   -# @f$ (1 \times N \times D) @f$
			
 
				+   *      the updated cell state @f$ c_t @f$, computed as:
			
 
				+   *          i_t := \sigmoid[i_t']
			
 
				+   *          f_t := \sigmoid[f_t']
			
 
				+   *          o_t := \sigmoid[o_t']
			
 
				+   *          g_t := \tanh[g_t']
			
 
				+   *          c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
			
 
				+   *   -# @f$ (1 \times N \times D) @f$
			
 
				+   *      the updated hidden state @f$ h_t @f$, computed as:
			
 
				+   *          h_t := o_t .* \tanh[c_t]
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the error gradient w.r.t. the LSTMUnit inputs.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 2), providing the error gradient with
			
 
				+   *        respect to the outputs
			
 
				+   *   -# @f$ (1 \times N \times D) @f$:
			
 
				+   *      containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$
			
 
				+   *      with respect to the updated cell state @f$ c_t @f$
			
 
				+   *   -# @f$ (1 \times N \times D) @f$:
			
 
				+   *      containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$
			
 
				+   *      with respect to the updated cell state @f$ h_t @f$
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length 3), into which the error gradients
			
 
				+   *        with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate
			
 
				+   *        inputs are computed.  Computatation of the error gradients w.r.t.
			
 
				+   *        the sequence indicators is not implemented.
			
 
				+   *   -# @f$ (1 \times N \times D) @f$
			
 
				+   *      the error gradient w.r.t. the previous timestep cell state
			
 
				+   *      @f$ c_{t-1} @f$
			
 
				+   *   -# @f$ (1 \times N \times 4D) @f$
			
 
				+   *      the error gradient w.r.t. the "gate inputs"
			
 
				+   *      @f$ [
			
 
				+   *          \frac{\partial E}{\partial i_t}
			
 
				+   *          \frac{\partial E}{\partial f_t}
			
 
				+   *          \frac{\partial E}{\partial o_t}
			
 
				+   *          \frac{\partial E}{\partial g_t}
			
 
				+   *          ] @f$
			
 
				+   *   -# @f$ (1 \times 1 \times N) @f$
			
 
				+   *      the gradient w.r.t. the sequence continuation indicators
			
 
				+   *      @f$ \delta_t @f$ is currently not computed.
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  /// @brief The hidden and output dimension.
			
 
				+  int hidden_dim_;
			
 
				+  Blob<Dtype> X_acts_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_LSTM_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/memory_data_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/memory_data_layer.hpp
@@ -0,0 +1,63 @@
 
				+#ifndef CAFFE_MEMORY_DATA_LAYER_HPP_
			
 
				+#define CAFFE_MEMORY_DATA_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/base_data_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Provides data to the Net from memory.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class MemoryDataLayer : public BaseDataLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit MemoryDataLayer(const LayerParameter& param)
			
 
				+      : BaseDataLayer<Dtype>(param), has_new_data_(false) {}
			
 
				+  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "MemoryData"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 0; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 2; }
			
 
				+
			
 
				+  virtual void AddDatumVector(const vector<Datum>& datum_vector);
			
 
				+#ifdef USE_OPENCV
			
 
				+  virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
			
 
				+      const vector<int>& labels);
			
 
				+#endif  // USE_OPENCV
			
 
				+
			
 
				+  // Reset should accept const pointers, but can't, because the memory
			
 
				+  //  will be given to Blob, which is mutable
			
 
				+  void Reset(Dtype* data, Dtype* label, int n);
			
 
				+  void set_batch_size(int new_size);
			
 
				+
			
 
				+  int batch_size() { return batch_size_; }
			
 
				+  int channels() { return channels_; }
			
 
				+  int height() { return height_; }
			
 
				+  int width() { return width_; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  int batch_size_, channels_, height_, width_, size_;
			
 
				+  Dtype* data_;
			
 
				+  Dtype* labels_;
			
 
				+  int n_;
			
 
				+  size_t pos_;
			
 
				+  Blob<Dtype> added_data_;
			
 
				+  Blob<Dtype> added_label_;
			
 
				+  bool has_new_data_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_MEMORY_DATA_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/multinomial_logistic_loss_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/multinomial_logistic_loss_layer.hpp
@@ -0,0 +1,92 @@
 
				+#ifndef CAFFE_MULTINOMIAL_LOGISTIC_LOSS_LAYER_HPP_
			
 
				+#define CAFFE_MULTINOMIAL_LOGISTIC_LOSS_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/loss_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Computes the multinomial logistic loss for a one-of-many
			
 
				+ *        classification task, directly taking a predicted probability
			
 
				+ *        distribution as input.
			
 
				+ *
			
 
				+ * When predictions are not already a probability distribution, you should
			
 
				+ * instead use the SoftmaxWithLossLayer, which maps predictions to a
			
 
				+ * distribution using the SoftmaxLayer, before computing the multinomial
			
 
				+ * logistic loss. The SoftmaxWithLossLayer should be preferred over separate
			
 
				+ * SoftmaxLayer + MultinomialLogisticLossLayer
			
 
				+ * as its gradient computation is more numerically stable.
			
 
				+ *
			
 
				+ * @param bottom input Blob vector (length 2)
			
 
				+ *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+ *      the predictions @f$ \hat{p} @f$, a Blob with values in
			
 
				+ *      @f$ [0, 1] @f$ indicating the predicted probability of each of the
			
 
				+ *      @f$ K = CHW @f$ classes.  Each prediction vector @f$ \hat{p}_n @f$
			
 
				+ *      should sum to 1 as in a probability distribution: @f$
			
 
				+ *      \forall n \sum\limits_{k=1}^K \hat{p}_{nk} = 1 @f$.
			
 
				+ *   -# @f$ (N \times 1 \times 1 \times 1) @f$
			
 
				+ *      the labels @f$ l @f$, an integer-valued Blob with values
			
 
				+ *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
			
 
				+ *      indicating the correct class label among the @f$ K @f$ classes
			
 
				+ * @param top output Blob vector (length 1)
			
 
				+ *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
			
 
				+ *      the computed multinomial logistic loss: @f$ E =
			
 
				+ *        \frac{-1}{N} \sum\limits_{n=1}^N \log(\hat{p}_{n,l_n})
			
 
				+ *      @f$
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class MultinomialLogisticLossLayer : public LossLayer<Dtype> {
			
 
				+ public:
			
 
				+  explicit MultinomialLogisticLossLayer(const LayerParameter& param)
			
 
				+      : LossLayer<Dtype>(param) {}
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "MultinomialLogisticLoss"; }
			
 
				+
			
 
				+ protected:
			
 
				+  /// @copydoc MultinomialLogisticLossLayer
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the multinomial logistic loss error gradient w.r.t. the
			
 
				+   *        predictions.
			
 
				+   *
			
 
				+   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
			
 
				+   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
			
 
				+   * if propagate_down[1] is set.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *      respect to the outputs
			
 
				+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
			
 
				+   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
			
 
				+   *      as @f$ \lambda @f$ is the coefficient of this layer's output
			
 
				+   *      @f$\ell_i@f$ in the overall Net loss
			
 
				+   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
			
 
				+   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
			
 
				+   *      (*Assuming that this top Blob is not used as a bottom (input) by any
			
 
				+   *      other layer of the Net.)
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   *      propagate_down[1] must be false as we can't compute gradients with
			
 
				+   *      respect to the labels.
			
 
				+   * @param bottom input Blob vector (length 2)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the predictions @f$ \hat{p} @f$; Backward computes diff
			
 
				+   *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
			
 
				+   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
			
 
				+   *      the labels -- ignored as we can't compute their error gradients
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_MULTINOMIAL_LOGISTIC_LOSS_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/mvn_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/mvn_layer.hpp
@@ -0,0 +1,48 @@
 
				+#ifndef CAFFE_MVN_LAYER_HPP_
			
 
				+#define CAFFE_MVN_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Normalizes the input to have 0-mean and/or unit (1) variance.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class MVNLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit MVNLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "MVN"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  Blob<Dtype> mean_, variance_, temp_;
			
 
				+
			
 
				+  /// sum_multiplier is used to carry out sum using BLAS
			
 
				+  Blob<Dtype> sum_multiplier_;
			
 
				+  Dtype eps_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_MVN_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/neuron_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/neuron_layer.hpp
@@ -0,0 +1,32 @@
 
				+#ifndef CAFFE_NEURON_LAYER_HPP_
			
 
				+#define CAFFE_NEURON_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief An interface for layers that take one blob as input (@f$ x @f$)
			
 
				+ *        and produce one equally-sized blob as output (@f$ y @f$), where
			
 
				+ *        each element of the output depends only on the corresponding input
			
 
				+ *        element.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class NeuronLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit NeuronLayer(const LayerParameter& param)
			
 
				+     : Layer<Dtype>(param) {}
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_NEURON_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/parameter_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/parameter_layer.hpp
@@ -0,0 +1,45 @@
 
				+#ifndef CAFFE_PARAMETER_LAYER_HPP_
			
 
				+#define CAFFE_PARAMETER_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+template <typename Dtype>
			
 
				+class ParameterLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit ParameterLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {
			
 
				+    if (this->blobs_.size() > 0) {
			
 
				+      LOG(INFO) << "Skipping parameter initialization";
			
 
				+    } else {
			
 
				+      this->blobs_.resize(1);
			
 
				+      this->blobs_[0].reset(new Blob<Dtype>());
			
 
				+      this->blobs_[0]->Reshape(this->layer_param_.parameter_param().shape());
			
 
				+    }
			
 
				+    top[0]->Reshape(this->layer_param_.parameter_param().shape());
			
 
				+  }
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) { }
			
 
				+  virtual inline const char* type() const { return "Parameter"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 0; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {
			
 
				+    top[0]->ShareData(*(this->blobs_[0]));
			
 
				+    top[0]->ShareDiff(*(this->blobs_[0]));
			
 
				+  }
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom)
			
 
				+  { }
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/pooling_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/pooling_layer.hpp
@@ -0,0 +1,61 @@
 
				+#ifndef CAFFE_POOLING_LAYER_HPP_
			
 
				+#define CAFFE_POOLING_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Pools the input image by taking the max, average, etc. within regions.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class PoolingLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit PoolingLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Pooling"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 1; }
			
 
				+  virtual inline int MinTopBlobs() const { return 1; }
			
 
				+  // MAX POOL layers can output an extra top blob for the mask;
			
 
				+  // others can only output the pooled inputs.
			
 
				+  virtual inline int MaxTopBlobs() const {
			
 
				+    return (this->layer_param_.pooling_param().pool() ==
			
 
				+            PoolingParameter_PoolMethod_MAX) ? 2 : 1;
			
 
				+  }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  int kernel_h_, kernel_w_;
			
 
				+  int stride_h_, stride_w_;
			
 
				+  int pad_h_, pad_w_;
			
 
				+  int channels_;
			
 
				+  int height_, width_;
			
 
				+  int pooled_height_, pooled_width_;
			
 
				+  bool global_pooling_;
			
 
				+  PoolingParameter_RoundMode round_mode_;
			
 
				+  Blob<Dtype> rand_idx_;
			
 
				+  Blob<int> max_idx_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_POOLING_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/power_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/power_layer.hpp
@@ -0,0 +1,89 @@
 
				+#ifndef CAFFE_POWER_LAYER_HPP_
			
 
				+#define CAFFE_POWER_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/neuron_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Computes @f$ y = (\alpha x + \beta) ^ \gamma @f$,
			
 
				+ *        as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$,
			
 
				+ *        and power @f$ \gamma @f$.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class PowerLayer : public NeuronLayer<Dtype> {
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @param param provides PowerParameter power_param,
			
 
				+   *     with PowerLayer options:
			
 
				+   *   - scale (\b optional, default 1) the scale @f$ \alpha @f$
			
 
				+   *   - shift (\b optional, default 0) the shift @f$ \beta @f$
			
 
				+   *   - power (\b optional, default 1) the power @f$ \gamma @f$
			
 
				+   */
			
 
				+  explicit PowerLayer(const LayerParameter& param)
			
 
				+      : NeuronLayer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Power"; }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$
			
 
				+   * @param top output Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the computed outputs @f$
			
 
				+   *        y = (\alpha x + \beta) ^ \gamma
			
 
				+   *      @f$
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the error gradient w.r.t. the power inputs.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *      respect to the outputs
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
			
 
				+   *      with respect to computed outputs @f$ y @f$
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times H \times W) @f$
			
 
				+   *      the inputs @f$ x @f$; Backward fills their diff with
			
 
				+   *      gradients @f$
			
 
				+   *        \frac{\partial E}{\partial x} =
			
 
				+   *            \frac{\partial E}{\partial y}
			
 
				+   *            \alpha \gamma (\alpha x + \beta) ^ {\gamma - 1} =
			
 
				+   *            \frac{\partial E}{\partial y}
			
 
				+   *            \frac{\alpha \gamma y}{\alpha x + \beta}
			
 
				+   *      @f$ if propagate_down[0]
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  /// @brief @f$ \gamma @f$ from layer_param_.power_param()
			
 
				+  Dtype power_;
			
 
				+  /// @brief @f$ \alpha @f$ from layer_param_.power_param()
			
 
				+  Dtype scale_;
			
 
				+  /// @brief @f$ \beta @f$ from layer_param_.power_param()
			
 
				+  Dtype shift_;
			
 
				+  /// @brief Result of @f$ \alpha \gamma @f$
			
 
				+  Dtype diff_scale_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_POWER_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/prelu_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/prelu_layer.hpp
@@ -0,0 +1,101 @@
 
				+#ifndef CAFFE_PRELU_LAYER_HPP_
			
 
				+#define CAFFE_PRELU_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+#include "caffe/layers/neuron_layer.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Parameterized Rectified Linear Unit non-linearity @f$
			
 
				+ *        y_i = \max(0, x_i) + a_i \min(0, x_i)
			
 
				+ *        @f$. The differences from ReLULayer are 1) negative slopes are
			
 
				+ *        learnable though backprop and 2) negative slopes can vary across
			
 
				+ *        channels. The number of axes of input blob should be greater than or
			
 
				+ *        equal to 2. The 1st axis (0-based) is seen as channels.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class PReLULayer : public NeuronLayer<Dtype> {
			
 
				+ public:
			
 
				+  /**
			
 
				+   * @param param provides PReLUParameter prelu_param,
			
 
				+   *     with PReLULayer options:
			
 
				+   *   - filler (\b optional, FillerParameter,
			
 
				+   *     default {'type': constant 'value':0.25}).
			
 
				+   *   - channel_shared (\b optional, default false).
			
 
				+   *     negative slopes are shared across channels.
			
 
				+   */
			
 
				+  explicit PReLULayer(const LayerParameter& param)
			
 
				+      : NeuronLayer<Dtype>(param) {}
			
 
				+
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "PReLU"; }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times ...) @f$
			
 
				+   *      the inputs @f$ x @f$
			
 
				+   * @param top output Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times ...) @f$
			
 
				+   *      the computed outputs for each channel @f$i@f$ @f$
			
 
				+   *        y_i = \max(0, x_i) + a_i \min(0, x_i)
			
 
				+   *      @f$.
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Computes the error gradient w.r.t. the PReLU inputs.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1), providing the error gradient with
			
 
				+   *      respect to the outputs
			
 
				+   *   -# @f$ (N \times C \times ...) @f$
			
 
				+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
			
 
				+   *      with respect to computed outputs @f$ y @f$
			
 
				+   * @param propagate_down see Layer::Backward.
			
 
				+   * @param bottom input Blob vector (length 1)
			
 
				+   *   -# @f$ (N \times C \times ...) @f$
			
 
				+   *      the inputs @f$ x @f$; For each channel @f$i@f$, backward fills their
			
 
				+   *      diff with gradients @f$
			
 
				+   *        \frac{\partial E}{\partial x_i} = \left\{
			
 
				+   *        \begin{array}{lr}
			
 
				+   *            a_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
			
 
				+   *            \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i > 0
			
 
				+   *        \end{array} \right.
			
 
				+   *      @f$.
			
 
				+   *      If param_propagate_down_[0] is true, it fills the diff with gradients
			
 
				+   *      @f$
			
 
				+   *        \frac{\partial E}{\partial a_i} = \left\{
			
 
				+   *        \begin{array}{lr}
			
 
				+   *            \sum_{x_i} x_i \frac{\partial E}{\partial y_i} & \mathrm{if} \; x_i \le 0 \\
			
 
				+   *            0 & \mathrm{if} \; x_i > 0
			
 
				+   *        \end{array} \right.
			
 
				+   *      @f$.
			
 
				+   */
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  bool channel_shared_;
			
 
				+  Blob<Dtype> multiplier_;  // dot multiplier for backward computation of params
			
 
				+  Blob<Dtype> backward_buff_;  // temporary buffer for backward computation
			
 
				+  Blob<Dtype> bottom_memory_;  // memory for in-place computation
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_PRELU_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/python_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/python_layer.hpp
@@ -0,0 +1,55 @@
 
				+#ifndef CAFFE_PYTHON_LAYER_HPP_
			
 
				+#define CAFFE_PYTHON_LAYER_HPP_
			
 
				+
			
 
				+#include <boost/python.hpp>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/layer.hpp"
			
 
				+
			
 
				+namespace bp = boost::python;
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+template <typename Dtype>
			
 
				+class PythonLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  PythonLayer(PyObject* self, const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param), self_(bp::handle<>(bp::borrowed(self))) { }
			
 
				+
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {
			
 
				+    // Disallow PythonLayer in MultiGPU training stage, due to GIL issues
			
 
				+    // Details: https://github.com/BVLC/caffe/issues/2936
			
 
				+    if (this->phase_ == TRAIN && Caffe::solver_count() > 1
			
 
				+        && !Caffe::multiprocess()) {
			
 
				+      LOG(FATAL) << "PythonLayer does not support CLI Multi-GPU, use train.py";
			
 
				+    }
			
 
				+    self_.attr("param_str") = bp::str(
			
 
				+        this->layer_param_.python_param().param_str());
			
 
				+    self_.attr("phase") = static_cast<int>(this->phase_);
			
 
				+    self_.attr("setup")(bottom, top);
			
 
				+  }
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {
			
 
				+    self_.attr("reshape")(bottom, top);
			
 
				+  }
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Python"; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top) {
			
 
				+    self_.attr("forward")(bottom, top);
			
 
				+  }
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
			
 
				+    self_.attr("backward")(top, propagate_down, bottom);
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  bp::object self_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/recurrent_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/recurrent_layer.hpp
@@ -0,0 +1,187 @@
 
				+#ifndef CAFFE_RECURRENT_LAYER_HPP_
			
 
				+#define CAFFE_RECURRENT_LAYER_HPP_
			
 
				+
			
 
				+#include <string>
			
 
				+#include <utility>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/common.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/net.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+#include "caffe/util/format.hpp"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+template <typename Dtype> class RecurrentLayer;
			
 
				+
			
 
				+/**
			
 
				+ * @brief An abstract class for implementing recurrent behavior inside of an
			
 
				+ *        unrolled network.  This Layer type cannot be instantiated -- instead,
			
 
				+ *        you should use one of its implementations which defines the recurrent
			
 
				+ *        architecture, such as RNNLayer or LSTMLayer.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class RecurrentLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit RecurrentLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reset();
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Recurrent"; }
			
 
				+  virtual inline int MinBottomBlobs() const {
			
 
				+    int min_bottoms = 2;
			
 
				+    if (this->layer_param_.recurrent_param().expose_hidden()) {
			
 
				+      vector<string> inputs;
			
 
				+      this->RecurrentInputBlobNames(&inputs);
			
 
				+      min_bottoms += inputs.size();
			
 
				+    }
			
 
				+    return min_bottoms;
			
 
				+  }
			
 
				+  virtual inline int MaxBottomBlobs() const { return MinBottomBlobs() + 1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const {
			
 
				+    int num_tops = 1;
			
 
				+    if (this->layer_param_.recurrent_param().expose_hidden()) {
			
 
				+      vector<string> outputs;
			
 
				+      this->RecurrentOutputBlobNames(&outputs);
			
 
				+      num_tops += outputs.size();
			
 
				+    }
			
 
				+    return num_tops;
			
 
				+  }
			
 
				+
			
 
				+  virtual inline bool AllowForceBackward(const int bottom_index) const {
			
 
				+    // Can't propagate to sequence continuation indicators.
			
 
				+    return bottom_index != 1;
			
 
				+  }
			
 
				+
			
 
				+ protected:
			
 
				+  /**
			
 
				+   * @brief Fills net_param with the recurrent network architecture.  Subclasses
			
 
				+   *        should define this -- see RNNLayer and LSTMLayer for examples.
			
 
				+   */
			
 
				+  virtual void FillUnrolledNet(NetParameter* net_param) const = 0;
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Fills names with the names of the 0th timestep recurrent input
			
 
				+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
			
 
				+   *        for examples.
			
 
				+   */
			
 
				+  virtual void RecurrentInputBlobNames(vector<string>* names) const = 0;
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Fills shapes with the shapes of the recurrent input Blob&s.
			
 
				+   *        Subclasses should define this -- see RNNLayer and LSTMLayer
			
 
				+   *        for examples.
			
 
				+   */
			
 
				+  virtual void RecurrentInputShapes(vector<BlobShape>* shapes) const = 0;
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Fills names with the names of the Tth timestep recurrent output
			
 
				+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
			
 
				+   *        for examples.
			
 
				+   */
			
 
				+  virtual void RecurrentOutputBlobNames(vector<string>* names) const = 0;
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Fills names with the names of the output blobs, concatenated across
			
 
				+   *        all timesteps.  Should return a name for each top Blob.
			
 
				+   *        Subclasses should define this -- see RNNLayer and LSTMLayer for
			
 
				+   *        examples.
			
 
				+   */
			
 
				+  virtual void OutputBlobNames(vector<string>* names) const = 0;
			
 
				+
			
 
				+  /**
			
 
				+   * @param bottom input Blob vector (length 2-3)
			
 
				+   *
			
 
				+   *   -# @f$ (T \times N \times ...) @f$
			
 
				+   *      the time-varying input @f$ x @f$.  After the first two axes, whose
			
 
				+   *      dimensions must correspond to the number of timesteps @f$ T @f$ and
			
 
				+   *      the number of independent streams @f$ N @f$, respectively, its
			
 
				+   *      dimensions may be arbitrary.  Note that the ordering of dimensions --
			
 
				+   *      @f$ (T \times N \times ...) @f$, rather than
			
 
				+   *      @f$ (N \times T \times ...) @f$ -- means that the @f$ N @f$
			
 
				+   *      independent input streams must be "interleaved".
			
 
				+   *
			
 
				+   *   -# @f$ (T \times N) @f$
			
 
				+   *      the sequence continuation indicators @f$ \delta @f$.
			
 
				+   *      These inputs should be binary (0 or 1) indicators, where
			
 
				+   *      @f$ \delta_{t,n} = 0 @f$ means that timestep @f$ t @f$ of stream
			
 
				+   *      @f$ n @f$ is the beginning of a new sequence, and hence the previous
			
 
				+   *      hidden state @f$ h_{t-1} @f$ is multiplied by @f$ \delta_t = 0 @f$
			
 
				+   *      and has no effect on the cell's output at timestep @f$ t @f$, and
			
 
				+   *      a value of @f$ \delta_{t,n} = 1 @f$ means that timestep @f$ t @f$ of
			
 
				+   *      stream @f$ n @f$ is a continuation from the previous timestep
			
 
				+   *      @f$ t-1 @f$, and the previous hidden state @f$ h_{t-1} @f$ affects the
			
 
				+   *      updated hidden state and output.
			
 
				+   *
			
 
				+   *   -# @f$ (N \times ...) @f$ (optional)
			
 
				+   *      the static (non-time-varying) input @f$ x_{static} @f$.
			
 
				+   *      After the first axis, whose dimension must be the number of
			
 
				+   *      independent streams, its dimensions may be arbitrary.
			
 
				+   *      This is mathematically equivalent to using a time-varying input of
			
 
				+   *      @f$ x'_t = [x_t; x_{static}] @f$ -- i.e., tiling the static input
			
 
				+   *      across the @f$ T @f$ timesteps and concatenating with the time-varying
			
 
				+   *      input.  Note that if this input is used, all timesteps in a single
			
 
				+   *      batch within a particular one of the @f$ N @f$ streams must share the
			
 
				+   *      same static input, even if the sequence continuation indicators
			
 
				+   *      suggest that difference sequences are ending and beginning within a
			
 
				+   *      single batch.  This may require padding and/or truncation for uniform
			
 
				+   *      length.
			
 
				+   *
			
 
				+   * @param top output Blob vector (length 1)
			
 
				+   *   -# @f$ (T \times N \times D) @f$
			
 
				+   *      the time-varying output @f$ y @f$, where @f$ D @f$ is
			
 
				+   *      <code>recurrent_param.num_output()</code>.
			
 
				+   *      Refer to documentation for particular RecurrentLayer implementations
			
 
				+   *      (such as RNNLayer and LSTMLayer) for the definition of @f$ y @f$.
			
 
				+   */
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  /// @brief A Net to implement the Recurrent functionality.
			
 
				+  shared_ptr<Net<Dtype> > unrolled_net_;
			
 
				+
			
 
				+  /// @brief The number of independent streams to process simultaneously.
			
 
				+  int N_;
			
 
				+
			
 
				+  /**
			
 
				+   * @brief The number of timesteps in the layer's input, and the number of
			
 
				+   *        timesteps over which to backpropagate through time.
			
 
				+   */
			
 
				+  int T_;
			
 
				+
			
 
				+  /// @brief Whether the layer has a "static" input copied across all timesteps.
			
 
				+  bool static_input_;
			
 
				+
			
 
				+  /**
			
 
				+   * @brief The last layer to run in the network. (Any later layers are losses
			
 
				+   *        added to force the recurrent net to do backprop.)
			
 
				+   */
			
 
				+  int last_layer_index_;
			
 
				+
			
 
				+  /**
			
 
				+   * @brief Whether the layer's hidden state at the first and last timesteps
			
 
				+   *        are layer inputs and outputs, respectively.
			
 
				+   */
			
 
				+  bool expose_hidden_;
			
 
				+
			
 
				+  vector<Blob<Dtype>* > recur_input_blobs_;
			
 
				+  vector<Blob<Dtype>* > recur_output_blobs_;
			
 
				+  vector<Blob<Dtype>* > output_blobs_;
			
 
				+  Blob<Dtype>* x_input_blob_;
			
 
				+  Blob<Dtype>* x_static_input_blob_;
			
 
				+  Blob<Dtype>* cont_input_blob_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_RECURRENT_LAYER_HPP_
			
--- a/src/api/perception_lidar_cnn_segmentation/caffe/layers/reduction_layer.hpp
+++ b/src/api/perception_lidar_cnn_segmentation/caffe/layers/reduction_layer.hpp
@@ -0,0 +1,59 @@
 
				+#ifndef CAFFE_REDUCTION_LAYER_HPP_
			
 
				+#define CAFFE_REDUCTION_LAYER_HPP_
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "caffe/blob.hpp"
			
 
				+#include "caffe/layer.hpp"
			
 
				+#include "caffe/proto/caffe.pb.h"
			
 
				+
			
 
				+namespace caffe {
			
 
				+
			
 
				+/**
			
 
				+ * @brief Compute "reductions" -- operations that return a scalar output Blob
			
 
				+ *        for an input Blob of arbitrary size, such as the sum, absolute sum,
			
 
				+ *        and sum of squares.
			
 
				+ *
			
 
				+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
			
 
				+ */
			
 
				+template <typename Dtype>
			
 
				+class ReductionLayer : public Layer<Dtype> {
			
 
				+ public:
			
 
				+  explicit ReductionLayer(const LayerParameter& param)
			
 
				+      : Layer<Dtype>(param) {}
			
 
				+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+
			
 
				+  virtual inline const char* type() const { return "Reduction"; }
			
 
				+  virtual inline int ExactNumBottomBlobs() const { return 1; }
			
 
				+  virtual inline int ExactNumTopBlobs() const { return 1; }
			
 
				+
			
 
				+ protected:
			
 
				+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
			
 
				+      const vector<Blob<Dtype>*>& top);
			
 
				+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
			
 
				+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
			
 
				+
			
 
				+  /// @brief the reduction operation performed by the layer
			
 
				+  ReductionParameter_ReductionOp op_;
			
 
				+  /// @brief a scalar coefficient applied to all outputs
			
 
				+  Dtype coeff_;
			
 
				+  /// @brief the index of the first input axis to reduce
			
 
				+  int axis_;
			
 
				+  /// @brief the number of reductions performed
			
 
				+  int num_;
			
 
				+  /// @brief the input size of each reduction
			
 
				+  int dim_;
			
 
				+  /// @brief a helper Blob used for summation (op_ == SUM)
			
 
				+  Blob<Dtype> sum_multiplier_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace caffe
			
 
				+
			
 
				+#endif  // CAFFE_REDUCTION_LAYER_HPP_