2 vuotta sitten · 54c07eda80
--- a/src/detection/laneATT_trt/README.md
+++ b/src/detection/laneATT_trt/README.md
@@ -0,0 +1,119 @@
 
				+YoloP
			
 
				+=====
			
 
				+
			
 
				+The original pytorch model is from [hustvl/YOLOP](https://github.com/hustvl/YOLOP)
			
 
				+
			
 
				+## Authors
			
 
				+
			
 
				+<a href="https://github.com/ausk"><img src="https://avatars.githubusercontent.com/u/4545060?v=4?s=48" width="40px;" alt=""/></a>
			
 
				+<a href="https://github.com/aliceint"><img src="https://avatars.githubusercontent.com/u/15520773?v=4?s=48" width="40px;" alt=""/></a>
			
 
				+
			
 
				+## 1. Prepare building environments
			
 
				+
			
 
				+Make sure you have install `c++`(support c++11)、 `cmake`、`opencv`(4.x)、`cuda`(10.x)、`nvinfer`(7.x).
			
 
				+
			
 
				+
			
 
				+## 2. build yolop
			
 
				+
			
 
				+Go to `yolop`.
			
 
				+
			
 
				+```
			
 
				+mkdir build
			
 
				+cd build
			
 
				+
			
 
				+cmake ..
			
 
				+make
			
 
				+```
			
 
				+
			
 
				+Now you can get `yolop` and `libmyplugins.so`.
			
 
				+
			
 
				+
			
 
				+## 3. Test in C++
			
 
				+
			
 
				+Go to `yolop/build`.
			
 
				+
			
 
				+### 3.1 generate yolop.wts
			
 
				+Download/Clone [YOLOP](https://github.com/hustvl/YOLOP)
			
 
				+
			
 
				+Edit `gen_wts.py` , change `YOLOP_BASE_DIR` to realpath of `YOLOP`.
			
 
				+
			
 
				+```
			
 
				+# [WARN] Please download/clone YOLOP, then set YOLOP_BASE_DIR to the root of YOLOP
			
 
				+python3 ../gen_wts.py
			
 
				+```
			
 
				+
			
 
				+### 3.2 generate yolop.trt
			
 
				+```
			
 
				+./yolop -s yolop.wts  yolop.trt
			
 
				+```
			
 
				+
			
 
				+Now you have such files:  `libmyplugins.so yolop yolop.wts  yolop.trt`
			
 
				+
			
 
				+
			
 
				+### 3.3 test yolop.trt
			
 
				+```
			
 
				+mkdir ../results
			
 
				+
			
 
				+YOLOP_BASE_DIR=/home/user/jetson/tmp/YOLOP
			
 
				+./yolop -d yolop.trt  $YOLOP_BASE_DIR/inference/images/
			
 
				+```
			
 
				+
			
 
				+It will output like as follow if successful! ( test on `Jetson Xavier NX - Jetpack 4.4`)
			
 
				+```
			
 
				+1601ms # the fist time is slow
			
 
				+26ms   # then it is faster
			
 
				+29ms
			
 
				+27ms
			
 
				+29ms
			
 
				+29ms
			
 
				+```
			
 
				+
			
 
				+![](https://user-images.githubusercontent.com/4545060/197756635-38348dc5-d8e7-4ae3-be56-6b231dd2f5db.jpg)
			
 
				+
			
 
				+
			
 
				+## 4. Test in python3
			
 
				+Go to `yolop`.
			
 
				+
			
 
				+Make sure you have install `pycuda` `tensorrt`; and modify `image_dir` to your image dir.
			
 
				+
			
 
				+```
			
 
				+# usage: xxx <engine file> <plugin file> <image dir>
			
 
				+
			
 
				+python3 yolop_trt.py  build/yolop.trt  build/libmyplugins.so /home/user/jetson/tmp/YOLOP/inference/images
			
 
				+```
			
 
				+
			
 
				+It will output like as follow if successful! ( test on `Jetson Xavier NX - Jetpack 4.4`)
			
 
				+```
			
 
				+usage: xxx <engine file> <plugin file> <image dir>
			
 
				+[WARN] preaprea you image_dir, such as: samples, or /home/user/jetson/tmp/YOLOP/inference/images
			
 
				+bingding:  data (3, 384, 640)
			
 
				+bingding:  det (6001, 1, 1)
			
 
				+bingding:  seg (1, 360, 640)
			
 
				+bingding:  lane (1, 360, 640)
			
 
				+batch size is 1
			
 
				+warm_up->(384, 640, 3), time->1070.87ms
			
 
				+input->['/home/user/jetson/tmp/YOLOP/inference/images/3c0e7240-96e390d2.jpg'], time->25.94ms, saving into output/
			
 
				+input->['/home/user/jetson/tmp/YOLOP/inference/images/adb4871d-4d063244.jpg'], time->25.34ms, saving into output/
			
 
				+input->['/home/user/jetson/tmp/YOLOP/inference/images/8e1c1ab0-a8b92173.jpg'], time->25.03ms, saving into output/
			
 
				+input->['/home/user/jetson/tmp/YOLOP/inference/images/7dd9ef45-f197db95.jpg'], time->25.45ms, saving into output/
			
 
				+input->['/home/user/jetson/tmp/YOLOP/inference/images/9aa94005-ff1d4c9a.jpg'], time->24.93ms, saving into output/
			
 
				+input->['/home/user/jetson/tmp/YOLOP/inference/images/0ace96c3-48481887.jpg'], time->25.33ms, saving into output/
			
 
				+done!
			
 
				+```
			
 
				+
			
 
				+![](https://user-images.githubusercontent.com/4545060/198003852-204f3bae-18ad-44fb-9ecd-4a2a07a726a3.jpg)
			
 
				+
			
 
				+
			
 
				+**Notice** : The results of c++ and python are not aligned for now!
			
 
				+
			
 
				+----------------------------------------
			
 
				+
			
 
				+```BibTeX
			
 
				+@misc{2108.11250,
			
 
				+Author = {Dong Wu and Manwen Liao and Weitian Zhang and Xinggang Wang},
			
 
				+Title = {YOLOP: You Only Look Once for Panoptic Driving Perception},
			
 
				+Year = {2021},
			
 
				+Eprint = {arXiv:2108.11250},
			
 
				+}
			
 
				+```
			
 
				+
			
--- a/src/detection/laneATT_trt/cuda_utils.h
+++ b/src/detection/laneATT_trt/cuda_utils.h
@@ -0,0 +1,14 @@
 
				+#pragma once
			
 
				+#include <cuda_runtime_api.h>
			
 
				+
			
 
				+#ifndef CUDA_CHECK
			
 
				+#define CUDA_CHECK(callstr)\
			
 
				+    {\
			
 
				+        cudaError_t error_code = callstr;\
			
 
				+        if (error_code != cudaSuccess) {\
			
 
				+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
			
 
				+            assert(0);\
			
 
				+        }\
			
 
				+    }
			
 
				+#endif  // CUDA_CHECK
			
 
				+
			
--- a/src/detection/laneATT_trt/gen_wts.py
+++ b/src/detection/laneATT_trt/gen_wts.py
@@ -0,0 +1,38 @@
 
				+import os, sys
			
 
				+import torch
			
 
				+import struct
			
 
				+
			
 
				+# TODO: YOLOP_BASE_DIR is the root of YOLOP
			
 
				+print("[WARN] Please download/clone YOLOP, then set YOLOP_BASE_DIR to the root of YOLOP")
			
 
				+
			
 
				+#YOLOP_BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
			
 
				+YOLOP_BASE_DIR = "/home/catarc/YOLOP"
			
 
				+
			
 
				+sys.path.append(YOLOP_BASE_DIR)
			
 
				+from lib.models import get_net
			
 
				+from lib.config import cfg
			
 
				+
			
 
				+
			
 
				+# Initialize
			
 
				+device = torch.device('cpu')
			
 
				+# Load model
			
 
				+model = get_net(cfg)
			
 
				+checkpoint = torch.load(YOLOP_BASE_DIR + '/weights/End-to-end.pth', map_location=device)
			
 
				+model.load_state_dict(checkpoint['state_dict'])
			
 
				+# load to FP32
			
 
				+model.float()
			
 
				+model.to(device).eval()
			
 
				+
			
 
				+f = open('yolop.wts', 'w')
			
 
				+f.write('{}\n'.format(len(model.state_dict().keys())))
			
 
				+for k, v in model.state_dict().items():
			
 
				+    vr = v.reshape(-1).cpu().numpy()
			
 
				+    f.write('{} {} '.format(k, len(vr)))
			
 
				+    for vv in vr:
			
 
				+        f.write(' ')
			
 
				+        f.write(struct.pack('>f',float(vv)).hex())
			
 
				+    f.write('\n')
			
 
				+
			
 
				+f.close()
			
 
				+
			
 
				+print("save as yolop.wts")
			
--- a/src/detection/laneATT_trt/laneATT_trt.pro
+++ b/src/detection/laneATT_trt/laneATT_trt.pro
@@ -0,0 +1,68 @@
 
				+QT -= gui
			
 
				+CONFIG += c++11
			
 
				+CONFIG -= app_bundle
			
 
				+
			
 
				+DEFINES += QT_DEPRECATED_WARNINGS
			
 
				+
			
 
				+QMAKE_LFLAGS += -no-pie
			
 
				+
			
 
				+
			
 
				+
			
 
				+#DEFINES += CPU_ONLY
			
 
				+
			
 
				+
			
 
				+#system("cd ./caffe/proto ; protoc *.proto -I=./ --cpp_out=./ ; cd ../../ ; echo $PWD")
			
 
				+
			
 
				+SOURCES += yolop.cpp \
			
 
				+            laneatt.cc \
			
 
				+    ../../include/msgtype/rawpic.pb.cc \
			
 
				+    ../../include/msgtype/lanearray.pb.cc
			
 
				+
			
 
				+
			
 
				+
			
 
				+INCLUDEPATH += \
			
 
				+               /usr/include/aarch64-linux-gnu \
			
 
				+               /usr/include/opencv4 \
			
 
				+               /usr/local/cuda/include \
			
 
				+               /usr/src/tensorrt/samples/common \
			
 
				+               usr/local/include \
			
 
				+               /usr/include/opencv2 \
			
 
				+               /usr/include/opencv \
			
 
				+    ../../include/msgtype/rawpic.pb.h
			
 
				+
			
 
				+LIBS += /usr/local/cuda/lib64/libcudart.so \
			
 
				+/usr/lib/aarch64-linux-gnu/libnvparsers.so \
			
 
				+/usr/lib/aarch64-linux-gnu/libnvinfer.so \
			
 
				+/usr/lib/aarch64-linux-gnu/libnvonnxparser.so \
			
 
				+/usr/lib/aarch64-linux-gnu/libnvinfer_plugin.so
			
 
				+
			
 
				+LIBS += -L"/usr/local/lib" -lprotobuf
			
 
				+
			
 
				+HEADERS +=  laneatt.hh \
			
 
				+    ../../include/msgtype/lanearray.pb.h
			
 
				+
			
 
				+
			
 
				+LIBS += /usr/lib/aarch64-linux-gnu/libopencv_highgui.so \
			
 
				+        /usr/lib/aarch64-linux-gnu/libopencv_core.so    \
			
 
				+        /usr/lib/aarch64-linux-gnu/libopencv_imgproc.so \
			
 
				+        /usr/lib/aarch64-linux-gnu/libopencv_imgcodecs.so \
			
 
				+        /usr/lib/aarch64-linux-gnu/libopencv_videoio.so \
			
 
				+        /usr/lib/aarch64-linux-gnu/libopencv_video.so \
			
 
				+        /usr/lib/aarch64-linux-gnu/libopencv_calib3d.so \
			
 
				+        /usr/local/cuda-10.2/lib64/*.so
			
 
				+#LIBS += -L/usr/local/lib -lboost_system \
			
 
				+
			
 
				+
			
 
				+!include(../../../include/common.pri ) {
			
 
				+    error( "Couldn't find the common.pri file!" )
			
 
				+}
			
 
				+
			
 
				+!include(../../../include/ivprotobuf.pri ) {
			
 
				+    error( "Couldn't find the ivprotobuf.pri file!" )
			
 
				+}
			
 
				+!include(../../../include/ivopencv.pri ) {
			
 
				+    error( "Couldn't find the ivopencv.pri file!" )
			
 
				+}
			
 
				+INCLUDEPATH += $$PWD/../../include/msgtype
			
 
				+LIBS += -L/usr/lib/aarch64-linux-gnu/ -lglog
			
 
				+
			
--- a/src/detection/laneATT_trt/laneatt.cc
+++ b/src/detection/laneATT_trt/laneatt.cc
@@ -0,0 +1,360 @@
 
				+#include "laneatt.hh"
			
 
				+
			
 
				+#include <fstream>
			
 
				+#include <NvInferPlugin.h>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include <opencv2/imgproc.hpp>
			
 
				+#include <opencv2/highgui.hpp>
			
 
				+#include <map>
			
 
				+
			
 
				+#define INPUT_H 360
			
 
				+#define INPUT_W 640
			
 
				+#define N_OFFSETS 72
			
 
				+#define N_STRIPS (N_OFFSETS - 1)
			
 
				+#define MAX_COL_BLOCKS 1000
			
 
				+
			
 
				+//std::vector<cv::Point2f> warped_point, origin_point;
			
 
				+
			
 
				+//float METER_PER_PIXEL;
			
 
				+
			
 
				+//float BOARD_SIDE_LENGTH = 0.5;
			
 
				+
			
 
				+//cv::Mat get_pers_mat(std::string pers_file)
			
 
				+//{
			
 
				+//    cv::Point2f src[4];
			
 
				+//    cv::Point2f dst[4];
			
 
				+//    cv::FileStorage pf(pers_file, cv::FileStorage::READ);
			
 
				+//    pf["board_left_top"]>>src[0];
			
 
				+//    pf["board_left_bottom"]>>src[1];
			
 
				+//    pf["board_right_top"]>>src[2];
			
 
				+//    pf["board_right_bottom"]>>src[3];
			
 
				+
			
 
				+//    dst[1].x = src[0].x;
			
 
				+//    dst[1].y = src[1].y;
			
 
				+
			
 
				+//    dst[3].x = src[2].x;
			
 
				+//    dst[3].y = src[3].y;
			
 
				+
			
 
				+//    int side_length = dst[3].x - dst[1].x;
			
 
				+
			
 
				+//    METER_PER_PIXEL = BOARD_SIDE_LENGTH / side_length;
			
 
				+
			
 
				+//    dst[0].x = src[0].x;
			
 
				+//    dst[0].y = dst[1].y - side_length;
			
 
				+
			
 
				+//    dst[2].x = src[2].x;
			
 
				+//    dst[2].y = dst[3].y - side_length;
			
 
				+
			
 
				+//    cv::Mat pers_mat = getPerspectiveTransform(dst, src);
			
 
				+//    cv::Mat pers_mat_inv = pers_mat.inv();
			
 
				+
			
 
				+//    return pers_mat_inv;
			
 
				+//}
			
 
				+
			
 
				+//float DX (float x1, float x2, float dis) {
			
 
				+//    return dis / (x2 - x1);
			
 
				+//}
			
 
				+
			
 
				+//float DY (float y1, float y2, float dis) {
			
 
				+//    return dis / (x2 - x1);
			
 
				+//}
			
 
				+
			
 
				+
			
 
				+
			
 
				+//std::string PERS_FILE = "./yaml/pers.yaml";
			
 
				+//cv::Mat pers_mat_inv = get_pers_mat(PERS_FILE);  // 得到透视变换关系
			
 
				+
			
 
				+LaneATT::LaneATT(const std::string& plan_path) {
			
 
				+    buffer_size_[0] = 3 * INPUT_H * INPUT_W;
			
 
				+    buffer_size_[1] = MAX_COL_BLOCKS * (5 + N_OFFSETS);
			
 
				+
			
 
				+    cudaMalloc(&buffers_[0], buffer_size_[0] * sizeof(float));
			
 
				+    cudaMalloc(&buffers_[1], buffer_size_[1] * sizeof(float));
			
 
				+    image_data_.resize(buffer_size_[0]);
			
 
				+    detections_.resize(MAX_COL_BLOCKS);
			
 
				+
			
 
				+    cudaStreamCreate(&stream_);
			
 
				+    LoadEngine(plan_path);
			
 
				+}
			
 
				+
			
 
				+LaneATT::~LaneATT() {
			
 
				+    cudaStreamDestroy(stream_);
			
 
				+    for (auto& buffer : buffers_) {
			
 
				+        cudaFree(buffer);
			
 
				+    }
			
 
				+    if (context_ != nullptr) {
			
 
				+        context_->destroy();
			
 
				+        engine_->destroy();
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+std::vector<std::vector<cv::Point2f>> LaneATT::DetectLane(const cv::Mat& raw_image) {
			
 
				+    // Preprocessing
			
 
				+    cv::Mat img_resize;
			
 
				+    cv::resize(raw_image, img_resize, cv::Size(INPUT_W, INPUT_H), cv::INTER_LINEAR);
			
 
				+    // img_resize.convertTo(img_resize, CV_32FC3, 1.0);
			
 
				+    uint8_t* data_hwc = reinterpret_cast<uint8_t*>(img_resize.data);
			
 
				+    float* data_chw = image_data_.data();
			
 
				+    for (int c = 0; c < 3; ++c) {
			
 
				+        for (unsigned j = 0, img_size = INPUT_H * INPUT_W; j < img_size; ++j) {
			
 
				+            data_chw[c * img_size + j] = data_hwc[j * 3 + c] / 255.f;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Do inference
			
 
				+    cudaMemcpyAsync(buffers_[0], image_data_.data(), buffer_size_[0] * sizeof(float), cudaMemcpyHostToDevice, stream_);
			
 
				+    context_->execute(1, &buffers_[0]);
			
 
				+    cudaMemcpyAsync(detections_.data(), buffers_[1], buffer_size_[1] * sizeof(float), cudaMemcpyDeviceToHost, stream_);
			
 
				+
			
 
				+    // NMS and decoding
			
 
				+    cv::Mat rst;
			
 
				+    std::vector<std::vector<cv::Point2f>> lanes;
			
 
				+    lanes = PostProcess(img_resize, raw_image);
			
 
				+    return lanes;
			
 
				+}
			
 
				+
			
 
				+void LaneATT::LoadEngine(const std::string& engine_file) {
			
 
				+    std::ifstream in_file(engine_file, std::ios::binary);
			
 
				+    if (!in_file.is_open()) {
			
 
				+        std::cerr << "Failed to open engine file: " << engine_file << std::endl;
			
 
				+        return;
			
 
				+    }
			
 
				+    in_file.seekg(0, in_file.end);
			
 
				+    int length = in_file.tellg();
			
 
				+    in_file.seekg(0, in_file.beg);
			
 
				+    std::unique_ptr<char[]> trt_model_stream(new char[length]);
			
 
				+    in_file.read(trt_model_stream.get(), length);
			
 
				+    in_file.close();
			
 
				+
			
 
				+    // FIXME: getPluginCreator could not find plugin: ScatterND version: 1
			
 
				+    initLibNvInferPlugins(&g_logger_, "");
			
 
				+    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(g_logger_);
			
 
				+    assert(runtime != nullptr);
			
 
				+    engine_ = runtime->deserializeCudaEngine(trt_model_stream.get(), length, nullptr);
			
 
				+    assert(engine_ != nullptr);
			
 
				+    context_ = engine_->createExecutionContext();
			
 
				+    assert(context_ != nullptr);
			
 
				+
			
 
				+    runtime->destroy();
			
 
				+}
			
 
				+
			
 
				+float LaneIoU(const Detection& a, const Detection& b) {
			
 
				+    int start_a = static_cast<int>(a.start_y *  + 0.5f);
			
 
				+    int start_b = static_cast<int>(b.start_y *  + 0.5f);
			
 
				+    int start = std::max(start_a, start_b);
			
 
				+    int end_a = start_a + static_cast<int>(a.length + 0.5f) - 1;
			
 
				+    int end_b = start_b + static_cast<int>(b.length + 0.5f) - 1;
			
 
				+    int end = std::min(std::min(end_a, end_b), N_STRIPS);
			
 
				+    // if (end < start) {
			
 
				+    //     return 1.0f / 0.0f;
			
 
				+    // }
			
 
				+    float dist = 0.0f;
			
 
				+    for (int i = start; i <= end; ++i) {
			
 
				+        dist += fabs(a.lane_xs[i] - b.lane_xs[i]);
			
 
				+    }
			
 
				+    dist /= static_cast<float>(end - start + 1);
			
 
				+    return dist;
			
 
				+}
			
 
				+
			
 
				+std::vector<std::vector<cv::Point2f>> LaneATT::PostProcess(cv::Mat& lane_image, cv::Mat raw_image, float conf_thresh, float nms_thresh, int nms_topk) { //conf_thresh=0.4 nms_thresh=50 nms_topk=4
			
 
				+    // 1.Do NMS
			
 
				+    std::vector<Detection> candidates;
			
 
				+    std::vector<Detection> proposals;
			
 
				+    for (auto det : detections_) {
			
 
				+        if (det.score > conf_thresh) {
			
 
				+            candidates.push_back(det);
			
 
				+        }
			
 
				+    }
			
 
				+    // std::cout << candidates.size() << std::endl;
			
 
				+    std::sort(candidates.begin(), candidates.end(), [=](const Detection& a, const Detection& b) { return a.score > b.score; });
			
 
				+    for (int i = 0; i < candidates.size(); ++i) {
			
 
				+        if (candidates[i].score < 0.0f) {
			
 
				+            continue;
			
 
				+        }
			
 
				+        proposals.push_back(candidates[i]);
			
 
				+        if (proposals.size() == nms_topk) {
			
 
				+            break;
			
 
				+        }
			
 
				+        for (int j = i + 1; j < candidates.size(); ++j) {
			
 
				+            if (candidates[j].score > 0.0f && LaneIoU(candidates[j], candidates[i]) < nms_thresh) {
			
 
				+                candidates[j].score = -1.0f;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // 2.Decoding
			
 
				+    std::vector<float> anchor_ys;
			
 
				+    for (int i = 0; i < N_OFFSETS; ++i) {
			
 
				+        anchor_ys.push_back(1.0f - i / float(N_STRIPS));
			
 
				+    }
			
 
				+    std::vector<std::vector<cv::Point2f>> lanes;
			
 
				+    for (const auto& lane: proposals) { 
			
 
				+        int start = static_cast<int>(lane.start_y * N_STRIPS + 0.5f);
			
 
				+        int end = start + static_cast<int>(lane.length + 0.5f) - 1;
			
 
				+        end = std::min(end, N_STRIPS);
			
 
				+        std::vector<cv::Point2f> points;
			
 
				+        for (int i = start; i <= end; ++i) {
			
 
				+            points.push_back(cv::Point2f(lane.lane_xs[i], anchor_ys[i] * INPUT_H));
			
 
				+        }
			
 
				+        lanes.push_back(points);
			
 
				+    }
			
 
				+
			
 
				+    std::vector<std::vector<cv::Point2f>> lanes1;
			
 
				+    float y_ratio =  float(raw_image.size().height) / float(INPUT_H);
			
 
				+    float x_ratio =  float(raw_image.size().width) / float(INPUT_W);
			
 
				+    cv::Point2f pp;
			
 
				+    for (const auto& lane_points : lanes) {
			
 
				+        std::vector<cv::Point2f> points;
			
 
				+        for (int i=0; i<lane_points.size(); i++)  {
			
 
				+            pp.x =float(lane_points[i].x) *  x_ratio;
			
 
				+            pp.y =float(lane_points[i].y) * y_ratio;
			
 
				+            points.push_back(pp);
			
 
				+        }
			
 
				+        lanes1.push_back(points);
			
 
				+    }
			
 
				+    return lanes1;
			
 
				+
			
 
				+
			
 
				+//    float y_ratio =  float(raw_image.size().height) / float(INPUT_H);
			
 
				+//    float x_ratio =  float(raw_image.size().width) / float(INPUT_W);
			
 
				+
			
 
				+//    // 3.PerspectiveTransform
			
 
				+
			
 
				+//    std::vector<cv::Point2f> leftLane;
			
 
				+//    std::vector<cv::Point2f> rightLane;
			
 
				+//    std::vector<std::vector<cv::Point2f>> warped_lanes;
			
 
				+
			
 
				+//    for (int i = 0; i < lanes.size(); i++) {
			
 
				+//        cv::perspectiveTransform(lanes[i], warped_point , pers_mat_inv); //  坐标变换(输入点，输出点，变换矩阵)
			
 
				+//        warped_lanes.push_back(warped_point);
			
 
				+//        double sum = 0.0;
			
 
				+//        for (int i = 0 ; i < 5 ; i++){
			
 
				+//            sum += warped_point[i].x;
			
 
				+//        }
			
 
				+//        double mean = sum / 5;
			
 
				+//        if (mean < INPUT_W / 2){
			
 
				+//            leftLane.push_back(cv::Point2f(mean, i));
			
 
				+//        }else{
			
 
				+//            rightLane.push_back(cv::Point2f(mean, i));
			
 
				+//        }
			
 
				+//    }
			
 
				+//    float left_min = 0;
			
 
				+//    float right_max = INPUT_W;
			
 
				+//    int left_num = -1;
			
 
				+//    int right_num = -1;
			
 
				+//    int left_left = -1;
			
 
				+//    int right_right = -1;
			
 
				+//    for (const auto& points : leftLane){
			
 
				+//        if(points.x > left_min){
			
 
				+//            left_num = (int) round(points.y);
			
 
				+//            left_min = points.x;
			
 
				+//        }
			
 
				+//    }
			
 
				+//    for (const auto& points : leftLane){
			
 
				+//        if((int)round(points.y) != left_num){
			
 
				+//            left_left = points.y;
			
 
				+//        }
			
 
				+//    }
			
 
				+//    if (left_num == -1){
			
 
				+//        std::cout<<"left lane failed"<<std::endl;
			
 
				+//    }
			
 
				+//    for (const auto& points : rightLane){
			
 
				+//        if(points.x < right_max){
			
 
				+//            right_num = (int) round(points.y);
			
 
				+//            right_max = points.x;
			
 
				+//        }
			
 
				+//    }
			
 
				+//    for (const auto& points : rightLane){
			
 
				+//        if((int)round(points.y) != right_num){
			
 
				+//            right_right = points.y;
			
 
				+//        }
			
 
				+//    }
			
 
				+//    if (right_num == -1){
			
 
				+//        std::cout<<"right lane failed"<<std::endl;
			
 
				+//    }
			
 
				+
			
 
				+//    // 4.Visualize
			
 
				+
			
 
				+//    cv::Point2f pp;
			
 
				+//    cv::Point3f ppp;
			
 
				+//    int flag = 0;
			
 
				+//    cv::Mat img_warp;
			
 
				+//    int DX, DY;
			
 
				+//    std::vector<std::vector<cv::Point3f>> world_lanes;
			
 
				+//    cv::warpPerspective(raw_image, img_warp, pers_mat_inv, raw_image.size());        //  图像变换(输入图像，输出图像，变换矩阵)
			
 
				+//    for (const auto& lane_points : warped_lanes) {
			
 
				+//        if(flag == left_num || flag == right_num){
			
 
				+//            std::vector<cv::Point3f> world_points;
			
 
				+//            for (const auto& point : lane_points) {
			
 
				+//                pp.x =float(point.x);
			
 
				+//                pp.y =float(point.y);
			
 
				+//                cv::circle(img_warp, pp, 1, cv::Scalar(0, 255, 0), -1);
			
 
				+//                ppp.x = (point.x - INPUT_W / 2) * DX;
			
 
				+//                ppp.y = float((INPUT_H - point.y) * DY);
			
 
				+//                ppp.z = float(0);
			
 
				+//                world_points.push_back(ppp);
			
 
				+//            }
			
 
				+//            world_lanes.push_back(world_points);
			
 
				+//        }else{
			
 
				+//            std::vector<cv::Point3f> world_points;
			
 
				+//            for (const auto& point : lane_points) {
			
 
				+//                pp.x =float(point.x);
			
 
				+//                pp.y =float(point.y);
			
 
				+//                cv::circle(img_warp, pp, 1, cv::Scalar(0, 0, 255), -1);
			
 
				+//                ppp.x = (point.x - INPUT_W / 2) * DX;
			
 
				+//                ppp.y = float((INPUT_H - point.y) * DY);
			
 
				+//                ppp.z = float(0);
			
 
				+//                world_points.push_back(ppp);
			
 
				+//            }
			
 
				+//            world_lanes.push_back(world_points);
			
 
				+//        }
			
 
				+//        flag++;
			
 
				+//    }
			
 
				+//    cv::imshow("warp",img_warp);
			
 
				+
			
 
				+
			
 
				+
			
 
				+//    flag = 0;
			
 
				+//    for (const auto& lane_points : lanes) {
			
 
				+//        if(flag == left_num || flag == right_num){
			
 
				+//            for (int i=0; i<lane_points.size()-1; i++)  {
			
 
				+//                pp.x =float(lane_points[i].x) *  x_ratio;
			
 
				+//                pp.y =float(lane_points[i].y) * y_ratio;
			
 
				+//                cv::circle(raw_image, pp, 1, cv::Scalar(0, 255, 0), -1);
			
 
				+
			
 
				+//            }
			
 
				+//        }else{
			
 
				+//            for (int i=0; i<lane_points.size()-1; i++) {
			
 
				+//                pp.x =float(lane_points[i].x) *  x_ratio;
			
 
				+//                pp.y =float(lane_points[i].y) * y_ratio;
			
 
				+//                cv::circle(raw_image, pp, 1, cv::Scalar(0, 0, 255), -1);
			
 
				+//            }
			
 
				+//        }
			
 
				+//        flag++;
			
 
				+//    }
			
 
				+//    cv::Point2f p1, p2;
			
 
				+//    flag = 0;
			
 
				+//    for (const auto& lane_points : lanes) {
			
 
				+//        if(flag == left_num || flag == right_num){
			
 
				+//            for (int i=0; i<lane_points.size()-1; i++) {
			
 
				+//                p1.x =float(lane_points[i].x) *  x_ratio;
			
 
				+//                p1.y =float(lane_points[i].y) * y_ratio;
			
 
				+//                p2.x =float(lane_points[i+1].x) *  x_ratio;
			
 
				+//                p2.y =float(lane_points[i+1].y) *  y_ratio;
			
 
				+//                cv::line(raw_image, p1, p2, cv::Scalar(0, 255, 0), 1);
			
 
				+
			
 
				+//            }
			
 
				+//        }else{
			
 
				+//            for (int i=0; i<lane_points.size()-1; i++) {
			
 
				+//                p1.x =float(lane_points[i].x) *  x_ratio;
			
 
				+//                p1.y =float(lane_points[i].y) * y_ratio;
			
 
				+//                p2.x =float(lane_points[i+1].x) *  x_ratio;
			
 
				+//                p2.y =float(lane_points[i+1].y) *  y_ratio;
			
 
				+//                cv::line(raw_image, p1, p2, cv::Scalar(0, 0, 255), 1);
			
 
				+//            }
			
 
				+//        }
			
 
				+//        flag++;
			
 
				+//    }
			
 
				+
			
 
				+//    return raw_image;
			
 
				+}
			
--- a/src/detection/laneATT_trt/laneatt.hh
+++ b/src/detection/laneATT_trt/laneatt.hh
@@ -0,0 +1,72 @@
 
				+#pragma once
			
 
				+
			
 
				+#include <NvInfer.h>
			
 
				+#include <iostream>
			
 
				+#include <memory>
			
 
				+#include <opencv2/core.hpp>
			
 
				+
			
 
				+
			
 
				+
			
 
				+class Logger : public nvinfer1::ILogger {
			
 
				+  public:
			
 
				+    explicit Logger(Severity severity = Severity::kWARNING)
			
 
				+        : reportable_severity(severity) {}
			
 
				+
			
 
				+    void log(Severity severity, const char* msg) noexcept {
			
 
				+        if (severity > reportable_severity) {
			
 
				+            return;
			
 
				+        }
			
 
				+        switch (severity) {
			
 
				+            case Severity::kINTERNAL_ERROR:
			
 
				+                std::cerr << "INTERNAL_ERROR: ";
			
 
				+                break;
			
 
				+            case Severity::kERROR:
			
 
				+                std::cerr << "ERROR: ";
			
 
				+                break;
			
 
				+            case Severity::kWARNING:
			
 
				+                std::cerr << "WARNING: ";
			
 
				+                break;
			
 
				+            case Severity::kINFO:
			
 
				+                std::cerr << "INFO: ";
			
 
				+                break;
			
 
				+            default:
			
 
				+                std::cerr << "UNKNOWN: ";
			
 
				+                break;
			
 
				+        }
			
 
				+        std::cerr << msg << std::endl;
			
 
				+    }
			
 
				+    Severity reportable_severity;
			
 
				+};
			
 
				+
			
 
				+struct Detection {
			
 
				+    float unknown;
			
 
				+    float score;
			
 
				+    float start_y;
			
 
				+    float start_x;
			
 
				+    float length;
			
 
				+    float lane_xs[72];
			
 
				+};
			
 
				+
			
 
				+class LaneATT {
			
 
				+  public:
			
 
				+    LaneATT(const std::string& plan_path);
			
 
				+
			
 
				+    ~LaneATT();
			
 
				+
			
 
				+    std::vector<std::vector<cv::Point2f>> DetectLane(const cv::Mat& raw_image);
			
 
				+
			
 
				+  private:
			
 
				+    void LoadEngine(const std::string& engine_file);
			
 
				+
			
 
				+    std::vector<std::vector<cv::Point2f>> PostProcess(cv::Mat& lane_image, cv::Mat raw_image, float conf_thresh=0.5f, float nms_thresh=50.f, int nms_topk=4);
			
 
				+
			
 
				+    Logger g_logger_;
			
 
				+    cudaStream_t stream_;
			
 
				+    nvinfer1::ICudaEngine* engine_;
			
 
				+    nvinfer1::IExecutionContext* context_;
			
 
				+    
			
 
				+    void* buffers_[2];
			
 
				+    int buffer_size_[2];
			
 
				+    std::vector<float> image_data_;
			
 
				+    std::vector<Detection> detections_;
			
 
				+};
			
--- a/src/detection/laneATT_trt/laneatt_to_onnx.py
+++ b/src/detection/laneATT_trt/laneatt_to_onnx.py
@@ -0,0 +1,84 @@
 
				+import torch
			
 
				+
			
 
				+from lib.models.laneatt import LaneATT
			
 
				+
			
 
				+
			
 
				+class LaneATTONNX(torch.nn.Module):
			
 
				+    def __init__(self, model):
			
 
				+        super(LaneATTONNX, self).__init__()
			
 
				+        # Params
			
 
				+        self.fmap_h = model.fmap_h  # 11
			
 
				+        self.fmap_w = model.fmap_w  # 20
			
 
				+        self.anchor_feat_channels = model.anchor_feat_channels  # 64
			
 
				+        self.anchors = model.anchors
			
 
				+        self.cut_xs = model.cut_xs
			
 
				+        self.cut_ys = model.cut_ys
			
 
				+        self.cut_zs = model.cut_zs
			
 
				+        self.invalid_mask = model.invalid_mask
			
 
				+        # Layers
			
 
				+        self.feature_extractor = model.feature_extractor
			
 
				+        self.conv1 = model.conv1
			
 
				+        self.cls_layer = model.cls_layer
			
 
				+        self.reg_layer = model.reg_layer
			
 
				+        self.attention_layer = model.attention_layer
			
 
				+
			
 
				+        # Exporting the operator eye to ONNX opset version 11 is not supported
			
 
				+        attention_matrix = torch.eye(1000)
			
 
				+        self.non_diag_inds = torch.nonzero(attention_matrix == 0., as_tuple=False)
			
 
				+        self.non_diag_inds = self.non_diag_inds[:, 1] + 1000 * self.non_diag_inds[:, 0]  # 999000
			
 
				+
			
 
				+    def forward(self, x):
			
 
				+        batch_features = self.feature_extractor(x)
			
 
				+        batch_features = self.conv1(batch_features)
			
 
				+        # batch_anchor_features = self.cut_anchor_features(batch_features)
			
 
				+        batch_anchor_features = batch_features[0].flatten()
			
 
				+        # h, w = batch_features.shape[2:4]  # 12, 20
			
 
				+        batch_anchor_features = batch_anchor_features[self.cut_xs + 20 * self.cut_ys + 12 * 20 * self.cut_zs].\
			
 
				+            view(1000, self.anchor_feat_channels, self.fmap_h, 1)
			
 
				+        # batch_anchor_features[self.invalid_mask] = 0
			
 
				+        batch_anchor_features = batch_anchor_features * torch.logical_not(self.invalid_mask)
			
 
				+
			
 
				+        # Join proposals from all images into a single proposals features batch
			
 
				+        batch_anchor_features = batch_anchor_features.view(-1, self.anchor_feat_channels * self.fmap_h)
			
 
				+
			
 
				+        # Add attention features
			
 
				+        softmax = torch.nn.Softmax(dim=1)
			
 
				+        scores = self.attention_layer(batch_anchor_features)
			
 
				+        attention = softmax(scores)
			
 
				+        attention_matrix = torch.zeros(1000 * 1000, device=x.device)
			
 
				+        attention_matrix[self.non_diag_inds] = attention.flatten()  # ScatterND
			
 
				+        attention_matrix = attention_matrix.view(1000, 1000)
			
 
				+        attention_features = torch.matmul(torch.transpose(batch_anchor_features, 0, 1),
			
 
				+                                          torch.transpose(attention_matrix, 0, 1)).transpose(0, 1)
			
 
				+        batch_anchor_features = torch.cat((attention_features, batch_anchor_features), dim=1)
			
 
				+
			
 
				+        # Predict
			
 
				+        cls_logits = self.cls_layer(batch_anchor_features)
			
 
				+        reg = self.reg_layer(batch_anchor_features)
			
 
				+
			
 
				+        # Add offsets to anchors (1000, 2+2+73)
			
 
				+        reg_proposals = torch.cat([softmax(cls_logits), self.anchors[:, 2:4], self.anchors[:, 4:] + reg], dim=1)
			
 
				+
			
 
				+        return reg_proposals
			
 
				+
			
 
				+
			
 
				+def export_onnx(onnx_file_path):
			
 
				+    # e.g. laneatt_r18_culane
			
 
				+    backbone_name = 'resnet18'
			
 
				+    checkpoint_file_path = 'experiments/laneatt_r18_culane/models/model_0015.pt'
			
 
				+    anchors_freq_path = 'culane_anchors_freq.pt'
			
 
				+
			
 
				+    # Load specified checkpoint
			
 
				+    model = LaneATT(backbone=backbone_name, anchors_freq_path=anchors_freq_path, topk_anchors=1000)
			
 
				+    checkpoint = torch.load(checkpoint_file_path)
			
 
				+    model.load_state_dict(checkpoint['model'])
			
 
				+    model.eval()
			
 
				+
			
 
				+    # Export to ONNX
			
 
				+    onnx_model = LaneATTONNX(model)
			
 
				+    dummy_input = torch.randn(1, 3, 360, 640)
			
 
				+    torch.onnx.export(onnx_model, dummy_input, onnx_file_path, opset_version=11)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    export_onnx('./LaneATT_test.onnx')
			
--- a/src/detection/laneATT_trt/logging.h
+++ b/src/detection/laneATT_trt/logging.h
@@ -0,0 +1,24 @@
 
				+// create by ausk(jinlj) 2022/10/25
			
 
				+#pragma once
			
 
				+
			
 
				+#include "NvInferRuntimeCommon.h"
			
 
				+#include <cassert>
			
 
				+#include <ctime>
			
 
				+#include <iomanip>
			
 
				+#include <iostream>
			
 
				+#include <ostream>
			
 
				+#include <sstream>
			
 
				+#include <string>
			
 
				+
			
 
				+using Severity = nvinfer1::ILogger::Severity;
			
 
				+
			
 
				+class Logger : public nvinfer1::ILogger
			
 
				+{
			
 
				+public:
			
 
				+    void log(Severity severity, const char* msg) noexcept override
			
 
				+    {
			
 
				+        if (severity < Severity::kINFO) {
			
 
				+            std::cout << msg << std::endl;
			
 
				+        }
			
 
				+    }
			
 
				+};
			
--- a/src/detection/laneATT_trt/nms.cpp
+++ b/src/detection/laneATT_trt/nms.cpp
@@ -0,0 +1,62 @@
 
				+/* Copyright (c) 2018, Grégoire Payen de La Garanderie, Durham University
			
 
				+ * All rights reserved.
			
 
				+ *
			
 
				+ * Redistribution and use in source and binary forms, with or without
			
 
				+ * modification, are permitted provided that the following conditions are met:
			
 
				+ *
			
 
				+ * * Redistributions of source code must retain the above copyright notice, this
			
 
				+ *   list of conditions and the following disclaimer.
			
 
				+ *
			
 
				+ * * Redistributions in binary form must reproduce the above copyright notice,
			
 
				+ *   this list of conditions and the following disclaimer in the documentation
			
 
				+ *   and/or other materials provided with the distribution.
			
 
				+ *
			
 
				+ * * Neither the name of the copyright holder nor the names of its
			
 
				+ *   contributors may be used to endorse or promote products derived from
			
 
				+ *   this software without specific prior written permission.
			
 
				+ *
			
 
				+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
			
 
				+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
			
 
				+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
			
 
				+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
			
 
				+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
			
 
				+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
			
 
				+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
			
 
				+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+ */
			
 
				+
			
 
				+#include <torch/extension.h>
			
 
				+#include <torch/types.h>
			
 
				+#include <iostream>
			
 
				+
			
 
				+std::vector<at::Tensor> nms_cuda_forward(
			
 
				+        at::Tensor boxes,
			
 
				+        at::Tensor idx,
			
 
				+        float nms_overlap_thresh,
			
 
				+        unsigned long top_k);
			
 
				+
			
 
				+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
			
 
				+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
			
 
				+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
			
 
				+
			
 
				+std::vector<at::Tensor> nms_forward(
			
 
				+        at::Tensor boxes,
			
 
				+        at::Tensor scores,
			
 
				+        float thresh,
			
 
				+        unsigned long top_k) {
			
 
				+
			
 
				+
			
 
				+    auto idx = std::get<1>(scores.sort(0,true));
			
 
				+
			
 
				+    CHECK_INPUT(boxes);
			
 
				+    CHECK_INPUT(idx);
			
 
				+
			
 
				+    return nms_cuda_forward(boxes, idx, thresh, top_k);
			
 
				+}
			
 
				+
			
 
				+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
			
 
				+    m.def("nms_forward", &nms_forward, "NMS");
			
 
				+}
			
 
				+
			
--- a/src/detection/laneATT_trt/nms_kernel.cu
+++ b/src/detection/laneATT_trt/nms_kernel.cu
@@ -0,0 +1,193 @@
 
				+#include <torch/extension.h>
			
 
				+#include <ATen/ATen.h>
			
 
				+
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime.h>
			
 
				+#include <vector>
			
 
				+#include <iostream>
			
 
				+
			
 
				+// Hard-coded maximum. Increase if needed.
			
 
				+#define MAX_COL_BLOCKS 1000
			
 
				+#define STRIDE 4
			
 
				+#define N_OFFSETS 72 // if you use more than 73 offsets you will have to adjust this value
			
 
				+#define N_STRIPS (N_OFFSETS - 1)
			
 
				+#define PROP_SIZE (5 + N_OFFSETS)
			
 
				+#define DATASET_OFFSET 0
			
 
				+
			
 
				+#define DIVUP(m,n) (((m)+(n)-1) / (n))
			
 
				+int64_t const threadsPerBlock = sizeof(unsigned long long) * 8;
			
 
				+
			
 
				+// The functions below originates from Fast R-CNN
			
 
				+// See https://github.com/rbgirshick/py-faster-rcnn
			
 
				+// Copyright (c) 2015 Microsoft
			
 
				+// Licensed under The MIT License
			
 
				+// Written by Shaoqing Ren
			
 
				+
			
 
				+template <typename scalar_t>
			
 
				+// __device__ inline scalar_t devIoU(scalar_t const * const a, scalar_t const * const b) {
			
 
				+__device__ inline bool devIoU(scalar_t const * const a, scalar_t const * const b, const float threshold) {
			
 
				+  const int start_a = (int) (a[2] * N_STRIPS - DATASET_OFFSET + 0.5); // 0.5 rounding trick
			
 
				+  const int start_b = (int) (b[2] * N_STRIPS - DATASET_OFFSET + 0.5);
			
 
				+  const int start = max(start_a, start_b);
			
 
				+  const int end_a = start_a + a[4] - 1 + 0.5 - ((a[4] - 1) < 0); //  - (x<0) trick to adjust for negative numbers (in case length is 0)
			
 
				+  const int end_b = start_b + b[4] - 1 + 0.5 - ((b[4] - 1) < 0);
			
 
				+  const int end = min(min(end_a, end_b), N_OFFSETS - 1);
			
 
				+  // if (end < start) return 1e9;
			
 
				+  if (end < start) return false;
			
 
				+  scalar_t dist = 0;
			
 
				+  for(unsigned char i = 5 + start; i <= 5 + end; ++i) {
			
 
				+    if (a[i] < b[i]) {
			
 
				+      dist += b[i] - a[i];
			
 
				+    } else {
			
 
				+      dist += a[i] - b[i];
			
 
				+    }
			
 
				+  }
			
 
				+  // return (dist / (end - start + 1)) < threshold;
			
 
				+  return dist < (threshold * (end - start + 1));
			
 
				+  // return dist / (end - start + 1);
			
 
				+}
			
 
				+
			
 
				+template <typename scalar_t>
			
 
				+__global__ void nms_kernel(const int64_t n_boxes, const scalar_t nms_overlap_thresh,
			
 
				+                           const scalar_t *dev_boxes, const int64_t *idx, int64_t *dev_mask) {
			
 
				+  const int64_t row_start = blockIdx.y;
			
 
				+  const int64_t col_start = blockIdx.x;
			
 
				+
			
 
				+  if (row_start > col_start) return;
			
 
				+
			
 
				+  const int row_size =
			
 
				+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
			
 
				+  const int col_size =
			
 
				+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
			
 
				+
			
 
				+  __shared__ scalar_t block_boxes[threadsPerBlock * PROP_SIZE];
			
 
				+  if (threadIdx.x < col_size) {
			
 
				+    for (int i = 0; i <  PROP_SIZE; ++i) {
			
 
				+      block_boxes[threadIdx.x * PROP_SIZE + i] = dev_boxes[idx[(threadsPerBlock * col_start + threadIdx.x)] * PROP_SIZE + i];
			
 
				+    }
			
 
				+  //   block_boxes[threadIdx.x * 4 + 0] =
			
 
				+  //       dev_boxes[idx[(threadsPerBlock * col_start + threadIdx.x)] * 4 + 0];
			
 
				+  //   block_boxes[threadIdx.x * 4 + 1] =
			
 
				+  //       dev_boxes[idx[(threadsPerBlock * col_start + threadIdx.x)] * 4 + 1];
			
 
				+  //   block_boxes[threadIdx.x * 4 + 2] =
			
 
				+  //       dev_boxes[idx[(threadsPerBlock * col_start + threadIdx.x)] * 4 + 2];
			
 
				+  //   block_boxes[threadIdx.x * 4 + 3] =
			
 
				+  //       dev_boxes[idx[(threadsPerBlock * col_start + threadIdx.x)] * 4 + 3];
			
 
				+  }
			
 
				+  __syncthreads();
			
 
				+
			
 
				+  if (threadIdx.x < row_size) {
			
 
				+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
			
 
				+    const scalar_t *cur_box = dev_boxes + idx[cur_box_idx] * PROP_SIZE;
			
 
				+    int i = 0;
			
 
				+    unsigned long long t = 0;
			
 
				+    int start = 0;
			
 
				+    if (row_start == col_start) {
			
 
				+      start = threadIdx.x + 1;
			
 
				+    }
			
 
				+    for (i = start; i < col_size; i++) {
			
 
				+      if (devIoU(cur_box, block_boxes + i * PROP_SIZE, nms_overlap_thresh)) {
			
 
				+        t |= 1ULL << i;
			
 
				+      }
			
 
				+    }
			
 
				+    const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
			
 
				+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+__global__ void nms_collect(const int64_t boxes_num, const int64_t col_blocks, int64_t top_k, const int64_t *idx, const int64_t *mask, int64_t *keep, int64_t *parent_object_index, int64_t *num_to_keep) {
			
 
				+  int64_t remv[MAX_COL_BLOCKS];
			
 
				+  int64_t num_to_keep_ = 0;
			
 
				+
			
 
				+  for (int i = 0; i < col_blocks; i++) {
			
 
				+      remv[i] = 0;
			
 
				+  }
			
 
				+
			
 
				+  for (int i = 0; i < boxes_num; ++i) {
			
 
				+      parent_object_index[i] = 0;
			
 
				+  }
			
 
				+
			
 
				+  for (int i = 0; i < boxes_num; i++) {
			
 
				+    int nblock = i / threadsPerBlock;
			
 
				+    int inblock = i % threadsPerBlock;
			
 
				+
			
 
				+
			
 
				+    if (!(remv[nblock] & (1ULL << inblock))) {
			
 
				+      int64_t idxi = idx[i];
			
 
				+      keep[num_to_keep_] = idxi;
			
 
				+      const int64_t *p = &mask[0] + i * col_blocks;
			
 
				+      for (int j = nblock; j < col_blocks; j++) {
			
 
				+        remv[j] |= p[j];
			
 
				+      }
			
 
				+      for (int j = i; j < boxes_num; j++) {
			
 
				+        int nblockj = j / threadsPerBlock;
			
 
				+        int inblockj = j % threadsPerBlock;
			
 
				+        if (p[nblockj] & (1ULL << inblockj))
			
 
				+            parent_object_index[idx[j]] = num_to_keep_+1;
			
 
				+      }
			
 
				+      parent_object_index[idx[i]] = num_to_keep_+1;
			
 
				+
			
 
				+      num_to_keep_++;
			
 
				+
			
 
				+      if (num_to_keep_==top_k)
			
 
				+          break;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Initialize the rest of the keep array to avoid uninitialized values.
			
 
				+  for (int i = num_to_keep_; i < boxes_num; ++i)
			
 
				+      keep[i] = 0;
			
 
				+
			
 
				+  *num_to_keep = min(top_k,num_to_keep_);
			
 
				+}
			
 
				+
			
 
				+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
			
 
				+
			
 
				+std::vector<at::Tensor> nms_cuda_forward(
			
 
				+        at::Tensor boxes,
			
 
				+        at::Tensor idx,
			
 
				+        float nms_overlap_thresh,
			
 
				+        unsigned long top_k) {
			
 
				+
			
 
				+  const auto boxes_num = boxes.size(0);
			
 
				+  TORCH_CHECK(boxes.size(1) == PROP_SIZE, "Wrong number of offsets. Please adjust `PROP_SIZE`");
			
 
				+
			
 
				+  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
			
 
				+
			
 
				+  AT_ASSERTM (col_blocks < MAX_COL_BLOCKS, "The number of column blocks must be less than MAX_COL_BLOCKS. Increase the MAX_COL_BLOCKS constant if needed.");
			
 
				+
			
 
				+  auto longOptions = torch::TensorOptions().device(torch::kCUDA).dtype(torch::kLong);
			
 
				+  auto mask = at::empty({boxes_num * col_blocks}, longOptions);
			
 
				+
			
 
				+  dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
			
 
				+              DIVUP(boxes_num, threadsPerBlock));
			
 
				+  dim3 threads(threadsPerBlock);
			
 
				+
			
 
				+  CHECK_CONTIGUOUS(boxes);
			
 
				+  CHECK_CONTIGUOUS(idx);
			
 
				+  CHECK_CONTIGUOUS(mask);
			
 
				+
			
 
				+  AT_DISPATCH_FLOATING_TYPES(boxes.type(), "nms_cuda_forward", ([&] {
			
 
				+    nms_kernel<<<blocks, threads>>>(boxes_num,
			
 
				+                                    (scalar_t)nms_overlap_thresh,
			
 
				+                                    boxes.data<scalar_t>(),
			
 
				+                                    idx.data<int64_t>(),
			
 
				+                                    mask.data<int64_t>());
			
 
				+  }));
			
 
				+
			
 
				+  auto keep = at::empty({boxes_num}, longOptions);
			
 
				+  auto parent_object_index = at::empty({boxes_num}, longOptions);
			
 
				+  auto num_to_keep = at::empty({}, longOptions);
			
 
				+
			
 
				+  nms_collect<<<1, 1>>>(boxes_num, col_blocks, top_k,
			
 
				+                        idx.data<int64_t>(),
			
 
				+                        mask.data<int64_t>(),
			
 
				+                        keep.data<int64_t>(),
			
 
				+                        parent_object_index.data<int64_t>(),
			
 
				+                        num_to_keep.data<int64_t>());
			
 
				+
			
 
				+
			
 
				+  return {keep,num_to_keep,parent_object_index};
			
 
				+}
			
 
				+
			
--- a/src/detection/laneATT_trt/para.hpp
+++ b/src/detection/laneATT_trt/para.hpp
@@ -0,0 +1,17 @@
 
				+#include <chrono>
			
 
				+#include "cuda_utils.h"
			
 
				+#include "logging.h"
			
 
				+#include "utils.h"
			
 
				+
			
 
				+static const int INPUT_H = Yolo::INPUT_H;
			
 
				+static const int INPUT_W = Yolo::INPUT_W;
			
 
				+static const int IMG_H = Yolo::IMG_H;
			
 
				+static const int IMG_W = Yolo::IMG_W;
			
 
				+static const int CLASS_NUM = Yolo::CLASS_NUM;
			
 
				+//static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
			
 
				+static const int OUTPUT_SIZE = 77000;
			
 
				+const char* INPUT_BLOB_NAME = "input";
			
 
				+const char* OUTPUT_DET_NAME = "split_for_trace_model_pred";
			
 
				+const char* OUTPUT_SEG_NAME = "seg";
			
 
				+const char* OUTPUT_LANE_NAME = "ll";
			
 
				+static Logger gLogger;
			
--- a/src/detection/laneATT_trt/python/calibrate_camera.py
+++ b/src/detection/laneATT_trt/python/calibrate_camera.py
@@ -0,0 +1,163 @@
 
				+#!/usr/bin/python3
			
 
				+"""
			
 
				+~~~~~~~~~~~~~~~~~~
			
 
				+Camera calibration
			
 
				+~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+Usage:
			
 
				+    python calib.py \
			
 
				+        -i /dev/video0 \
			
 
				+        -grid 9x6 \
			
 
				+        -out fisheye.yaml \
			
 
				+        -framestep 20 \
			
 
				+        -resolution 640x480
			
 
				+        --fisheye
			
 
				+"""
			
 
				+import argparse
			
 
				+import os
			
 
				+import numpy as np
			
 
				+import yaml
			
 
				+import cv2
			
 
				+import sys
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+    # parser.add_argument("-i", "--input", type=int, default=0,
			
 
				+                        # help="input camera device")
			
 
				+    parser.add_argument("-grid", "--grid", default="10x7", help="size of the grid (rows x cols)")
			
 
				+    parser.add_argument("-framestep", type=int, default=20, help="use every nth frame in the video")
			
 
				+    parser.add_argument("-o", "--output", default="./yaml",
			
 
				+                        help="path to output yaml file")
			
 
				+    # parser.add_argument("-resolution", "--resolution", default="640x480",
			
 
				+    #                     help="resolution of the camera")
			
 
				+    parser.add_argument("-fisheye", "--fisheye", default=True,action="store_true",
			
 
				+                        help="set ture if this is a fisheye camera")
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+   
			
 
				+    cap = "v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink"
			
 
				+
			
 
				+    if not os.path.exists(args.output):
			
 
				+        os.mkdir(args.output)
			
 
				+
			
 
				+    
			
 
				+    source = cv2.VideoCapture(cap)
			
 
				+
			
 
				+    W=1280
			
 
				+    H=720
			
 
				+    source.set(3, W)
			
 
				+    source.set(4, H)
			
 
				+
			
 
				+    grid_size = tuple(int(x) for x in args.grid.split("x"))
			
 
				+    grid_points = np.zeros((np.prod(grid_size), 3), np.float32)
			
 
				+    grid_points[:, :2] = np.indices(grid_size).T.reshape(-1, 2)
			
 
				+
			
 
				+    objpoints = []  # 3d point in real world space
			
 
				+    imgpoints = []  # 2d points in image plane
			
 
				+
			
 
				+    quit = False
			
 
				+    do_calib = False
			
 
				+    i = -1
			
 
				+    while True:
			
 
				+        i += 1
			
 
				+        retcode, img = source.read()
			
 
				+        W,H,_ = img.shape
			
 
				+        if not retcode:
			
 
				+            raise ValueError("cannot read frame from video")
			
 
				+        if i % args.framestep != 0:
			
 
				+            continue
			
 
				+
			
 
				+        print("searching for chessboard corners in frame " + str(i) + "...")
			
 
				+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
			
 
				+        found, corners = cv2.findChessboardCorners(
			
 
				+            gray,
			
 
				+            grid_size,
			
 
				+            cv2.CALIB_CB_ADAPTIVE_THRESH +
			
 
				+            cv2.CALIB_CB_NORMALIZE_IMAGE +
			
 
				+            cv2.CALIB_CB_FILTER_QUADS
			
 
				+        )
			
 
				+        if found:
			
 
				+            term = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_COUNT, 30, 0.01)
			
 
				+            cv2.cornerSubPix(gray, corners, (5, 5), (-1, -1), term)
			
 
				+            print("OK")
			
 
				+            imgpoints.append(corners.reshape(1, -1, 2))
			
 
				+            objpoints.append(grid_points.reshape(1, -1, 3))
			
 
				+            cv2.drawChessboardCorners(img, grid_size, corners, found)
			
 
				+        text1 = "press c to calibrate"
			
 
				+        text2 = "press q to quit"
			
 
				+        text3 = "device: {}".format(cap)
			
 
				+        fontscale = 0.6
			
 
				+        cv2.putText(img, text1, (20, 70), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
			
 
				+        cv2.putText(img, text2, (20, 110), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
			
 
				+        cv2.putText(img, text3, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
			
 
				+        cv2.imshow("corners", img)
			
 
				+        key = cv2.waitKey(1) & 0xFF
			
 
				+        if key == ord("c"):
			
 
				+            do_calib = True
			
 
				+            break
			
 
				+
			
 
				+        elif key == ord("q"):
			
 
				+            quit = True
			
 
				+            break
			
 
				+
			
 
				+    if quit:
			
 
				+        source.release()
			
 
				+        cv2.destroyAllWindows()
			
 
				+
			
 
				+    if do_calib:
			
 
				+        print("\nPerforming calibration...\n")
			
 
				+        N_OK = len(objpoints)
			
 
				+        if N_OK < 12:
			
 
				+            print("Less than 12 corners detected, calibration failed")
			
 
				+            return
			
 
				+
			
 
				+        K = np.zeros((3, 3))
			
 
				+        D = np.zeros((4, 1))
			
 
				+        rvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
			
 
				+        tvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
			
 
				+        calibration_flags = (cv2.fisheye.CALIB_RECOMPUTE_EXTRINSIC +
			
 
				+                             cv2.fisheye.CALIB_CHECK_COND +
			
 
				+                             cv2.fisheye.CALIB_FIX_SKEW)
			
 
				+
			
 
				+        # 求出内参矩阵和畸变系数
			
 
				+        if True:
			
 
				+            ret, K, D, rvecs, tvecs = cv2.fisheye.calibrate(
			
 
				+                objpoints,
			
 
				+                imgpoints,
			
 
				+                (W, H),
			
 
				+                K,
			
 
				+                D,
			
 
				+                rvecs,
			
 
				+                tvecs,
			
 
				+                calibration_flags,
			
 
				+                (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 1e-6)
			
 
				+            )
			
 
				+        else:
			
 
				+            ret, K, D, rvecs, tvecs = cv2.calibrateCamera(
			
 
				+                objpoints,
			
 
				+                imgpoints,
			
 
				+                (W, H),
			
 
				+                None,
			
 
				+                None)
			
 
				+        if ret:
			
 
				+            print(ret)
			
 
				+            print(K)
			
 
				+            data = {"dim": np.array([W, H]).tolist(), "K": K.tolist(), "D": D.tolist()}
			
 
				+            fname = "./yaml/camera.yaml"
			
 
				+            print(fname)
			
 
				+            with open(fname, "w") as f:
			
 
				+                yaml.safe_dump(data,f)
			
 
				+            print("succesfully saved camera data")
			
 
				+
			
 
				+            cv2.putText(img, "Success!", (220 , 240), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)
			
 
				+        else:
			
 
				+            cv2.putText(img, "Failed!", (220, 240), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)
			
 
				+
			
 
				+        cv2.imshow("corners", img)
			
 
				+        cv2.waitKey(0)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/src/detection/laneATT_trt/python/calibrate_camera_new.py
+++ b/src/detection/laneATT_trt/python/calibrate_camera_new.py
@@ -0,0 +1,196 @@
 
				+#!/usr/bin/python3
			
 
				+"""
			
 
				+~~~~~~~~~~~~~~~~~~
			
 
				+Camera calibration
			
 
				+~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+Usage:
			
 
				+    python calib.py \
			
 
				+        -i /dev/video0 \
			
 
				+        -grid 9x6 \
			
 
				+        -out fisheye.yaml \
			
 
				+        -framestep 20 \
			
 
				+        -resolution 640x480
			
 
				+        --fisheye
			
 
				+"""
			
 
				+import argparse
			
 
				+import os
			
 
				+import numpy as np
			
 
				+import yaml
			
 
				+import cv2
			
 
				+import sys
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+    # parser.add_argument("-i", "--input", type=int, default=0,
			
 
				+                        # help="input camera device")
			
 
				+    parser.add_argument("-grid", "--grid", default="10x7", help="size of the grid (rows x cols)")
			
 
				+    parser.add_argument("-framestep", type=int, default=20, help="use every nth frame in the video")
			
 
				+    parser.add_argument("-o", "--output", default="./yaml",
			
 
				+                        help="path to output yaml file")
			
 
				+    # parser.add_argument("-resolution", "--resolution", default="640x480",
			
 
				+    #                     help="resolution of the camera")
			
 
				+    parser.add_argument("-fisheye", "--fisheye", default=True,action="store_true",
			
 
				+                        help="set ture if this is a fisheye camera")
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+   
			
 
				+    cap = "v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink"
			
 
				+
			
 
				+    if not os.path.exists(args.output):
			
 
				+        os.mkdir(args.output)
			
 
				+
			
 
				+    
			
 
				+    source = cv2.VideoCapture(cap)
			
 
				+
			
 
				+    W=1280
			
 
				+    H=720
			
 
				+    source.set(3, W)
			
 
				+    source.set(4, H)
			
 
				+
			
 
				+    grid_size = tuple(int(x) for x in args.grid.split("x"))
			
 
				+    grid_points = np.zeros((np.prod(grid_size), 3), np.float32)
			
 
				+    grid_points[:, :2] = np.indices(grid_size).T.reshape(-1, 2)
			
 
				+
			
 
				+    objpoints = []  # 3d point in real world space
			
 
				+    imgpoints = []  # 2d points in image plane
			
 
				+
			
 
				+    quit = False
			
 
				+    do_calib = False
			
 
				+    i = -1
			
 
				+    while True:
			
 
				+        i += 1
			
 
				+        retcode, img = source.read()
			
 
				+        //img= img[120:480, 240:880]
			
 
				+        # print(img.shape)
			
 
				+        W,H,_ = img.shape
			
 
				+        if not retcode:
			
 
				+            raise ValueError("cannot read frame from video")
			
 
				+        if i % args.framestep != 0:
			
 
				+            continue
			
 
				+
			
 
				+        print("searching for chessboard corners in frame " + str(i) + "...")
			
 
				+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
			
 
				+        found, corners = cv2.findChessboardCorners(
			
 
				+            gray,
			
 
				+            grid_size,
			
 
				+            cv2.CALIB_CB_ADAPTIVE_THRESH +
			
 
				+            cv2.CALIB_CB_NORMALIZE_IMAGE +
			
 
				+            cv2.CALIB_CB_FILTER_QUADS
			
 
				+        )
			
 
				+        if found:
			
 
				+            term = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_COUNT, 30, 0.01)
			
 
				+            cv2.cornerSubPix(gray, corners, (5, 5), (-1, -1), term)
			
 
				+            print("OK")
			
 
				+            imgpoints.append(corners.reshape(1, -1, 2))
			
 
				+            objpoints.append(grid_points.reshape(1, -1, 3))
			
 
				+            cv2.drawChessboardCorners(img, grid_size, corners, found)
			
 
				+        text1 = "press c to calibrate"
			
 
				+        text2 = "press q to quit"
			
 
				+        text3 = "device: {}".format(cap)
			
 
				+        fontscale = 0.6
			
 
				+        cv2.putText(img, text1, (20, 70), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
			
 
				+        cv2.putText(img, text2, (20, 110), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
			
 
				+        cv2.putText(img, text3, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
			
 
				+        cv2.imshow("corners", img)
			
 
				+
			
 
				+        print("\nPerforming calibration...\n")
			
 
				+        N_OK = len(objpoints)
			
 
				+        if N_OK > 12:
			
 
				+            print("More than 12 corners detected")
			
 
				+
			
 
				+            K = np.zeros((3, 3))
			
 
				+            D = np.zeros((4, 1))
			
 
				+            rvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
			
 
				+            tvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
			
 
				+            calibration_flags = (cv2.fisheye.CALIB_RECOMPUTE_EXTRINSIC +
			
 
				+                                cv2.fisheye.CALIB_CHECK_COND +
			
 
				+                                cv2.fisheye.CALIB_FIX_SKEW)
			
 
				+
			
 
				+            try:
			
 
				+                ret, K, D, rvecs, tvecs = cv2.fisheye.calibrate(
			
 
				+                    objpoints,
			
 
				+                    imgpoints,
			
 
				+                    (W, H),
			
 
				+                    K,
			
 
				+                    D,
			
 
				+                    rvecs,
			
 
				+                    tvecs,
			
 
				+                    calibration_flags,
			
 
				+                    (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 1e-5)
			
 
				+                )
			
 
				+            except:
			
 
				+                print("Do not press C!!!!")
			
 
				+
			
 
				+
			
 
				+        key = cv2.waitKey(1) & 0xFF
			
 
				+        if key == ord("c"):
			
 
				+            do_calib = True
			
 
				+            break
			
 
				+
			
 
				+        elif key == ord("q"):
			
 
				+            quit = True
			
 
				+            break
			
 
				+
			
 
				+    if quit:
			
 
				+        source.release()
			
 
				+        cv2.destroyAllWindows()
			
 
				+
			
 
				+    if do_calib:
			
 
				+        print("\nPerforming calibration...\n")
			
 
				+        N_OK = len(objpoints)
			
 
				+        if N_OK < 12:
			
 
				+            print("Less than 12 corners detected, calibration failed")
			
 
				+            return
			
 
				+
			
 
				+        K = np.zeros((3, 3))
			
 
				+        D = np.zeros((4, 1))
			
 
				+        rvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
			
 
				+        tvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
			
 
				+        calibration_flags = (cv2.fisheye.CALIB_RECOMPUTE_EXTRINSIC +
			
 
				+                             cv2.fisheye.CALIB_CHECK_COND +
			
 
				+                             cv2.fisheye.CALIB_FIX_SKEW)
			
 
				+
			
 
				+        # 求出内参矩阵和畸变系数
			
 
				+        if True:
			
 
				+            ret, K, D, rvecs, tvecs = cv2.fisheye.calibrate(
			
 
				+                objpoints,
			
 
				+                imgpoints,
			
 
				+                (W, H),
			
 
				+                K,
			
 
				+                D,
			
 
				+                rvecs,
			
 
				+                tvecs,
			
 
				+                calibration_flags,
			
 
				+                (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 1e-5)
			
 
				+            )
			
 
				+
			
 
				+        else:
			
 
				+            ret, K, D, rvecs, tvecs = cv2.calibrateCamera(
			
 
				+                objpoints,
			
 
				+                imgpoints,
			
 
				+                (W, H),
			
 
				+                None,
			
 
				+                None)
			
 
				+        if ret:
			
 
				+            print(ret)
			
 
				+            print(K)
			
 
				+            data = {"dim": np.array([W, H]).tolist(), "K": K.tolist(), "D": D.tolist()}
			
 
				+            fname = "./yaml/camera.yaml"
			
 
				+            print(fname)
			
 
				+            with open(fname, "w") as f:
			
 
				+                yaml.safe_dump(data,f)
			
 
				+            print("succesfully saved camera data")
			
 
				+
			
 
				+            cv2.putText(img, "Success!", (220 , 240), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)
			
 
				+        else:
			
 
				+            cv2.putText(img, "Failed!", (220, 240), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)
			
 
				+
			
 
				+        cv2.imshow("corners", img)
			
 
				+        cv2.waitKey(0)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/src/detection/laneATT_trt/python/calibrate_camera_new_.py
+++ b/src/detection/laneATT_trt/python/calibrate_camera_new_.py
@@ -0,0 +1,183 @@
 
				+#!/usr/bin/python3
			
 
				+"""
			
 
				+~~~~~~~~~~~~~~~~~~
			
 
				+Camera calibration
			
 
				+~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+Usage:
			
 
				+    python calib.py \
			
 
				+        -i /dev/video0 \
			
 
				+        -grid 9x6 \
			
 
				+        -out fisheye.yaml \
			
 
				+        -framestep 20 \
			
 
				+        -resolution 640x480
			
 
				+        --fisheye
			
 
				+"""
			
 
				+import argparse
			
 
				+import os
			
 
				+import numpy as np
			
 
				+import yaml
			
 
				+import cv2
			
 
				+import sys
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+    # parser.add_argument("-i", "--input", type=int, default=0,
			
 
				+                        # help="input camera device")
			
 
				+    parser.add_argument("-grid", "--grid", default="10x7", help="size of the grid (rows x cols)")
			
 
				+    parser.add_argument("-framestep", type=int, default=20, help="use every nth frame in the video")
			
 
				+    parser.add_argument("-o", "--output", default="./yaml",
			
 
				+                        help="path to output yaml file")
			
 
				+    # parser.add_argument("-resolution", "--resolution", default="640x480",
			
 
				+    #                     help="resolution of the camera")
			
 
				+    parser.add_argument("-fisheye", "--fisheye", default=True,action="store_true",
			
 
				+                        help="set ture if this is a fisheye camera")
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+   
			
 
				+    cap = "v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink"
			
 
				+
			
 
				+    if not os.path.exists(args.output):
			
 
				+        os.mkdir(args.output)
			
 
				+
			
 
				+    
			
 
				+    source = cv2.VideoCapture(cap)
			
 
				+
			
 
				+    W=1280
			
 
				+    H=720
			
 
				+    source.set(3, W)
			
 
				+    source.set(4, H)
			
 
				+
			
 
				+    grid_size = tuple(int(x) for x in args.grid.split("x"))
			
 
				+    grid_points = np.zeros((np.prod(grid_size), 3), np.float32)
			
 
				+    grid_points[:, :2] = np.indices(grid_size).T.reshape(-1, 2)
			
 
				+
			
 
				+    objpoints = []  # 3d point in real world space
			
 
				+    imgpoints = []  # 2d points in image plane
			
 
				+
			
 
				+    quit = False
			
 
				+    do_calib = False
			
 
				+    i = -1
			
 
				+    while True:
			
 
				+        i += 1
			
 
				+        retcode, img = source.read()
			
 
				+        img= img[410-90:590+90, 380+35:1020+35]
			
 
				+        #print(img.shape)
			
 
				+        W,H,_ = img.shape
			
 
				+        if not retcode:
			
 
				+            raise ValueError("cannot read frame from video")
			
 
				+        if i % args.framestep != 0:
			
 
				+            continue
			
 
				+
			
 
				+        print("searching for chessboard corners in frame " + str(i) + "...")
			
 
				+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
			
 
				+        found, corners = cv2.findChessboardCorners(
			
 
				+            gray,
			
 
				+            grid_size,
			
 
				+            cv2.CALIB_CB_ADAPTIVE_THRESH +
			
 
				+            cv2.CALIB_CB_NORMALIZE_IMAGE +
			
 
				+            cv2.CALIB_CB_FILTER_QUADS
			
 
				+        )
			
 
				+        if found:
			
 
				+            term = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_COUNT, 30, 0.01)
			
 
				+            cv2.cornerSubPix(gray, corners, (5, 5), (-1, -1), term)
			
 
				+            print("OK")
			
 
				+            imgpoints.append(corners.reshape(1, -1, 2))
			
 
				+            objpoints.append(grid_points.reshape(1, -1, 3))
			
 
				+            cv2.drawChessboardCorners(img, grid_size, corners, found)
			
 
				+        text1 = "press c to calibrate"
			
 
				+        text2 = "press q to quit"
			
 
				+        text3 = "device: {}".format(cap)
			
 
				+        fontscale = 0.6
			
 
				+        cv2.putText(img, text1, (20, 70), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
			
 
				+        cv2.putText(img, text2, (20, 110), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
			
 
				+        cv2.putText(img, text3, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
			
 
				+        cv2.imshow("corners", img)
			
 
				+
			
 
				+        print("\nPerforming calibration...\n")
			
 
				+        N_OK = len(objpoints)
			
 
				+        if N_OK > 12:
			
 
				+            print("More than 12 corners detected")
			
 
				+
			
 
				+            K = np.zeros((3, 3))
			
 
				+            D = np.zeros((4, 1))
			
 
				+            rvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
			
 
				+            tvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
			
 
				+
			
 
				+
			
 
				+            try:
			
 
				+                ret, K, D, rvecs, tvecs = cv2.calibrateCamera(
			
 
				+                    objpoints,
			
 
				+                    imgpoints,
			
 
				+                    (W, H),
			
 
				+                    None,
			
 
				+                    None
			
 
				+                )
			
 
				+            except:
			
 
				+                print("Do not press C!!!!")
			
 
				+
			
 
				+
			
 
				+        key = cv2.waitKey(1) & 0xFF
			
 
				+        if key == ord("c"):
			
 
				+            do_calib = True
			
 
				+            break
			
 
				+
			
 
				+        elif key == ord("q"):
			
 
				+            quit = True
			
 
				+            break
			
 
				+
			
 
				+    if quit:
			
 
				+        source.release()
			
 
				+        cv2.destroyAllWindows()
			
 
				+
			
 
				+    if do_calib:
			
 
				+        print("\nPerforming calibration...\n")
			
 
				+        N_OK = len(objpoints)
			
 
				+        if N_OK < 12:
			
 
				+            print("Less than 12 corners detected, calibration failed")
			
 
				+            return
			
 
				+
			
 
				+        K = np.zeros((3, 3))
			
 
				+        D = np.zeros((4, 1))
			
 
				+        rvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
			
 
				+        tvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
			
 
				+        
			
 
				+
			
 
				+        # 求出内参矩阵和畸变系数
			
 
				+        if True:
			
 
				+            ret, K, D, rvecs, tvecs = cv2.calibrateCamera(
			
 
				+                objpoints,
			
 
				+                imgpoints,
			
 
				+                (W, H),
			
 
				+                None,
			
 
				+                None)
			
 
				+
			
 
				+        else:
			
 
				+            ret, K, D, rvecs, tvecs = cv2.calibrateCamera(
			
 
				+                objpoints,
			
 
				+                imgpoints,
			
 
				+                (W, H),
			
 
				+                None,
			
 
				+                None)
			
 
				+        if ret:
			
 
				+            print(ret)
			
 
				+            print(K)
			
 
				+            data = {"dim": np.array([W, H]).tolist(), "K": K.tolist(), "D": D.tolist()}
			
 
				+            fname = "./yaml/camera.yaml"
			
 
				+            print(fname)
			
 
				+            with open(fname, "w") as f:
			
 
				+                yaml.safe_dump(data,f)
			
 
				+            print("succesfully saved camera data")
			
 
				+
			
 
				+            cv2.putText(img, "Success!", (220 , 240), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)
			
 
				+        else:
			
 
				+            cv2.putText(img, "Failed!", (220, 240), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)
			
 
				+
			
 
				+        cv2.imshow("corners", img)
			
 
				+        cv2.waitKey(0)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/src/detection/laneATT_trt/python/cameraID.py
+++ b/src/detection/laneATT_trt/python/cameraID.py
@@ -0,0 +1,11 @@
 
				+import cv2
			
 
				+ID = 0
			
 
				+while(1):
			
 
				+    cap = cv2.VideoCapture(ID,cv2.CAP_DSHOW)
			
 
				+    # get a frame
			
 
				+    ret, frame = cap.read()
			
 
				+    if ret == False:
			
 
				+        ID += 1
			
 
				+    else:
			
 
				+        print(ID)
			
 
				+        break
			
--- a/src/detection/laneATT_trt/python/select_area.py
+++ b/src/detection/laneATT_trt/python/select_area.py
@@ -0,0 +1,58 @@
 
				+import cv2
			
 
				+import yaml
			
 
				+import numpy as np
			
 
				+import sys
			
 
				+import os
			
 
				+
			
 
				+
			
 
				+def test():
			
 
				+
			
 
				+    cap = cv2.VideoCapture("v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink")
			
 
				+    while True:
			
 
				+        ret, frame = cap.read()
			
 
				+        #frame = frame[410-90:590+90, 380+35:1020+35]
			
 
				+        cv2.imshow("capture", frame)
			
 
				+        if cv2.waitKey(10) & 0xFF == ord('q'):
			
 
				+            img = frame
			
 
				+            cv2.imwrite('img.jpg', frame)  
			
 
				+            break
			
 
				+    cap.release()
			
 
				+    cv2.destroyAllWindows()
			
 
				+    #img = cv2.imread('img.jpg')
			
 
				+
			
 
				+
			
 
				+    def on_EVENT_LBUTTONDOWN(event, x, y, flags, param):
			
 
				+        if event == cv2.EVENT_LBUTTONDOWN:
			
 
				+            xy = "%d,%d" % (x, y)
			
 
				+            cv2.circle(img, (x, y), 1, (255, 0, 0), thickness=-1)
			
 
				+            print((x,y))
			
 
				+            cv2.putText(img, xy, (x, y), cv2.FONT_HERSHEY_PLAIN,
			
 
				+                        1.5, (255, 255, 255), thickness=2)
			
 
				+            cv2.imshow("image", img)
			
 
				+
			
 
				+    cv2.namedWindow("image", cv2.WINDOW_AUTOSIZE)
			
 
				+    cv2.setMouseCallback("image", on_EVENT_LBUTTONDOWN)
			
 
				+    cv2.imshow("image", img)
			
 
				+
			
 
				+    while (True):
			
 
				+        try:
			
 
				+            if cv2.waitKey(10) & 0xFF == ord('q'):
			
 
				+                break
			
 
				+            elif cv2.waitKey(10) & 0xFF == ord('s'):
			
 
				+                cv2.imwrite('img_select.jpg', frame)
			
 
				+                break
			
 
				+        except Exception:
			
 
				+            cv2.destroyWindow("image")
			
 
				+            break
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__=='__main__':
			
 
				+    test()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/src/detection/laneATT_trt/python/show_coordinate.py
+++ b/src/detection/laneATT_trt/python/show_coordinate.py
@@ -0,0 +1,100 @@
 
				+import cv2
			
 
				+import yaml
			
 
				+import numpy as np
			
 
				+import sys
			
 
				+import os
			
 
				+
			
 
				+
			
 
				+def test():
			
 
				+
			
 
				+
			
 
				+    camera_files = "./yaml/camera_ori.yaml"
			
 
				+    print("files",camera_files)
			
 
				+    #W, H = 640, 480
			
 
				+    W, H = 1280, 720
			
 
				+    init_files=(os.path.realpath(sys.path[0]) + "/yaml/init.yaml")
			
 
				+    
			
 
				+    with open(init_files, "r") as f:
			
 
				+        init_data = yaml.load(f)
			
 
				+   
			
 
				+    source = init_data["source"]
			
 
				+    print("source=",source)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    #cap = cv2.VideoCapture("v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink")
			
 
				+    cap = cv2.VideoCapture("/home/nvidia/modularization/src/detection/laneATT_trt/test/camera_test.mp4")
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    #with open(camera_files, "r") as f:
			
 
				+    #    data = yaml.load(f,Loader=yaml.FullLoader)
			
 
				+    #K = np.array(data["K"])
			
 
				+    #D = np.array(data["D"])    
			
 
				+    
			
 
				+    data = cv2.FileStorage(camera_files, cv2.FILE_STORAGE_READ)
			
 
				+    K = data.getNode("cameraMatrix").mat()
			
 
				+    D = data.getNode("distCoeffs").mat()
			
 
				+    data.release()
			
 
				+    print("K=",K)
			
 
				+    print("D=",D)
			
 
				+
			
 
				+    map1, map2 = cv2.fisheye.initUndistortRectifyMap(
			
 
				+            K,
			
 
				+            D,
			
 
				+            np.eye(3),
			
 
				+            K,
			
 
				+            (W, H),
			
 
				+            cv2.CV_16SC2
			
 
				+    )
			
 
				+    
			
 
				+
			
 
				+    while True:
			
 
				+        ret, frame = cap.read()
			
 
				+        frame = cv2.resize(frame,(W,H))
			
 
				+        #frame = cv2.remap(frame, map1, map2, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT)
			
 
				+        cv2.imshow("capture", frame)
			
 
				+        if cv2.waitKey(10) & 0xFF == ord('q'):
			
 
				+            img = frame
			
 
				+            cv2.imwrite('img.jpg', frame)  # 存储为图像
			
 
				+            break
			
 
				+    cap.release()
			
 
				+    cv2.destroyAllWindows()
			
 
				+    #img = cv2.imread('img.jpg')
			
 
				+
			
 
				+
			
 
				+    def on_EVENT_LBUTTONDOWN(event, x, y, flags, param):
			
 
				+        if event == cv2.EVENT_LBUTTONDOWN:
			
 
				+            xy = "%d,%d" % (x, y)
			
 
				+            cv2.circle(img, (x, y), 1, (255, 0, 0), thickness=-1)
			
 
				+            print((x,y))
			
 
				+            cv2.putText(img, xy, (x, y), cv2.FONT_HERSHEY_PLAIN,
			
 
				+                        1.5, (255, 255, 255), thickness=2)
			
 
				+            cv2.imshow("image", img)
			
 
				+
			
 
				+    cv2.namedWindow("image", cv2.WINDOW_AUTOSIZE)
			
 
				+    cv2.setMouseCallback("image", on_EVENT_LBUTTONDOWN)
			
 
				+    cv2.imshow("image", img)
			
 
				+
			
 
				+    while (True):
			
 
				+        try:
			
 
				+            if cv2.waitKey(10) & 0xFF == ord('q'):
			
 
				+                break
			
 
				+        except Exception:
			
 
				+            cv2.destroyWindow("image")
			
 
				+            break
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__=='__main__':
			
 
				+    test()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/src/detection/laneATT_trt/python/show_coordinate_new.py
+++ b/src/detection/laneATT_trt/python/show_coordinate_new.py
@@ -0,0 +1,101 @@
 
				+import cv2
			
 
				+import yaml
			
 
				+import numpy as np
			
 
				+import sys
			
 
				+import os
			
 
				+
			
 
				+
			
 
				+def test():
			
 
				+
			
 
				+
			
 
				+    camera_files = "./yaml/camera_middle_640_360.yaml"
			
 
				+    print("files",camera_files)
			
 
				+    W, H = 640, 360
			
 
				+    #W, H = 1280, 720
			
 
				+    init_files=(os.path.realpath(sys.path[0]) + "/yaml/init.yaml")
			
 
				+    
			
 
				+    with open(init_files, "r") as f:
			
 
				+        init_data = yaml.load(f)
			
 
				+   
			
 
				+    source = init_data["source"]
			
 
				+    print("source=",source)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    cap = cv2.VideoCapture("v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink")
			
 
				+   # cap = cv2.VideoCapture(1)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    #with open(camera_files, "r") as f:
			
 
				+    #    data = yaml.load(f,Loader=yaml.FullLoader)
			
 
				+    #K = np.array(data["K"])
			
 
				+    #D = np.array(data["D"])    
			
 
				+    
			
 
				+    data = cv2.FileStorage(camera_files, cv2.FILE_STORAGE_READ)
			
 
				+    K = data.getNode("cameraMatrix").mat()
			
 
				+    D = data.getNode("distCoeffs").mat()
			
 
				+    data.release()
			
 
				+    print("K=",K)
			
 
				+    print("D=",D)
			
 
				+
			
 
				+    map1, map2 = cv2.initUndistortRectifyMap(
			
 
				+            K,
			
 
				+            D,
			
 
				+            np.eye(3),
			
 
				+            K,
			
 
				+            (W, H),
			
 
				+            cv2.CV_16SC2
			
 
				+    )
			
 
				+    
			
 
				+
			
 
				+    while True:
			
 
				+        ret, frame = cap.read()
			
 
				+        #frame = frame[410-90:590+90, 380+35:1020+35]
			
 
				+        #frame = cv2.resize(frame,(W,H))
			
 
				+        #frame = cv2.remap(frame, map1, map2, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT)
			
 
				+        cv2.imshow("capture", frame)
			
 
				+        if cv2.waitKey(10) & 0xFF == ord('q'):
			
 
				+            img = frame
			
 
				+            cv2.imwrite('img_cali.jpg', frame)  # 存储为图像
			
 
				+            break
			
 
				+    cap.release()
			
 
				+    cv2.destroyAllWindows()
			
 
				+    #img = cv2.imread('img.jpg')
			
 
				+
			
 
				+
			
 
				+    def on_EVENT_LBUTTONDOWN(event, x, y, flags, param):
			
 
				+        if event == cv2.EVENT_LBUTTONDOWN:
			
 
				+            xy = "%d,%d" % (x, y)
			
 
				+            cv2.circle(img, (x, y), 1, (255, 0, 0), thickness=-1)
			
 
				+            print((x,y))
			
 
				+            cv2.putText(img, xy, (x, y), cv2.FONT_HERSHEY_PLAIN,
			
 
				+                        1.5, (255, 255, 255), thickness=2)
			
 
				+            cv2.imshow("image", img)
			
 
				+
			
 
				+    cv2.namedWindow("image", cv2.WINDOW_AUTOSIZE)
			
 
				+    cv2.setMouseCallback("image", on_EVENT_LBUTTONDOWN)
			
 
				+    cv2.imshow("image", img)
			
 
				+
			
 
				+    while (True):
			
 
				+        try:
			
 
				+            if cv2.waitKey(10) & 0xFF == ord('q'):
			
 
				+                break
			
 
				+        except Exception:
			
 
				+            cv2.destroyWindow("image")
			
 
				+            break
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__=='__main__':
			
 
				+    test()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/src/detection/laneATT_trt/python/show_coordinate_shot.py
+++ b/src/detection/laneATT_trt/python/show_coordinate_shot.py
@@ -0,0 +1,64 @@
 
				+import cv2
			
 
				+import yaml
			
 
				+import numpy as np
			
 
				+import sys
			
 
				+import os
			
 
				+
			
 
				+def test():
			
 
				+
			
 
				+
			
 
				+    pers_file = "./yaml/pers.yaml"
			
 
				+    W, H = 640, 360
			
 
				+    init_files="./yaml/init.yaml"
			
 
				+    
			
 
				+    with open(init_files, "r") as f:
			
 
				+        init_data = yaml.load(f)
			
 
				+   
			
 
				+    source = init_data["source"]
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    
			
 
				+
			
 
				+
			
 
				+    img = cv2.imread('./img.jpg')
			
 
				+
			
 
				+
			
 
				+    def on_EVENT_LBUTTONDOWN(event, x, y, flags, param):
			
 
				+        if event == cv2.EVENT_LBUTTONDOWN:
			
 
				+            xy = "%d,%d" % (x, y)
			
 
				+            cv2.circle(img, (x, y), 1, (255, 0, 0), thickness=-1)
			
 
				+            print((x,y))
			
 
				+            cv2.putText(img, xy, (x, y), cv2.FONT_HERSHEY_PLAIN,
			
 
				+                        1.5, (255, 255, 255), thickness=2)
			
 
				+            cv2.imshow("image", img)
			
 
				+
			
 
				+    cv2.namedWindow("image", cv2.WINDOW_AUTOSIZE)
			
 
				+    cv2.setMouseCallback("image", on_EVENT_LBUTTONDOWN)
			
 
				+    cv2.imshow("image", img)
			
 
				+
			
 
				+    #fs = cv2.FileStorage(pers_file, cv2.FILE_STORAGE_WRITE)
			
 
				+    
			
 
				+
			
 
				+
			
 
				+    while (True):
			
 
				+        try:
			
 
				+            if cv2.waitKey(10) & 0xFF == ord('q'):
			
 
				+                break
			
 
				+        except Exception:
			
 
				+            cv2.destroyWindow("image")
			
 
				+            break
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__=='__main__':
			
 
				+    test()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/src/detection/laneATT_trt/utils.h
+++ b/src/detection/laneATT_trt/utils.h
@@ -0,0 +1,54 @@
 
				+#pragma once
			
 
				+
			
 
				+#include <dirent.h>
			
 
				+#include <opencv2/opencv.hpp>
			
 
				+
			
 
				+#include <iostream>
			
 
				+#include "common.hpp"
			
 
				+
			
 
				+#define SHOW_IMG
			
 
				+
			
 
				+static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
			
 
				+    int w, h, x, y;
			
 
				+    float r_w = input_w / (img.cols*1.0);
			
 
				+    float r_h = input_h / (img.rows*1.0);
			
 
				+    if (r_h > r_w) {
			
 
				+        w = input_w;
			
 
				+        h = r_w * img.rows;
			
 
				+        x = 0;
			
 
				+        y = (input_h - h) / 2;
			
 
				+    } else {
			
 
				+        w = r_h * img.cols;
			
 
				+        h = input_h;
			
 
				+        x = (input_w - w) / 2;
			
 
				+        y = 0;
			
 
				+    }
			
 
				+    cv::Mat re(h, w, CV_8UC3);
			
 
				+    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
			
 
				+    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(114, 114, 114));
			
 
				+    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
			
 
				+    cv::Mat tensor;
			
 
				+    out.convertTo(tensor, CV_32FC3, 1.f / 255.f);
			
 
				+
			
 
				+    cv::subtract(tensor, cv::Scalar(0.485, 0.456, 0.406), tensor, cv::noArray(), -1);
			
 
				+    cv::divide(tensor, cv::Scalar(0.229, 0.224, 0.225), tensor, 1, -1);
			
 
				+    return tensor;
			
 
				+}
			
 
				+
			
 
				+static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
			
 
				+    DIR *p_dir = opendir(p_dir_name);
			
 
				+    if (p_dir == nullptr) {
			
 
				+        return -1;
			
 
				+    }
			
 
				+
			
 
				+    struct dirent* p_file = nullptr;
			
 
				+    while ((p_file = readdir(p_dir)) != nullptr) {
			
 
				+        if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
			
 
				+            std::string cur_file_name(p_file->d_name);
			
 
				+            file_names.push_back(cur_file_name);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    closedir(p_dir);
			
 
				+    return 0;
			
 
				+}
			
--- a/src/detection/laneATT_trt/yaml/camera.yaml
+++ b/src/detection/laneATT_trt/yaml/camera.yaml
@@ -0,0 +1,8 @@
 
				+D:
			
 
				+- [-23.425988880936135, -21.80458632893629, -0.5387917768358769, 0.4939328755112909,
			
 
				+  0.8515868920493874]
			
 
				+K:
			
 
				+- [6628.560676825491, 0.0, 176.13848220061746]
			
 
				+- [0.0, 6611.257267601054, 321.4390302745282]
			
 
				+- [0.0, 0.0, 1.0]
			
 
				+dim: [360, 640]
			
--- a/src/detection/laneATT_trt/yaml/camera_middle_640_360.yaml
+++ b/src/detection/laneATT_trt/yaml/camera_middle_640_360.yaml
@@ -0,0 +1,12 @@
 
				+%YAML:1.0
			
 
				+
			
 
				+cameraMatrix:
			
 
				+    rows: 3
			
 
				+    cols: 3
			
 
				+    dt: f
			
 
				+    data: [6628.560676825491, 0.0, 176.13848220061746,0.0, 6611.257267601054, 321.4390302745282, 0.0, 0.0, 1.0]
			
 
				+distCoeffs: 
			
 
				+    rows: 1
			
 
				+    cols: 5
			
 
				+    dt: f
			
 
				+    data: [-23.425988880936135, -21.80458632893629, -0.5387917768358769, 0.4939328755112909,0.8515868920493874]
			
--- a/src/detection/laneATT_trt/yaml/init.yaml
+++ b/src/detection/laneATT_trt/yaml/init.yaml
@@ -0,0 +1,16 @@
 
				+source:
			
 
				+- 0
			
 
				+
			
 
				+src:
			
 
				+- [140, 370]
			
 
				+- [220, 298]
			
 
				+- [410, 292]
			
 
				+- [478, 360]
			
 
				+dst:
			
 
				+- [125, 371]
			
 
				+- [125, 218]
			
 
				+- [484, 218]
			
 
				+- [484, 370]
			
 
				+
			
 
				+meters: 2.0
			
 
				+pixels: 640
			
--- a/src/detection/laneATT_trt/yaml/pers.yaml
+++ b/src/detection/laneATT_trt/yaml/pers.yaml
@@ -0,0 +1,2 @@
 
				+%YAML:1.0
			
 
				+---
			
--- a/src/detection/laneATT_trt/yaml/pers_ori.yaml
+++ b/src/detection/laneATT_trt/yaml/pers_ori.yaml
@@ -0,0 +1,10 @@
 
				+%YAML:1.0
			
 
				+
			
 
				+
			
 
				+board_left_top: [549,557]
			
 
				+
			
 
				+board_left_bottom: [341,690]
			
 
				+
			
 
				+board_right_top: [740,558]
			
 
				+
			
 
				+board_right_bottom: [941,696]
			
--- a/src/detection/laneATT_trt/yololayer.cu
+++ b/src/detection/laneATT_trt/yololayer.cu
@@ -0,0 +1,333 @@
 
				+#include <assert.h>
			
 
				+#include <vector>
			
 
				+#include <iostream>
			
 
				+#include "yololayer.h"
			
 
				+#include "cuda_utils.h"
			
 
				+
			
 
				+namespace Tn
			
 
				+{
			
 
				+    template<typename T>
			
 
				+    void write(char*& buffer, const T& val)
			
 
				+    {
			
 
				+        *reinterpret_cast<T*>(buffer) = val;
			
 
				+        buffer += sizeof(T);
			
 
				+    }
			
 
				+
			
 
				+    template<typename T>
			
 
				+    void read(const char*& buffer, T& val)
			
 
				+    {
			
 
				+        val = *reinterpret_cast<const T*>(buffer);
			
 
				+        buffer += sizeof(T);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+using namespace Yolo;
			
 
				+
			
 
				+namespace nvinfer1
			
 
				+{
			
 
				+    YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel)
			
 
				+    {
			
 
				+        mClassCount = classCount;
			
 
				+        mNetWidth = netWidth;
			
 
				+        mNetHeight = netHeight;
			
 
				+        mMaxOutObject = maxOut;
			
 
				+        mYoloKernel = vYoloKernel;
			
 
				+        mKernelCount = vYoloKernel.size();
			
 
				+
			
 
				+        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
			
 
				+        size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
			
 
				+        for (int ii = 0; ii < mKernelCount; ii++)
			
 
				+        {
			
 
				+            CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
			
 
				+            const auto& yolo = mYoloKernel[ii];
			
 
				+            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
			
 
				+        }
			
 
				+    }
			
 
				+    YoloLayerPlugin::~YoloLayerPlugin()
			
 
				+    {
			
 
				+        for (int ii = 0; ii < mKernelCount; ii++)
			
 
				+        {
			
 
				+            CUDA_CHECK(cudaFree(mAnchor[ii]));
			
 
				+        }
			
 
				+        CUDA_CHECK(cudaFreeHost(mAnchor));
			
 
				+    }
			
 
				+
			
 
				+    // create the plugin at runtime from a byte stream
			
 
				+    YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
			
 
				+    {
			
 
				+        using namespace Tn;
			
 
				+        const char *d = reinterpret_cast<const char *>(data), *a = d;
			
 
				+        read(d, mClassCount);
			
 
				+        read(d, mThreadCount);
			
 
				+        read(d, mKernelCount);
			
 
				+        read(d, mNetWidth);
			
 
				+        read(d, mNetHeight);
			
 
				+        read(d, mMaxOutObject);
			
 
				+        mYoloKernel.resize(mKernelCount);
			
 
				+        auto kernelSize = mKernelCount * sizeof(YoloKernel);
			
 
				+        memcpy(mYoloKernel.data(), d, kernelSize);
			
 
				+        d += kernelSize;
			
 
				+        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
			
 
				+        size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
			
 
				+        for (int ii = 0; ii < mKernelCount; ii++)
			
 
				+        {
			
 
				+            CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
			
 
				+            const auto& yolo = mYoloKernel[ii];
			
 
				+            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
			
 
				+        }
			
 
				+        assert(d == a + length);
			
 
				+    }
			
 
				+
			
 
				+    void YoloLayerPlugin::serialize(void* buffer) const
			
 
				+    {
			
 
				+        using namespace Tn;
			
 
				+        char* d = static_cast<char*>(buffer), *a = d;
			
 
				+        write(d, mClassCount);
			
 
				+        write(d, mThreadCount);
			
 
				+        write(d, mKernelCount);
			
 
				+        write(d, mNetWidth);
			
 
				+        write(d, mNetHeight);
			
 
				+        write(d, mMaxOutObject);
			
 
				+        auto kernelSize = mKernelCount * sizeof(YoloKernel);
			
 
				+        memcpy(d, mYoloKernel.data(), kernelSize);
			
 
				+        d += kernelSize;
			
 
				+
			
 
				+        assert(d == a + getSerializationSize());
			
 
				+    }
			
 
				+
			
 
				+    size_t YoloLayerPlugin::getSerializationSize() const
			
 
				+    {
			
 
				+        return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size() + sizeof(mNetWidth) + sizeof(mNetHeight) + sizeof(mMaxOutObject);
			
 
				+    }
			
 
				+
			
 
				+    int YoloLayerPlugin::initialize()
			
 
				+    {
			
 
				+        return 0;
			
 
				+    }
			
 
				+
			
 
				+    Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
			
 
				+    {
			
 
				+        //output the result to channel
			
 
				+        int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float);
			
 
				+
			
 
				+        return Dims3(totalsize + 1, 1, 1);
			
 
				+    }
			
 
				+
			
 
				+    // Set plugin namespace
			
 
				+    void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
			
 
				+    {
			
 
				+        mPluginNamespace = pluginNamespace;
			
 
				+    }
			
 
				+
			
 
				+    const char* YoloLayerPlugin::getPluginNamespace() const
			
 
				+    {
			
 
				+        return mPluginNamespace;
			
 
				+    }
			
 
				+
			
 
				+    // Return the DataType of the plugin output at the requested index
			
 
				+    DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
			
 
				+    {
			
 
				+        return DataType::kFLOAT;
			
 
				+    }
			
 
				+
			
 
				+    // Return true if output tensor is broadcast across a batch.
			
 
				+    bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
			
 
				+    {
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    // Return true if plugin can use input that is broadcast across batch without replication.
			
 
				+    bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
			
 
				+    {
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
			
 
				+    void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    // Detach the plugin object from its execution context.
			
 
				+    void YoloLayerPlugin::detachFromContext() {}
			
 
				+
			
 
				+    const char* YoloLayerPlugin::getPluginType() const
			
 
				+    {
			
 
				+        return "YoloLayer_TRT";
			
 
				+    }
			
 
				+
			
 
				+    const char* YoloLayerPlugin::getPluginVersion() const
			
 
				+    {
			
 
				+        return "1";
			
 
				+    }
			
 
				+
			
 
				+    void YoloLayerPlugin::destroy()
			
 
				+    {
			
 
				+        delete this;
			
 
				+    }
			
 
				+
			
 
				+    // Clone the plugin
			
 
				+    IPluginV2IOExt* YoloLayerPlugin::clone() const
			
 
				+    {
			
 
				+        YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mNetWidth, mNetHeight, mMaxOutObject, mYoloKernel);
			
 
				+        p->setPluginNamespace(mPluginNamespace);
			
 
				+        return p;
			
 
				+    }
			
 
				+
			
 
				+    __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };
			
 
				+
			
 
				+    __global__ void CalDetection(const float *input, float *output, int noElements,
			
 
				+        const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT * 2], int classes, int outputElem)
			
 
				+    {
			
 
				+
			
 
				+        int idx = threadIdx.x + blockDim.x * blockIdx.x;
			
 
				+        if (idx >= noElements) return;
			
 
				+
			
 
				+        int total_grid = yoloWidth * yoloHeight;
			
 
				+        int bnIdx = idx / total_grid;
			
 
				+        idx = idx - total_grid * bnIdx;
			
 
				+        int info_len_i = 5 + classes;
			
 
				+        const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);
			
 
				+
			
 
				+        for (int k = 0; k < 3; ++k) {
			
 
				+            float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
			
 
				+            if (box_prob < IGNORE_THRESH) continue;
			
 
				+            int class_id = 0;
			
 
				+            float max_cls_prob = 0.0;
			
 
				+            for (int i = 5; i < info_len_i; ++i) {
			
 
				+                float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
			
 
				+                if (p > max_cls_prob) {
			
 
				+                    max_cls_prob = p;
			
 
				+                    class_id = i - 5;
			
 
				+                }
			
 
				+            }
			
 
				+            float *res_count = output + bnIdx * outputElem;
			
 
				+            int count = (int)atomicAdd(res_count, 1);
			
 
				+            if (count >= maxoutobject) return;
			
 
				+            char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection);
			
 
				+            Detection* det = (Detection*)(data);
			
 
				+
			
 
				+            int row = idx / yoloWidth;
			
 
				+            int col = idx % yoloWidth;
			
 
				+
			
 
				+            //Location
			
 
				+            // pytorch:
			
 
				+            //  y = x[i].sigmoid()
			
 
				+            //  y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
			
 
				+            //  y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
			
 
				+            //  X: (sigmoid(tx) + cx)/FeaturemapW *  netwidth
			
 
				+            det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
			
 
				+            det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;
			
 
				+
			
 
				+            // W: (Pw * e^tw) / FeaturemapW * netwidth
			
 
				+            // v5: https://github.com/ultralytics/yolov5/issues/471
			
 
				+            det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
			
 
				+            det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k];
			
 
				+            det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
			
 
				+            det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1];
			
 
				+            det->conf = box_prob * max_cls_prob;
			
 
				+            det->class_id = class_id;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize)
			
 
				+    {
			
 
				+        int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
			
 
				+        for (int idx = 0; idx < batchSize; ++idx) {
			
 
				+            CUDA_CHECK(cudaMemset(output + idx * outputElem, 0, sizeof(float)));
			
 
				+        }
			
 
				+        int numElem = 0;
			
 
				+        for (unsigned int i = 0; i < mYoloKernel.size(); ++i)
			
 
				+        {
			
 
				+            const auto& yolo = mYoloKernel[i];
			
 
				+            numElem = yolo.width*yolo.height*batchSize;
			
 
				+            if (numElem < mThreadCount)
			
 
				+                mThreadCount = numElem;
			
 
				+
			
 
				+            //printf("Net: %d  %d \n", mNetWidth, mNetHeight);
			
 
				+            CalDetection << < (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount >> >
			
 
				+                (inputs[i], output, numElem, mNetWidth, mNetHeight, mMaxOutObject, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
			
 
				+    {
			
 
				+        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
			
 
				+        return 0;
			
 
				+    }
			
 
				+
			
 
				+    PluginFieldCollection YoloPluginCreator::mFC{};
			
 
				+    std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
			
 
				+
			
 
				+    YoloPluginCreator::YoloPluginCreator()
			
 
				+    {
			
 
				+        mPluginAttributes.clear();
			
 
				+
			
 
				+        mFC.nbFields = mPluginAttributes.size();
			
 
				+        mFC.fields = mPluginAttributes.data();
			
 
				+    }
			
 
				+
			
 
				+    const char* YoloPluginCreator::getPluginName() const
			
 
				+    {
			
 
				+        return "YoloLayer_TRT";
			
 
				+    }
			
 
				+
			
 
				+    const char* YoloPluginCreator::getPluginVersion() const
			
 
				+    {
			
 
				+        return "1";
			
 
				+    }
			
 
				+
			
 
				+    const PluginFieldCollection* YoloPluginCreator::getFieldNames()
			
 
				+    {
			
 
				+        return &mFC;
			
 
				+    }
			
 
				+
			
 
				+    IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
			
 
				+    {
			
 
				+        int class_count = -1;
			
 
				+        int input_w = -1;
			
 
				+        int input_h = -1;
			
 
				+        int max_output_object_count = -1;
			
 
				+        std::vector<Yolo::YoloKernel> yolo_kernels(3);
			
 
				+
			
 
				+        const PluginField* fields = fc->fields;
			
 
				+        for (int i = 0; i < fc->nbFields; i++) {
			
 
				+            if (strcmp(fields[i].name, "netdata") == 0) {
			
 
				+                assert(fields[i].type == PluginFieldType::kFLOAT32);
			
 
				+                int *tmp = (int*)(fields[i].data);
			
 
				+                class_count = tmp[0];
			
 
				+                input_w = tmp[1];
			
 
				+                input_h = tmp[2];
			
 
				+                max_output_object_count = tmp[3];
			
 
				+            } else if (strstr(fields[i].name, "yolodata") != NULL) {
			
 
				+                assert(fields[i].type == PluginFieldType::kFLOAT32);
			
 
				+                int *tmp = (int*)(fields[i].data);
			
 
				+                YoloKernel kernel;
			
 
				+                kernel.width = tmp[0];
			
 
				+                kernel.height = tmp[1];
			
 
				+                for (int j = 0; j < fields[i].length - 2; j++) {
			
 
				+                    kernel.anchors[j] = tmp[j + 2];
			
 
				+                }
			
 
				+                yolo_kernels[2 - (fields[i].name[8] - '1')] = kernel;
			
 
				+            }
			
 
				+        }
			
 
				+        assert(class_count && input_w && input_h && max_output_object_count);
			
 
				+        YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, yolo_kernels);
			
 
				+        obj->setPluginNamespace(mNamespace.c_str());
			
 
				+        return obj;
			
 
				+    }
			
 
				+
			
 
				+    IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
			
 
				+    {
			
 
				+        // This object will be deleted when the network is destroyed, which will
			
 
				+        // call YoloLayerPlugin::destroy()
			
 
				+        YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
			
 
				+        obj->setPluginNamespace(mNamespace.c_str());
			
 
				+        return obj;
			
 
				+    }
			
 
				+}
			
 
				+
			
--- a/src/detection/laneATT_trt/yololayer.h
+++ b/src/detection/laneATT_trt/yololayer.h
@@ -0,0 +1,36 @@
 
				+#ifndef _YOLO_LAYER_H
			
 
				+#define _YOLO_LAYER_H
			
 
				+
			
 
				+#include <vector>
			
 
				+#include <string>
			
 
				+#include "NvInfer.h"
			
 
				+
			
 
				+namespace Yolo
			
 
				+{
			
 
				+    static constexpr int CHECK_COUNT = 3;
			
 
				+    static constexpr float IGNORE_THRESH = 0.1f;
			
 
				+    struct YoloKernel
			
 
				+    {
			
 
				+        int width;
			
 
				+        int height;
			
 
				+        float anchors[CHECK_COUNT * 2];
			
 
				+    };
			
 
				+    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
			
 
				+    static constexpr int CLASS_NUM = 1;
			
 
				+    static constexpr int INPUT_H = 360;
			
 
				+    static constexpr int INPUT_W = 640;
			
 
				+    static constexpr int IMG_H = 360;
			
 
				+    static constexpr int IMG_W = 640;
			
 
				+
			
 
				+    static constexpr int LOCATIONS = 4;
			
 
				+    struct alignas(float) Detection {
			
 
				+        //center_x center_y w h
			
 
				+        float bbox[LOCATIONS];
			
 
				+        float conf;  // bbox_conf * cls_conf
			
 
				+        float class_id;
			
 
				+    };
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+#endif
			
--- a/src/detection/laneATT_trt/yolop.cpp
+++ b/src/detection/laneATT_trt/yolop.cpp
@@ -0,0 +1,490 @@
 
				+#include "modulecomm.h"

			
 
				+#include "xmlparam.h"

			
 
				+#include "ivfault.h"

			
 
				+#include "ivlog.h"

			
 
				+#include "ivexit.h"

			
 
				+#include "ivversion.h"

			
 
				+#include "rawpic.pb.cc"

			
 
				+#include <QCoreApplication>

			
 
				+#include <thread>

			
 
				+#include "ivversion.h"

			
 
				+#include <chrono>

			
 
				+#include "opencv2/imgcodecs/legacy/constants_c.h"

			
 
				+#include "qmutex.h"

			
 
				+#include "condition_variable"

			
 
				+#include "laneatt.hh"

			
 
				+#include <opencv2/opencv.hpp>

			
 
				+#include "lanearray.pb.h"

			
 
				+

			
 
				+#define INPUT_H 360

			
 
				+#define INPUT_W 640

			
 
				+#define N_OFFSETS 72

			
 
				+#define N_STRIPS (N_OFFSETS - 1)

			
 
				+#define MAX_COL_BLOCKS 1000

			
 
				+

			
 
				+std::vector<cv::Point2f> warped_point, origin_point;

			
 
				+cv::Point2i transed_O;

			
 
				+

			
 
				+float METER_PER_PIXEL_X,METER_PER_PIXEL_Y;

			
 
				+

			
 
				+float BOARD_SIDE_LENGTH_X = 3.7;

			
 
				+float BOARD_SIDE_LENGTH_Y = 10.0;

			
 
				+

			
 
				+

			
 
				+bool gbstate = true;

			
 
				+void * gp;

			
 
				+void * gpdetect;

			
 
				+iv::Ivfault *gfault = nullptr;

			
 
				+iv::Ivlog *givlog = nullptr;

			
 
				+

			
 
				+std::thread * gpthread;

			
 
				+

			
 
				+std::string gstrinput;

			
 
				+std::string gstroutput;

			
 
				+QMutex mMutex;

			
 
				+std::condition_variable cond;

			
 
				+

			
 
				+cv::Mat result,img_input;

			
 
				+

			
 
				+cv::Mat camera_matrix,dist_coe,map1,map2;

			
 
				+cv::Mat img_warp,raw_image;

			
 
				+

			
 
				+std::vector<std::vector<cv::Point2f>> lanes;

			
 
				+

			
 
				+LaneATT model("./LaneATT_test.trt8");

			
 
				+

			
 
				+cv::VideoWriter outputVideo;

			
 
				+

			
 
				+

			
 
				+void exitfunc()

			
 
				+{

			
 
				+//    gbstate = false;

			
 
				+//    gpthread->join();

			
 
				+//    std::cout<<"state thread closed."<<std::endl;

			
 
				+return;

			
 
				+}

			
 
				+

			
 
				+cv::Mat get_pers_mat(std::string pers_file)

			
 
				+{

			
 
				+    cv::Point2f src[4];

			
 
				+    cv::Point2f dst[4];

			
 
				+    cv::FileStorage pf(pers_file, cv::FileStorage::READ);

			
 
				+    pf["board_left_top"]>>src[0];

			
 
				+    pf["board_left_bottom"]>>src[1];

			
 
				+    pf["board_right_top"]>>src[2];

			
 
				+    pf["board_right_bottom"]>>src[3];

			
 
				+

			
 
				+//    pf["board_left_top_dst"]>>dst[0];

			
 
				+//    pf["board_left_bottom_dst"]>>dst[1];

			
 
				+//    pf["board_right_top_dst"]>>dst[2];

			
 
				+//    pf["board_right_bottom_dst"]>>dst[3];

			
 
				+

			
 
				+/* -------------------------------------zhengke

			
 
				+    dst[1].x = src[0].x;

			
 
				+    dst[1].y = src[1].y;

			
 
				+

			
 
				+    dst[3].x = src[2].x;

			
 
				+    dst[3].y = src[3].y;

			
 
				+

			
 
				+    int side_length = dst[3].x - dst[1].x;

			
 
				+

			
 
				+    METER_PER_PIXEL = BOARD_SIDE_LENGTH / side_length;

			
 
				+

			
 
				+    dst[0].x = src[0].x;

			
 
				+    dst[0].y = dst[1].y - side_length;

			
 
				+

			
 
				+    dst[2].x = src[2].x;

			
 
				+    dst[2].y = dst[3].y - side_length;

			
 
				+

			
 
				+    cv::Mat pers_mat = getPerspectiveTransform(dst, src);

			
 
				+    cv::Mat pers_mat_inv = pers_mat.inv();

			
 
				+

			
 
				+    return pers_mat_inv;

			
 
				+*///-------------------------------------zhengke

			
 
				+    dst[1].x = 130*2;

			
 
				+    dst[1].y = 330*2;

			
 
				+

			
 
				+    dst[3].x = 510*2;

			
 
				+    dst[3].y = 330*2;

			
 
				+

			
 
				+    int side_length_x = dst[3].x - dst[1].x;

			
 
				+

			
 
				+    METER_PER_PIXEL_X = BOARD_SIDE_LENGTH_X / float(side_length_x);

			
 
				+

			
 
				+    dst[0].x = dst[1].x;

			
 
				+    dst[0].y = 10*2;

			
 
				+

			
 
				+    dst[2].x = dst[3].x;

			
 
				+    dst[2].y = 10*2;

			
 
				+

			
 
				+    int side_length_y = dst[1].y - dst[0].y;

			
 
				+

			
 
				+    METER_PER_PIXEL_Y = BOARD_SIDE_LENGTH_Y / float(side_length_y);

			
 
				+

			
 
				+    transed_O.x = (dst[1].x + dst[3].x)/2;

			
 
				+    transed_O.y = (dst[1].y + dst[3].y)/2;

			
 
				+

			
 
				+    cv::Mat pers_mat = getPerspectiveTransform(dst, src);

			
 
				+    cv::Mat pers_mat_inv = pers_mat.inv();

			
 
				+

			
 
				+    return pers_mat_inv;

			
 
				+}

			
 
				+

			
 
				+std::string PERS_FILE = "./yaml/pers_ori.yaml";

			
 
				+cv::Mat pers_mat_inv = get_pers_mat(PERS_FILE);  // 得到透视变换关系

			
 
				+

			
 
				+void transform (std::vector<std::vector<cv::Point2f>> lanes) {

			
 
				+

			
 
				+    // PerspectiveTransform

			
 
				+

			
 
				+    std::vector<cv::Point2f> leftLane;

			
 
				+    std::vector<cv::Point2f> rightLane;

			
 
				+    std::vector<std::vector<cv::Point2f>> warped_lanes;

			
 
				+

			
 
				+    for (int i = 0; i < lanes.size(); i++) {

			
 
				+        cv::perspectiveTransform(lanes[i], warped_point , pers_mat_inv); //  坐标变换(输入点，输出点，变换矩阵)

			
 
				+        warped_lanes.push_back(warped_point);

			
 
				+        double sum = 0.0;

			
 
				+        for (int i = 0 ; i < 5 ; i++){

			
 
				+            sum += warped_point[i].x;

			
 
				+        }

			
 
				+        double mean = sum / 5;

			
 
				+        if (mean < raw_image.size().width / 2){

			
 
				+            leftLane.push_back(cv::Point2f(mean, i));

			
 
				+        }else{

			
 
				+            rightLane.push_back(cv::Point2f(mean, i));

			
 
				+        }

			
 
				+    }

			
 
				+    float left_min = 0;

			
 
				+    float right_max = INPUT_W;

			
 
				+    int left_num = -1;

			
 
				+    int right_num = -1;

			
 
				+    int left_left = -1;

			
 
				+    int right_right = -1;

			
 
				+    for (const auto& points : leftLane){

			
 
				+        if(points.x > left_min){

			
 
				+            left_num = (int) round(points.y);

			
 
				+            left_min = points.x;

			
 
				+        }

			
 
				+    }

			
 
				+    for (const auto& points : leftLane){

			
 
				+        if((int)round(points.y) != left_num){

			
 
				+            left_left = points.y;

			
 
				+        }

			
 
				+    }

			
 
				+//    if (left_num == -1){

			
 
				+//        std::cout<<"left lane failed"<<std::endl;

			
 
				+//    }

			
 
				+    for (const auto& points : rightLane){

			
 
				+        if(points.x < right_max){

			
 
				+            right_num = (int) round(points.y);

			
 
				+            right_max = points.x;

			
 
				+        }

			
 
				+    }

			
 
				+    for (const auto& points : rightLane){

			
 
				+        if((int)round(points.y) != right_num){

			
 
				+            right_right = points.y;

			
 
				+        }

			
 
				+    }

			
 
				+//    if (right_num == -1){

			
 
				+//        std::cout<<"right lane failed"<<std::endl;

			
 
				+//    }

			
 
				+

			
 
				+    // Visualize

			
 
				+

			
 
				+    /*-----------------warp image and points ----------------------*/

			
 
				+    cv::Point2f pp;

			
 
				+    cv::Point2f ppp;

			
 
				+    int flag = 0;

			
 
				+    float DX = 0.055, DY = 0.26; // 像素和实际距离的比例

			
 
				+    std::vector<std::vector<cv::Point2f>> world_lanes;

			
 
				+    cv::warpPerspective(raw_image, img_warp, pers_mat_inv, raw_image.size());        //  图像变换(输入图像，输出图像，变换矩阵)

			
 
				+    for (const auto& lane_points : warped_lanes) {

			
 
				+        if(flag == left_num || flag == right_num){

			
 
				+            std::vector<cv::Point2f> world_points;

			
 
				+            for (const auto& point : lane_points) {

			
 
				+                pp.x =float(point.x);

			
 
				+                pp.y =float(point.y);

			
 
				+                cv::circle(img_warp, pp, 3, cv::Scalar(0, 255, 0), -1);

			
 
				+                //ppp.x = float((point.x - img_warp.size().width / 2) * METER_PER_PIXEL_X);

			
 
				+                //ppp.y = float((img_warp.size().height - point.y) * METER_PER_PIXEL_Y);

			
 
				+                ppp.x = float((point.x - transed_O.x) * METER_PER_PIXEL_X);

			
 
				+                ppp.y = float((transed_O.y - point.y) * METER_PER_PIXEL_Y);

			
 
				+                world_points.push_back(ppp);

			
 
				+            }

			
 
				+            world_lanes.push_back(world_points);

			
 
				+        }else{

			
 
				+            std::vector<cv::Point2f> world_points;

			
 
				+            for (const auto& point : lane_points) {

			
 
				+                pp.x =float(point.x);

			
 
				+                pp.y =float(point.y);

			
 
				+                //std::cout<<pp.x<<" "<<pp.y<<std::endl;

			
 
				+                cv::circle(img_warp, pp, 3, cv::Scalar(0, 0, 255), -1);

			
 
				+                ppp.x = float((point.x - transed_O.x) * METER_PER_PIXEL_X);

			
 
				+                ppp.y = float((transed_O.y - point.y) * METER_PER_PIXEL_Y);

			
 
				+                world_points.push_back(ppp);

			
 
				+                //std::cout<<ppp.x<<" "<<ppp.y<<std::endl;

			
 
				+            }

			
 
				+            world_lanes.push_back(world_points);

			
 
				+        }

			
 
				+        flag++;

			
 
				+    }

			
 
				+    //cv::imshow("warp",img_warp);

			
 
				+

			
 
				+

			
 
				+   /*------------------------show circle--------------------------*/

			
 
				+    flag = 0;

			
 
				+    for (const auto& lane_points : lanes) {

			
 
				+        if(flag == left_num || flag == right_num){

			
 
				+            for (int i=0; i<lane_points.size(); i++)  {

			
 
				+                pp.x =float(lane_points[i].x);

			
 
				+                pp.y =float(lane_points[i].y);

			
 
				+                cv::circle(raw_image, pp, 3, cv::Scalar(0, 255, 0), -1);

			
 
				+

			
 
				+            }

			
 
				+        }else if(flag == left_left){

			
 
				+            for (int i=0; i<lane_points.size(); i++) {

			
 
				+                pp.x =float(lane_points[i].x);

			
 
				+                pp.y =float(lane_points[i].y);

			
 
				+                cv::circle(raw_image, pp, 3, cv::Scalar(0, 0, 255), -1);

			
 
				+            }

			
 
				+        }else {

			
 
				+            for (int i=0; i<lane_points.size(); i++) {

			
 
				+                pp.x =float(lane_points[i].x);

			
 
				+                pp.y =float(lane_points[i].y);

			
 
				+                cv::circle(raw_image, pp, 3, cv::Scalar(255, 0, 0), -1);

			
 
				+            }

			
 
				+        }

			
 
				+        flag++;

			
 
				+    }

			
 
				+/*------------------------show lanes_line------------------------------*/

			
 
				+//    cv::Point2f p1, p2;

			
 
				+//    flag = 0;

			
 
				+//    for (const auto& lane_points : lanes) {

			
 
				+//        if(flag == left_num || flag == right_num){

			
 
				+//            for (int i=0; i<lane_points.size()-1; i++) {

			
 
				+//                p1.x =float(lane_points[i].x) *  x_ratio;

			
 
				+//                p1.y =float(lane_points[i].y) * y_ratio;

			
 
				+//                p2.x =float(lane_points[i+1].x) *  x_ratio;

			
 
				+//                p2.y =float(lane_points[i+1].y) *  y_ratio;

			
 
				+//                cv::line(raw_image, p1, p2, cv::Scalar(0, 255, 0), 1);

			
 
				+

			
 
				+//            }

			
 
				+//        }else{

			
 
				+//            for (int i=0; i<lane_points.size()-1; i++) {

			
 
				+//                p1.x =float(lane_points[i].x) *  x_ratio;

			
 
				+//                p1.y =float(lane_points[i].y) * y_ratio;

			
 
				+//                p2.x =float(lane_points[i+1].x) *  x_ratio;

			
 
				+//                p2.y =float(lane_points[i+1].y) *  y_ratio;

			
 
				+//                cv::line(raw_image, p1, p2, cv::Scalar(0, 0, 255), 1);

			
 
				+//            }

			
 
				+//        }

			
 
				+//        flag++;

			
 
				+//    }

			
 
				+

			
 
				+    //cv::imshow("raw",raw_image);

			
 
				+

			
 
				+    /*---------------------- protobuf ------------------------*/

			
 
				+

			
 
				+    iv::vision::Linearray line_array;

			
 
				+

			
 
				+    flag = 0;

			
 
				+    for (const auto& lane_points : world_lanes) {

			
 
				+        iv::vision::Line *line = line_array.add_line();

			
 
				+

			
 
				+        if(flag == left_left){

			
 
				+            line->set_index(1);

			
 
				+            for (int i=0; i<lane_points.size(); i++)  {

			
 
				+                iv::vision::Point2f *points = line->add_linepoint();

			
 
				+                points->set_x(lane_points[i].x);

			
 
				+                points->set_y(lane_points[i].y);

			
 
				+

			
 
				+            }

			
 
				+        }else if(flag == left_num){

			
 
				+            line->set_index(2);

			
 
				+            for (int i=0; i<lane_points.size(); i++) {

			
 
				+                iv::vision::Point2f *points = line->add_linepoint();

			
 
				+                points->set_x(lane_points[i].x);

			
 
				+                points->set_y(lane_points[i].y);

			
 
				+            }

			
 
				+        }else if(flag == right_num){

			
 
				+            line->set_index(3);

			
 
				+            for (int i=0; i<lane_points.size(); i++) {

			
 
				+                iv::vision::Point2f *points = line->add_linepoint();

			
 
				+                points->set_x(lane_points[i].x);

			
 
				+                points->set_y(lane_points[i].y);

			
 
				+            }

			
 
				+        }else{

			
 
				+            line->set_index(4);

			
 
				+            for (int i=0; i<lane_points.size(); i++) {

			
 
				+                iv::vision::Point2f *points = line->add_linepoint();

			
 
				+                points->set_x(lane_points[i].x);

			
 
				+                points->set_y(lane_points[i].y);

			
 
				+            }

			
 
				+        }

			
 
				+        flag++;

			
 
				+    }

			
 
				+    /*-----------------------shareMsg-------------------------*/

			
 
				+

			
 
				+    int size = line_array.ByteSize();

			
 
				+    char * strdata = new char[line_array.ByteSize()];

			
 
				+    if(line_array.SerializeToArray(strdata, size))

			
 
				+    {

			
 
				+        iv::modulecomm::ModuleSendMsg(gpdetect, strdata, size);

			
 
				+        std::cout<<"lane serialize success."<<std::endl;

			
 
				+    }

			
 
				+    else

			
 
				+    {

			
 
				+        std::cout<<"lane serialize error."<<std::endl;

			
 
				+    }

			
 
				+    delete strdata;

			
 
				+

			
 
				+}

			
 
				+

			
 
				+void Listenpic(const char * strdata,const unsigned int nSize,const unsigned int index,const QDateTime * dt,const char * strmemname)

			
 
				+{

			
 
				+    mMutex.lock();

			
 
				+

			
 
				+    std::cout<<"one time"<<std::endl;

			
 
				+    if(nSize<1000)return;

			
 
				+    iv::vision::rawpic pic;

			
 
				+

			
 
				+    if(false == pic.ParseFromArray(strdata,nSize))

			
 
				+    {

			
 
				+        std::cout<<"picview Listenpic fail."<<std::endl;

			
 
				+        return;

			
 
				+    }

			
 
				+

			
 
				+    cv::Mat mat(pic.height(),pic.width(),pic.mattype());

			
 
				+    if(pic.type() == 1)

			
 
				+        memcpy(mat.data,pic.picdata().data(),mat.rows*mat.cols*mat.elemSize());

			
 
				+    else

			
 
				+    {

			
 
				+       mat.release();

			
 
				+       std::vector<unsigned char> buff(pic.picdata().data(),pic.picdata().data() + pic.picdata().size());

			
 
				+       mat = cv::imdecode(buff,CV_LOAD_IMAGE_COLOR);

			
 
				+    }

			
 
				+

			
 
				+    cv::Mat img_camera=mat;

			
 
				+

			
 
				+    // save video

			
 
				+//    outputVideo.write(img_camera);

			
 
				+

			
 
				+    img_camera = img_camera(cv::Range(410-90,590+90), cv::Range(380+35,1020+35));//height width

			
 
				+    //img_camera = img_camera(cv::Range(240-30,600-30), cv::Range(320-70,960-70));//height width

			
 
				+    //img_camera = img_camera(cv::Range(120,480), cv::Range(240,880));//height width

			
 
				+//    std::cout<<img_camera<<std::endl;

			
 
				+    //cv::remap(img_camera,img_camera,map1,map2,cv::INTER_LINEAR,cv::BORDER_CONSTANT);

			
 
				+//    img_camera = img_camera(cv::Range(120,360), cv::Range(105,535));

			
 
				+//    cv::resize(img_camera,img_camera,cv::Size(640,360));

			
 
				+    raw_image = img_camera;

			
 
				+    std::cout<<raw_image.size()<<std::endl;

			
 
				+

			
 
				+//    img_camera = img_camera.resize(640,360);

			
 
				+

			
 
				+

			
 
				+    lanes = model.DetectLane(img_camera);

			
 
				+    std::cout<<"lane_num is: "<<lanes.size()<<std::endl;

			
 
				+    transform (lanes);

			
 
				+    std::cout<<img_warp.size()<<std::endl;

			
 
				+

			
 
				+

			
 
				+    mat.release();

			
 
				+    mMutex.unlock();

			
 
				+}

			
 
				+

			
 
				+

			
 
				+

			
 
				+int main(int argc, char *argv[]) {

			
 
				+

			
 
				+// ==============================================================================

			
 
				+    showversion("laneatt");

			
 
				+    QCoreApplication a(argc, argv);

			
 
				+

			
 
				+    gfault = new iv::Ivfault("laneATT");

			
 
				+    givlog = new iv::Ivlog("laneATT");

			
 
				+

			
 
				+    gfault->SetFaultState(0,0,"laneATT initialize.");

			
 
				+

			
 
				+

			
 
				+//================================== camera calib init ==========================

			
 
				+

			
 
				+    cv::FileStorage calib_file("./yaml/camera_middle_640_360.yaml", cv::FileStorage::READ);

			
 
				+    calib_file["cameraMatrix"]>>camera_matrix;

			
 
				+    calib_file["distCoeffs"]>>dist_coe;

			
 
				+    cv::Mat R = cv::Mat::eye(3, 3, CV_64F);

			
 
				+    //cv::Size imgsize=cv::Size(640,480);

			
 
				+    cv::Size imgsize=cv::Size(640,360);

			
 
				+

			
 
				+    cv::initUndistortRectifyMap(camera_matrix, dist_coe, R, camera_matrix,imgsize,CV_16SC2,map1,map2);

			
 
				+

			
 
				+    std::string outputVideoPath = "./test/wri.mp4";

			
 
				+

			
 
				+//    outputVideo.open(outputVideoPath, cv::VideoWriter::fourcc('D','I','V','X'), 24, cv::Size(1280, 720));

			
 
				+//==============================================================================

			
 
				+

			
 
				+/*-------------- input: camera -------------*/

			
 
				+

			
 
				+    //gp = iv::modulecomm::RegisterRecv("image00",Listenpic);

			
 
				+

			
 
				+

			
 
				+    gpdetect = iv::modulecomm::RegisterSend("linearray",10000000,1);

			
 
				+

			
 
				+//    cv::waitKey(1000);

			
 
				+

			
 
				+//    while(true){

			
 
				+//        if(cv::waitKey(10)==27)

			
 
				+//            break;

			
 
				+//        if(raw_image.empty()==1 || img_warp.empty()==1) continue;

			
 
				+//        cv::imshow("rst",raw_image);

			
 
				+//        //cv::imshow("rst2",img_warp );

			
 
				+//        cv::waitKey(10);

			
 
				+//    }

			
 
				+//    outputVideo.release();

			
 
				+

			
 
				+/*-------------- input: image --------------*/

			
 
				+

			
 
				+//    cv::Mat img_camera;

			
 
				+//    img_camera = cv::imread("./test/1.png");

			
 
				+//    lanes = model.DetectLane(img_camera);

			
 
				+//    transform (lanes, img_camera);

			
 
				+

			
 
				+

			
 
				+/*-------------- input: video -----------------*/

			
 
				+

			
 
				+    cv::VideoCapture cap("v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink");

			
 
				+    //cv::VideoCapture cap("/home/nvidia/code/modularization/src/detection/laneATT_trt/test/camera_test1.mp4");

			
 
				+    if(!cap.isOpened()){

			
 
				+        std::cout<<"open failed"<<std::endl;

			
 
				+        return -1;

			
 
				+    }

			
 
				+    cv::Mat frame;

			
 
				+    cv::waitKey(1000);

			
 
				+    cv::Mat img_crop;

			
 
				+    while(true)

			
 
				+    {

			
 
				+        auto start = std::chrono::system_clock::now();  //时间函数

			
 
				+        cap>>frame;

			
 
				+        img_crop = frame;

			
 
				+        //img_crop = frame(cv::Range(410-90,590+90), cv::Range(380+35,1020+35));//height width

			
 
				+        //cv::remap(img_crop,img_crop,map1,map2,cv::INTER_LINEAR,cv::BORDER_CONSTANT);

			
 
				+

			
 
				+        raw_image = img_crop;

			
 
				+        lanes = model.DetectLane(img_crop);

			
 
				+        transform (lanes);

			
 
				+        auto end = std::chrono::system_clock::now();  //时间函数

			
 
				+        std::cout <<"total time lane : "<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

			
 
				+        cv::imshow("rst",raw_image);

			
 
				+//        cv::imshow("rst2",img_warp );

			
 
				+        if (cv::waitKey(10) == 'q')

			
 
				+            break;

			
 
				+    }

			
 
				+    cv::destroyAllWindows();

			
 
				+

			
 
				+/*-------------------------------------------------------*/

			
 
				+

			
 
				+    iv::ivexit::RegIVExitCall(exitfunc);

			
 
				+

			
 
				+    return a.exec();

			
 
				+}

			
 
				+

			
--- a/src/detection/laneATT_trt/yolop.hpp
+++ b/src/detection/laneATT_trt/yolop.hpp
@@ -0,0 +1,239 @@
 
				+#pragma once
			
 
				+
			
 
				+#include <chrono>
			
 
				+#include "cuda_utils.h"
			
 
				+#include "logging.h"
			
 
				+#include "utils.h"
			
 
				+
			
 
				+#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
			
 
				+#define DEVICE 0  // GPU id
			
 
				+#define NMS_THRESH 0.45
			
 
				+#define CONF_THRESH 0.25
			
 
				+#define BATCH_SIZE 1
			
 
				+
			
 
				+// stuff we know about the network and the input/output blobs
			
 
				+static const int INPUT_H = Yolo::INPUT_H;
			
 
				+static const int INPUT_W = Yolo::INPUT_W;
			
 
				+static const int IMG_H = Yolo::IMG_H;
			
 
				+static const int IMG_W = Yolo::IMG_W;
			
 
				+static const int CLASS_NUM = Yolo::CLASS_NUM;
			
 
				+static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
			
 
				+const char* INPUT_BLOB_NAME = "data";
			
 
				+const char* OUTPUT_DET_NAME = "det";
			
 
				+const char* OUTPUT_SEG_NAME = "seg";
			
 
				+const char* OUTPUT_LANE_NAME = "lane";
			
 
				+static Logger gLogger;
			
 
				+
			
 
				+
			
 
				+
			
 
				+ICudaEngine* build_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) {
			
 
				+    INetworkDefinition* network = builder->createNetworkV2(0U);
			
 
				+
			
 
				+    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
			
 
				+    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
			
 
				+    assert(data);
			
 
				+
			
 
				+    std::map<std::string, Weights> weightMap = loadWeights(wts_name);
			
 
				+    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
			
 
				+
			
 
				+    // yolop backbone
			
 
				+    // auto focus0 = focus(network, weightMap, *shuffle->getOutput(0), 3, 32, 3, "model.0");
			
 
				+    auto focus0 = focus(network, weightMap, *data, 3, 32, 3, "model.0");
			
 
				+    auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 64, 3, 2, 1, "model.1");
			
 
				+    auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 64, 64, 1, true, 1, 0.5, "model.2");
			
 
				+    auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 128, 3, 2, 1, "model.3");
			
 
				+    auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.4");
			
 
				+    auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 256, 3, 2, 1, "model.5");
			
 
				+    auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 256, 256, 3, true, 1, 0.5, "model.6");
			
 
				+    auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 512, 3, 2, 1, "model.7");
			
 
				+    auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 512, 512, 5, 9, 13, "model.8");
			
 
				+
			
 
				+    // yolop head
			
 
				+    auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.9");
			
 
				+    auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 256, 1, 1, 1, "model.10");
			
 
				+
			
 
				+    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
			
 
				+    for (int i = 0; i < 256 * 2 * 2; i++) {
			
 
				+        deval[i] = 1.0;
			
 
				+    }
			
 
				+    Weights deconvwts11{ DataType::kFLOAT, deval, 256 * 2 * 2 };
			
 
				+    IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 256, DimsHW{ 2, 2 }, deconvwts11, emptywts);
			
 
				+    deconv11->setStrideNd(DimsHW{ 2, 2 });
			
 
				+    deconv11->setNbGroups(256);
			
 
				+    weightMap["deconv11"] = deconvwts11;
			
 
				+
			
 
				+    ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) };
			
 
				+    auto cat12 = network->addConcatenation(inputTensors12, 2);
			
 
				+    auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 512, 256, 1, false, 1, 0.5, "model.13");
			
 
				+    auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 128, 1, 1, 1, "model.14");
			
 
				+
			
 
				+    Weights deconvwts15{ DataType::kFLOAT, deval, 128 * 2 * 2 };
			
 
				+    IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts15, emptywts);
			
 
				+    deconv15->setStrideNd(DimsHW{ 2, 2 });
			
 
				+    deconv15->setNbGroups(128);
			
 
				+
			
 
				+    ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) };
			
 
				+    auto cat16 = network->addConcatenation(inputTensors16, 2);
			
 
				+    auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 256, 128, 1, false, 1, 0.5, "model.17");
			
 
				+    IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
			
 
				+
			
 
				+    auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 128, 3, 2, 1, "model.18");
			
 
				+    ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
			
 
				+    auto cat19 = network->addConcatenation(inputTensors19, 2);
			
 
				+    auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 256, 256, 1, false, 1, 0.5, "model.20");
			
 
				+    IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
			
 
				+
			
 
				+    auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 256, 3, 2, 1, "model.21");
			
 
				+    ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
			
 
				+    auto cat22 = network->addConcatenation(inputTensors22, 2);
			
 
				+    auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.23");
			
 
				+    IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
			
 
				+
			
 
				+    auto detect24 = addYoLoLayer(network, weightMap, det0, det1, det2);
			
 
				+    detect24->getOutput(0)->setName(OUTPUT_DET_NAME);
			
 
				+
			
 
				+    auto conv25 = convBlock(network, weightMap, *cat16->getOutput(0), 128, 3, 1, 1, "model.25");
			
 
				+    // upsample 26
			
 
				+    Weights deconvwts26{ DataType::kFLOAT, deval, 128 * 2 * 2 };
			
 
				+    IDeconvolutionLayer* deconv26 = network->addDeconvolutionNd(*conv25->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts26, emptywts);
			
 
				+    deconv26->setStrideNd(DimsHW{ 2, 2 });
			
 
				+    deconv26->setNbGroups(128);
			
 
				+
			
 
				+    auto bottleneck_csp27 = bottleneckCSP(network, weightMap, *deconv26->getOutput(0), 128, 64, 1, false, 1, 0.5, "model.27");
			
 
				+    auto conv28 = convBlock(network, weightMap, *bottleneck_csp27->getOutput(0), 32, 3, 1, 1, "model.28");
			
 
				+    // upsample 29
			
 
				+    Weights deconvwts29{ DataType::kFLOAT, deval, 32 * 2 * 2 };
			
 
				+    IDeconvolutionLayer* deconv29 = network->addDeconvolutionNd(*conv28->getOutput(0), 32, DimsHW{ 2, 2 }, deconvwts29, emptywts);
			
 
				+    deconv29->setStrideNd(DimsHW{ 2, 2 });
			
 
				+    deconv29->setNbGroups(32);
			
 
				+
			
 
				+    auto conv30 = convBlock(network, weightMap, *deconv29->getOutput(0), 16, 3, 1, 1, "model.30");
			
 
				+    auto bottleneck_csp31 = bottleneckCSP(network, weightMap, *conv30->getOutput(0), 16, 8, 1, false, 1, 0.5, "model.31");
			
 
				+
			
 
				+    // upsample32
			
 
				+    Weights deconvwts32{ DataType::kFLOAT, deval, 8 * 2 * 2 };
			
 
				+    IDeconvolutionLayer* deconv32 = network->addDeconvolutionNd(*bottleneck_csp31->getOutput(0), 8, DimsHW{ 2, 2 }, deconvwts32, emptywts);
			
 
				+    deconv32->setStrideNd(DimsHW{ 2, 2 });
			
 
				+    deconv32->setNbGroups(8);
			
 
				+
			
 
				+    auto conv33 = convBlock(network, weightMap, *deconv32->getOutput(0), 2, 3, 1, 1, "model.33");
			
 
				+    // segmentation output
			
 
				+    ISliceLayer *slicelayer = network->addSlice(*conv33->getOutput(0), Dims3{ 0, (Yolo::INPUT_H - Yolo::IMG_H) / 2, 0 }, Dims3{ 2, Yolo::IMG_H, Yolo::IMG_W }, Dims3{ 1, 1, 1 });
			
 
				+    auto segout = network->addTopK(*slicelayer->getOutput(0), TopKOperation::kMAX, 1, 1);
			
 
				+    segout->getOutput(1)->setName(OUTPUT_SEG_NAME);
			
 
				+
			
 
				+    auto conv34 = convBlock(network, weightMap, *cat16->getOutput(0), 128, 3, 1, 1, "model.34");
			
 
				+
			
 
				+    // upsample35
			
 
				+    Weights deconvwts35{ DataType::kFLOAT, deval, 128 * 2 * 2 };
			
 
				+    IDeconvolutionLayer* deconv35 = network->addDeconvolutionNd(*conv34->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts35, emptywts);
			
 
				+    deconv35->setStrideNd(DimsHW{ 2, 2 });
			
 
				+    deconv35->setNbGroups(128);
			
 
				+
			
 
				+    auto bottleneck_csp36 = bottleneckCSP(network, weightMap, *deconv35->getOutput(0), 128, 64, 1, false, 1, 0.5, "model.36");
			
 
				+    auto conv37 = convBlock(network, weightMap, *bottleneck_csp36->getOutput(0), 32, 3, 1, 1, "model.37");
			
 
				+
			
 
				+    // upsample38
			
 
				+    Weights deconvwts38{ DataType::kFLOAT, deval, 32 * 2 * 2 };
			
 
				+    IDeconvolutionLayer* deconv38 = network->addDeconvolutionNd(*conv37->getOutput(0), 32, DimsHW{ 2, 2 }, deconvwts38, emptywts);
			
 
				+    deconv38->setStrideNd(DimsHW{ 2, 2 });
			
 
				+    deconv38->setNbGroups(32);
			
 
				+
			
 
				+    auto conv39 = convBlock(network, weightMap, *deconv38->getOutput(0), 16, 3, 1, 1, "model.39");
			
 
				+    auto bottleneck_csp40 = bottleneckCSP(network, weightMap, *conv39->getOutput(0), 16, 8, 1, false, 1, 0.5, "model.40");
			
 
				+
			
 
				+    // upsample41
			
 
				+    Weights deconvwts41{ DataType::kFLOAT, deval, 8 * 2 * 2 };
			
 
				+    IDeconvolutionLayer* deconv41 = network->addDeconvolutionNd(*bottleneck_csp40->getOutput(0), 8, DimsHW{ 2, 2 }, deconvwts41, emptywts);
			
 
				+    deconv41->setStrideNd(DimsHW{ 2, 2 });
			
 
				+    deconv41->setNbGroups(8);
			
 
				+
			
 
				+    auto conv42 = convBlock(network, weightMap, *deconv41->getOutput(0), 2, 3, 1, 1, "model.42");
			
 
				+    // lane-det output
			
 
				+    ISliceLayer *laneSlice = network->addSlice(*conv42->getOutput(0), Dims3{ 0, (Yolo::INPUT_H - Yolo::IMG_H) / 2, 0 }, Dims3{ 2, Yolo::IMG_H, Yolo::IMG_W }, Dims3{ 1, 1, 1 });
			
 
				+    auto laneout = network->addTopK(*laneSlice->getOutput(0), TopKOperation::kMAX, 1, 1);
			
 
				+    laneout->getOutput(1)->setName(OUTPUT_LANE_NAME);
			
 
				+
			
 
				+    // detection output
			
 
				+    network->markOutput(*detect24->getOutput(0));
			
 
				+    // segmentation output
			
 
				+    network->markOutput(*segout->getOutput(1));
			
 
				+    // lane output
			
 
				+    network->markOutput(*laneout->getOutput(1));
			
 
				+
			
 
				+    //assert(false);
			
 
				+
			
 
				+    // Build engine
			
 
				+    builder->setMaxBatchSize(maxBatchSize);
			
 
				+    config->setMaxWorkspaceSize(2L * (1L << 30));  // 2GB
			
 
				+#if defined(USE_FP16)
			
 
				+    config->setFlag(BuilderFlag::kFP16);
			
 
				+#endif
			
 
				+
			
 
				+    std::cout << "Building engine, please wait for a while..." << std::endl;
			
 
				+    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
			
 
				+    std::cout << "Build engine successfully!" << std::endl;
			
 
				+
			
 
				+    // Don't need the network any more
			
 
				+    network->destroy();
			
 
				+
			
 
				+    // Release host memory
			
 
				+    for (auto& mem : weightMap)
			
 
				+    {
			
 
				+        free((void*)(mem.second.values));
			
 
				+    }
			
 
				+
			
 
				+    return engine;
			
 
				+}
			
 
				+
			
 
				+void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, std::string& wts_name) {
			
 
				+    // Create builder
			
 
				+    IBuilder* builder = createInferBuilder(gLogger);
			
 
				+    IBuilderConfig* config = builder->createBuilderConfig();
			
 
				+
			
 
				+    // Create model to populate the network, then set the outputs and create an engine
			
 
				+    ICudaEngine* engine = build_engine(maxBatchSize, builder, config, DataType::kFLOAT, wts_name);
			
 
				+    assert(engine != nullptr);
			
 
				+
			
 
				+    // Serialize the engine
			
 
				+    (*modelStream) = engine->serialize();
			
 
				+
			
 
				+    // Close everything down
			
 
				+    engine->destroy();
			
 
				+    builder->destroy();
			
 
				+    config->destroy();
			
 
				+}
			
 
				+
			
 
				+void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* det_output, int* seg_output, int* lane_output, int batchSize) {
			
 
				+    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
			
 
				+    // CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
			
 
				+    context.enqueue(batchSize, buffers, stream, nullptr);
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
			
 
				+    cudaStreamSynchronize(stream);
			
 
				+}
			
 
				+
			
 
				+void doInferenceCpu(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* det_output, int* seg_output, int* lane_output, int batchSize) {
			
 
				+    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
			
 
				+    context.enqueue(batchSize, buffers, stream, nullptr);
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
			
 
				+    cudaStreamSynchronize(stream);
			
 
				+}
			
 
				+
			
 
				+bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir) {
			
 
				+    if (argc < 4) return false;
			
 
				+    if (std::string(argv[1]) == "-s" && argc == 4) {
			
 
				+        wts = std::string(argv[2]);
			
 
				+        engine = std::string(argv[3]);
			
 
				+    } else if (std::string(argv[1]) == "-d" && argc == 4) {
			
 
				+        engine = std::string(argv[2]);
			
 
				+        img_dir = std::string(argv[3]);
			
 
				+    } else {
			
 
				+        return false;
			
 
				+    }
			
 
				+    return true;
			
 
				+}
			
--- a/src/detection/laneATT_trt/yolop_model.cpp
+++ b/src/detection/laneATT_trt/yolop_model.cpp
@@ -0,0 +1,4 @@
 
				+
			
 
				+#include "yolop_model.hpp"
			
 
				+#include "para.hpp"
			
 
				+
			
--- a/src/detection/laneATT_trt/yolop_model.hpp
+++ b/src/detection/laneATT_trt/yolop_model.hpp
@@ -0,0 +1,185 @@
 
				+#pragma once
			
 
				+
			
 
				+#include <chrono>
			
 
				+#include "cuda_utils.h"
			
 
				+#include "logging.h"
			
 
				+#include "utils.h"
			
 
				+#include "para.hpp"
			
 
				+#include "NvOnnxParser.h"
			
 
				+#include "parserOnnxConfig.h"
			
 
				+#include "NvInfer.h"
			
 
				+#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
			
 
				+#define DEVICE 0  // GPU id
			
 
				+#define NMS_THRESH 0.45
			
 
				+#define CONF_THRESH 0.25
			
 
				+#define BATCH_SIZE 1
			
 
				+
			
 
				+struct InferDeleter
			
 
				+{
			
 
				+    template <typename T>
			
 
				+    void operator()(T* obj) const
			
 
				+    {
			
 
				+        if (obj)
			
 
				+        {
			
 
				+            obj->destroy();
			
 
				+        }
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+class Yolop{
			
 
				+template <typename T>
			
 
				+using nvUniquePtr = std::unique_ptr<T, InferDeleter>;
			
 
				+public:
			
 
				+// stuff we know about the network and the input/output blobs
			
 
				+Yolop();
			
 
				+IRuntime* runtime;
			
 
				+ICudaEngine* engine;
			
 
				+IExecutionContext* context;
			
 
				+float input_data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
			
 
				+// Create stream
			
 
				+cudaStream_t stream;
			
 
				+void* buffers[4];
			
 
				+
			
 
				+
			
 
				+
			
 
				+void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* det_output, int* seg_output, int* lane_output, int batchSize);
			
 
				+
			
 
				+void doInferenceCpu(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* det_output, float* seg_output, float* lane_output, int batchSize);
			
 
				+
			
 
				+
			
 
				+void SaveEngine(IHostMemory& trtModelStream, std::string& engine_filepath);
			
 
				+
			
 
				+bool isFileExists_ifstream(const string& name);
			
 
				+
			
 
				+ICudaEngine* LoadEngine(IRuntime& runtime);
			
 
				+
			
 
				+int cuda_init();
			
 
				+
			
 
				+};
			
 
				+
			
 
				+
			
 
				+int Yolop::cuda_init(){
			
 
				+
			
 
				+
			
 
				+//    std::string onnxfile = "./weights/yolopv2_384x640.onnx";
			
 
				+
			
 
				+
			
 
				+
			
 
				+    IRuntime* runtime = createInferRuntime(gLogger);
			
 
				+    assert(runtime != nullptr);
			
 
				+    this->engine = LoadEngine(*runtime);
			
 
				+
			
 
				+    runtime->destroy();
			
 
				+    this->context = this->engine->createExecutionContext();
			
 
				+    if(this->context == nullptr)
			
 
				+    {
			
 
				+        std::cout<<"load engine fail"<<std::endl;
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    // deserialize the .engine and run inference
			
 
				+
			
 
				+
			
 
				+//    assert(this->engine->getNbBindings() == 4);
			
 
				+
			
 
				+    // In order to bind the buffers, we need to know the names of the input and output tensors.
			
 
				+    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
			
 
				+    const int inputIndex = this->engine->getBindingIndex(INPUT_BLOB_NAME);
			
 
				+    const int output_det_index = this->engine->getBindingIndex(OUTPUT_DET_NAME);
			
 
				+    const int output_seg_index = this->engine->getBindingIndex(OUTPUT_SEG_NAME);
			
 
				+    const int output_lane_index = this->engine->getBindingIndex(OUTPUT_LANE_NAME);
			
 
				+//    Dims k=this->engine->getBindingDimensions(output_seg_index);
			
 
				+//    DataType m=this->engine->getBindingDataType(output_det_index);
			
 
				+//    TensorFormat f= this->engine->getBindingFormat(output_det_index);
			
 
				+//    assert(inputIndex == 0);
			
 
				+//    assert(output_det_index == 1);
			
 
				+//    assert(output_seg_index == 2);
			
 
				+//    assert(output_lane_index == 3);
			
 
				+    // Create GPU buffers on device
			
 
				+    CUDA_CHECK(cudaMalloc(&this->buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
			
 
				+    CUDA_CHECK(cudaMalloc(&this->buffers[output_det_index], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
			
 
				+    CUDA_CHECK(cudaMalloc(&this->buffers[output_seg_index], BATCH_SIZE * IMG_H * IMG_W * sizeof(int)));
			
 
				+    CUDA_CHECK(cudaMalloc(&this->buffers[output_lane_index], BATCH_SIZE * IMG_H * IMG_W * sizeof(int)));
			
 
				+
			
 
				+    CUDA_CHECK(cudaStreamCreate(&this->stream));
			
 
				+
			
 
				+
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+Yolop::Yolop(){}
			
 
				+
			
 
				+bool Yolop::isFileExists_ifstream(const string& name) {
			
 
				+    ifstream f(name.c_str());
			
 
				+    return f.good();
			
 
				+    f.close();
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+void Yolop::SaveEngine(IHostMemory& trtModelStream, std::string& engine_filepath)
			
 
				+{
			
 
				+    std::ofstream file;
			
 
				+    file.open(engine_filepath, std::ios::binary | std::ios::out);
			
 
				+    if(!file.is_open())
			
 
				+    {
			
 
				+        std::cout << "read create engine file" << engine_filepath <<" failed" << std::endl;
			
 
				+        return;
			
 
				+    }
			
 
				+    file.write((const char*)trtModelStream.data(), std::streamsize(trtModelStream.size()));
			
 
				+    file.close();
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+ICudaEngine* Yolop::LoadEngine(IRuntime& runtime)
			
 
				+{
			
 
				+    const std::string engine_filepath = "./LaneATT_test.trt8";
			
 
				+
			
 
				+    std::ifstream file;
			
 
				+    file.open(engine_filepath, ios::binary | ios::in);
			
 
				+    if(!file)
			
 
				+    {
			
 
				+        cout<<engine_filepath<<" not found!"<<endl;
			
 
				+        return nullptr;
			
 
				+    }
			
 
				+    file.seekg(0, ios::end);
			
 
				+    std::size_t length = std::size_t(file.tellg());
			
 
				+    file.seekg(0, ios::beg);
			
 
				+
			
 
				+    std::shared_ptr<char> data(new char[length], std::default_delete<char[]>());
			
 
				+    file.read(data.get(), std::streamsize(length));
			
 
				+    file.close();
			
 
				+
			
 
				+    // runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
			
 
				+    ICudaEngine* engine = runtime.deserializeCudaEngine(data.get(), length, nullptr);
			
 
				+    return engine;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+void Yolop::doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* det_output, int* seg_output, int* lane_output, int batchSize) {
			
 
				+    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
			
 
				+    // CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
			
 
				+    context.enqueue(batchSize, buffers, stream, nullptr);
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
			
 
				+    cudaStreamSynchronize(stream);
			
 
				+}
			
 
				+
			
 
				+void Yolop::doInferenceCpu(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* det_output, float* seg_output, float* lane_output, int batchSize) {
			
 
				+    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
			
 
				+    context.enqueue(batchSize, buffers, stream, nullptr);
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * 2 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyDeviceToHost, stream));
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
			
 
				+    CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyDeviceToHost, stream));
			
 
				+    cudaStreamSynchronize(stream);
			
 
				+}
			
 
				+
			
 
				+
			
--- a/src/detection/laneATT_trt/yolop_trt.py
+++ b/src/detection/laneATT_trt/yolop_trt.py
@@ -0,0 +1,395 @@
 
				+# 2022/10/26 by ausk
			
 
				+"""
			
 
				+An example that uses TensorRT's Python api to make yolop inferences.
			
 
				+"""
			
 
				+import ctypes
			
 
				+import os
			
 
				+import shutil
			
 
				+import random
			
 
				+import sys
			
 
				+import time
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				+import pycuda.autoinit
			
 
				+import pycuda.driver as cuda
			
 
				+import tensorrt as trt
			
 
				+
			
 
				+CONF_THRESH = 0.5
			
 
				+IOU_THRESHOLD = 0.4
			
 
				+
			
 
				+
			
 
				+def get_img_path_batches(batch_size, img_dir):
			
 
				+    ret = []
			
 
				+    batch = []
			
 
				+    for root, dirs, files in os.walk(img_dir):
			
 
				+        for name in files:
			
 
				+            if len(batch) == batch_size:
			
 
				+                ret.append(batch)
			
 
				+                batch = []
			
 
				+            batch.append(os.path.join(root, name))
			
 
				+    if len(batch) > 0:
			
 
				+        ret.append(batch)
			
 
				+    return ret
			
 
				+
			
 
				+def plot_one_box(x, img, color=None, label=None, line_thickness=None):
			
 
				+    """
			
 
				+    description: Plots one bounding box on image img,
			
 
				+                 this function comes from YoLov5 project.
			
 
				+    """
			
 
				+    tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1)  # line/font thickness
			
 
				+    color = color or [random.randint(0, 255) for _ in range(3)]
			
 
				+    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
			
 
				+    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
			
 
				+    if label:
			
 
				+        tf = max(tl - 1, 1)  # font thickness
			
 
				+        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
			
 
				+        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
			
 
				+        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
			
 
				+        cv2.putText( img, label,  (c1[0], c1[1] - 2), 0,  tl / 3, [225, 255, 255], thickness=tf,  lineType=cv2.LINE_AA)
			
 
				+
			
 
				+class YolopTRT(object):
			
 
				+    """
			
 
				+    description: Warps TensorRT ops, preprocess and postprocess ops.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, engine_file_path):
			
 
				+        # Create a Context on this device,
			
 
				+        self.ctx = cuda.Device(0).make_context()
			
 
				+        stream = cuda.Stream()
			
 
				+        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
			
 
				+        runtime = trt.Runtime(TRT_LOGGER)
			
 
				+
			
 
				+        # Deserialize the engine from file
			
 
				+        with open(engine_file_path, "rb") as f:
			
 
				+            engine = runtime.deserialize_cuda_engine(f.read())
			
 
				+        context = engine.create_execution_context()
			
 
				+
			
 
				+        host_inputs = []
			
 
				+        cuda_inputs = []
			
 
				+        host_outputs = []
			
 
				+        cuda_outputs = []
			
 
				+        bindings = []
			
 
				+
			
 
				+        for binding in engine:
			
 
				+            print('bingding: ', binding, engine.get_binding_shape(binding))
			
 
				+            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
			
 
				+            dtype = trt.nptype(engine.get_binding_dtype(binding))
			
 
				+            # Allocate host and device buffers
			
 
				+            host_mem = cuda.pagelocked_empty(size, dtype)
			
 
				+            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
			
 
				+            # Append the device buffer to device bindings.
			
 
				+            bindings.append(int(cuda_mem))
			
 
				+            # Append to the appropriate list.
			
 
				+            if engine.binding_is_input(binding):
			
 
				+                self.input_w = engine.get_binding_shape(binding)[-1]
			
 
				+                self.input_h = engine.get_binding_shape(binding)[-2]
			
 
				+                host_inputs.append(host_mem)
			
 
				+                cuda_inputs.append(cuda_mem)
			
 
				+            else:
			
 
				+                host_outputs.append(host_mem)
			
 
				+                cuda_outputs.append(cuda_mem)
			
 
				+
			
 
				+        self.input_h = 384
			
 
				+        self.input_w = 640
			
 
				+        self.img_h = 360
			
 
				+        self.img_w = 640
			
 
				+
			
 
				+        # Store
			
 
				+        self.stream = stream
			
 
				+        self.context = context
			
 
				+        self.engine = engine
			
 
				+        self.host_inputs = host_inputs
			
 
				+        self.cuda_inputs = cuda_inputs
			
 
				+        self.host_outputs = host_outputs
			
 
				+        self.cuda_outputs = cuda_outputs
			
 
				+        self.bindings = bindings
			
 
				+        self.batch_size = engine.max_batch_size
			
 
				+
			
 
				+    def infer(self, raw_image_generator):
			
 
				+        # Make self the active context, pushing it on top of the context stack.
			
 
				+        self.ctx.push()
			
 
				+        # Restore
			
 
				+        stream = self.stream
			
 
				+        context = self.context
			
 
				+        engine = self.engine
			
 
				+        host_inputs = self.host_inputs
			
 
				+        cuda_inputs = self.cuda_inputs
			
 
				+        host_outputs = self.host_outputs
			
 
				+        cuda_outputs = self.cuda_outputs
			
 
				+        bindings = self.bindings
			
 
				+        # Do image preprocess
			
 
				+        batch_image_raw = []
			
 
				+        batch_origin_h = []
			
 
				+        batch_origin_w = []
			
 
				+        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
			
 
				+        for i, image_raw in enumerate(raw_image_generator):
			
 
				+            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
			
 
				+            batch_image_raw.append(image_raw)
			
 
				+            batch_origin_h.append(origin_h)
			
 
				+            batch_origin_w.append(origin_w)
			
 
				+            np.copyto(batch_input_image[i], input_image)
			
 
				+        batch_input_image = np.ascontiguousarray(batch_input_image)
			
 
				+
			
 
				+        # Copy input image to host buffer
			
 
				+        np.copyto(host_inputs[0], batch_input_image.ravel())
			
 
				+        start = time.time()
			
 
				+        # Transfer input data  to the GPU.
			
 
				+        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
			
 
				+        # Run inference.
			
 
				+        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
			
 
				+        # Transfer predictions back from the GPU.
			
 
				+        for i in range(len(host_outputs)):
			
 
				+            cuda.memcpy_dtoh_async(host_outputs[i], cuda_outputs[i], stream)
			
 
				+        # Synchronize the stream
			
 
				+        stream.synchronize()
			
 
				+        end = time.time()
			
 
				+        # Remove any context from the top of the context stack, deactivating it.
			
 
				+        self.ctx.pop()
			
 
				+        # Here we use the first row of output in that batch_size = 1
			
 
				+
			
 
				+        detout = host_outputs[0]
			
 
				+        segout = host_outputs[1].reshape( (self.batch_size, self.img_h,self.img_w))
			
 
				+        laneout = host_outputs[2].reshape( (self.batch_size, self.img_h,self.img_w))
			
 
				+
			
 
				+        # Do postprocess
			
 
				+        for i in range(self.batch_size):
			
 
				+            result_boxes, result_scores, result_classid = self.post_process(
			
 
				+                detout[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i]
			
 
				+            )
			
 
				+
			
 
				+            # Draw rectangles and labels on the original image
			
 
				+            img = batch_image_raw[i]
			
 
				+            nh = img.shape[0]
			
 
				+            nw = img.shape[1]
			
 
				+            for j in range(len(result_boxes)):
			
 
				+                box = result_boxes[j]
			
 
				+                label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j])
			
 
				+                plot_one_box( box, img, label=label)
			
 
				+
			
 
				+            seg  = cv2.resize(segout[i], (nw, nh), interpolation=cv2.INTER_NEAREST)
			
 
				+            lane = cv2.resize(laneout[i], (nw, nh), interpolation=cv2.INTER_NEAREST)
			
 
				+            color_area = np.zeros_like(img)
			
 
				+            color_area[seg==1]  = (0,255,0)
			
 
				+            color_area[lane==1] = (0,0,255)
			
 
				+            color_mask = np.mean(color_area, 2)
			
 
				+            img[color_mask != 0] = img[color_mask != 0] * 0.5 + color_area[color_mask != 0] * 0.5
			
 
				+            img = img.astype(np.uint8)
			
 
				+
			
 
				+        return batch_image_raw, end - start
			
 
				+
			
 
				+    def destroy(self):
			
 
				+        # Remove any context from the top of the context stack, deactivating it.
			
 
				+        self.ctx.pop()
			
 
				+
			
 
				+    def get_raw_image(self, image_path_batch):
			
 
				+        for img_path in image_path_batch:
			
 
				+            yield cv2.imread(img_path)
			
 
				+
			
 
				+    def get_raw_image_zeros(self, image_path_batch=None):
			
 
				+        for _ in range(self.batch_size):
			
 
				+            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
			
 
				+
			
 
				+    def preprocess_image(self, raw_bgr_image):
			
 
				+        image_raw = raw_bgr_image
			
 
				+        h, w, c = image_raw.shape
			
 
				+        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
			
 
				+        # Calculate widht and height and paddings
			
 
				+        r_w = self.input_w / w
			
 
				+        r_h = self.input_h / h
			
 
				+        if r_h > r_w:
			
 
				+            tw = self.input_w
			
 
				+            th = int(r_w * h)
			
 
				+            tx1 = tx2 = 0
			
 
				+            ty1 = int((self.input_h - th) / 2)
			
 
				+            ty2 = self.input_h - th - ty1
			
 
				+        else:
			
 
				+            tw = int(r_h * w)
			
 
				+            th = self.input_h
			
 
				+            tx1 = int((self.input_w - tw) / 2)
			
 
				+            tx2 = self.input_w - tw - tx1
			
 
				+            ty1 = ty2 = 0
			
 
				+        # Resize the image with long side while maintaining ratio
			
 
				+        image = cv2.resize(image, (tw, th))
			
 
				+        # Pad the short side with (128,128,128)
			
 
				+        image = cv2.copyMakeBorder(
			
 
				+            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (114, 114, 114)
			
 
				+        )
			
 
				+        image = image.astype(np.float32)
			
 
				+        # Normalize to [0,1]
			
 
				+        image /= 255.0
			
 
				+        image = (image - (0.485, 0.456, 0.406)) /(0.229, 0.224, 0.225)
			
 
				+        # HWC to CHW format:
			
 
				+        image = np.transpose(image, [2, 0, 1])
			
 
				+        # CHW to NCHW format
			
 
				+        image = np.expand_dims(image, axis=0)
			
 
				+        # Convert the image to row-major order, also known as "C order":
			
 
				+        image = np.ascontiguousarray(image)
			
 
				+        return image, image_raw, h, w
			
 
				+
			
 
				+    def xywh2xyxy(self, origin_h, origin_w, x):
			
 
				+        """
			
 
				+        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
			
 
				+        param:
			
 
				+            origin_h:   height of original image
			
 
				+            origin_w:   width of original image
			
 
				+            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
			
 
				+        return:
			
 
				+            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
			
 
				+        """
			
 
				+        y = np.zeros_like(x)
			
 
				+        r_w = self.input_w / origin_w
			
 
				+        r_h = self.input_h / origin_h
			
 
				+        if r_h > r_w:
			
 
				+            y[:, 0] = x[:, 0] - x[:, 2] / 2
			
 
				+            y[:, 2] = x[:, 0] + x[:, 2] / 2
			
 
				+            y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
			
 
				+            y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
			
 
				+            y /= r_w
			
 
				+        else:
			
 
				+            y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
			
 
				+            y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
			
 
				+            y[:, 1] = x[:, 1] - x[:, 3] / 2
			
 
				+            y[:, 3] = x[:, 1] + x[:, 3] / 2
			
 
				+            y /= r_h
			
 
				+
			
 
				+        return y
			
 
				+
			
 
				+    def post_process(self, output, origin_h, origin_w):
			
 
				+        # Get the num of boxes detected
			
 
				+        num = int(output[0])
			
 
				+        # Reshape to a two dimentional ndarray
			
 
				+        pred = np.reshape(output[1:], (-1, 6))[:num, :]
			
 
				+        # Do nms
			
 
				+        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
			
 
				+        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
			
 
				+        result_scores = boxes[:, 4] if len(boxes) else np.array([])
			
 
				+        result_classid = boxes[:, 5] if len(boxes) else np.array([])
			
 
				+        return result_boxes, result_scores, result_classid
			
 
				+
			
 
				+    def bbox_iou(self, box1, box2, x1y1x2y2=True):
			
 
				+        """
			
 
				+        description: compute the IoU of two bounding boxes
			
 
				+        param:
			
 
				+            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
			
 
				+            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
			
 
				+            x1y1x2y2: select the coordinate format
			
 
				+        return:
			
 
				+            iou: computed iou
			
 
				+        """
			
 
				+        if not x1y1x2y2:
			
 
				+            # Transform from center and width to exact coordinates
			
 
				+            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
			
 
				+            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
			
 
				+            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
			
 
				+            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
			
 
				+        else:
			
 
				+            # Get the coordinates of bounding boxes
			
 
				+            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
			
 
				+            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
			
 
				+
			
 
				+        # Get the coordinates of the intersection rectangle
			
 
				+        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
			
 
				+        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
			
 
				+        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
			
 
				+        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
			
 
				+        # Intersection area
			
 
				+        inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
			
 
				+                     np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
			
 
				+        # Union Area
			
 
				+        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
			
 
				+        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
			
 
				+
			
 
				+        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
			
 
				+
			
 
				+        return iou
			
 
				+
			
 
				+    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
			
 
				+        """
			
 
				+        description: Removes detections with lower object confidence score than 'conf_thres' and performs
			
 
				+        Non-Maximum Suppression to further filter detections.
			
 
				+        param:
			
 
				+            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
			
 
				+            origin_h: original image height
			
 
				+            origin_w: original image width
			
 
				+            conf_thres: a confidence threshold to filter detections
			
 
				+            nms_thres: a iou threshold to filter detections
			
 
				+        return:
			
 
				+            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
			
 
				+        """
			
 
				+        # Get the boxes that score > CONF_THRESH
			
 
				+        boxes = prediction[prediction[:, 4] >= conf_thres]
			
 
				+        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
			
 
				+        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
			
 
				+        # clip the coordinates
			
 
				+        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1)
			
 
				+        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1)
			
 
				+        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1)
			
 
				+        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1)
			
 
				+        # Object confidence
			
 
				+        confs = boxes[:, 4]
			
 
				+        # Sort by the confs
			
 
				+        boxes = boxes[np.argsort(-confs)]
			
 
				+        # Perform non-maximum suppression
			
 
				+        keep_boxes = []
			
 
				+        while boxes.shape[0]:
			
 
				+            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
			
 
				+            label_match = boxes[0, -1] == boxes[:, -1]
			
 
				+            # Indices of boxes with lower confidence scores, large IOUs and matching labels
			
 
				+            invalid = large_overlap & label_match
			
 
				+            keep_boxes += [boxes[0]]
			
 
				+            boxes = boxes[~invalid]
			
 
				+        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
			
 
				+        return boxes
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # load custom plugin and engine
			
 
				+    PLUGIN_LIBRARY = "build/libmyplugins.so"
			
 
				+    engine_file_path = "build/yolop.trt"
			
 
				+
			
 
				+    print("usage: xxx <engine file> <plugin file> <image dir>")
			
 
				+    print("[WARN] preaprea you image_dir, such as: samples, or /home/user/jetson/tmp/YOLOP/inference/images")
			
 
				+    IMAGE_DIR =  "/home/user/jetson/tmp/YOLOP/inference/images"
			
 
				+
			
 
				+    if len(sys.argv) > 1:
			
 
				+        engine_file_path = sys.argv[1]
			
 
				+    if len(sys.argv) > 2:
			
 
				+        PLUGIN_LIBRARY = sys.argv[2]
			
 
				+    if len(sys.argv) > 3:
			
 
				+        IMAGE_DIR = sys.argv[3]
			
 
				+
			
 
				+    ctypes.CDLL(PLUGIN_LIBRARY)
			
 
				+
			
 
				+    categories = ["car"]
			
 
				+
			
 
				+    if os.path.exists('output/'):
			
 
				+        shutil.rmtree('output/')
			
 
				+    os.makedirs('output/')
			
 
				+
			
 
				+    # a YolopTRT instance
			
 
				+    yolop_wrapper = YolopTRT(engine_file_path)
			
 
				+
			
 
				+    try:
			
 
				+        print('batch size is', yolop_wrapper.batch_size)
			
 
				+
			
 
				+        image_dir = IMAGE_DIR
			
 
				+        image_path_batches = get_img_path_batches(yolop_wrapper.batch_size, image_dir)
			
 
				+
			
 
				+        for i in range(1):
			
 
				+            batch_image_raw, use_time = yolop_wrapper.infer(yolop_wrapper.get_raw_image_zeros())
			
 
				+            print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
			
 
				+
			
 
				+        for batch in image_path_batches:
			
 
				+            batch_image_raw, use_time = yolop_wrapper.infer(yolop_wrapper.get_raw_image(batch))
			
 
				+            for i, img_path in enumerate(batch):
			
 
				+                parent, filename = os.path.split(img_path)
			
 
				+                save_name = os.path.join('output', filename)
			
 
				+                # Save image
			
 
				+                cv2.imwrite(save_name, batch_image_raw[i])
			
 
				+            print('input->{}, time->{:.2f}ms, saving into output/'.format(batch, use_time * 1000))
			
 
				+
			
 
				+    finally:
			
 
				+        # destroy the instance
			
 
				+        yolop_wrapper.destroy()
			
 
				+
			
 
				+    print("done!")