Browse Source

车道线检测

fujiankuan 2 năm trước cách đây
mục cha
commit
54c07eda80
32 tập tin đã thay đổi với 3711 bổ sung0 xóa
  1. 119 0
      src/detection/laneATT_trt/README.md
  2. 14 0
      src/detection/laneATT_trt/cuda_utils.h
  3. 38 0
      src/detection/laneATT_trt/gen_wts.py
  4. 68 0
      src/detection/laneATT_trt/laneATT_trt.pro
  5. 360 0
      src/detection/laneATT_trt/laneatt.cc
  6. 72 0
      src/detection/laneATT_trt/laneatt.hh
  7. 84 0
      src/detection/laneATT_trt/laneatt_to_onnx.py
  8. 24 0
      src/detection/laneATT_trt/logging.h
  9. 62 0
      src/detection/laneATT_trt/nms.cpp
  10. 193 0
      src/detection/laneATT_trt/nms_kernel.cu
  11. 17 0
      src/detection/laneATT_trt/para.hpp
  12. 163 0
      src/detection/laneATT_trt/python/calibrate_camera.py
  13. 196 0
      src/detection/laneATT_trt/python/calibrate_camera_new.py
  14. 183 0
      src/detection/laneATT_trt/python/calibrate_camera_new_.py
  15. 11 0
      src/detection/laneATT_trt/python/cameraID.py
  16. 58 0
      src/detection/laneATT_trt/python/select_area.py
  17. 100 0
      src/detection/laneATT_trt/python/show_coordinate.py
  18. 101 0
      src/detection/laneATT_trt/python/show_coordinate_new.py
  19. 64 0
      src/detection/laneATT_trt/python/show_coordinate_shot.py
  20. 54 0
      src/detection/laneATT_trt/utils.h
  21. 8 0
      src/detection/laneATT_trt/yaml/camera.yaml
  22. 12 0
      src/detection/laneATT_trt/yaml/camera_middle_640_360.yaml
  23. 16 0
      src/detection/laneATT_trt/yaml/init.yaml
  24. 2 0
      src/detection/laneATT_trt/yaml/pers.yaml
  25. 10 0
      src/detection/laneATT_trt/yaml/pers_ori.yaml
  26. 333 0
      src/detection/laneATT_trt/yololayer.cu
  27. 36 0
      src/detection/laneATT_trt/yololayer.h
  28. 490 0
      src/detection/laneATT_trt/yolop.cpp
  29. 239 0
      src/detection/laneATT_trt/yolop.hpp
  30. 4 0
      src/detection/laneATT_trt/yolop_model.cpp
  31. 185 0
      src/detection/laneATT_trt/yolop_model.hpp
  32. 395 0
      src/detection/laneATT_trt/yolop_trt.py

+ 119 - 0
src/detection/laneATT_trt/README.md

@@ -0,0 +1,119 @@
+YoloP
+=====
+
+The original pytorch model is from [hustvl/YOLOP](https://github.com/hustvl/YOLOP)
+
+## Authors
+
+<a href="https://github.com/ausk"><img src="https://avatars.githubusercontent.com/u/4545060?v=4?s=48" width="40px;" alt=""/></a>
+<a href="https://github.com/aliceint"><img src="https://avatars.githubusercontent.com/u/15520773?v=4?s=48" width="40px;" alt=""/></a>
+
+## 1. Prepare building environments
+
+Make sure you have install `c++`(support c++11)、 `cmake`、`opencv`(4.x)、`cuda`(10.x)、`nvinfer`(7.x).
+
+
+## 2. build yolop
+
+Go to `yolop`.
+
+```
+mkdir build
+cd build
+
+cmake ..
+make
+```
+
+Now you can get `yolop` and `libmyplugins.so`.
+
+
+## 3. Test in C++
+
+Go to `yolop/build`.
+
+### 3.1 generate yolop.wts
+Download/Clone [YOLOP](https://github.com/hustvl/YOLOP)
+
+Edit `gen_wts.py` , change `YOLOP_BASE_DIR` to realpath of `YOLOP`.
+
+```
+# [WARN] Please download/clone YOLOP, then set YOLOP_BASE_DIR to the root of YOLOP
+python3 ../gen_wts.py
+```
+
+### 3.2 generate yolop.trt
+```
+./yolop -s yolop.wts  yolop.trt
+```
+
+Now you have such files:  `libmyplugins.so yolop yolop.wts  yolop.trt`
+
+
+### 3.3 test yolop.trt
+```
+mkdir ../results
+
+YOLOP_BASE_DIR=/home/user/jetson/tmp/YOLOP
+./yolop -d yolop.trt  $YOLOP_BASE_DIR/inference/images/
+```
+
+It will output like as follow if successful! ( test on `Jetson Xavier NX - Jetpack 4.4`)
+```
+1601ms # the fist time is slow
+26ms   # then it is faster
+29ms
+27ms
+29ms
+29ms
+```
+
+![](https://user-images.githubusercontent.com/4545060/197756635-38348dc5-d8e7-4ae3-be56-6b231dd2f5db.jpg)
+
+
+## 4. Test in python3
+Go to `yolop`.
+
+Make sure you have install `pycuda` `tensorrt`; and modify `image_dir` to your image dir.
+
+```
+# usage: xxx <engine file> <plugin file> <image dir>
+
+python3 yolop_trt.py  build/yolop.trt  build/libmyplugins.so /home/user/jetson/tmp/YOLOP/inference/images
+```
+
+It will output like as follow if successful! ( test on `Jetson Xavier NX - Jetpack 4.4`)
+```
+usage: xxx <engine file> <plugin file> <image dir>
+[WARN] preaprea you image_dir, such as: samples, or /home/user/jetson/tmp/YOLOP/inference/images
+bingding:  data (3, 384, 640)
+bingding:  det (6001, 1, 1)
+bingding:  seg (1, 360, 640)
+bingding:  lane (1, 360, 640)
+batch size is 1
+warm_up->(384, 640, 3), time->1070.87ms
+input->['/home/user/jetson/tmp/YOLOP/inference/images/3c0e7240-96e390d2.jpg'], time->25.94ms, saving into output/
+input->['/home/user/jetson/tmp/YOLOP/inference/images/adb4871d-4d063244.jpg'], time->25.34ms, saving into output/
+input->['/home/user/jetson/tmp/YOLOP/inference/images/8e1c1ab0-a8b92173.jpg'], time->25.03ms, saving into output/
+input->['/home/user/jetson/tmp/YOLOP/inference/images/7dd9ef45-f197db95.jpg'], time->25.45ms, saving into output/
+input->['/home/user/jetson/tmp/YOLOP/inference/images/9aa94005-ff1d4c9a.jpg'], time->24.93ms, saving into output/
+input->['/home/user/jetson/tmp/YOLOP/inference/images/0ace96c3-48481887.jpg'], time->25.33ms, saving into output/
+done!
+```
+
+![](https://user-images.githubusercontent.com/4545060/198003852-204f3bae-18ad-44fb-9ecd-4a2a07a726a3.jpg)
+
+
+**Notice** : The results of c++ and python are not aligned for now!
+
+----------------------------------------
+
+```BibTeX
+@misc{2108.11250,
+Author = {Dong Wu and Manwen Liao and Weitian Zhang and Xinggang Wang},
+Title = {YOLOP: You Only Look Once for Panoptic Driving Perception},
+Year = {2021},
+Eprint = {arXiv:2108.11250},
+}
+```
+

+ 14 - 0
src/detection/laneATT_trt/cuda_utils.h

@@ -0,0 +1,14 @@
+#pragma once
+#include <cuda_runtime_api.h>
+
+#ifndef CUDA_CHECK
+#define CUDA_CHECK(callstr)\
+    {\
+        cudaError_t error_code = callstr;\
+        if (error_code != cudaSuccess) {\
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
+            assert(0);\
+        }\
+    }
+#endif  // CUDA_CHECK
+

+ 38 - 0
src/detection/laneATT_trt/gen_wts.py

@@ -0,0 +1,38 @@
+import os, sys
+import torch
+import struct
+
+# TODO: YOLOP_BASE_DIR is the root of YOLOP
+print("[WARN] Please download/clone YOLOP, then set YOLOP_BASE_DIR to the root of YOLOP")
+
+#YOLOP_BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+YOLOP_BASE_DIR = "/home/catarc/YOLOP"
+
+sys.path.append(YOLOP_BASE_DIR)
+from lib.models import get_net
+from lib.config import cfg
+
+
+# Initialize
+device = torch.device('cpu')
+# Load model
+model = get_net(cfg)
+checkpoint = torch.load(YOLOP_BASE_DIR + '/weights/End-to-end.pth', map_location=device)
+model.load_state_dict(checkpoint['state_dict'])
+# load to FP32
+model.float()
+model.to(device).eval()
+
+f = open('yolop.wts', 'w')
+f.write('{}\n'.format(len(model.state_dict().keys())))
+for k, v in model.state_dict().items():
+    vr = v.reshape(-1).cpu().numpy()
+    f.write('{} {} '.format(k, len(vr)))
+    for vv in vr:
+        f.write(' ')
+        f.write(struct.pack('>f',float(vv)).hex())
+    f.write('\n')
+
+f.close()
+
+print("save as yolop.wts")

+ 68 - 0
src/detection/laneATT_trt/laneATT_trt.pro

@@ -0,0 +1,68 @@
+QT -= gui
+CONFIG += c++11
+CONFIG -= app_bundle
+
+DEFINES += QT_DEPRECATED_WARNINGS
+
+QMAKE_LFLAGS += -no-pie
+
+
+
+#DEFINES += CPU_ONLY
+
+
+#system("cd ./caffe/proto ; protoc *.proto -I=./ --cpp_out=./ ; cd ../../ ; echo $PWD")
+
+SOURCES += yolop.cpp \
+            laneatt.cc \
+    ../../include/msgtype/rawpic.pb.cc \
+    ../../include/msgtype/lanearray.pb.cc
+
+
+
+INCLUDEPATH += \
+               /usr/include/aarch64-linux-gnu \
+               /usr/include/opencv4 \
+               /usr/local/cuda/include \
+               /usr/src/tensorrt/samples/common \
+               usr/local/include \
+               /usr/include/opencv2 \
+               /usr/include/opencv \
+    ../../include/msgtype/rawpic.pb.h
+
+LIBS += /usr/local/cuda/lib64/libcudart.so \
+/usr/lib/aarch64-linux-gnu/libnvparsers.so \
+/usr/lib/aarch64-linux-gnu/libnvinfer.so \
+/usr/lib/aarch64-linux-gnu/libnvonnxparser.so \
+/usr/lib/aarch64-linux-gnu/libnvinfer_plugin.so
+
+LIBS += -L"/usr/local/lib" -lprotobuf
+
+HEADERS +=  laneatt.hh \
+    ../../include/msgtype/lanearray.pb.h
+
+
+LIBS += /usr/lib/aarch64-linux-gnu/libopencv_highgui.so \
+        /usr/lib/aarch64-linux-gnu/libopencv_core.so    \
+        /usr/lib/aarch64-linux-gnu/libopencv_imgproc.so \
+        /usr/lib/aarch64-linux-gnu/libopencv_imgcodecs.so \
+        /usr/lib/aarch64-linux-gnu/libopencv_videoio.so \
+        /usr/lib/aarch64-linux-gnu/libopencv_video.so \
+        /usr/lib/aarch64-linux-gnu/libopencv_calib3d.so \
+        /usr/local/cuda-10.2/lib64/*.so
+#LIBS += -L/usr/local/lib -lboost_system \
+
+
+!include(../../../include/common.pri ) {
+    error( "Couldn't find the common.pri file!" )
+}
+
+!include(../../../include/ivprotobuf.pri ) {
+    error( "Couldn't find the ivprotobuf.pri file!" )
+}
+!include(../../../include/ivopencv.pri ) {
+    error( "Couldn't find the ivopencv.pri file!" )
+}
+INCLUDEPATH += $$PWD/../../include/msgtype
+LIBS += -L/usr/lib/aarch64-linux-gnu/ -lglog
+

+ 360 - 0
src/detection/laneATT_trt/laneatt.cc

@@ -0,0 +1,360 @@
+#include "laneatt.hh"
+
+#include <fstream>
+#include <NvInferPlugin.h>
+#include <cuda_runtime_api.h>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <map>
+
+#define INPUT_H 360
+#define INPUT_W 640
+#define N_OFFSETS 72
+#define N_STRIPS (N_OFFSETS - 1)
+#define MAX_COL_BLOCKS 1000
+
+//std::vector<cv::Point2f> warped_point, origin_point;
+
+//float METER_PER_PIXEL;
+
+//float BOARD_SIDE_LENGTH = 0.5;
+
+//cv::Mat get_pers_mat(std::string pers_file)
+//{
+//    cv::Point2f src[4];
+//    cv::Point2f dst[4];
+//    cv::FileStorage pf(pers_file, cv::FileStorage::READ);
+//    pf["board_left_top"]>>src[0];
+//    pf["board_left_bottom"]>>src[1];
+//    pf["board_right_top"]>>src[2];
+//    pf["board_right_bottom"]>>src[3];
+
+//    dst[1].x = src[0].x;
+//    dst[1].y = src[1].y;
+
+//    dst[3].x = src[2].x;
+//    dst[3].y = src[3].y;
+
+//    int side_length = dst[3].x - dst[1].x;
+
+//    METER_PER_PIXEL = BOARD_SIDE_LENGTH / side_length;
+
+//    dst[0].x = src[0].x;
+//    dst[0].y = dst[1].y - side_length;
+
+//    dst[2].x = src[2].x;
+//    dst[2].y = dst[3].y - side_length;
+
+//    cv::Mat pers_mat = getPerspectiveTransform(dst, src);
+//    cv::Mat pers_mat_inv = pers_mat.inv();
+
+//    return pers_mat_inv;
+//}
+
+//float DX (float x1, float x2, float dis) {
+//    return dis / (x2 - x1);
+//}
+
+//float DY (float y1, float y2, float dis) {
+//    return dis / (x2 - x1);
+//}
+
+
+
+//std::string PERS_FILE = "./yaml/pers.yaml";
+//cv::Mat pers_mat_inv = get_pers_mat(PERS_FILE);  // 得到透视变换关系
+
+LaneATT::LaneATT(const std::string& plan_path) {
+    buffer_size_[0] = 3 * INPUT_H * INPUT_W;
+    buffer_size_[1] = MAX_COL_BLOCKS * (5 + N_OFFSETS);
+
+    cudaMalloc(&buffers_[0], buffer_size_[0] * sizeof(float));
+    cudaMalloc(&buffers_[1], buffer_size_[1] * sizeof(float));
+    image_data_.resize(buffer_size_[0]);
+    detections_.resize(MAX_COL_BLOCKS);
+
+    cudaStreamCreate(&stream_);
+    LoadEngine(plan_path);
+}
+
+LaneATT::~LaneATT() {
+    cudaStreamDestroy(stream_);
+    for (auto& buffer : buffers_) {
+        cudaFree(buffer);
+    }
+    if (context_ != nullptr) {
+        context_->destroy();
+        engine_->destroy();
+    }
+}
+
+std::vector<std::vector<cv::Point2f>> LaneATT::DetectLane(const cv::Mat& raw_image) {
+    // Preprocessing
+    cv::Mat img_resize;
+    cv::resize(raw_image, img_resize, cv::Size(INPUT_W, INPUT_H), cv::INTER_LINEAR);
+    // img_resize.convertTo(img_resize, CV_32FC3, 1.0);
+    uint8_t* data_hwc = reinterpret_cast<uint8_t*>(img_resize.data);
+    float* data_chw = image_data_.data();
+    for (int c = 0; c < 3; ++c) {
+        for (unsigned j = 0, img_size = INPUT_H * INPUT_W; j < img_size; ++j) {
+            data_chw[c * img_size + j] = data_hwc[j * 3 + c] / 255.f;
+        }
+    }
+
+    // Do inference
+    cudaMemcpyAsync(buffers_[0], image_data_.data(), buffer_size_[0] * sizeof(float), cudaMemcpyHostToDevice, stream_);
+    context_->execute(1, &buffers_[0]);
+    cudaMemcpyAsync(detections_.data(), buffers_[1], buffer_size_[1] * sizeof(float), cudaMemcpyDeviceToHost, stream_);
+
+    // NMS and decoding
+    cv::Mat rst;
+    std::vector<std::vector<cv::Point2f>> lanes;
+    lanes = PostProcess(img_resize, raw_image);
+    return lanes;
+}
+
+void LaneATT::LoadEngine(const std::string& engine_file) {
+    std::ifstream in_file(engine_file, std::ios::binary);
+    if (!in_file.is_open()) {
+        std::cerr << "Failed to open engine file: " << engine_file << std::endl;
+        return;
+    }
+    in_file.seekg(0, in_file.end);
+    int length = in_file.tellg();
+    in_file.seekg(0, in_file.beg);
+    std::unique_ptr<char[]> trt_model_stream(new char[length]);
+    in_file.read(trt_model_stream.get(), length);
+    in_file.close();
+
+    // FIXME: getPluginCreator could not find plugin: ScatterND version: 1
+    initLibNvInferPlugins(&g_logger_, "");
+    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(g_logger_);
+    assert(runtime != nullptr);
+    engine_ = runtime->deserializeCudaEngine(trt_model_stream.get(), length, nullptr);
+    assert(engine_ != nullptr);
+    context_ = engine_->createExecutionContext();
+    assert(context_ != nullptr);
+
+    runtime->destroy();
+}
+
+float LaneIoU(const Detection& a, const Detection& b) {
+    int start_a = static_cast<int>(a.start_y *  + 0.5f);
+    int start_b = static_cast<int>(b.start_y *  + 0.5f);
+    int start = std::max(start_a, start_b);
+    int end_a = start_a + static_cast<int>(a.length + 0.5f) - 1;
+    int end_b = start_b + static_cast<int>(b.length + 0.5f) - 1;
+    int end = std::min(std::min(end_a, end_b), N_STRIPS);
+    // if (end < start) {
+    //     return 1.0f / 0.0f;
+    // }
+    float dist = 0.0f;
+    for (int i = start; i <= end; ++i) {
+        dist += fabs(a.lane_xs[i] - b.lane_xs[i]);
+    }
+    dist /= static_cast<float>(end - start + 1);
+    return dist;
+}
+
+std::vector<std::vector<cv::Point2f>> LaneATT::PostProcess(cv::Mat& lane_image, cv::Mat raw_image, float conf_thresh, float nms_thresh, int nms_topk) { //conf_thresh=0.4 nms_thresh=50 nms_topk=4
+    // 1.Do NMS
+    std::vector<Detection> candidates;
+    std::vector<Detection> proposals;
+    for (auto det : detections_) {
+        if (det.score > conf_thresh) {
+            candidates.push_back(det);
+        }
+    }
+    // std::cout << candidates.size() << std::endl;
+    std::sort(candidates.begin(), candidates.end(), [=](const Detection& a, const Detection& b) { return a.score > b.score; });
+    for (int i = 0; i < candidates.size(); ++i) {
+        if (candidates[i].score < 0.0f) {
+            continue;
+        }
+        proposals.push_back(candidates[i]);
+        if (proposals.size() == nms_topk) {
+            break;
+        }
+        for (int j = i + 1; j < candidates.size(); ++j) {
+            if (candidates[j].score > 0.0f && LaneIoU(candidates[j], candidates[i]) < nms_thresh) {
+                candidates[j].score = -1.0f;
+            }
+        }
+    }
+
+    // 2.Decoding
+    std::vector<float> anchor_ys;
+    for (int i = 0; i < N_OFFSETS; ++i) {
+        anchor_ys.push_back(1.0f - i / float(N_STRIPS));
+    }
+    std::vector<std::vector<cv::Point2f>> lanes;
+    for (const auto& lane: proposals) { 
+        int start = static_cast<int>(lane.start_y * N_STRIPS + 0.5f);
+        int end = start + static_cast<int>(lane.length + 0.5f) - 1;
+        end = std::min(end, N_STRIPS);
+        std::vector<cv::Point2f> points;
+        for (int i = start; i <= end; ++i) {
+            points.push_back(cv::Point2f(lane.lane_xs[i], anchor_ys[i] * INPUT_H));
+        }
+        lanes.push_back(points);
+    }
+
+    std::vector<std::vector<cv::Point2f>> lanes1;
+    float y_ratio =  float(raw_image.size().height) / float(INPUT_H);
+    float x_ratio =  float(raw_image.size().width) / float(INPUT_W);
+    cv::Point2f pp;
+    for (const auto& lane_points : lanes) {
+        std::vector<cv::Point2f> points;
+        for (int i=0; i<lane_points.size(); i++)  {
+            pp.x =float(lane_points[i].x) *  x_ratio;
+            pp.y =float(lane_points[i].y) * y_ratio;
+            points.push_back(pp);
+        }
+        lanes1.push_back(points);
+    }
+    return lanes1;
+
+
+//    float y_ratio =  float(raw_image.size().height) / float(INPUT_H);
+//    float x_ratio =  float(raw_image.size().width) / float(INPUT_W);
+
+//    // 3.PerspectiveTransform
+
+//    std::vector<cv::Point2f> leftLane;
+//    std::vector<cv::Point2f> rightLane;
+//    std::vector<std::vector<cv::Point2f>> warped_lanes;
+
+//    for (int i = 0; i < lanes.size(); i++) {
+//        cv::perspectiveTransform(lanes[i], warped_point , pers_mat_inv); //  坐标变换(输入点,输出点,变换矩阵)
+//        warped_lanes.push_back(warped_point);
+//        double sum = 0.0;
+//        for (int i = 0 ; i < 5 ; i++){
+//            sum += warped_point[i].x;
+//        }
+//        double mean = sum / 5;
+//        if (mean < INPUT_W / 2){
+//            leftLane.push_back(cv::Point2f(mean, i));
+//        }else{
+//            rightLane.push_back(cv::Point2f(mean, i));
+//        }
+//    }
+//    float left_min = 0;
+//    float right_max = INPUT_W;
+//    int left_num = -1;
+//    int right_num = -1;
+//    int left_left = -1;
+//    int right_right = -1;
+//    for (const auto& points : leftLane){
+//        if(points.x > left_min){
+//            left_num = (int) round(points.y);
+//            left_min = points.x;
+//        }
+//    }
+//    for (const auto& points : leftLane){
+//        if((int)round(points.y) != left_num){
+//            left_left = points.y;
+//        }
+//    }
+//    if (left_num == -1){
+//        std::cout<<"left lane failed"<<std::endl;
+//    }
+//    for (const auto& points : rightLane){
+//        if(points.x < right_max){
+//            right_num = (int) round(points.y);
+//            right_max = points.x;
+//        }
+//    }
+//    for (const auto& points : rightLane){
+//        if((int)round(points.y) != right_num){
+//            right_right = points.y;
+//        }
+//    }
+//    if (right_num == -1){
+//        std::cout<<"right lane failed"<<std::endl;
+//    }
+
+//    // 4.Visualize
+
+//    cv::Point2f pp;
+//    cv::Point3f ppp;
+//    int flag = 0;
+//    cv::Mat img_warp;
+//    int DX, DY;
+//    std::vector<std::vector<cv::Point3f>> world_lanes;
+//    cv::warpPerspective(raw_image, img_warp, pers_mat_inv, raw_image.size());        //  图像变换(输入图像,输出图像,变换矩阵)
+//    for (const auto& lane_points : warped_lanes) {
+//        if(flag == left_num || flag == right_num){
+//            std::vector<cv::Point3f> world_points;
+//            for (const auto& point : lane_points) {
+//                pp.x =float(point.x);
+//                pp.y =float(point.y);
+//                cv::circle(img_warp, pp, 1, cv::Scalar(0, 255, 0), -1);
+//                ppp.x = (point.x - INPUT_W / 2) * DX;
+//                ppp.y = float((INPUT_H - point.y) * DY);
+//                ppp.z = float(0);
+//                world_points.push_back(ppp);
+//            }
+//            world_lanes.push_back(world_points);
+//        }else{
+//            std::vector<cv::Point3f> world_points;
+//            for (const auto& point : lane_points) {
+//                pp.x =float(point.x);
+//                pp.y =float(point.y);
+//                cv::circle(img_warp, pp, 1, cv::Scalar(0, 0, 255), -1);
+//                ppp.x = (point.x - INPUT_W / 2) * DX;
+//                ppp.y = float((INPUT_H - point.y) * DY);
+//                ppp.z = float(0);
+//                world_points.push_back(ppp);
+//            }
+//            world_lanes.push_back(world_points);
+//        }
+//        flag++;
+//    }
+//    cv::imshow("warp",img_warp);
+
+
+
+//    flag = 0;
+//    for (const auto& lane_points : lanes) {
+//        if(flag == left_num || flag == right_num){
+//            for (int i=0; i<lane_points.size()-1; i++)  {
+//                pp.x =float(lane_points[i].x) *  x_ratio;
+//                pp.y =float(lane_points[i].y) * y_ratio;
+//                cv::circle(raw_image, pp, 1, cv::Scalar(0, 255, 0), -1);
+
+//            }
+//        }else{
+//            for (int i=0; i<lane_points.size()-1; i++) {
+//                pp.x =float(lane_points[i].x) *  x_ratio;
+//                pp.y =float(lane_points[i].y) * y_ratio;
+//                cv::circle(raw_image, pp, 1, cv::Scalar(0, 0, 255), -1);
+//            }
+//        }
+//        flag++;
+//    }
+//    cv::Point2f p1, p2;
+//    flag = 0;
+//    for (const auto& lane_points : lanes) {
+//        if(flag == left_num || flag == right_num){
+//            for (int i=0; i<lane_points.size()-1; i++) {
+//                p1.x =float(lane_points[i].x) *  x_ratio;
+//                p1.y =float(lane_points[i].y) * y_ratio;
+//                p2.x =float(lane_points[i+1].x) *  x_ratio;
+//                p2.y =float(lane_points[i+1].y) *  y_ratio;
+//                cv::line(raw_image, p1, p2, cv::Scalar(0, 255, 0), 1);
+
+//            }
+//        }else{
+//            for (int i=0; i<lane_points.size()-1; i++) {
+//                p1.x =float(lane_points[i].x) *  x_ratio;
+//                p1.y =float(lane_points[i].y) * y_ratio;
+//                p2.x =float(lane_points[i+1].x) *  x_ratio;
+//                p2.y =float(lane_points[i+1].y) *  y_ratio;
+//                cv::line(raw_image, p1, p2, cv::Scalar(0, 0, 255), 1);
+//            }
+//        }
+//        flag++;
+//    }
+
+//    return raw_image;
+}

+ 72 - 0
src/detection/laneATT_trt/laneatt.hh

@@ -0,0 +1,72 @@
+#pragma once
+
+#include <NvInfer.h>
+#include <iostream>
+#include <memory>
+#include <opencv2/core.hpp>
+
+
+
+class Logger : public nvinfer1::ILogger {
+  public:
+    explicit Logger(Severity severity = Severity::kWARNING)
+        : reportable_severity(severity) {}
+
+    void log(Severity severity, const char* msg) noexcept {
+        if (severity > reportable_severity) {
+            return;
+        }
+        switch (severity) {
+            case Severity::kINTERNAL_ERROR:
+                std::cerr << "INTERNAL_ERROR: ";
+                break;
+            case Severity::kERROR:
+                std::cerr << "ERROR: ";
+                break;
+            case Severity::kWARNING:
+                std::cerr << "WARNING: ";
+                break;
+            case Severity::kINFO:
+                std::cerr << "INFO: ";
+                break;
+            default:
+                std::cerr << "UNKNOWN: ";
+                break;
+        }
+        std::cerr << msg << std::endl;
+    }
+    Severity reportable_severity;
+};
+
+struct Detection {
+    float unknown;
+    float score;
+    float start_y;
+    float start_x;
+    float length;
+    float lane_xs[72];
+};
+
+class LaneATT {
+  public:
+    LaneATT(const std::string& plan_path);
+
+    ~LaneATT();
+
+    std::vector<std::vector<cv::Point2f>> DetectLane(const cv::Mat& raw_image);
+
+  private:
+    void LoadEngine(const std::string& engine_file);
+
+    std::vector<std::vector<cv::Point2f>> PostProcess(cv::Mat& lane_image, cv::Mat raw_image, float conf_thresh=0.5f, float nms_thresh=50.f, int nms_topk=4);
+
+    Logger g_logger_;
+    cudaStream_t stream_;
+    nvinfer1::ICudaEngine* engine_;
+    nvinfer1::IExecutionContext* context_;
+    
+    void* buffers_[2];
+    int buffer_size_[2];
+    std::vector<float> image_data_;
+    std::vector<Detection> detections_;
+};

+ 84 - 0
src/detection/laneATT_trt/laneatt_to_onnx.py

@@ -0,0 +1,84 @@
+import torch
+
+from lib.models.laneatt import LaneATT
+
+
+class LaneATTONNX(torch.nn.Module):
+    def __init__(self, model):
+        super(LaneATTONNX, self).__init__()
+        # Params
+        self.fmap_h = model.fmap_h  # 11
+        self.fmap_w = model.fmap_w  # 20
+        self.anchor_feat_channels = model.anchor_feat_channels  # 64
+        self.anchors = model.anchors
+        self.cut_xs = model.cut_xs
+        self.cut_ys = model.cut_ys
+        self.cut_zs = model.cut_zs
+        self.invalid_mask = model.invalid_mask
+        # Layers
+        self.feature_extractor = model.feature_extractor
+        self.conv1 = model.conv1
+        self.cls_layer = model.cls_layer
+        self.reg_layer = model.reg_layer
+        self.attention_layer = model.attention_layer
+
+        # Exporting the operator eye to ONNX opset version 11 is not supported
+        attention_matrix = torch.eye(1000)
+        self.non_diag_inds = torch.nonzero(attention_matrix == 0., as_tuple=False)
+        self.non_diag_inds = self.non_diag_inds[:, 1] + 1000 * self.non_diag_inds[:, 0]  # 999000
+
+    def forward(self, x):
+        batch_features = self.feature_extractor(x)
+        batch_features = self.conv1(batch_features)
+        # batch_anchor_features = self.cut_anchor_features(batch_features)
+        batch_anchor_features = batch_features[0].flatten()
+        # h, w = batch_features.shape[2:4]  # 12, 20
+        batch_anchor_features = batch_anchor_features[self.cut_xs + 20 * self.cut_ys + 12 * 20 * self.cut_zs].\
+            view(1000, self.anchor_feat_channels, self.fmap_h, 1)
+        # batch_anchor_features[self.invalid_mask] = 0
+        batch_anchor_features = batch_anchor_features * torch.logical_not(self.invalid_mask)
+
+        # Join proposals from all images into a single proposals features batch
+        batch_anchor_features = batch_anchor_features.view(-1, self.anchor_feat_channels * self.fmap_h)
+
+        # Add attention features
+        softmax = torch.nn.Softmax(dim=1)
+        scores = self.attention_layer(batch_anchor_features)
+        attention = softmax(scores)
+        attention_matrix = torch.zeros(1000 * 1000, device=x.device)
+        attention_matrix[self.non_diag_inds] = attention.flatten()  # ScatterND
+        attention_matrix = attention_matrix.view(1000, 1000)
+        attention_features = torch.matmul(torch.transpose(batch_anchor_features, 0, 1),
+                                          torch.transpose(attention_matrix, 0, 1)).transpose(0, 1)
+        batch_anchor_features = torch.cat((attention_features, batch_anchor_features), dim=1)
+
+        # Predict
+        cls_logits = self.cls_layer(batch_anchor_features)
+        reg = self.reg_layer(batch_anchor_features)
+
+        # Add offsets to anchors (1000, 2+2+73)
+        reg_proposals = torch.cat([softmax(cls_logits), self.anchors[:, 2:4], self.anchors[:, 4:] + reg], dim=1)
+
+        return reg_proposals
+
+
+def export_onnx(onnx_file_path):
+    # e.g. laneatt_r18_culane
+    backbone_name = 'resnet18'
+    checkpoint_file_path = 'experiments/laneatt_r18_culane/models/model_0015.pt'
+    anchors_freq_path = 'culane_anchors_freq.pt'
+
+    # Load specified checkpoint
+    model = LaneATT(backbone=backbone_name, anchors_freq_path=anchors_freq_path, topk_anchors=1000)
+    checkpoint = torch.load(checkpoint_file_path)
+    model.load_state_dict(checkpoint['model'])
+    model.eval()
+
+    # Export to ONNX
+    onnx_model = LaneATTONNX(model)
+    dummy_input = torch.randn(1, 3, 360, 640)
+    torch.onnx.export(onnx_model, dummy_input, onnx_file_path, opset_version=11)
+
+
+if __name__ == '__main__':
+    export_onnx('./LaneATT_test.onnx')

+ 24 - 0
src/detection/laneATT_trt/logging.h

@@ -0,0 +1,24 @@
+// create by ausk(jinlj) 2022/10/25
+#pragma once
+
+#include "NvInferRuntimeCommon.h"
+#include <cassert>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <sstream>
+#include <string>
+
+using Severity = nvinfer1::ILogger::Severity;
+
+class Logger : public nvinfer1::ILogger
+{
+public:
+    void log(Severity severity, const char* msg) noexcept override
+    {
+        if (severity < Severity::kINFO) {
+            std::cout << msg << std::endl;
+        }
+    }
+};

+ 62 - 0
src/detection/laneATT_trt/nms.cpp

@@ -0,0 +1,62 @@
+/* Copyright (c) 2018, Grégoire Payen de La Garanderie, Durham University
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <torch/extension.h>
+#include <torch/types.h>
+#include <iostream>
+
+std::vector<at::Tensor> nms_cuda_forward(
+        at::Tensor boxes,
+        at::Tensor idx,
+        float nms_overlap_thresh,
+        unsigned long top_k);
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<at::Tensor> nms_forward(
+        at::Tensor boxes,
+        at::Tensor scores,
+        float thresh,
+        unsigned long top_k) {
+
+
+    auto idx = std::get<1>(scores.sort(0,true));
+
+    CHECK_INPUT(boxes);
+    CHECK_INPUT(idx);
+
+    return nms_cuda_forward(boxes, idx, thresh, top_k);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("nms_forward", &nms_forward, "NMS");
+}
+

+ 193 - 0
src/detection/laneATT_trt/nms_kernel.cu

@@ -0,0 +1,193 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+#include <iostream>
+
+// Hard-coded maximum. Increase if needed.
+#define MAX_COL_BLOCKS 1000
+#define STRIDE 4
+#define N_OFFSETS 72 // if you use more than 73 offsets you will have to adjust this value
+#define N_STRIPS (N_OFFSETS - 1)
+#define PROP_SIZE (5 + N_OFFSETS)
+#define DATASET_OFFSET 0
+
+#define DIVUP(m,n) (((m)+(n)-1) / (n))
+int64_t const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+// The functions below originates from Fast R-CNN
+// See https://github.com/rbgirshick/py-faster-rcnn
+// Copyright (c) 2015 Microsoft
+// Licensed under The MIT License
+// Written by Shaoqing Ren
+
+template <typename scalar_t>
+// __device__ inline scalar_t devIoU(scalar_t const * const a, scalar_t const * const b) {
+__device__ inline bool devIoU(scalar_t const * const a, scalar_t const * const b, const float threshold) {
+  const int start_a = (int) (a[2] * N_STRIPS - DATASET_OFFSET + 0.5); // 0.5 rounding trick
+  const int start_b = (int) (b[2] * N_STRIPS - DATASET_OFFSET + 0.5);
+  const int start = max(start_a, start_b);
+  const int end_a = start_a + a[4] - 1 + 0.5 - ((a[4] - 1) < 0); //  - (x<0) trick to adjust for negative numbers (in case length is 0)
+  const int end_b = start_b + b[4] - 1 + 0.5 - ((b[4] - 1) < 0);
+  const int end = min(min(end_a, end_b), N_OFFSETS - 1);
+  // if (end < start) return 1e9;
+  if (end < start) return false;
+  scalar_t dist = 0;
+  for(unsigned char i = 5 + start; i <= 5 + end; ++i) {
+    if (a[i] < b[i]) {
+      dist += b[i] - a[i];
+    } else {
+      dist += a[i] - b[i];
+    }
+  }
+  // return (dist / (end - start + 1)) < threshold;
+  return dist < (threshold * (end - start + 1));
+  // return dist / (end - start + 1);
+}
+
+template <typename scalar_t>
+__global__ void nms_kernel(const int64_t n_boxes, const scalar_t nms_overlap_thresh,
+                           const scalar_t *dev_boxes, const int64_t *idx, int64_t *dev_mask) {
+  const int64_t row_start = blockIdx.y;
+  const int64_t col_start = blockIdx.x;
+
+  if (row_start > col_start) return;
+
+  const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ scalar_t block_boxes[threadsPerBlock * PROP_SIZE];
+  if (threadIdx.x < col_size) {
+    for (int i = 0; i <  PROP_SIZE; ++i) {
+      block_boxes[threadIdx.x * PROP_SIZE + i] = dev_boxes[idx[(threadsPerBlock * col_start + threadIdx.x)] * PROP_SIZE + i];
+    }
+  //   block_boxes[threadIdx.x * 4 + 0] =
+  //       dev_boxes[idx[(threadsPerBlock * col_start + threadIdx.x)] * 4 + 0];
+  //   block_boxes[threadIdx.x * 4 + 1] =
+  //       dev_boxes[idx[(threadsPerBlock * col_start + threadIdx.x)] * 4 + 1];
+  //   block_boxes[threadIdx.x * 4 + 2] =
+  //       dev_boxes[idx[(threadsPerBlock * col_start + threadIdx.x)] * 4 + 2];
+  //   block_boxes[threadIdx.x * 4 + 3] =
+  //       dev_boxes[idx[(threadsPerBlock * col_start + threadIdx.x)] * 4 + 3];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const scalar_t *cur_box = dev_boxes + idx[cur_box_idx] * PROP_SIZE;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU(cur_box, block_boxes + i * PROP_SIZE, nms_overlap_thresh)) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+
+__global__ void nms_collect(const int64_t boxes_num, const int64_t col_blocks, int64_t top_k, const int64_t *idx, const int64_t *mask, int64_t *keep, int64_t *parent_object_index, int64_t *num_to_keep) {
+  int64_t remv[MAX_COL_BLOCKS];
+  int64_t num_to_keep_ = 0;
+
+  for (int i = 0; i < col_blocks; i++) {
+      remv[i] = 0;
+  }
+
+  for (int i = 0; i < boxes_num; ++i) {
+      parent_object_index[i] = 0;
+  }
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      int64_t idxi = idx[i];
+      keep[num_to_keep_] = idxi;
+      const int64_t *p = &mask[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+      for (int j = i; j < boxes_num; j++) {
+        int nblockj = j / threadsPerBlock;
+        int inblockj = j % threadsPerBlock;
+        if (p[nblockj] & (1ULL << inblockj))
+            parent_object_index[idx[j]] = num_to_keep_+1;
+      }
+      parent_object_index[idx[i]] = num_to_keep_+1;
+
+      num_to_keep_++;
+
+      if (num_to_keep_==top_k)
+          break;
+    }
+  }
+
+  // Initialize the rest of the keep array to avoid uninitialized values.
+  for (int i = num_to_keep_; i < boxes_num; ++i)
+      keep[i] = 0;
+
+  *num_to_keep = min(top_k,num_to_keep_);
+}
+
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+
+std::vector<at::Tensor> nms_cuda_forward(
+        at::Tensor boxes,
+        at::Tensor idx,
+        float nms_overlap_thresh,
+        unsigned long top_k) {
+
+  const auto boxes_num = boxes.size(0);
+  TORCH_CHECK(boxes.size(1) == PROP_SIZE, "Wrong number of offsets. Please adjust `PROP_SIZE`");
+
+  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
+
+  AT_ASSERTM (col_blocks < MAX_COL_BLOCKS, "The number of column blocks must be less than MAX_COL_BLOCKS. Increase the MAX_COL_BLOCKS constant if needed.");
+
+  auto longOptions = torch::TensorOptions().device(torch::kCUDA).dtype(torch::kLong);
+  auto mask = at::empty({boxes_num * col_blocks}, longOptions);
+
+  dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
+              DIVUP(boxes_num, threadsPerBlock));
+  dim3 threads(threadsPerBlock);
+
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_CONTIGUOUS(mask);
+
+  AT_DISPATCH_FLOATING_TYPES(boxes.type(), "nms_cuda_forward", ([&] {
+    nms_kernel<<<blocks, threads>>>(boxes_num,
+                                    (scalar_t)nms_overlap_thresh,
+                                    boxes.data<scalar_t>(),
+                                    idx.data<int64_t>(),
+                                    mask.data<int64_t>());
+  }));
+
+  auto keep = at::empty({boxes_num}, longOptions);
+  auto parent_object_index = at::empty({boxes_num}, longOptions);
+  auto num_to_keep = at::empty({}, longOptions);
+
+  nms_collect<<<1, 1>>>(boxes_num, col_blocks, top_k,
+                        idx.data<int64_t>(),
+                        mask.data<int64_t>(),
+                        keep.data<int64_t>(),
+                        parent_object_index.data<int64_t>(),
+                        num_to_keep.data<int64_t>());
+
+
+  return {keep,num_to_keep,parent_object_index};
+}
+

+ 17 - 0
src/detection/laneATT_trt/para.hpp

@@ -0,0 +1,17 @@
+#include <chrono>
+#include "cuda_utils.h"
+#include "logging.h"
+#include "utils.h"
+
+static const int INPUT_H = Yolo::INPUT_H;
+static const int INPUT_W = Yolo::INPUT_W;
+static const int IMG_H = Yolo::IMG_H;
+static const int IMG_W = Yolo::IMG_W;
+static const int CLASS_NUM = Yolo::CLASS_NUM;
+//static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
+static const int OUTPUT_SIZE = 77000;
+const char* INPUT_BLOB_NAME = "input";
+const char* OUTPUT_DET_NAME = "split_for_trace_model_pred";
+const char* OUTPUT_SEG_NAME = "seg";
+const char* OUTPUT_LANE_NAME = "ll";
+static Logger gLogger;

+ 163 - 0
src/detection/laneATT_trt/python/calibrate_camera.py

@@ -0,0 +1,163 @@
+#!/usr/bin/python3
+"""
+~~~~~~~~~~~~~~~~~~
+Camera calibration
+~~~~~~~~~~~~~~~~~~
+
+Usage:
+    python calib.py \
+        -i /dev/video0 \
+        -grid 9x6 \
+        -out fisheye.yaml \
+        -framestep 20 \
+        -resolution 640x480
+        --fisheye
+"""
+import argparse
+import os
+import numpy as np
+import yaml
+import cv2
+import sys
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("-i", "--input", type=int, default=0,
+                        # help="input camera device")
+    parser.add_argument("-grid", "--grid", default="10x7", help="size of the grid (rows x cols)")
+    parser.add_argument("-framestep", type=int, default=20, help="use every nth frame in the video")
+    parser.add_argument("-o", "--output", default="./yaml",
+                        help="path to output yaml file")
+    # parser.add_argument("-resolution", "--resolution", default="640x480",
+    #                     help="resolution of the camera")
+    parser.add_argument("-fisheye", "--fisheye", default=True,action="store_true",
+                        help="set ture if this is a fisheye camera")
+
+    args = parser.parse_args()
+
+   
+    cap = "v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink"
+
+    if not os.path.exists(args.output):
+        os.mkdir(args.output)
+
+    
+    source = cv2.VideoCapture(cap)
+
+    W=1280
+    H=720
+    source.set(3, W)
+    source.set(4, H)
+
+    grid_size = tuple(int(x) for x in args.grid.split("x"))
+    grid_points = np.zeros((np.prod(grid_size), 3), np.float32)
+    grid_points[:, :2] = np.indices(grid_size).T.reshape(-1, 2)
+
+    objpoints = []  # 3d point in real world space
+    imgpoints = []  # 2d points in image plane
+
+    quit = False
+    do_calib = False
+    i = -1
+    while True:
+        i += 1
+        retcode, img = source.read()
+        W,H,_ = img.shape
+        if not retcode:
+            raise ValueError("cannot read frame from video")
+        if i % args.framestep != 0:
+            continue
+
+        print("searching for chessboard corners in frame " + str(i) + "...")
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        found, corners = cv2.findChessboardCorners(
+            gray,
+            grid_size,
+            cv2.CALIB_CB_ADAPTIVE_THRESH +
+            cv2.CALIB_CB_NORMALIZE_IMAGE +
+            cv2.CALIB_CB_FILTER_QUADS
+        )
+        if found:
+            term = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_COUNT, 30, 0.01)
+            cv2.cornerSubPix(gray, corners, (5, 5), (-1, -1), term)
+            print("OK")
+            imgpoints.append(corners.reshape(1, -1, 2))
+            objpoints.append(grid_points.reshape(1, -1, 3))
+            cv2.drawChessboardCorners(img, grid_size, corners, found)
+        text1 = "press c to calibrate"
+        text2 = "press q to quit"
+        text3 = "device: {}".format(cap)
+        fontscale = 0.6
+        cv2.putText(img, text1, (20, 70), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
+        cv2.putText(img, text2, (20, 110), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
+        cv2.putText(img, text3, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
+        cv2.imshow("corners", img)
+        key = cv2.waitKey(1) & 0xFF
+        if key == ord("c"):
+            do_calib = True
+            break
+
+        elif key == ord("q"):
+            quit = True
+            break
+
+    if quit:
+        source.release()
+        cv2.destroyAllWindows()
+
+    if do_calib:
+        print("\nPerforming calibration...\n")
+        N_OK = len(objpoints)
+        if N_OK < 12:
+            print("Less than 12 corners detected, calibration failed")
+            return
+
+        K = np.zeros((3, 3))
+        D = np.zeros((4, 1))
+        rvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
+        tvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
+        calibration_flags = (cv2.fisheye.CALIB_RECOMPUTE_EXTRINSIC +
+                             cv2.fisheye.CALIB_CHECK_COND +
+                             cv2.fisheye.CALIB_FIX_SKEW)
+
+        # 求出内参矩阵和畸变系数
+        if True:
+            ret, K, D, rvecs, tvecs = cv2.fisheye.calibrate(
+                objpoints,
+                imgpoints,
+                (W, H),
+                K,
+                D,
+                rvecs,
+                tvecs,
+                calibration_flags,
+                (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 1e-6)
+            )
+        else:
+            ret, K, D, rvecs, tvecs = cv2.calibrateCamera(
+                objpoints,
+                imgpoints,
+                (W, H),
+                None,
+                None)
+        if ret:
+            print(ret)
+            print(K)
+            data = {"dim": np.array([W, H]).tolist(), "K": K.tolist(), "D": D.tolist()}
+            fname = "./yaml/camera.yaml"
+            print(fname)
+            with open(fname, "w") as f:
+                yaml.safe_dump(data,f)
+            print("succesfully saved camera data")
+
+            cv2.putText(img, "Success!", (220 , 240), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)
+        else:
+            cv2.putText(img, "Failed!", (220, 240), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)
+
+        cv2.imshow("corners", img)
+        cv2.waitKey(0)
+
+
+if __name__ == "__main__":
+    main()

+ 196 - 0
src/detection/laneATT_trt/python/calibrate_camera_new.py

@@ -0,0 +1,196 @@
+#!/usr/bin/python3
+"""
+~~~~~~~~~~~~~~~~~~
+Camera calibration
+~~~~~~~~~~~~~~~~~~
+
+Usage:
+    python calib.py \
+        -i /dev/video0 \
+        -grid 9x6 \
+        -out fisheye.yaml \
+        -framestep 20 \
+        -resolution 640x480
+        --fisheye
+"""
+import argparse
+import os
+import numpy as np
+import yaml
+import cv2
+import sys
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("-i", "--input", type=int, default=0,
+                        # help="input camera device")
+    parser.add_argument("-grid", "--grid", default="10x7", help="size of the grid (rows x cols)")
+    parser.add_argument("-framestep", type=int, default=20, help="use every nth frame in the video")
+    parser.add_argument("-o", "--output", default="./yaml",
+                        help="path to output yaml file")
+    # parser.add_argument("-resolution", "--resolution", default="640x480",
+    #                     help="resolution of the camera")
+    parser.add_argument("-fisheye", "--fisheye", default=True,action="store_true",
+                        help="set ture if this is a fisheye camera")
+
+    args = parser.parse_args()
+
+   
+    cap = "v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink"
+
+    if not os.path.exists(args.output):
+        os.mkdir(args.output)
+
+    
+    source = cv2.VideoCapture(cap)
+
+    W=1280
+    H=720
+    source.set(3, W)
+    source.set(4, H)
+
+    grid_size = tuple(int(x) for x in args.grid.split("x"))
+    grid_points = np.zeros((np.prod(grid_size), 3), np.float32)
+    grid_points[:, :2] = np.indices(grid_size).T.reshape(-1, 2)
+
+    objpoints = []  # 3d point in real world space
+    imgpoints = []  # 2d points in image plane
+
+    quit = False
+    do_calib = False
+    i = -1
+    while True:
+        i += 1
+        retcode, img = source.read()
+        //img= img[120:480, 240:880]
+        # print(img.shape)
+        W,H,_ = img.shape
+        if not retcode:
+            raise ValueError("cannot read frame from video")
+        if i % args.framestep != 0:
+            continue
+
+        print("searching for chessboard corners in frame " + str(i) + "...")
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        found, corners = cv2.findChessboardCorners(
+            gray,
+            grid_size,
+            cv2.CALIB_CB_ADAPTIVE_THRESH +
+            cv2.CALIB_CB_NORMALIZE_IMAGE +
+            cv2.CALIB_CB_FILTER_QUADS
+        )
+        if found:
+            term = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_COUNT, 30, 0.01)
+            cv2.cornerSubPix(gray, corners, (5, 5), (-1, -1), term)
+            print("OK")
+            imgpoints.append(corners.reshape(1, -1, 2))
+            objpoints.append(grid_points.reshape(1, -1, 3))
+            cv2.drawChessboardCorners(img, grid_size, corners, found)
+        text1 = "press c to calibrate"
+        text2 = "press q to quit"
+        text3 = "device: {}".format(cap)
+        fontscale = 0.6
+        cv2.putText(img, text1, (20, 70), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
+        cv2.putText(img, text2, (20, 110), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
+        cv2.putText(img, text3, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
+        cv2.imshow("corners", img)
+
+        print("\nPerforming calibration...\n")
+        N_OK = len(objpoints)
+        if N_OK > 12:
+            print("More than 12 corners detected")
+
+            K = np.zeros((3, 3))
+            D = np.zeros((4, 1))
+            rvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
+            tvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
+            calibration_flags = (cv2.fisheye.CALIB_RECOMPUTE_EXTRINSIC +
+                                cv2.fisheye.CALIB_CHECK_COND +
+                                cv2.fisheye.CALIB_FIX_SKEW)
+
+            try:
+                ret, K, D, rvecs, tvecs = cv2.fisheye.calibrate(
+                    objpoints,
+                    imgpoints,
+                    (W, H),
+                    K,
+                    D,
+                    rvecs,
+                    tvecs,
+                    calibration_flags,
+                    (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 1e-5)
+                )
+            except:
+                print("Do not press C!!!!")
+
+
+        key = cv2.waitKey(1) & 0xFF
+        if key == ord("c"):
+            do_calib = True
+            break
+
+        elif key == ord("q"):
+            quit = True
+            break
+
+    if quit:
+        source.release()
+        cv2.destroyAllWindows()
+
+    if do_calib:
+        print("\nPerforming calibration...\n")
+        N_OK = len(objpoints)
+        if N_OK < 12:
+            print("Less than 12 corners detected, calibration failed")
+            return
+
+        K = np.zeros((3, 3))
+        D = np.zeros((4, 1))
+        rvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
+        tvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
+        calibration_flags = (cv2.fisheye.CALIB_RECOMPUTE_EXTRINSIC +
+                             cv2.fisheye.CALIB_CHECK_COND +
+                             cv2.fisheye.CALIB_FIX_SKEW)
+
+        # 求出内参矩阵和畸变系数
+        if True:
+            ret, K, D, rvecs, tvecs = cv2.fisheye.calibrate(
+                objpoints,
+                imgpoints,
+                (W, H),
+                K,
+                D,
+                rvecs,
+                tvecs,
+                calibration_flags,
+                (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 1e-5)
+            )
+
+        else:
+            ret, K, D, rvecs, tvecs = cv2.calibrateCamera(
+                objpoints,
+                imgpoints,
+                (W, H),
+                None,
+                None)
+        if ret:
+            print(ret)
+            print(K)
+            data = {"dim": np.array([W, H]).tolist(), "K": K.tolist(), "D": D.tolist()}
+            fname = "./yaml/camera.yaml"
+            print(fname)
+            with open(fname, "w") as f:
+                yaml.safe_dump(data,f)
+            print("succesfully saved camera data")
+
+            cv2.putText(img, "Success!", (220 , 240), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)
+        else:
+            cv2.putText(img, "Failed!", (220, 240), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)
+
+        cv2.imshow("corners", img)
+        cv2.waitKey(0)
+
+
+if __name__ == "__main__":
+    main()

+ 183 - 0
src/detection/laneATT_trt/python/calibrate_camera_new_.py

@@ -0,0 +1,183 @@
+#!/usr/bin/python3
+"""
+~~~~~~~~~~~~~~~~~~
+Camera calibration
+~~~~~~~~~~~~~~~~~~
+
+Usage:
+    python calib.py \
+        -i /dev/video0 \
+        -grid 9x6 \
+        -out fisheye.yaml \
+        -framestep 20 \
+        -resolution 640x480
+        --fisheye
+"""
+import argparse
+import os
+import numpy as np
+import yaml
+import cv2
+import sys
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("-i", "--input", type=int, default=0,
+                        # help="input camera device")
+    parser.add_argument("-grid", "--grid", default="10x7", help="size of the grid (rows x cols)")
+    parser.add_argument("-framestep", type=int, default=20, help="use every nth frame in the video")
+    parser.add_argument("-o", "--output", default="./yaml",
+                        help="path to output yaml file")
+    # parser.add_argument("-resolution", "--resolution", default="640x480",
+    #                     help="resolution of the camera")
+    parser.add_argument("-fisheye", "--fisheye", default=True,action="store_true",
+                        help="set ture if this is a fisheye camera")
+
+    args = parser.parse_args()
+
+   
+    cap = "v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink"
+
+    if not os.path.exists(args.output):
+        os.mkdir(args.output)
+
+    
+    source = cv2.VideoCapture(cap)
+
+    W=1280
+    H=720
+    source.set(3, W)
+    source.set(4, H)
+
+    grid_size = tuple(int(x) for x in args.grid.split("x"))
+    grid_points = np.zeros((np.prod(grid_size), 3), np.float32)
+    grid_points[:, :2] = np.indices(grid_size).T.reshape(-1, 2)
+
+    objpoints = []  # 3d point in real world space
+    imgpoints = []  # 2d points in image plane
+
+    quit = False
+    do_calib = False
+    i = -1
+    while True:
+        i += 1
+        retcode, img = source.read()
+        img= img[410-90:590+90, 380+35:1020+35]
+        #print(img.shape)
+        W,H,_ = img.shape
+        if not retcode:
+            raise ValueError("cannot read frame from video")
+        if i % args.framestep != 0:
+            continue
+
+        print("searching for chessboard corners in frame " + str(i) + "...")
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        found, corners = cv2.findChessboardCorners(
+            gray,
+            grid_size,
+            cv2.CALIB_CB_ADAPTIVE_THRESH +
+            cv2.CALIB_CB_NORMALIZE_IMAGE +
+            cv2.CALIB_CB_FILTER_QUADS
+        )
+        if found:
+            term = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_COUNT, 30, 0.01)
+            cv2.cornerSubPix(gray, corners, (5, 5), (-1, -1), term)
+            print("OK")
+            imgpoints.append(corners.reshape(1, -1, 2))
+            objpoints.append(grid_points.reshape(1, -1, 3))
+            cv2.drawChessboardCorners(img, grid_size, corners, found)
+        text1 = "press c to calibrate"
+        text2 = "press q to quit"
+        text3 = "device: {}".format(cap)
+        fontscale = 0.6
+        cv2.putText(img, text1, (20, 70), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
+        cv2.putText(img, text2, (20, 110), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
+        cv2.putText(img, text3, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, fontscale, (255, 200, 0), 2)
+        cv2.imshow("corners", img)
+
+        print("\nPerforming calibration...\n")
+        N_OK = len(objpoints)
+        if N_OK > 12:
+            print("More than 12 corners detected")
+
+            K = np.zeros((3, 3))
+            D = np.zeros((4, 1))
+            rvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
+            tvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
+
+
+            try:
+                ret, K, D, rvecs, tvecs = cv2.calibrateCamera(
+                    objpoints,
+                    imgpoints,
+                    (W, H),
+                    None,
+                    None
+                )
+            except:
+                print("Do not press C!!!!")
+
+
+        key = cv2.waitKey(1) & 0xFF
+        if key == ord("c"):
+            do_calib = True
+            break
+
+        elif key == ord("q"):
+            quit = True
+            break
+
+    if quit:
+        source.release()
+        cv2.destroyAllWindows()
+
+    if do_calib:
+        print("\nPerforming calibration...\n")
+        N_OK = len(objpoints)
+        if N_OK < 12:
+            print("Less than 12 corners detected, calibration failed")
+            return
+
+        K = np.zeros((3, 3))
+        D = np.zeros((4, 1))
+        rvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
+        tvecs = [np.zeros((1, 1, 3), dtype=np.float64) for _ in range(N_OK)]
+        
+
+        # 求出内参矩阵和畸变系数
+        if True:
+            ret, K, D, rvecs, tvecs = cv2.calibrateCamera(
+                objpoints,
+                imgpoints,
+                (W, H),
+                None,
+                None)
+
+        else:
+            ret, K, D, rvecs, tvecs = cv2.calibrateCamera(
+                objpoints,
+                imgpoints,
+                (W, H),
+                None,
+                None)
+        if ret:
+            print(ret)
+            print(K)
+            data = {"dim": np.array([W, H]).tolist(), "K": K.tolist(), "D": D.tolist()}
+            fname = "./yaml/camera.yaml"
+            print(fname)
+            with open(fname, "w") as f:
+                yaml.safe_dump(data,f)
+            print("succesfully saved camera data")
+
+            cv2.putText(img, "Success!", (220 , 240), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)
+        else:
+            cv2.putText(img, "Failed!", (220, 240), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)
+
+        cv2.imshow("corners", img)
+        cv2.waitKey(0)
+
+
+if __name__ == "__main__":
+    main()

+ 11 - 0
src/detection/laneATT_trt/python/cameraID.py

@@ -0,0 +1,11 @@
+import cv2
+ID = 0
+while(1):
+    cap = cv2.VideoCapture(ID,cv2.CAP_DSHOW)
+    # get a frame
+    ret, frame = cap.read()
+    if ret == False:
+        ID += 1
+    else:
+        print(ID)
+        break

+ 58 - 0
src/detection/laneATT_trt/python/select_area.py

@@ -0,0 +1,58 @@
+import cv2
+import yaml
+import numpy as np
+import sys
+import os
+
+
+def test():
+
+    cap = cv2.VideoCapture("v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink")
+    while True:
+        ret, frame = cap.read()
+        #frame = frame[410-90:590+90, 380+35:1020+35]
+        cv2.imshow("capture", frame)
+        if cv2.waitKey(10) & 0xFF == ord('q'):
+            img = frame
+            cv2.imwrite('img.jpg', frame)  
+            break
+    cap.release()
+    cv2.destroyAllWindows()
+    #img = cv2.imread('img.jpg')
+
+
+    def on_EVENT_LBUTTONDOWN(event, x, y, flags, param):
+        if event == cv2.EVENT_LBUTTONDOWN:
+            xy = "%d,%d" % (x, y)
+            cv2.circle(img, (x, y), 1, (255, 0, 0), thickness=-1)
+            print((x,y))
+            cv2.putText(img, xy, (x, y), cv2.FONT_HERSHEY_PLAIN,
+                        1.5, (255, 255, 255), thickness=2)
+            cv2.imshow("image", img)
+
+    cv2.namedWindow("image", cv2.WINDOW_AUTOSIZE)
+    cv2.setMouseCallback("image", on_EVENT_LBUTTONDOWN)
+    cv2.imshow("image", img)
+
+    while (True):
+        try:
+            if cv2.waitKey(10) & 0xFF == ord('q'):
+                break
+            elif cv2.waitKey(10) & 0xFF == ord('s'):
+                cv2.imwrite('img_select.jpg', frame)
+                break
+        except Exception:
+            cv2.destroyWindow("image")
+            break
+
+
+
+if __name__=='__main__':
+    test()
+
+
+
+
+
+
+

+ 100 - 0
src/detection/laneATT_trt/python/show_coordinate.py

@@ -0,0 +1,100 @@
+import cv2
+import yaml
+import numpy as np
+import sys
+import os
+
+
+def test():
+
+
+    camera_files = "./yaml/camera_ori.yaml"
+    print("files",camera_files)
+    #W, H = 640, 480
+    W, H = 1280, 720
+    init_files=(os.path.realpath(sys.path[0]) + "/yaml/init.yaml")
+    
+    with open(init_files, "r") as f:
+        init_data = yaml.load(f)
+   
+    source = init_data["source"]
+    print("source=",source)
+
+
+
+
+    #cap = cv2.VideoCapture("v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink")
+    cap = cv2.VideoCapture("/home/nvidia/modularization/src/detection/laneATT_trt/test/camera_test.mp4")
+
+
+
+
+
+    #with open(camera_files, "r") as f:
+    #    data = yaml.load(f,Loader=yaml.FullLoader)
+    #K = np.array(data["K"])
+    #D = np.array(data["D"])    
+    
+    data = cv2.FileStorage(camera_files, cv2.FILE_STORAGE_READ)
+    K = data.getNode("cameraMatrix").mat()
+    D = data.getNode("distCoeffs").mat()
+    data.release()
+    print("K=",K)
+    print("D=",D)
+
+    map1, map2 = cv2.fisheye.initUndistortRectifyMap(
+            K,
+            D,
+            np.eye(3),
+            K,
+            (W, H),
+            cv2.CV_16SC2
+    )
+    
+
+    while True:
+        ret, frame = cap.read()
+        frame = cv2.resize(frame,(W,H))
+        #frame = cv2.remap(frame, map1, map2, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT)
+        cv2.imshow("capture", frame)
+        if cv2.waitKey(10) & 0xFF == ord('q'):
+            img = frame
+            cv2.imwrite('img.jpg', frame)  # 存储为图像
+            break
+    cap.release()
+    cv2.destroyAllWindows()
+    #img = cv2.imread('img.jpg')
+
+
+    def on_EVENT_LBUTTONDOWN(event, x, y, flags, param):
+        if event == cv2.EVENT_LBUTTONDOWN:
+            xy = "%d,%d" % (x, y)
+            cv2.circle(img, (x, y), 1, (255, 0, 0), thickness=-1)
+            print((x,y))
+            cv2.putText(img, xy, (x, y), cv2.FONT_HERSHEY_PLAIN,
+                        1.5, (255, 255, 255), thickness=2)
+            cv2.imshow("image", img)
+
+    cv2.namedWindow("image", cv2.WINDOW_AUTOSIZE)
+    cv2.setMouseCallback("image", on_EVENT_LBUTTONDOWN)
+    cv2.imshow("image", img)
+
+    while (True):
+        try:
+            if cv2.waitKey(10) & 0xFF == ord('q'):
+                break
+        except Exception:
+            cv2.destroyWindow("image")
+            break
+
+
+
+if __name__=='__main__':
+    test()
+
+
+
+
+
+
+

+ 101 - 0
src/detection/laneATT_trt/python/show_coordinate_new.py

@@ -0,0 +1,101 @@
+import cv2
+import yaml
+import numpy as np
+import sys
+import os
+
+
+def test():
+
+
+    camera_files = "./yaml/camera_middle_640_360.yaml"
+    print("files",camera_files)
+    W, H = 640, 360
+    #W, H = 1280, 720
+    init_files=(os.path.realpath(sys.path[0]) + "/yaml/init.yaml")
+    
+    with open(init_files, "r") as f:
+        init_data = yaml.load(f)
+   
+    source = init_data["source"]
+    print("source=",source)
+
+
+
+
+    cap = cv2.VideoCapture("v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink")
+   # cap = cv2.VideoCapture(1)
+
+
+
+
+
+    #with open(camera_files, "r") as f:
+    #    data = yaml.load(f,Loader=yaml.FullLoader)
+    #K = np.array(data["K"])
+    #D = np.array(data["D"])    
+    
+    data = cv2.FileStorage(camera_files, cv2.FILE_STORAGE_READ)
+    K = data.getNode("cameraMatrix").mat()
+    D = data.getNode("distCoeffs").mat()
+    data.release()
+    print("K=",K)
+    print("D=",D)
+
+    map1, map2 = cv2.initUndistortRectifyMap(
+            K,
+            D,
+            np.eye(3),
+            K,
+            (W, H),
+            cv2.CV_16SC2
+    )
+    
+
+    while True:
+        ret, frame = cap.read()
+        #frame = frame[410-90:590+90, 380+35:1020+35]
+        #frame = cv2.resize(frame,(W,H))
+        #frame = cv2.remap(frame, map1, map2, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT)
+        cv2.imshow("capture", frame)
+        if cv2.waitKey(10) & 0xFF == ord('q'):
+            img = frame
+            cv2.imwrite('img_cali.jpg', frame)  # 存储为图像
+            break
+    cap.release()
+    cv2.destroyAllWindows()
+    #img = cv2.imread('img.jpg')
+
+
+    def on_EVENT_LBUTTONDOWN(event, x, y, flags, param):
+        if event == cv2.EVENT_LBUTTONDOWN:
+            xy = "%d,%d" % (x, y)
+            cv2.circle(img, (x, y), 1, (255, 0, 0), thickness=-1)
+            print((x,y))
+            cv2.putText(img, xy, (x, y), cv2.FONT_HERSHEY_PLAIN,
+                        1.5, (255, 255, 255), thickness=2)
+            cv2.imshow("image", img)
+
+    cv2.namedWindow("image", cv2.WINDOW_AUTOSIZE)
+    cv2.setMouseCallback("image", on_EVENT_LBUTTONDOWN)
+    cv2.imshow("image", img)
+
+    while (True):
+        try:
+            if cv2.waitKey(10) & 0xFF == ord('q'):
+                break
+        except Exception:
+            cv2.destroyWindow("image")
+            break
+
+
+
+if __name__=='__main__':
+    test()
+
+
+
+
+
+
+

+ 64 - 0
src/detection/laneATT_trt/python/show_coordinate_shot.py

@@ -0,0 +1,64 @@
+import cv2
+import yaml
+import numpy as np
+import sys
+import os
+
+def test():
+
+
+    pers_file = "./yaml/pers.yaml"
+    W, H = 640, 360
+    init_files="./yaml/init.yaml"
+    
+    with open(init_files, "r") as f:
+        init_data = yaml.load(f)
+   
+    source = init_data["source"]
+
+
+
+
+
+    
+
+
+    img = cv2.imread('./img.jpg')
+
+
+    def on_EVENT_LBUTTONDOWN(event, x, y, flags, param):
+        if event == cv2.EVENT_LBUTTONDOWN:
+            xy = "%d,%d" % (x, y)
+            cv2.circle(img, (x, y), 1, (255, 0, 0), thickness=-1)
+            print((x,y))
+            cv2.putText(img, xy, (x, y), cv2.FONT_HERSHEY_PLAIN,
+                        1.5, (255, 255, 255), thickness=2)
+            cv2.imshow("image", img)
+
+    cv2.namedWindow("image", cv2.WINDOW_AUTOSIZE)
+    cv2.setMouseCallback("image", on_EVENT_LBUTTONDOWN)
+    cv2.imshow("image", img)
+
+    #fs = cv2.FileStorage(pers_file, cv2.FILE_STORAGE_WRITE)
+    
+
+
+    while (True):
+        try:
+            if cv2.waitKey(10) & 0xFF == ord('q'):
+                break
+        except Exception:
+            cv2.destroyWindow("image")
+            break
+
+
+
+if __name__=='__main__':
+    test()
+
+
+
+
+
+
+

+ 54 - 0
src/detection/laneATT_trt/utils.h

@@ -0,0 +1,54 @@
+#pragma once
+
+#include <dirent.h>
+#include <opencv2/opencv.hpp>
+
+#include <iostream>
+#include "common.hpp"
+
+#define SHOW_IMG
+
+static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
+    int w, h, x, y;
+    float r_w = input_w / (img.cols*1.0);
+    float r_h = input_h / (img.rows*1.0);
+    if (r_h > r_w) {
+        w = input_w;
+        h = r_w * img.rows;
+        x = 0;
+        y = (input_h - h) / 2;
+    } else {
+        w = r_h * img.cols;
+        h = input_h;
+        x = (input_w - w) / 2;
+        y = 0;
+    }
+    cv::Mat re(h, w, CV_8UC3);
+    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
+    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(114, 114, 114));
+    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
+    cv::Mat tensor;
+    out.convertTo(tensor, CV_32FC3, 1.f / 255.f);
+
+    cv::subtract(tensor, cv::Scalar(0.485, 0.456, 0.406), tensor, cv::noArray(), -1);
+    cv::divide(tensor, cv::Scalar(0.229, 0.224, 0.225), tensor, 1, -1);
+    return tensor;
+}
+
+static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
+    DIR *p_dir = opendir(p_dir_name);
+    if (p_dir == nullptr) {
+        return -1;
+    }
+
+    struct dirent* p_file = nullptr;
+    while ((p_file = readdir(p_dir)) != nullptr) {
+        if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
+            std::string cur_file_name(p_file->d_name);
+            file_names.push_back(cur_file_name);
+        }
+    }
+
+    closedir(p_dir);
+    return 0;
+}

+ 8 - 0
src/detection/laneATT_trt/yaml/camera.yaml

@@ -0,0 +1,8 @@
+D:
+- [-23.425988880936135, -21.80458632893629, -0.5387917768358769, 0.4939328755112909,
+  0.8515868920493874]
+K:
+- [6628.560676825491, 0.0, 176.13848220061746]
+- [0.0, 6611.257267601054, 321.4390302745282]
+- [0.0, 0.0, 1.0]
+dim: [360, 640]

+ 12 - 0
src/detection/laneATT_trt/yaml/camera_middle_640_360.yaml

@@ -0,0 +1,12 @@
+%YAML:1.0
+
+cameraMatrix:
+    rows: 3
+    cols: 3
+    dt: f
+    data: [6628.560676825491, 0.0, 176.13848220061746,0.0, 6611.257267601054, 321.4390302745282, 0.0, 0.0, 1.0]
+distCoeffs: 
+    rows: 1
+    cols: 5
+    dt: f
+    data: [-23.425988880936135, -21.80458632893629, -0.5387917768358769, 0.4939328755112909,0.8515868920493874]

+ 16 - 0
src/detection/laneATT_trt/yaml/init.yaml

@@ -0,0 +1,16 @@
+source:
+- 0
+
+src:
+- [140, 370]
+- [220, 298]
+- [410, 292]
+- [478, 360]
+dst:
+- [125, 371]
+- [125, 218]
+- [484, 218]
+- [484, 370]
+
+meters: 2.0
+pixels: 640

+ 2 - 0
src/detection/laneATT_trt/yaml/pers.yaml

@@ -0,0 +1,2 @@
+%YAML:1.0
+---

+ 10 - 0
src/detection/laneATT_trt/yaml/pers_ori.yaml

@@ -0,0 +1,10 @@
+%YAML:1.0
+
+
+board_left_top: [549,557]
+
+board_left_bottom: [341,690]
+
+board_right_top: [740,558]
+
+board_right_bottom: [941,696]

+ 333 - 0
src/detection/laneATT_trt/yololayer.cu

@@ -0,0 +1,333 @@
+#include <assert.h>
+#include <vector>
+#include <iostream>
+#include "yololayer.h"
+#include "cuda_utils.h"
+
+namespace Tn
+{
+    template<typename T>
+    void write(char*& buffer, const T& val)
+    {
+        *reinterpret_cast<T*>(buffer) = val;
+        buffer += sizeof(T);
+    }
+
+    template<typename T>
+    void read(const char*& buffer, T& val)
+    {
+        val = *reinterpret_cast<const T*>(buffer);
+        buffer += sizeof(T);
+    }
+}
+
+using namespace Yolo;
+
+namespace nvinfer1
+{
+    YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel)
+    {
+        mClassCount = classCount;
+        mNetWidth = netWidth;
+        mNetHeight = netHeight;
+        mMaxOutObject = maxOut;
+        mYoloKernel = vYoloKernel;
+        mKernelCount = vYoloKernel.size();
+
+        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
+        size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
+        for (int ii = 0; ii < mKernelCount; ii++)
+        {
+            CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
+            const auto& yolo = mYoloKernel[ii];
+            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
+        }
+    }
+    YoloLayerPlugin::~YoloLayerPlugin()
+    {
+        for (int ii = 0; ii < mKernelCount; ii++)
+        {
+            CUDA_CHECK(cudaFree(mAnchor[ii]));
+        }
+        CUDA_CHECK(cudaFreeHost(mAnchor));
+    }
+
+    // create the plugin at runtime from a byte stream
+    YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
+    {
+        using namespace Tn;
+        const char *d = reinterpret_cast<const char *>(data), *a = d;
+        read(d, mClassCount);
+        read(d, mThreadCount);
+        read(d, mKernelCount);
+        read(d, mNetWidth);
+        read(d, mNetHeight);
+        read(d, mMaxOutObject);
+        mYoloKernel.resize(mKernelCount);
+        auto kernelSize = mKernelCount * sizeof(YoloKernel);
+        memcpy(mYoloKernel.data(), d, kernelSize);
+        d += kernelSize;
+        CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
+        size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
+        for (int ii = 0; ii < mKernelCount; ii++)
+        {
+            CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
+            const auto& yolo = mYoloKernel[ii];
+            CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
+        }
+        assert(d == a + length);
+    }
+
+    void YoloLayerPlugin::serialize(void* buffer) const
+    {
+        using namespace Tn;
+        char* d = static_cast<char*>(buffer), *a = d;
+        write(d, mClassCount);
+        write(d, mThreadCount);
+        write(d, mKernelCount);
+        write(d, mNetWidth);
+        write(d, mNetHeight);
+        write(d, mMaxOutObject);
+        auto kernelSize = mKernelCount * sizeof(YoloKernel);
+        memcpy(d, mYoloKernel.data(), kernelSize);
+        d += kernelSize;
+
+        assert(d == a + getSerializationSize());
+    }
+
+    size_t YoloLayerPlugin::getSerializationSize() const
+    {
+        return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size() + sizeof(mNetWidth) + sizeof(mNetHeight) + sizeof(mMaxOutObject);
+    }
+
+    int YoloLayerPlugin::initialize()
+    {
+        return 0;
+    }
+
+    Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+    {
+        //output the result to channel
+        int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float);
+
+        return Dims3(totalsize + 1, 1, 1);
+    }
+
+    // Set plugin namespace
+    void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
+    {
+        mPluginNamespace = pluginNamespace;
+    }
+
+    const char* YoloLayerPlugin::getPluginNamespace() const
+    {
+        return mPluginNamespace;
+    }
+
+    // Return the DataType of the plugin output at the requested index
+    DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
+    {
+        return DataType::kFLOAT;
+    }
+
+    // Return true if output tensor is broadcast across a batch.
+    bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
+    {
+        return false;
+    }
+
+    // Return true if plugin can use input that is broadcast across batch without replication.
+    bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
+    {
+        return false;
+    }
+
+    void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
+    {
+    }
+
+    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
+    void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
+    {
+    }
+
+    // Detach the plugin object from its execution context.
+    void YoloLayerPlugin::detachFromContext() {}
+
+    const char* YoloLayerPlugin::getPluginType() const
+    {
+        return "YoloLayer_TRT";
+    }
+
+    const char* YoloLayerPlugin::getPluginVersion() const
+    {
+        return "1";
+    }
+
+    void YoloLayerPlugin::destroy()
+    {
+        delete this;
+    }
+
+    // Clone the plugin
+    IPluginV2IOExt* YoloLayerPlugin::clone() const
+    {
+        YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mNetWidth, mNetHeight, mMaxOutObject, mYoloKernel);
+        p->setPluginNamespace(mPluginNamespace);
+        return p;
+    }
+
+    __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };
+
+    __global__ void CalDetection(const float *input, float *output, int noElements,
+        const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT * 2], int classes, int outputElem)
+    {
+
+        int idx = threadIdx.x + blockDim.x * blockIdx.x;
+        if (idx >= noElements) return;
+
+        int total_grid = yoloWidth * yoloHeight;
+        int bnIdx = idx / total_grid;
+        idx = idx - total_grid * bnIdx;
+        int info_len_i = 5 + classes;
+        const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);
+
+        for (int k = 0; k < 3; ++k) {
+            float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
+            if (box_prob < IGNORE_THRESH) continue;
+            int class_id = 0;
+            float max_cls_prob = 0.0;
+            for (int i = 5; i < info_len_i; ++i) {
+                float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
+                if (p > max_cls_prob) {
+                    max_cls_prob = p;
+                    class_id = i - 5;
+                }
+            }
+            float *res_count = output + bnIdx * outputElem;
+            int count = (int)atomicAdd(res_count, 1);
+            if (count >= maxoutobject) return;
+            char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection);
+            Detection* det = (Detection*)(data);
+
+            int row = idx / yoloWidth;
+            int col = idx % yoloWidth;
+
+            //Location
+            // pytorch:
+            //  y = x[i].sigmoid()
+            //  y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
+            //  y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
+            //  X: (sigmoid(tx) + cx)/FeaturemapW *  netwidth
+            det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
+            det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;
+
+            // W: (Pw * e^tw) / FeaturemapW * netwidth
+            // v5: https://github.com/ultralytics/yolov5/issues/471
+            det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
+            det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k];
+            det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
+            det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1];
+            det->conf = box_prob * max_cls_prob;
+            det->class_id = class_id;
+        }
+    }
+
+    void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize)
+    {
+        int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
+        for (int idx = 0; idx < batchSize; ++idx) {
+            CUDA_CHECK(cudaMemset(output + idx * outputElem, 0, sizeof(float)));
+        }
+        int numElem = 0;
+        for (unsigned int i = 0; i < mYoloKernel.size(); ++i)
+        {
+            const auto& yolo = mYoloKernel[i];
+            numElem = yolo.width*yolo.height*batchSize;
+            if (numElem < mThreadCount)
+                mThreadCount = numElem;
+
+            //printf("Net: %d  %d \n", mNetWidth, mNetHeight);
+            CalDetection << < (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount >> >
+                (inputs[i], output, numElem, mNetWidth, mNetHeight, mMaxOutObject, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem);
+        }
+    }
+
+
+    int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
+    {
+        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
+        return 0;
+    }
+
+    PluginFieldCollection YoloPluginCreator::mFC{};
+    std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
+
+    YoloPluginCreator::YoloPluginCreator()
+    {
+        mPluginAttributes.clear();
+
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields = mPluginAttributes.data();
+    }
+
+    const char* YoloPluginCreator::getPluginName() const
+    {
+        return "YoloLayer_TRT";
+    }
+
+    const char* YoloPluginCreator::getPluginVersion() const
+    {
+        return "1";
+    }
+
+    const PluginFieldCollection* YoloPluginCreator::getFieldNames()
+    {
+        return &mFC;
+    }
+
+    IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
+    {
+        int class_count = -1;
+        int input_w = -1;
+        int input_h = -1;
+        int max_output_object_count = -1;
+        std::vector<Yolo::YoloKernel> yolo_kernels(3);
+
+        const PluginField* fields = fc->fields;
+        for (int i = 0; i < fc->nbFields; i++) {
+            if (strcmp(fields[i].name, "netdata") == 0) {
+                assert(fields[i].type == PluginFieldType::kFLOAT32);
+                int *tmp = (int*)(fields[i].data);
+                class_count = tmp[0];
+                input_w = tmp[1];
+                input_h = tmp[2];
+                max_output_object_count = tmp[3];
+            } else if (strstr(fields[i].name, "yolodata") != NULL) {
+                assert(fields[i].type == PluginFieldType::kFLOAT32);
+                int *tmp = (int*)(fields[i].data);
+                YoloKernel kernel;
+                kernel.width = tmp[0];
+                kernel.height = tmp[1];
+                for (int j = 0; j < fields[i].length - 2; j++) {
+                    kernel.anchors[j] = tmp[j + 2];
+                }
+                yolo_kernels[2 - (fields[i].name[8] - '1')] = kernel;
+            }
+        }
+        assert(class_count && input_w && input_h && max_output_object_count);
+        YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, yolo_kernels);
+        obj->setPluginNamespace(mNamespace.c_str());
+        return obj;
+    }
+
+    IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
+    {
+        // This object will be deleted when the network is destroyed, which will
+        // call YoloLayerPlugin::destroy()
+        YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
+        obj->setPluginNamespace(mNamespace.c_str());
+        return obj;
+    }
+}
+

+ 36 - 0
src/detection/laneATT_trt/yololayer.h

@@ -0,0 +1,36 @@
+#ifndef _YOLO_LAYER_H
+#define _YOLO_LAYER_H
+
+#include <vector>
+#include <string>
+#include "NvInfer.h"
+
+namespace Yolo
+{
+    static constexpr int CHECK_COUNT = 3;
+    static constexpr float IGNORE_THRESH = 0.1f;
+    struct YoloKernel
+    {
+        int width;
+        int height;
+        float anchors[CHECK_COUNT * 2];
+    };
+    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
+    static constexpr int CLASS_NUM = 1;
+    static constexpr int INPUT_H = 360;
+    static constexpr int INPUT_W = 640;
+    static constexpr int IMG_H = 360;
+    static constexpr int IMG_W = 640;
+
+    static constexpr int LOCATIONS = 4;
+    struct alignas(float) Detection {
+        //center_x center_y w h
+        float bbox[LOCATIONS];
+        float conf;  // bbox_conf * cls_conf
+        float class_id;
+    };
+}
+
+
+
+#endif

+ 490 - 0
src/detection/laneATT_trt/yolop.cpp

@@ -0,0 +1,490 @@
+#include "modulecomm.h"
+#include "xmlparam.h"
+#include "ivfault.h"
+#include "ivlog.h"
+#include "ivexit.h"
+#include "ivversion.h"
+#include "rawpic.pb.cc"
+#include <QCoreApplication>
+#include <thread>
+#include "ivversion.h"
+#include <chrono>
+#include "opencv2/imgcodecs/legacy/constants_c.h"
+#include "qmutex.h"
+#include "condition_variable"
+#include "laneatt.hh"
+#include <opencv2/opencv.hpp>
+#include "lanearray.pb.h"
+
+#define INPUT_H 360
+#define INPUT_W 640
+#define N_OFFSETS 72
+#define N_STRIPS (N_OFFSETS - 1)
+#define MAX_COL_BLOCKS 1000
+
+std::vector<cv::Point2f> warped_point, origin_point;
+cv::Point2i transed_O;
+
+float METER_PER_PIXEL_X,METER_PER_PIXEL_Y;
+
+float BOARD_SIDE_LENGTH_X = 3.7;
+float BOARD_SIDE_LENGTH_Y = 10.0;
+
+
+bool gbstate = true;
+void * gp;
+void * gpdetect;
+iv::Ivfault *gfault = nullptr;
+iv::Ivlog *givlog = nullptr;
+
+std::thread * gpthread;
+
+std::string gstrinput;
+std::string gstroutput;
+QMutex mMutex;
+std::condition_variable cond;
+
+cv::Mat result,img_input;
+
+cv::Mat camera_matrix,dist_coe,map1,map2;
+cv::Mat img_warp,raw_image;
+
+std::vector<std::vector<cv::Point2f>> lanes;
+
+LaneATT model("./LaneATT_test.trt8");
+
+cv::VideoWriter outputVideo;
+
+
+void exitfunc()
+{
+//    gbstate = false;
+//    gpthread->join();
+//    std::cout<<"state thread closed."<<std::endl;
+return;
+}
+
+cv::Mat get_pers_mat(std::string pers_file)
+{
+    cv::Point2f src[4];
+    cv::Point2f dst[4];
+    cv::FileStorage pf(pers_file, cv::FileStorage::READ);
+    pf["board_left_top"]>>src[0];
+    pf["board_left_bottom"]>>src[1];
+    pf["board_right_top"]>>src[2];
+    pf["board_right_bottom"]>>src[3];
+
+//    pf["board_left_top_dst"]>>dst[0];
+//    pf["board_left_bottom_dst"]>>dst[1];
+//    pf["board_right_top_dst"]>>dst[2];
+//    pf["board_right_bottom_dst"]>>dst[3];
+
+/* -------------------------------------zhengke
+    dst[1].x = src[0].x;
+    dst[1].y = src[1].y;
+
+    dst[3].x = src[2].x;
+    dst[3].y = src[3].y;
+
+    int side_length = dst[3].x - dst[1].x;
+
+    METER_PER_PIXEL = BOARD_SIDE_LENGTH / side_length;
+
+    dst[0].x = src[0].x;
+    dst[0].y = dst[1].y - side_length;
+
+    dst[2].x = src[2].x;
+    dst[2].y = dst[3].y - side_length;
+
+    cv::Mat pers_mat = getPerspectiveTransform(dst, src);
+    cv::Mat pers_mat_inv = pers_mat.inv();
+
+    return pers_mat_inv;
+*///-------------------------------------zhengke
+    dst[1].x = 130*2;
+    dst[1].y = 330*2;
+
+    dst[3].x = 510*2;
+    dst[3].y = 330*2;
+
+    int side_length_x = dst[3].x - dst[1].x;
+
+    METER_PER_PIXEL_X = BOARD_SIDE_LENGTH_X / float(side_length_x);
+
+    dst[0].x = dst[1].x;
+    dst[0].y = 10*2;
+
+    dst[2].x = dst[3].x;
+    dst[2].y = 10*2;
+
+    int side_length_y = dst[1].y - dst[0].y;
+
+    METER_PER_PIXEL_Y = BOARD_SIDE_LENGTH_Y / float(side_length_y);
+
+    transed_O.x = (dst[1].x + dst[3].x)/2;
+    transed_O.y = (dst[1].y + dst[3].y)/2;
+
+    cv::Mat pers_mat = getPerspectiveTransform(dst, src);
+    cv::Mat pers_mat_inv = pers_mat.inv();
+
+    return pers_mat_inv;
+}
+
+std::string PERS_FILE = "./yaml/pers_ori.yaml";
+cv::Mat pers_mat_inv = get_pers_mat(PERS_FILE);  // 得到透视变换关系
+
+void transform (std::vector<std::vector<cv::Point2f>> lanes) {
+
+    // PerspectiveTransform
+
+    std::vector<cv::Point2f> leftLane;
+    std::vector<cv::Point2f> rightLane;
+    std::vector<std::vector<cv::Point2f>> warped_lanes;
+
+    for (int i = 0; i < lanes.size(); i++) {
+        cv::perspectiveTransform(lanes[i], warped_point , pers_mat_inv); //  坐标变换(输入点,输出点,变换矩阵)
+        warped_lanes.push_back(warped_point);
+        double sum = 0.0;
+        for (int i = 0 ; i < 5 ; i++){
+            sum += warped_point[i].x;
+        }
+        double mean = sum / 5;
+        if (mean < raw_image.size().width / 2){
+            leftLane.push_back(cv::Point2f(mean, i));
+        }else{
+            rightLane.push_back(cv::Point2f(mean, i));
+        }
+    }
+    float left_min = 0;
+    float right_max = INPUT_W;
+    int left_num = -1;
+    int right_num = -1;
+    int left_left = -1;
+    int right_right = -1;
+    for (const auto& points : leftLane){
+        if(points.x > left_min){
+            left_num = (int) round(points.y);
+            left_min = points.x;
+        }
+    }
+    for (const auto& points : leftLane){
+        if((int)round(points.y) != left_num){
+            left_left = points.y;
+        }
+    }
+//    if (left_num == -1){
+//        std::cout<<"left lane failed"<<std::endl;
+//    }
+    for (const auto& points : rightLane){
+        if(points.x < right_max){
+            right_num = (int) round(points.y);
+            right_max = points.x;
+        }
+    }
+    for (const auto& points : rightLane){
+        if((int)round(points.y) != right_num){
+            right_right = points.y;
+        }
+    }
+//    if (right_num == -1){
+//        std::cout<<"right lane failed"<<std::endl;
+//    }
+
+    // Visualize
+
+    /*-----------------warp image and points ----------------------*/
+    cv::Point2f pp;
+    cv::Point2f ppp;
+    int flag = 0;
+    float DX = 0.055, DY = 0.26; // 像素和实际距离的比例
+    std::vector<std::vector<cv::Point2f>> world_lanes;
+    cv::warpPerspective(raw_image, img_warp, pers_mat_inv, raw_image.size());        //  图像变换(输入图像,输出图像,变换矩阵)
+    for (const auto& lane_points : warped_lanes) {
+        if(flag == left_num || flag == right_num){
+            std::vector<cv::Point2f> world_points;
+            for (const auto& point : lane_points) {
+                pp.x =float(point.x);
+                pp.y =float(point.y);
+                cv::circle(img_warp, pp, 3, cv::Scalar(0, 255, 0), -1);
+                //ppp.x = float((point.x - img_warp.size().width / 2) * METER_PER_PIXEL_X);
+                //ppp.y = float((img_warp.size().height - point.y) * METER_PER_PIXEL_Y);
+                ppp.x = float((point.x - transed_O.x) * METER_PER_PIXEL_X);
+                ppp.y = float((transed_O.y - point.y) * METER_PER_PIXEL_Y);
+                world_points.push_back(ppp);
+            }
+            world_lanes.push_back(world_points);
+        }else{
+            std::vector<cv::Point2f> world_points;
+            for (const auto& point : lane_points) {
+                pp.x =float(point.x);
+                pp.y =float(point.y);
+                //std::cout<<pp.x<<" "<<pp.y<<std::endl;
+                cv::circle(img_warp, pp, 3, cv::Scalar(0, 0, 255), -1);
+                ppp.x = float((point.x - transed_O.x) * METER_PER_PIXEL_X);
+                ppp.y = float((transed_O.y - point.y) * METER_PER_PIXEL_Y);
+                world_points.push_back(ppp);
+                //std::cout<<ppp.x<<" "<<ppp.y<<std::endl;
+            }
+            world_lanes.push_back(world_points);
+        }
+        flag++;
+    }
+    //cv::imshow("warp",img_warp);
+
+
+   /*------------------------show circle--------------------------*/
+    flag = 0;
+    for (const auto& lane_points : lanes) {
+        if(flag == left_num || flag == right_num){
+            for (int i=0; i<lane_points.size(); i++)  {
+                pp.x =float(lane_points[i].x);
+                pp.y =float(lane_points[i].y);
+                cv::circle(raw_image, pp, 3, cv::Scalar(0, 255, 0), -1);
+
+            }
+        }else if(flag == left_left){
+            for (int i=0; i<lane_points.size(); i++) {
+                pp.x =float(lane_points[i].x);
+                pp.y =float(lane_points[i].y);
+                cv::circle(raw_image, pp, 3, cv::Scalar(0, 0, 255), -1);
+            }
+        }else {
+            for (int i=0; i<lane_points.size(); i++) {
+                pp.x =float(lane_points[i].x);
+                pp.y =float(lane_points[i].y);
+                cv::circle(raw_image, pp, 3, cv::Scalar(255, 0, 0), -1);
+            }
+        }
+        flag++;
+    }
+/*------------------------show lanes_line------------------------------*/
+//    cv::Point2f p1, p2;
+//    flag = 0;
+//    for (const auto& lane_points : lanes) {
+//        if(flag == left_num || flag == right_num){
+//            for (int i=0; i<lane_points.size()-1; i++) {
+//                p1.x =float(lane_points[i].x) *  x_ratio;
+//                p1.y =float(lane_points[i].y) * y_ratio;
+//                p2.x =float(lane_points[i+1].x) *  x_ratio;
+//                p2.y =float(lane_points[i+1].y) *  y_ratio;
+//                cv::line(raw_image, p1, p2, cv::Scalar(0, 255, 0), 1);
+
+//            }
+//        }else{
+//            for (int i=0; i<lane_points.size()-1; i++) {
+//                p1.x =float(lane_points[i].x) *  x_ratio;
+//                p1.y =float(lane_points[i].y) * y_ratio;
+//                p2.x =float(lane_points[i+1].x) *  x_ratio;
+//                p2.y =float(lane_points[i+1].y) *  y_ratio;
+//                cv::line(raw_image, p1, p2, cv::Scalar(0, 0, 255), 1);
+//            }
+//        }
+//        flag++;
+//    }
+
+    //cv::imshow("raw",raw_image);
+
+    /*---------------------- protobuf ------------------------*/
+
+    iv::vision::Linearray line_array;
+
+    flag = 0;
+    for (const auto& lane_points : world_lanes) {
+        iv::vision::Line *line = line_array.add_line();
+
+        if(flag == left_left){
+            line->set_index(1);
+            for (int i=0; i<lane_points.size(); i++)  {
+                iv::vision::Point2f *points = line->add_linepoint();
+                points->set_x(lane_points[i].x);
+                points->set_y(lane_points[i].y);
+
+            }
+        }else if(flag == left_num){
+            line->set_index(2);
+            for (int i=0; i<lane_points.size(); i++) {
+                iv::vision::Point2f *points = line->add_linepoint();
+                points->set_x(lane_points[i].x);
+                points->set_y(lane_points[i].y);
+            }
+        }else if(flag == right_num){
+            line->set_index(3);
+            for (int i=0; i<lane_points.size(); i++) {
+                iv::vision::Point2f *points = line->add_linepoint();
+                points->set_x(lane_points[i].x);
+                points->set_y(lane_points[i].y);
+            }
+        }else{
+            line->set_index(4);
+            for (int i=0; i<lane_points.size(); i++) {
+                iv::vision::Point2f *points = line->add_linepoint();
+                points->set_x(lane_points[i].x);
+                points->set_y(lane_points[i].y);
+            }
+        }
+        flag++;
+    }
+    /*-----------------------shareMsg-------------------------*/
+
+    int size = line_array.ByteSize();
+    char * strdata = new char[line_array.ByteSize()];
+    if(line_array.SerializeToArray(strdata, size))
+    {
+        iv::modulecomm::ModuleSendMsg(gpdetect, strdata, size);
+        std::cout<<"lane serialize success."<<std::endl;
+    }
+    else
+    {
+        std::cout<<"lane serialize error."<<std::endl;
+    }
+    delete strdata;
+
+}
+
+void Listenpic(const char * strdata,const unsigned int nSize,const unsigned int index,const QDateTime * dt,const char * strmemname)
+{
+    mMutex.lock();
+
+    std::cout<<"one time"<<std::endl;
+    if(nSize<1000)return;
+    iv::vision::rawpic pic;
+
+    if(false == pic.ParseFromArray(strdata,nSize))
+    {
+        std::cout<<"picview Listenpic fail."<<std::endl;
+        return;
+    }
+
+    cv::Mat mat(pic.height(),pic.width(),pic.mattype());
+    if(pic.type() == 1)
+        memcpy(mat.data,pic.picdata().data(),mat.rows*mat.cols*mat.elemSize());
+    else
+    {
+       mat.release();
+       std::vector<unsigned char> buff(pic.picdata().data(),pic.picdata().data() + pic.picdata().size());
+       mat = cv::imdecode(buff,CV_LOAD_IMAGE_COLOR);
+    }
+
+    cv::Mat img_camera=mat;
+
+    // save video
+//    outputVideo.write(img_camera);
+
+    img_camera = img_camera(cv::Range(410-90,590+90), cv::Range(380+35,1020+35));//height width
+    //img_camera = img_camera(cv::Range(240-30,600-30), cv::Range(320-70,960-70));//height width
+    //img_camera = img_camera(cv::Range(120,480), cv::Range(240,880));//height width
+//    std::cout<<img_camera<<std::endl;
+    //cv::remap(img_camera,img_camera,map1,map2,cv::INTER_LINEAR,cv::BORDER_CONSTANT);
+//    img_camera = img_camera(cv::Range(120,360), cv::Range(105,535));
+//    cv::resize(img_camera,img_camera,cv::Size(640,360));
+    raw_image = img_camera;
+    std::cout<<raw_image.size()<<std::endl;
+
+//    img_camera = img_camera.resize(640,360);
+
+
+    lanes = model.DetectLane(img_camera);
+    std::cout<<"lane_num is: "<<lanes.size()<<std::endl;
+    transform (lanes);
+    std::cout<<img_warp.size()<<std::endl;
+
+
+    mat.release();
+    mMutex.unlock();
+}
+
+
+
+int main(int argc, char *argv[]) {
+
+// ==============================================================================
+    showversion("laneatt");
+    QCoreApplication a(argc, argv);
+
+    gfault = new iv::Ivfault("laneATT");
+    givlog = new iv::Ivlog("laneATT");
+
+    gfault->SetFaultState(0,0,"laneATT initialize.");
+
+
+//================================== camera calib init ==========================
+
+    cv::FileStorage calib_file("./yaml/camera_middle_640_360.yaml", cv::FileStorage::READ);
+    calib_file["cameraMatrix"]>>camera_matrix;
+    calib_file["distCoeffs"]>>dist_coe;
+    cv::Mat R = cv::Mat::eye(3, 3, CV_64F);
+    //cv::Size imgsize=cv::Size(640,480);
+    cv::Size imgsize=cv::Size(640,360);
+
+    cv::initUndistortRectifyMap(camera_matrix, dist_coe, R, camera_matrix,imgsize,CV_16SC2,map1,map2);
+
+    std::string outputVideoPath = "./test/wri.mp4";
+
+//    outputVideo.open(outputVideoPath, cv::VideoWriter::fourcc('D','I','V','X'), 24, cv::Size(1280, 720));
+//==============================================================================
+
+/*-------------- input: camera -------------*/
+
+    //gp = iv::modulecomm::RegisterRecv("image00",Listenpic);
+
+
+    gpdetect = iv::modulecomm::RegisterSend("linearray",10000000,1);
+
+//    cv::waitKey(1000);
+
+//    while(true){
+//        if(cv::waitKey(10)==27)
+//            break;
+//        if(raw_image.empty()==1 || img_warp.empty()==1) continue;
+//        cv::imshow("rst",raw_image);
+//        //cv::imshow("rst2",img_warp );
+//        cv::waitKey(10);
+//    }
+//    outputVideo.release();
+
+/*-------------- input: image --------------*/
+
+//    cv::Mat img_camera;
+//    img_camera = cv::imread("./test/1.png");
+//    lanes = model.DetectLane(img_camera);
+//    transform (lanes, img_camera);
+
+
+/*-------------- input: video -----------------*/
+
+    cv::VideoCapture cap("v4l2src device=/dev/video0 ! video/x-raw, width=(int)1280, height=(int)720 ! videoconvert ! appsink");
+    //cv::VideoCapture cap("/home/nvidia/code/modularization/src/detection/laneATT_trt/test/camera_test1.mp4");
+    if(!cap.isOpened()){
+        std::cout<<"open failed"<<std::endl;
+        return -1;
+    }
+    cv::Mat frame;
+    cv::waitKey(1000);
+    cv::Mat img_crop;
+    while(true)
+    {
+        auto start = std::chrono::system_clock::now();  //时间函数
+        cap>>frame;
+        img_crop = frame;
+        //img_crop = frame(cv::Range(410-90,590+90), cv::Range(380+35,1020+35));//height width
+        //cv::remap(img_crop,img_crop,map1,map2,cv::INTER_LINEAR,cv::BORDER_CONSTANT);
+
+        raw_image = img_crop;
+        lanes = model.DetectLane(img_crop);
+        transform (lanes);
+        auto end = std::chrono::system_clock::now();  //时间函数
+        std::cout <<"total time lane : "<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+        cv::imshow("rst",raw_image);
+//        cv::imshow("rst2",img_warp );
+        if (cv::waitKey(10) == 'q')
+            break;
+    }
+    cv::destroyAllWindows();
+
+/*-------------------------------------------------------*/
+
+    iv::ivexit::RegIVExitCall(exitfunc);
+
+    return a.exec();
+}
+

+ 239 - 0
src/detection/laneATT_trt/yolop.hpp

@@ -0,0 +1,239 @@
+#pragma once
+
+#include <chrono>
+#include "cuda_utils.h"
+#include "logging.h"
+#include "utils.h"
+
+#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
+#define DEVICE 0  // GPU id
+#define NMS_THRESH 0.45
+#define CONF_THRESH 0.25
+#define BATCH_SIZE 1
+
+// stuff we know about the network and the input/output blobs
+static const int INPUT_H = Yolo::INPUT_H;
+static const int INPUT_W = Yolo::INPUT_W;
+static const int IMG_H = Yolo::IMG_H;
+static const int IMG_W = Yolo::IMG_W;
+static const int CLASS_NUM = Yolo::CLASS_NUM;
+static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
+const char* INPUT_BLOB_NAME = "data";
+const char* OUTPUT_DET_NAME = "det";
+const char* OUTPUT_SEG_NAME = "seg";
+const char* OUTPUT_LANE_NAME = "lane";
+static Logger gLogger;
+
+
+
+ICudaEngine* build_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, std::string& wts_name) {
+    INetworkDefinition* network = builder->createNetworkV2(0U);
+
+    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
+    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
+    assert(data);
+
+    std::map<std::string, Weights> weightMap = loadWeights(wts_name);
+    Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
+
+    // yolop backbone
+    // auto focus0 = focus(network, weightMap, *shuffle->getOutput(0), 3, 32, 3, "model.0");
+    auto focus0 = focus(network, weightMap, *data, 3, 32, 3, "model.0");
+    auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 64, 3, 2, 1, "model.1");
+    auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 64, 64, 1, true, 1, 0.5, "model.2");
+    auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 128, 3, 2, 1, "model.3");
+    auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.4");
+    auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 256, 3, 2, 1, "model.5");
+    auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 256, 256, 3, true, 1, 0.5, "model.6");
+    auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 512, 3, 2, 1, "model.7");
+    auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 512, 512, 5, 9, 13, "model.8");
+
+    // yolop head
+    auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.9");
+    auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 256, 1, 1, 1, "model.10");
+
+    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
+    for (int i = 0; i < 256 * 2 * 2; i++) {
+        deval[i] = 1.0;
+    }
+    Weights deconvwts11{ DataType::kFLOAT, deval, 256 * 2 * 2 };
+    IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 256, DimsHW{ 2, 2 }, deconvwts11, emptywts);
+    deconv11->setStrideNd(DimsHW{ 2, 2 });
+    deconv11->setNbGroups(256);
+    weightMap["deconv11"] = deconvwts11;
+
+    ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) };
+    auto cat12 = network->addConcatenation(inputTensors12, 2);
+    auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 512, 256, 1, false, 1, 0.5, "model.13");
+    auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 128, 1, 1, 1, "model.14");
+
+    Weights deconvwts15{ DataType::kFLOAT, deval, 128 * 2 * 2 };
+    IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts15, emptywts);
+    deconv15->setStrideNd(DimsHW{ 2, 2 });
+    deconv15->setNbGroups(128);
+
+    ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) };
+    auto cat16 = network->addConcatenation(inputTensors16, 2);
+    auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 256, 128, 1, false, 1, 0.5, "model.17");
+    IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
+
+    auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 128, 3, 2, 1, "model.18");
+    ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
+    auto cat19 = network->addConcatenation(inputTensors19, 2);
+    auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 256, 256, 1, false, 1, 0.5, "model.20");
+    IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
+
+    auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 256, 3, 2, 1, "model.21");
+    ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
+    auto cat22 = network->addConcatenation(inputTensors22, 2);
+    auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.23");
+    IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
+
+    auto detect24 = addYoLoLayer(network, weightMap, det0, det1, det2);
+    detect24->getOutput(0)->setName(OUTPUT_DET_NAME);
+
+    auto conv25 = convBlock(network, weightMap, *cat16->getOutput(0), 128, 3, 1, 1, "model.25");
+    // upsample 26
+    Weights deconvwts26{ DataType::kFLOAT, deval, 128 * 2 * 2 };
+    IDeconvolutionLayer* deconv26 = network->addDeconvolutionNd(*conv25->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts26, emptywts);
+    deconv26->setStrideNd(DimsHW{ 2, 2 });
+    deconv26->setNbGroups(128);
+
+    auto bottleneck_csp27 = bottleneckCSP(network, weightMap, *deconv26->getOutput(0), 128, 64, 1, false, 1, 0.5, "model.27");
+    auto conv28 = convBlock(network, weightMap, *bottleneck_csp27->getOutput(0), 32, 3, 1, 1, "model.28");
+    // upsample 29
+    Weights deconvwts29{ DataType::kFLOAT, deval, 32 * 2 * 2 };
+    IDeconvolutionLayer* deconv29 = network->addDeconvolutionNd(*conv28->getOutput(0), 32, DimsHW{ 2, 2 }, deconvwts29, emptywts);
+    deconv29->setStrideNd(DimsHW{ 2, 2 });
+    deconv29->setNbGroups(32);
+
+    auto conv30 = convBlock(network, weightMap, *deconv29->getOutput(0), 16, 3, 1, 1, "model.30");
+    auto bottleneck_csp31 = bottleneckCSP(network, weightMap, *conv30->getOutput(0), 16, 8, 1, false, 1, 0.5, "model.31");
+
+    // upsample32
+    Weights deconvwts32{ DataType::kFLOAT, deval, 8 * 2 * 2 };
+    IDeconvolutionLayer* deconv32 = network->addDeconvolutionNd(*bottleneck_csp31->getOutput(0), 8, DimsHW{ 2, 2 }, deconvwts32, emptywts);
+    deconv32->setStrideNd(DimsHW{ 2, 2 });
+    deconv32->setNbGroups(8);
+
+    auto conv33 = convBlock(network, weightMap, *deconv32->getOutput(0), 2, 3, 1, 1, "model.33");
+    // segmentation output
+    ISliceLayer *slicelayer = network->addSlice(*conv33->getOutput(0), Dims3{ 0, (Yolo::INPUT_H - Yolo::IMG_H) / 2, 0 }, Dims3{ 2, Yolo::IMG_H, Yolo::IMG_W }, Dims3{ 1, 1, 1 });
+    auto segout = network->addTopK(*slicelayer->getOutput(0), TopKOperation::kMAX, 1, 1);
+    segout->getOutput(1)->setName(OUTPUT_SEG_NAME);
+
+    auto conv34 = convBlock(network, weightMap, *cat16->getOutput(0), 128, 3, 1, 1, "model.34");
+
+    // upsample35
+    Weights deconvwts35{ DataType::kFLOAT, deval, 128 * 2 * 2 };
+    IDeconvolutionLayer* deconv35 = network->addDeconvolutionNd(*conv34->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts35, emptywts);
+    deconv35->setStrideNd(DimsHW{ 2, 2 });
+    deconv35->setNbGroups(128);
+
+    auto bottleneck_csp36 = bottleneckCSP(network, weightMap, *deconv35->getOutput(0), 128, 64, 1, false, 1, 0.5, "model.36");
+    auto conv37 = convBlock(network, weightMap, *bottleneck_csp36->getOutput(0), 32, 3, 1, 1, "model.37");
+
+    // upsample38
+    Weights deconvwts38{ DataType::kFLOAT, deval, 32 * 2 * 2 };
+    IDeconvolutionLayer* deconv38 = network->addDeconvolutionNd(*conv37->getOutput(0), 32, DimsHW{ 2, 2 }, deconvwts38, emptywts);
+    deconv38->setStrideNd(DimsHW{ 2, 2 });
+    deconv38->setNbGroups(32);
+
+    auto conv39 = convBlock(network, weightMap, *deconv38->getOutput(0), 16, 3, 1, 1, "model.39");
+    auto bottleneck_csp40 = bottleneckCSP(network, weightMap, *conv39->getOutput(0), 16, 8, 1, false, 1, 0.5, "model.40");
+
+    // upsample41
+    Weights deconvwts41{ DataType::kFLOAT, deval, 8 * 2 * 2 };
+    IDeconvolutionLayer* deconv41 = network->addDeconvolutionNd(*bottleneck_csp40->getOutput(0), 8, DimsHW{ 2, 2 }, deconvwts41, emptywts);
+    deconv41->setStrideNd(DimsHW{ 2, 2 });
+    deconv41->setNbGroups(8);
+
+    auto conv42 = convBlock(network, weightMap, *deconv41->getOutput(0), 2, 3, 1, 1, "model.42");
+    // lane-det output
+    ISliceLayer *laneSlice = network->addSlice(*conv42->getOutput(0), Dims3{ 0, (Yolo::INPUT_H - Yolo::IMG_H) / 2, 0 }, Dims3{ 2, Yolo::IMG_H, Yolo::IMG_W }, Dims3{ 1, 1, 1 });
+    auto laneout = network->addTopK(*laneSlice->getOutput(0), TopKOperation::kMAX, 1, 1);
+    laneout->getOutput(1)->setName(OUTPUT_LANE_NAME);
+
+    // detection output
+    network->markOutput(*detect24->getOutput(0));
+    // segmentation output
+    network->markOutput(*segout->getOutput(1));
+    // lane output
+    network->markOutput(*laneout->getOutput(1));
+
+    //assert(false);
+
+    // Build engine
+    builder->setMaxBatchSize(maxBatchSize);
+    config->setMaxWorkspaceSize(2L * (1L << 30));  // 2GB
+#if defined(USE_FP16)
+    config->setFlag(BuilderFlag::kFP16);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    // Don't need the network any more
+    network->destroy();
+
+    // Release host memory
+    for (auto& mem : weightMap)
+    {
+        free((void*)(mem.second.values));
+    }
+
+    return engine;
+}
+
+void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, std::string& wts_name) {
+    // Create builder
+    IBuilder* builder = createInferBuilder(gLogger);
+    IBuilderConfig* config = builder->createBuilderConfig();
+
+    // Create model to populate the network, then set the outputs and create an engine
+    ICudaEngine* engine = build_engine(maxBatchSize, builder, config, DataType::kFLOAT, wts_name);
+    assert(engine != nullptr);
+
+    // Serialize the engine
+    (*modelStream) = engine->serialize();
+
+    // Close everything down
+    engine->destroy();
+    builder->destroy();
+    config->destroy();
+}
+
+void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* det_output, int* seg_output, int* lane_output, int batchSize) {
+    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
+    // CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
+    context.enqueue(batchSize, buffers, stream, nullptr);
+    CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    cudaStreamSynchronize(stream);
+}
+
+void doInferenceCpu(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* det_output, int* seg_output, int* lane_output, int batchSize) {
+    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
+    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
+    context.enqueue(batchSize, buffers, stream, nullptr);
+    CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    cudaStreamSynchronize(stream);
+}
+
+bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir) {
+    if (argc < 4) return false;
+    if (std::string(argv[1]) == "-s" && argc == 4) {
+        wts = std::string(argv[2]);
+        engine = std::string(argv[3]);
+    } else if (std::string(argv[1]) == "-d" && argc == 4) {
+        engine = std::string(argv[2]);
+        img_dir = std::string(argv[3]);
+    } else {
+        return false;
+    }
+    return true;
+}

+ 4 - 0
src/detection/laneATT_trt/yolop_model.cpp

@@ -0,0 +1,4 @@
+
+#include "yolop_model.hpp"
+#include "para.hpp"
+

+ 185 - 0
src/detection/laneATT_trt/yolop_model.hpp

@@ -0,0 +1,185 @@
+#pragma once
+
+#include <chrono>
+#include "cuda_utils.h"
+#include "logging.h"
+#include "utils.h"
+#include "para.hpp"
+#include "NvOnnxParser.h"
+#include "parserOnnxConfig.h"
+#include "NvInfer.h"
+#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
+#define DEVICE 0  // GPU id
+#define NMS_THRESH 0.45
+#define CONF_THRESH 0.25
+#define BATCH_SIZE 1
+
+struct InferDeleter
+{
+    template <typename T>
+    void operator()(T* obj) const
+    {
+        if (obj)
+        {
+            obj->destroy();
+        }
+    }
+};
+
+class Yolop{
+template <typename T>
+using nvUniquePtr = std::unique_ptr<T, InferDeleter>;
+public:
+// stuff we know about the network and the input/output blobs
+Yolop();
+IRuntime* runtime;
+ICudaEngine* engine;
+IExecutionContext* context;
+float input_data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
+// Create stream
+cudaStream_t stream;
+void* buffers[4];
+
+
+
+void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* det_output, int* seg_output, int* lane_output, int batchSize);
+
+void doInferenceCpu(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* det_output, float* seg_output, float* lane_output, int batchSize);
+
+
+void SaveEngine(IHostMemory& trtModelStream, std::string& engine_filepath);
+
+bool isFileExists_ifstream(const string& name);
+
+ICudaEngine* LoadEngine(IRuntime& runtime);
+
+int cuda_init();
+
+};
+
+
+int Yolop::cuda_init(){
+
+
+//    std::string onnxfile = "./weights/yolopv2_384x640.onnx";
+
+
+
+    IRuntime* runtime = createInferRuntime(gLogger);
+    assert(runtime != nullptr);
+    this->engine = LoadEngine(*runtime);
+
+    runtime->destroy();
+    this->context = this->engine->createExecutionContext();
+    if(this->context == nullptr)
+    {
+        std::cout<<"load engine fail"<<std::endl;
+        return false;
+    }
+
+    // deserialize the .engine and run inference
+
+
+//    assert(this->engine->getNbBindings() == 4);
+
+    // In order to bind the buffers, we need to know the names of the input and output tensors.
+    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+    const int inputIndex = this->engine->getBindingIndex(INPUT_BLOB_NAME);
+    const int output_det_index = this->engine->getBindingIndex(OUTPUT_DET_NAME);
+    const int output_seg_index = this->engine->getBindingIndex(OUTPUT_SEG_NAME);
+    const int output_lane_index = this->engine->getBindingIndex(OUTPUT_LANE_NAME);
+//    Dims k=this->engine->getBindingDimensions(output_seg_index);
+//    DataType m=this->engine->getBindingDataType(output_det_index);
+//    TensorFormat f= this->engine->getBindingFormat(output_det_index);
+//    assert(inputIndex == 0);
+//    assert(output_det_index == 1);
+//    assert(output_seg_index == 2);
+//    assert(output_lane_index == 3);
+    // Create GPU buffers on device
+    CUDA_CHECK(cudaMalloc(&this->buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&this->buffers[output_det_index], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&this->buffers[output_seg_index], BATCH_SIZE * IMG_H * IMG_W * sizeof(int)));
+    CUDA_CHECK(cudaMalloc(&this->buffers[output_lane_index], BATCH_SIZE * IMG_H * IMG_W * sizeof(int)));
+
+    CUDA_CHECK(cudaStreamCreate(&this->stream));
+
+
+
+}
+
+
+Yolop::Yolop(){}
+
+bool Yolop::isFileExists_ifstream(const string& name) {
+    ifstream f(name.c_str());
+    return f.good();
+    f.close();
+}
+
+
+
+
+
+void Yolop::SaveEngine(IHostMemory& trtModelStream, std::string& engine_filepath)
+{
+    std::ofstream file;
+    file.open(engine_filepath, std::ios::binary | std::ios::out);
+    if(!file.is_open())
+    {
+        std::cout << "read create engine file" << engine_filepath <<" failed" << std::endl;
+        return;
+    }
+    file.write((const char*)trtModelStream.data(), std::streamsize(trtModelStream.size()));
+    file.close();
+}
+
+
+
+ICudaEngine* Yolop::LoadEngine(IRuntime& runtime)
+{
+    const std::string engine_filepath = "./LaneATT_test.trt8";
+
+    std::ifstream file;
+    file.open(engine_filepath, ios::binary | ios::in);
+    if(!file)
+    {
+        cout<<engine_filepath<<" not found!"<<endl;
+        return nullptr;
+    }
+    file.seekg(0, ios::end);
+    std::size_t length = std::size_t(file.tellg());
+    file.seekg(0, ios::beg);
+
+    std::shared_ptr<char> data(new char[length], std::default_delete<char[]>());
+    file.read(data.get(), std::streamsize(length));
+    file.close();
+
+    // runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
+    ICudaEngine* engine = runtime.deserializeCudaEngine(data.get(), length, nullptr);
+    return engine;
+}
+
+
+
+
+void Yolop::doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* det_output, int* seg_output, int* lane_output, int batchSize) {
+    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
+    // CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
+    context.enqueue(batchSize, buffers, stream, nullptr);
+    CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * IMG_H * IMG_W * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    cudaStreamSynchronize(stream);
+}
+
+void Yolop::doInferenceCpu(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* det_output, float* seg_output, float* lane_output, int batchSize) {
+    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
+    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
+    context.enqueue(batchSize, buffers, stream, nullptr);
+    CUDA_CHECK(cudaMemcpyAsync(seg_output, buffers[2], batchSize * 2 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(det_output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(lane_output, buffers[3], batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    cudaStreamSynchronize(stream);
+}
+
+

+ 395 - 0
src/detection/laneATT_trt/yolop_trt.py

@@ -0,0 +1,395 @@
+# 2022/10/26 by ausk
+"""
+An example that uses TensorRT's Python api to make yolop inferences.
+"""
+import ctypes
+import os
+import shutil
+import random
+import sys
+import time
+import cv2
+import numpy as np
+import pycuda.autoinit
+import pycuda.driver as cuda
+import tensorrt as trt
+
+CONF_THRESH = 0.5
+IOU_THRESHOLD = 0.4
+
+
+def get_img_path_batches(batch_size, img_dir):
+    ret = []
+    batch = []
+    for root, dirs, files in os.walk(img_dir):
+        for name in files:
+            if len(batch) == batch_size:
+                ret.append(batch)
+                batch = []
+            batch.append(os.path.join(root, name))
+    if len(batch) > 0:
+        ret.append(batch)
+    return ret
+
+def plot_one_box(x, img, color=None, label=None, line_thickness=None):
+    """
+    description: Plots one bounding box on image img,
+                 this function comes from YoLov5 project.
+    """
+    tl = ( line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1)  # line/font thickness
+    color = color or [random.randint(0, 255) for _ in range(3)]
+    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
+    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
+    if label:
+        tf = max(tl - 1, 1)  # font thickness
+        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
+        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
+        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
+        cv2.putText( img, label,  (c1[0], c1[1] - 2), 0,  tl / 3, [225, 255, 255], thickness=tf,  lineType=cv2.LINE_AA)
+
+class YolopTRT(object):
+    """
+    description: Warps TensorRT ops, preprocess and postprocess ops.
+    """
+
+    def __init__(self, engine_file_path):
+        # Create a Context on this device,
+        self.ctx = cuda.Device(0).make_context()
+        stream = cuda.Stream()
+        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+        runtime = trt.Runtime(TRT_LOGGER)
+
+        # Deserialize the engine from file
+        with open(engine_file_path, "rb") as f:
+            engine = runtime.deserialize_cuda_engine(f.read())
+        context = engine.create_execution_context()
+
+        host_inputs = []
+        cuda_inputs = []
+        host_outputs = []
+        cuda_outputs = []
+        bindings = []
+
+        for binding in engine:
+            print('bingding: ', binding, engine.get_binding_shape(binding))
+            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
+            dtype = trt.nptype(engine.get_binding_dtype(binding))
+            # Allocate host and device buffers
+            host_mem = cuda.pagelocked_empty(size, dtype)
+            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
+            # Append the device buffer to device bindings.
+            bindings.append(int(cuda_mem))
+            # Append to the appropriate list.
+            if engine.binding_is_input(binding):
+                self.input_w = engine.get_binding_shape(binding)[-1]
+                self.input_h = engine.get_binding_shape(binding)[-2]
+                host_inputs.append(host_mem)
+                cuda_inputs.append(cuda_mem)
+            else:
+                host_outputs.append(host_mem)
+                cuda_outputs.append(cuda_mem)
+
+        self.input_h = 384
+        self.input_w = 640
+        self.img_h = 360
+        self.img_w = 640
+
+        # Store
+        self.stream = stream
+        self.context = context
+        self.engine = engine
+        self.host_inputs = host_inputs
+        self.cuda_inputs = cuda_inputs
+        self.host_outputs = host_outputs
+        self.cuda_outputs = cuda_outputs
+        self.bindings = bindings
+        self.batch_size = engine.max_batch_size
+
+    def infer(self, raw_image_generator):
+        # Make self the active context, pushing it on top of the context stack.
+        self.ctx.push()
+        # Restore
+        stream = self.stream
+        context = self.context
+        engine = self.engine
+        host_inputs = self.host_inputs
+        cuda_inputs = self.cuda_inputs
+        host_outputs = self.host_outputs
+        cuda_outputs = self.cuda_outputs
+        bindings = self.bindings
+        # Do image preprocess
+        batch_image_raw = []
+        batch_origin_h = []
+        batch_origin_w = []
+        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
+        for i, image_raw in enumerate(raw_image_generator):
+            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
+            batch_image_raw.append(image_raw)
+            batch_origin_h.append(origin_h)
+            batch_origin_w.append(origin_w)
+            np.copyto(batch_input_image[i], input_image)
+        batch_input_image = np.ascontiguousarray(batch_input_image)
+
+        # Copy input image to host buffer
+        np.copyto(host_inputs[0], batch_input_image.ravel())
+        start = time.time()
+        # Transfer input data  to the GPU.
+        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
+        # Run inference.
+        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
+        # Transfer predictions back from the GPU.
+        for i in range(len(host_outputs)):
+            cuda.memcpy_dtoh_async(host_outputs[i], cuda_outputs[i], stream)
+        # Synchronize the stream
+        stream.synchronize()
+        end = time.time()
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+        # Here we use the first row of output in that batch_size = 1
+
+        detout = host_outputs[0]
+        segout = host_outputs[1].reshape( (self.batch_size, self.img_h,self.img_w))
+        laneout = host_outputs[2].reshape( (self.batch_size, self.img_h,self.img_w))
+
+        # Do postprocess
+        for i in range(self.batch_size):
+            result_boxes, result_scores, result_classid = self.post_process(
+                detout[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i]
+            )
+
+            # Draw rectangles and labels on the original image
+            img = batch_image_raw[i]
+            nh = img.shape[0]
+            nw = img.shape[1]
+            for j in range(len(result_boxes)):
+                box = result_boxes[j]
+                label="{}:{:.2f}".format( categories[int(result_classid[j])], result_scores[j])
+                plot_one_box( box, img, label=label)
+
+            seg  = cv2.resize(segout[i], (nw, nh), interpolation=cv2.INTER_NEAREST)
+            lane = cv2.resize(laneout[i], (nw, nh), interpolation=cv2.INTER_NEAREST)
+            color_area = np.zeros_like(img)
+            color_area[seg==1]  = (0,255,0)
+            color_area[lane==1] = (0,0,255)
+            color_mask = np.mean(color_area, 2)
+            img[color_mask != 0] = img[color_mask != 0] * 0.5 + color_area[color_mask != 0] * 0.5
+            img = img.astype(np.uint8)
+
+        return batch_image_raw, end - start
+
+    def destroy(self):
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+
+    def get_raw_image(self, image_path_batch):
+        for img_path in image_path_batch:
+            yield cv2.imread(img_path)
+
+    def get_raw_image_zeros(self, image_path_batch=None):
+        for _ in range(self.batch_size):
+            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
+
+    def preprocess_image(self, raw_bgr_image):
+        image_raw = raw_bgr_image
+        h, w, c = image_raw.shape
+        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
+        # Calculate widht and height and paddings
+        r_w = self.input_w / w
+        r_h = self.input_h / h
+        if r_h > r_w:
+            tw = self.input_w
+            th = int(r_w * h)
+            tx1 = tx2 = 0
+            ty1 = int((self.input_h - th) / 2)
+            ty2 = self.input_h - th - ty1
+        else:
+            tw = int(r_h * w)
+            th = self.input_h
+            tx1 = int((self.input_w - tw) / 2)
+            tx2 = self.input_w - tw - tx1
+            ty1 = ty2 = 0
+        # Resize the image with long side while maintaining ratio
+        image = cv2.resize(image, (tw, th))
+        # Pad the short side with (128,128,128)
+        image = cv2.copyMakeBorder(
+            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (114, 114, 114)
+        )
+        image = image.astype(np.float32)
+        # Normalize to [0,1]
+        image /= 255.0
+        image = (image - (0.485, 0.456, 0.406)) /(0.229, 0.224, 0.225)
+        # HWC to CHW format:
+        image = np.transpose(image, [2, 0, 1])
+        # CHW to NCHW format
+        image = np.expand_dims(image, axis=0)
+        # Convert the image to row-major order, also known as "C order":
+        image = np.ascontiguousarray(image)
+        return image, image_raw, h, w
+
+    def xywh2xyxy(self, origin_h, origin_w, x):
+        """
+        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+        param:
+            origin_h:   height of original image
+            origin_w:   width of original image
+            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
+        return:
+            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
+        """
+        y = np.zeros_like(x)
+        r_w = self.input_w / origin_w
+        r_h = self.input_h / origin_h
+        if r_h > r_w:
+            y[:, 0] = x[:, 0] - x[:, 2] / 2
+            y[:, 2] = x[:, 0] + x[:, 2] / 2
+            y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
+            y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
+            y /= r_w
+        else:
+            y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
+            y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
+            y[:, 1] = x[:, 1] - x[:, 3] / 2
+            y[:, 3] = x[:, 1] + x[:, 3] / 2
+            y /= r_h
+
+        return y
+
+    def post_process(self, output, origin_h, origin_w):
+        # Get the num of boxes detected
+        num = int(output[0])
+        # Reshape to a two dimentional ndarray
+        pred = np.reshape(output[1:], (-1, 6))[:num, :]
+        # Do nms
+        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
+        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
+        result_scores = boxes[:, 4] if len(boxes) else np.array([])
+        result_classid = boxes[:, 5] if len(boxes) else np.array([])
+        return result_boxes, result_scores, result_classid
+
+    def bbox_iou(self, box1, box2, x1y1x2y2=True):
+        """
+        description: compute the IoU of two bounding boxes
+        param:
+            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
+            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
+            x1y1x2y2: select the coordinate format
+        return:
+            iou: computed iou
+        """
+        if not x1y1x2y2:
+            # Transform from center and width to exact coordinates
+            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+        else:
+            # Get the coordinates of bounding boxes
+            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
+            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
+
+        # Get the coordinates of the intersection rectangle
+        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
+        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
+        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
+        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
+        # Intersection area
+        inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
+                     np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
+        # Union Area
+        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
+        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
+
+        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
+
+        return iou
+
+    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
+        """
+        description: Removes detections with lower object confidence score than 'conf_thres' and performs
+        Non-Maximum Suppression to further filter detections.
+        param:
+            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
+            origin_h: original image height
+            origin_w: original image width
+            conf_thres: a confidence threshold to filter detections
+            nms_thres: a iou threshold to filter detections
+        return:
+            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
+        """
+        # Get the boxes that score > CONF_THRESH
+        boxes = prediction[prediction[:, 4] >= conf_thres]
+        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
+        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
+        # clip the coordinates
+        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1)
+        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1)
+        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1)
+        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1)
+        # Object confidence
+        confs = boxes[:, 4]
+        # Sort by the confs
+        boxes = boxes[np.argsort(-confs)]
+        # Perform non-maximum suppression
+        keep_boxes = []
+        while boxes.shape[0]:
+            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
+            label_match = boxes[0, -1] == boxes[:, -1]
+            # Indices of boxes with lower confidence scores, large IOUs and matching labels
+            invalid = large_overlap & label_match
+            keep_boxes += [boxes[0]]
+            boxes = boxes[~invalid]
+        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
+        return boxes
+
+
+if __name__ == "__main__":
+    # load custom plugin and engine
+    PLUGIN_LIBRARY = "build/libmyplugins.so"
+    engine_file_path = "build/yolop.trt"
+
+    print("usage: xxx <engine file> <plugin file> <image dir>")
+    print("[WARN] preaprea you image_dir, such as: samples, or /home/user/jetson/tmp/YOLOP/inference/images")
+    IMAGE_DIR =  "/home/user/jetson/tmp/YOLOP/inference/images"
+
+    if len(sys.argv) > 1:
+        engine_file_path = sys.argv[1]
+    if len(sys.argv) > 2:
+        PLUGIN_LIBRARY = sys.argv[2]
+    if len(sys.argv) > 3:
+        IMAGE_DIR = sys.argv[3]
+
+    ctypes.CDLL(PLUGIN_LIBRARY)
+
+    categories = ["car"]
+
+    if os.path.exists('output/'):
+        shutil.rmtree('output/')
+    os.makedirs('output/')
+
+    # a YolopTRT instance
+    yolop_wrapper = YolopTRT(engine_file_path)
+
+    try:
+        print('batch size is', yolop_wrapper.batch_size)
+
+        image_dir = IMAGE_DIR
+        image_path_batches = get_img_path_batches(yolop_wrapper.batch_size, image_dir)
+
+        for i in range(1):
+            batch_image_raw, use_time = yolop_wrapper.infer(yolop_wrapper.get_raw_image_zeros())
+            print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
+
+        for batch in image_path_batches:
+            batch_image_raw, use_time = yolop_wrapper.infer(yolop_wrapper.get_raw_image(batch))
+            for i, img_path in enumerate(batch):
+                parent, filename = os.path.split(img_path)
+                save_name = os.path.join('output', filename)
+                # Save image
+                cv2.imwrite(save_name, batch_image_raw[i])
+            print('input->{}, time->{:.2f}ms, saving into output/'.format(batch, use_time * 1000))
+
+    finally:
+        # destroy the instance
+        yolop_wrapper.destroy()
+
+    print("done!")