/* * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TRT_SAMPLE_UTILS_H #define TRT_SAMPLE_UTILS_H #include #include #include #include #include #include #include #include #if CUDA_VERSION < 10000 #include #else #include #endif #include "NvInfer.h" #include "common.h" #include "logger.h" #include "sampleDevice.h" #include "sampleOptions.h" namespace sample { inline int dataTypeSize(nvinfer1::DataType dataType) { switch (dataType) { case nvinfer1::DataType::kINT32: case nvinfer1::DataType::kFLOAT: return 4; case nvinfer1::DataType::kHALF: return 2; case nvinfer1::DataType::kBOOL: case nvinfer1::DataType::kINT8: return 1; } return 0; } template inline T roundUp(T m, T n) { return ((m + n - 1) / n) * n; } inline int volume(const nvinfer1::Dims& d) { return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); } //! comps is the number of components in a vector. Ignored if vecDim < 0. inline int64_t volume(const nvinfer1::Dims& dims, const nvinfer1::Dims& strides, int vecDim, int comps, int batch) { int maxNbElems = 1; for (int i = 0; i < dims.nbDims; ++i) { // Get effective length of axis. int d = dims.d[i]; // Any dimension is 0, it is an empty tensor. if (d == 0) { return 0; } if (i == vecDim) { d = samplesCommon::divUp(d, comps); } maxNbElems = std::max(maxNbElems, d * strides.d[i]); } return static_cast(maxNbElems) * batch * (vecDim < 0 ? 1 : comps); } inline int64_t volume(nvinfer1::Dims dims, int vecDim, int comps, int batch) { if (vecDim != -1) { dims.d[vecDim] = roundUp(dims.d[vecDim], comps); } return volume(dims) * std::max(batch, 1); } inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) { for (int i = 0; i < dims.nbDims; ++i) { os << (i ? "x" : "") << dims.d[i]; } return os; } inline std::ostream& operator<<(std::ostream& os, const nvinfer1::WeightsRole role) { switch (role) { case nvinfer1::WeightsRole::kKERNEL: { os << "Kernel"; break; } case nvinfer1::WeightsRole::kBIAS: { os << "Bias"; break; } case nvinfer1::WeightsRole::kSHIFT: { os << "Shift"; break; } case nvinfer1::WeightsRole::kSCALE: { os << "Scale"; break; } case nvinfer1::WeightsRole::kCONSTANT: { os << "Constant"; break; } case nvinfer1::WeightsRole::kANY: { os << "Any"; break; } } return os; } inline std::ostream& operator<<(std::ostream& os, const std::vector& vec) { for (int i = 0, e = static_cast(vec.size()); i < e; ++i) { os << (i ? "x" : "") << vec[i]; } return os; } inline nvinfer1::Dims toDims(const std::vector& vec) { int limit = static_cast(nvinfer1::Dims::MAX_DIMS); if (static_cast(vec.size()) > limit) { sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl; } // Pick first nvinfer1::Dims::MAX_DIMS elements nvinfer1::Dims dims{std::min(static_cast(vec.size()), limit), {}}; std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d)); return dims; } template inline void fillBuffer(void* buffer, int64_t volume, T min, T max) { T* typedBuffer = static_cast(buffer); std::default_random_engine engine; if (std::is_integral::value) { std::uniform_int_distribution distribution(min, max); auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; std::generate(typedBuffer, typedBuffer + volume, generator); } else { std::uniform_real_distribution distribution(min, max); auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; std::generate(typedBuffer, typedBuffer + volume, generator); } } // Specialization needed for custom type __half template inline void fillBufferHalf(void* buffer, int64_t volume, H min, H max) { H* typedBuffer = static_cast(buffer); std::default_random_engine engine; std::uniform_real_distribution distribution(min, max); auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; std::generate(typedBuffer, typedBuffer + volume, generator); } template <> #if CUDA_VERSION < 10000 inline void fillBuffer(void* buffer, int64_t volume, half_float::half min, half_float::half max) #else inline void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max) #endif { fillBufferHalf(buffer, volume, min, max); } template inline void dumpBuffer(const void* buffer, const std::string& separator, std::ostream& os, const Dims& dims, const Dims& strides, int32_t vectorDim, int32_t spv) { const int64_t volume = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies()); const T* typedBuffer = static_cast(buffer); std::string sep; for (int64_t v = 0; v < volume; ++v) { int64_t curV = v; int32_t dataOffset = 0; for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex) { int32_t dimVal = curV % dims.d[dimIndex]; if (dimIndex == vectorDim) { dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv; } else { dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv); } curV /= dims.d[dimIndex]; ASSERT(curV >= 0); } os << sep << typedBuffer[dataOffset]; sep = separator; } } struct Binding { bool isInput{false}; MirroredBuffer buffer; int64_t volume{0}; nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT}; void fill(const std::string& fileName) { std::ifstream file(fileName, std::ios::in | std::ios::binary); if (file.is_open()) { file.read(static_cast(buffer.getHostBuffer()), buffer.getSize()); file.close(); } } void fill() { switch (dataType) { case nvinfer1::DataType::kBOOL: { fillBuffer(buffer.getHostBuffer(), volume, 0, 1); break; } case nvinfer1::DataType::kINT32: { fillBuffer(buffer.getHostBuffer(), volume, -128, 127); break; } case nvinfer1::DataType::kINT8: { fillBuffer(buffer.getHostBuffer(), volume, -128, 127); break; } case nvinfer1::DataType::kFLOAT: { fillBuffer(buffer.getHostBuffer(), volume, -1.0, 1.0); break; } case nvinfer1::DataType::kHALF: { #if CUDA_VERSION < 10000 fillBuffer(buffer.getHostBuffer(), volume, static_cast(-1.0), static_cast(-1.0)); #else fillBuffer<__half>(buffer.getHostBuffer(), volume, -1.0, 1.0); #endif break; } } } void dump(std::ostream& os, Dims dims, Dims strides, int32_t vectorDim, int32_t spv, const std::string separator = " ") const { switch (dataType) { case nvinfer1::DataType::kBOOL: { dumpBuffer(buffer.getHostBuffer(), separator, os, dims, strides, vectorDim, spv); break; } case nvinfer1::DataType::kINT32: { dumpBuffer(buffer.getHostBuffer(), separator, os, dims, strides, vectorDim, spv); break; } case nvinfer1::DataType::kINT8: { dumpBuffer(buffer.getHostBuffer(), separator, os, dims, strides, vectorDim, spv); break; } case nvinfer1::DataType::kFLOAT: { dumpBuffer(buffer.getHostBuffer(), separator, os, dims, strides, vectorDim, spv); break; } case nvinfer1::DataType::kHALF: { #if CUDA_VERSION < 10000 dumpBuffer(buffer.getHostBuffer(), separator, os, dims, strides, vectorDim, spv); #else dumpBuffer<__half>(buffer.getHostBuffer(), separator, os, dims, strides, vectorDim, spv); #endif break; } } } }; class Bindings { public: void addBinding(int b, const std::string& name, bool isInput, int64_t volume, nvinfer1::DataType dataType, const std::string& fileName = "") { while (mBindings.size() <= static_cast(b)) { mBindings.emplace_back(); mDevicePointers.emplace_back(); } mNames[name] = b; mBindings[b].isInput = isInput; // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr // even for empty tensors, so allocate a dummy byte. if (volume == 0) { mBindings[b].buffer.allocate(1); } else { mBindings[b].buffer.allocate(static_cast(volume) * static_cast(dataTypeSize(dataType))); } mBindings[b].volume = volume; mBindings[b].dataType = dataType; mDevicePointers[b] = mBindings[b].buffer.getDeviceBuffer(); if (isInput) { if (fileName.empty()) { fill(b); } else { fill(b, fileName); } } } void** getDeviceBuffers() { return mDevicePointers.data(); } void transferInputToDevice(TrtCudaStream& stream) { for (auto& b : mNames) { if (mBindings[b.second].isInput) { mBindings[b.second].buffer.hostToDevice(stream); } } } void transferOutputToHost(TrtCudaStream& stream) { for (auto& b : mNames) { if (!mBindings[b.second].isInput) { mBindings[b.second].buffer.deviceToHost(stream); } } } void fill(int binding, const std::string& fileName) { mBindings[binding].fill(fileName); } void fill(int binding) { mBindings[binding].fill(); } void dumpBindingDimensions(int binding, const nvinfer1::IExecutionContext& context, std::ostream& os) const { const auto dims = context.getBindingDimensions(binding); // Do not add a newline terminator, because the caller may be outputting a JSON string. os << dims; } void dumpBindingValues(const nvinfer1::IExecutionContext& context, int binding, std::ostream& os, const std::string& separator = " ", int32_t batch = 1) const { Dims dims = context.getBindingDimensions(binding); Dims strides = context.getStrides(binding); int32_t vectorDim = context.getEngine().getBindingVectorizedDim(binding); const int32_t spv = context.getEngine().getBindingComponentsPerElement(binding); if (context.getEngine().hasImplicitBatchDimension()) { auto insertN = [](Dims& d, int32_t bs) { const int32_t nbDims = d.nbDims; ASSERT(nbDims < Dims::MAX_DIMS); std::copy_backward(&d.d[0], &d.d[nbDims], &d.d[nbDims + 1]); d.d[0] = bs; d.nbDims = nbDims + 1; }; int32_t batchStride = 0; for (int32_t i = 0; i < strides.nbDims; ++i) { if (strides.d[i] * dims.d[i] > batchStride) { batchStride = strides.d[i] * dims.d[i]; } } insertN(dims, batch); insertN(strides, batchStride); vectorDim = (vectorDim == -1) ? -1 : vectorDim + 1; } mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator); } void dumpInputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const { auto isInput = [](const Binding& b) { return b.isInput; }; dumpBindings(context, isInput, os); } void dumpOutputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const { auto isOutput = [](const Binding& b) { return !b.isInput; }; dumpBindings(context, isOutput, os); } void dumpBindings(const nvinfer1::IExecutionContext& context, std::ostream& os) const { auto all = [](const Binding& b) { return true; }; dumpBindings(context, all, os); } void dumpBindings( const nvinfer1::IExecutionContext& context, bool (*predicate)(const Binding& b), std::ostream& os) const { for (const auto& n : mNames) { const auto binding = n.second; if (predicate(mBindings[binding])) { os << n.first << ": ("; dumpBindingDimensions(binding, context, os); os << ")" << std::endl; dumpBindingValues(context, binding, os); os << std::endl; } } } std::unordered_map getInputBindings() const { auto isInput = [](const Binding& b) { return b.isInput; }; return getBindings(isInput); } std::unordered_map getOutputBindings() const { auto isOutput = [](const Binding& b) { return !b.isInput; }; return getBindings(isOutput); } std::unordered_map getBindings() const { auto all = [](const Binding& b) { return true; }; return getBindings(all); } std::unordered_map getBindings(bool (*predicate)(const Binding& b)) const { std::unordered_map bindings; for (const auto& n : mNames) { const auto binding = n.second; if (predicate(mBindings[binding])) { bindings.insert(n); } } return bindings; } private: std::unordered_map mNames; std::vector mBindings; std::vector mDevicePointers; }; template struct TrtDestroyer { void operator()(T* t) { t->destroy(); } }; template using TrtUniquePtr = std::unique_ptr>; inline bool broadcastIOFormats(const std::vector& formats, size_t nbBindings, bool isInput = true) { bool broadcast = formats.size() == 1; bool validFormatsCount = broadcast || (formats.size() == nbBindings); if (!formats.empty() && !validFormatsCount) { if (isInput) { throw std::invalid_argument( "The number of inputIOFormats must match network's inputs or be one for broadcasting."); } else { throw std::invalid_argument( "The number of outputIOFormats must match network's outputs or be one for broadcasting."); } } return broadcast; } inline std::vector loadTimingCacheFile(const std::string inFileName) { std::ifstream iFile(inFileName, std::ios::in | std::ios::binary); if (!iFile) { sample::gLogWarning << "Could not read timing cache from: " << inFileName << ". A new timing cache will be generated and written." << std::endl; return std::vector(); } iFile.seekg(0, std::ifstream::end); size_t fsize = iFile.tellg(); iFile.seekg(0, std::ifstream::beg); std::vector content(fsize); iFile.read(content.data(), fsize); iFile.close(); sample::gLogInfo << "Loaded " << fsize << " bytes of timing cache from " << inFileName << std::endl; return content; } inline void saveTimingCacheFile(const std::string outFileName, const IHostMemory* blob) { std::ofstream oFile(outFileName, std::ios::out | std::ios::binary); if (!oFile) { sample::gLogWarning << "Could not write timing cache to: " << outFileName << std::endl; return; } oFile.write((char*) blob->data(), blob->size()); oFile.close(); sample::gLogInfo << "Saved " << blob->size() << " bytes of timing cache to " << outFileName << std::endl; } } // namespace sample #endif // TRT_SAMPLE_UTILS_H