sampleUtils.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587
  1. /*
  2. * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef TRT_SAMPLE_UTILS_H
  17. #define TRT_SAMPLE_UTILS_H
  18. #include <fstream>
  19. #include <iostream>
  20. #include <memory>
  21. #include <numeric>
  22. #include <random>
  23. #include <unordered_map>
  24. #include <vector>
  25. #include <cuda.h>
  26. #if CUDA_VERSION < 10000
  27. #include <half.h>
  28. #else
  29. #include <cuda_fp16.h>
  30. #endif
  31. #include "NvInfer.h"
  32. #include "common.h"
  33. #include "logger.h"
  34. #include "sampleDevice.h"
  35. #include "sampleOptions.h"
  36. namespace sample
  37. {
  38. inline int dataTypeSize(nvinfer1::DataType dataType)
  39. {
  40. switch (dataType)
  41. {
  42. case nvinfer1::DataType::kINT32:
  43. case nvinfer1::DataType::kFLOAT: return 4;
  44. case nvinfer1::DataType::kHALF: return 2;
  45. case nvinfer1::DataType::kBOOL:
  46. case nvinfer1::DataType::kINT8: return 1;
  47. }
  48. return 0;
  49. }
  50. template <typename T>
  51. inline T roundUp(T m, T n)
  52. {
  53. return ((m + n - 1) / n) * n;
  54. }
  55. inline int volume(const nvinfer1::Dims& d)
  56. {
  57. return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int>());
  58. }
  59. //! comps is the number of components in a vector. Ignored if vecDim < 0.
  60. inline int64_t volume(const nvinfer1::Dims& dims, const nvinfer1::Dims& strides, int vecDim, int comps, int batch)
  61. {
  62. int maxNbElems = 1;
  63. for (int i = 0; i < dims.nbDims; ++i)
  64. {
  65. // Get effective length of axis.
  66. int d = dims.d[i];
  67. // Any dimension is 0, it is an empty tensor.
  68. if (d == 0)
  69. {
  70. return 0;
  71. }
  72. if (i == vecDim)
  73. {
  74. d = samplesCommon::divUp(d, comps);
  75. }
  76. maxNbElems = std::max(maxNbElems, d * strides.d[i]);
  77. }
  78. return static_cast<int64_t>(maxNbElems) * batch * (vecDim < 0 ? 1 : comps);
  79. }
  80. inline int64_t volume(nvinfer1::Dims dims, int vecDim, int comps, int batch)
  81. {
  82. if (vecDim != -1)
  83. {
  84. dims.d[vecDim] = roundUp(dims.d[vecDim], comps);
  85. }
  86. return volume(dims) * std::max(batch, 1);
  87. }
  88. inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims)
  89. {
  90. for (int i = 0; i < dims.nbDims; ++i)
  91. {
  92. os << (i ? "x" : "") << dims.d[i];
  93. }
  94. return os;
  95. }
  96. inline std::ostream& operator<<(std::ostream& os, const nvinfer1::WeightsRole role)
  97. {
  98. switch (role)
  99. {
  100. case nvinfer1::WeightsRole::kKERNEL:
  101. {
  102. os << "Kernel";
  103. break;
  104. }
  105. case nvinfer1::WeightsRole::kBIAS:
  106. {
  107. os << "Bias";
  108. break;
  109. }
  110. case nvinfer1::WeightsRole::kSHIFT:
  111. {
  112. os << "Shift";
  113. break;
  114. }
  115. case nvinfer1::WeightsRole::kSCALE:
  116. {
  117. os << "Scale";
  118. break;
  119. }
  120. case nvinfer1::WeightsRole::kCONSTANT:
  121. {
  122. os << "Constant";
  123. break;
  124. }
  125. case nvinfer1::WeightsRole::kANY:
  126. {
  127. os << "Any";
  128. break;
  129. }
  130. }
  131. return os;
  132. }
  133. inline std::ostream& operator<<(std::ostream& os, const std::vector<int>& vec)
  134. {
  135. for (int i = 0, e = static_cast<int>(vec.size()); i < e; ++i)
  136. {
  137. os << (i ? "x" : "") << vec[i];
  138. }
  139. return os;
  140. }
  141. inline nvinfer1::Dims toDims(const std::vector<int>& vec)
  142. {
  143. int limit = static_cast<int>(nvinfer1::Dims::MAX_DIMS);
  144. if (static_cast<int>(vec.size()) > limit)
  145. {
  146. sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl;
  147. }
  148. // Pick first nvinfer1::Dims::MAX_DIMS elements
  149. nvinfer1::Dims dims{std::min(static_cast<int>(vec.size()), limit), {}};
  150. std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d));
  151. return dims;
  152. }
  153. template <typename T>
  154. inline void fillBuffer(void* buffer, int64_t volume, T min, T max)
  155. {
  156. T* typedBuffer = static_cast<T*>(buffer);
  157. std::default_random_engine engine;
  158. if (std::is_integral<T>::value)
  159. {
  160. std::uniform_int_distribution<int> distribution(min, max);
  161. auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
  162. std::generate(typedBuffer, typedBuffer + volume, generator);
  163. }
  164. else
  165. {
  166. std::uniform_real_distribution<float> distribution(min, max);
  167. auto generator = [&engine, &distribution]() { return static_cast<T>(distribution(engine)); };
  168. std::generate(typedBuffer, typedBuffer + volume, generator);
  169. }
  170. }
  171. // Specialization needed for custom type __half
  172. template <typename H>
  173. inline void fillBufferHalf(void* buffer, int64_t volume, H min, H max)
  174. {
  175. H* typedBuffer = static_cast<H*>(buffer);
  176. std::default_random_engine engine;
  177. std::uniform_real_distribution<float> distribution(min, max);
  178. auto generator = [&engine, &distribution]() { return static_cast<H>(distribution(engine)); };
  179. std::generate(typedBuffer, typedBuffer + volume, generator);
  180. }
  181. template <>
  182. #if CUDA_VERSION < 10000
  183. inline void fillBuffer<half_float::half>(void* buffer, int64_t volume, half_float::half min, half_float::half max)
  184. #else
  185. inline void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max)
  186. #endif
  187. {
  188. fillBufferHalf(buffer, volume, min, max);
  189. }
  190. template <typename T>
  191. inline void dumpBuffer(const void* buffer, const std::string& separator, std::ostream& os, const Dims& dims,
  192. const Dims& strides, int32_t vectorDim, int32_t spv)
  193. {
  194. const int64_t volume = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int64_t>());
  195. const T* typedBuffer = static_cast<const T*>(buffer);
  196. std::string sep;
  197. for (int64_t v = 0; v < volume; ++v)
  198. {
  199. int64_t curV = v;
  200. int32_t dataOffset = 0;
  201. for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex)
  202. {
  203. int32_t dimVal = curV % dims.d[dimIndex];
  204. if (dimIndex == vectorDim)
  205. {
  206. dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv;
  207. }
  208. else
  209. {
  210. dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv);
  211. }
  212. curV /= dims.d[dimIndex];
  213. ASSERT(curV >= 0);
  214. }
  215. os << sep << typedBuffer[dataOffset];
  216. sep = separator;
  217. }
  218. }
  219. struct Binding
  220. {
  221. bool isInput{false};
  222. MirroredBuffer buffer;
  223. int64_t volume{0};
  224. nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT};
  225. void fill(const std::string& fileName)
  226. {
  227. std::ifstream file(fileName, std::ios::in | std::ios::binary);
  228. if (file.is_open())
  229. {
  230. file.read(static_cast<char*>(buffer.getHostBuffer()), buffer.getSize());
  231. file.close();
  232. }
  233. }
  234. void fill()
  235. {
  236. switch (dataType)
  237. {
  238. case nvinfer1::DataType::kBOOL:
  239. {
  240. fillBuffer<bool>(buffer.getHostBuffer(), volume, 0, 1);
  241. break;
  242. }
  243. case nvinfer1::DataType::kINT32:
  244. {
  245. fillBuffer<int32_t>(buffer.getHostBuffer(), volume, -128, 127);
  246. break;
  247. }
  248. case nvinfer1::DataType::kINT8:
  249. {
  250. fillBuffer<int8_t>(buffer.getHostBuffer(), volume, -128, 127);
  251. break;
  252. }
  253. case nvinfer1::DataType::kFLOAT:
  254. {
  255. fillBuffer<float>(buffer.getHostBuffer(), volume, -1.0, 1.0);
  256. break;
  257. }
  258. case nvinfer1::DataType::kHALF:
  259. {
  260. #if CUDA_VERSION < 10000
  261. fillBuffer<half_float::half>(buffer.getHostBuffer(), volume, static_cast<half_float::half>(-1.0),
  262. static_cast<half_float::half>(-1.0));
  263. #else
  264. fillBuffer<__half>(buffer.getHostBuffer(), volume, -1.0, 1.0);
  265. #endif
  266. break;
  267. }
  268. }
  269. }
  270. void dump(std::ostream& os, Dims dims, Dims strides, int32_t vectorDim, int32_t spv,
  271. const std::string separator = " ") const
  272. {
  273. switch (dataType)
  274. {
  275. case nvinfer1::DataType::kBOOL:
  276. {
  277. dumpBuffer<bool>(buffer.getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
  278. break;
  279. }
  280. case nvinfer1::DataType::kINT32:
  281. {
  282. dumpBuffer<int32_t>(buffer.getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
  283. break;
  284. }
  285. case nvinfer1::DataType::kINT8:
  286. {
  287. dumpBuffer<int8_t>(buffer.getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
  288. break;
  289. }
  290. case nvinfer1::DataType::kFLOAT:
  291. {
  292. dumpBuffer<float>(buffer.getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
  293. break;
  294. }
  295. case nvinfer1::DataType::kHALF:
  296. {
  297. #if CUDA_VERSION < 10000
  298. dumpBuffer<half_float::half>(buffer.getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
  299. #else
  300. dumpBuffer<__half>(buffer.getHostBuffer(), separator, os, dims, strides, vectorDim, spv);
  301. #endif
  302. break;
  303. }
  304. }
  305. }
  306. };
  307. class Bindings
  308. {
  309. public:
  310. void addBinding(int b, const std::string& name, bool isInput, int64_t volume, nvinfer1::DataType dataType,
  311. const std::string& fileName = "")
  312. {
  313. while (mBindings.size() <= static_cast<size_t>(b))
  314. {
  315. mBindings.emplace_back();
  316. mDevicePointers.emplace_back();
  317. }
  318. mNames[name] = b;
  319. mBindings[b].isInput = isInput;
  320. // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
  321. // even for empty tensors, so allocate a dummy byte.
  322. if (volume == 0)
  323. {
  324. mBindings[b].buffer.allocate(1);
  325. }
  326. else
  327. {
  328. mBindings[b].buffer.allocate(static_cast<size_t>(volume) * static_cast<size_t>(dataTypeSize(dataType)));
  329. }
  330. mBindings[b].volume = volume;
  331. mBindings[b].dataType = dataType;
  332. mDevicePointers[b] = mBindings[b].buffer.getDeviceBuffer();
  333. if (isInput)
  334. {
  335. if (fileName.empty())
  336. {
  337. fill(b);
  338. }
  339. else
  340. {
  341. fill(b, fileName);
  342. }
  343. }
  344. }
  345. void** getDeviceBuffers()
  346. {
  347. return mDevicePointers.data();
  348. }
  349. void transferInputToDevice(TrtCudaStream& stream)
  350. {
  351. for (auto& b : mNames)
  352. {
  353. if (mBindings[b.second].isInput)
  354. {
  355. mBindings[b.second].buffer.hostToDevice(stream);
  356. }
  357. }
  358. }
  359. void transferOutputToHost(TrtCudaStream& stream)
  360. {
  361. for (auto& b : mNames)
  362. {
  363. if (!mBindings[b.second].isInput)
  364. {
  365. mBindings[b.second].buffer.deviceToHost(stream);
  366. }
  367. }
  368. }
  369. void fill(int binding, const std::string& fileName)
  370. {
  371. mBindings[binding].fill(fileName);
  372. }
  373. void fill(int binding)
  374. {
  375. mBindings[binding].fill();
  376. }
  377. void dumpBindingDimensions(int binding, const nvinfer1::IExecutionContext& context, std::ostream& os) const
  378. {
  379. const auto dims = context.getBindingDimensions(binding);
  380. // Do not add a newline terminator, because the caller may be outputting a JSON string.
  381. os << dims;
  382. }
  383. void dumpBindingValues(const nvinfer1::IExecutionContext& context, int binding, std::ostream& os,
  384. const std::string& separator = " ", int32_t batch = 1) const
  385. {
  386. Dims dims = context.getBindingDimensions(binding);
  387. Dims strides = context.getStrides(binding);
  388. int32_t vectorDim = context.getEngine().getBindingVectorizedDim(binding);
  389. const int32_t spv = context.getEngine().getBindingComponentsPerElement(binding);
  390. if (context.getEngine().hasImplicitBatchDimension())
  391. {
  392. auto insertN = [](Dims& d, int32_t bs) {
  393. const int32_t nbDims = d.nbDims;
  394. ASSERT(nbDims < Dims::MAX_DIMS);
  395. std::copy_backward(&d.d[0], &d.d[nbDims], &d.d[nbDims + 1]);
  396. d.d[0] = bs;
  397. d.nbDims = nbDims + 1;
  398. };
  399. int32_t batchStride = 0;
  400. for (int32_t i = 0; i < strides.nbDims; ++i)
  401. {
  402. if (strides.d[i] * dims.d[i] > batchStride)
  403. {
  404. batchStride = strides.d[i] * dims.d[i];
  405. }
  406. }
  407. insertN(dims, batch);
  408. insertN(strides, batchStride);
  409. vectorDim = (vectorDim == -1) ? -1 : vectorDim + 1;
  410. }
  411. mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator);
  412. }
  413. void dumpInputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const
  414. {
  415. auto isInput = [](const Binding& b) { return b.isInput; };
  416. dumpBindings(context, isInput, os);
  417. }
  418. void dumpOutputs(const nvinfer1::IExecutionContext& context, std::ostream& os) const
  419. {
  420. auto isOutput = [](const Binding& b) { return !b.isInput; };
  421. dumpBindings(context, isOutput, os);
  422. }
  423. void dumpBindings(const nvinfer1::IExecutionContext& context, std::ostream& os) const
  424. {
  425. auto all = [](const Binding& b) { return true; };
  426. dumpBindings(context, all, os);
  427. }
  428. void dumpBindings(
  429. const nvinfer1::IExecutionContext& context, bool (*predicate)(const Binding& b), std::ostream& os) const
  430. {
  431. for (const auto& n : mNames)
  432. {
  433. const auto binding = n.second;
  434. if (predicate(mBindings[binding]))
  435. {
  436. os << n.first << ": (";
  437. dumpBindingDimensions(binding, context, os);
  438. os << ")" << std::endl;
  439. dumpBindingValues(context, binding, os);
  440. os << std::endl;
  441. }
  442. }
  443. }
  444. std::unordered_map<std::string, int> getInputBindings() const
  445. {
  446. auto isInput = [](const Binding& b) { return b.isInput; };
  447. return getBindings(isInput);
  448. }
  449. std::unordered_map<std::string, int> getOutputBindings() const
  450. {
  451. auto isOutput = [](const Binding& b) { return !b.isInput; };
  452. return getBindings(isOutput);
  453. }
  454. std::unordered_map<std::string, int> getBindings() const
  455. {
  456. auto all = [](const Binding& b) { return true; };
  457. return getBindings(all);
  458. }
  459. std::unordered_map<std::string, int> getBindings(bool (*predicate)(const Binding& b)) const
  460. {
  461. std::unordered_map<std::string, int> bindings;
  462. for (const auto& n : mNames)
  463. {
  464. const auto binding = n.second;
  465. if (predicate(mBindings[binding]))
  466. {
  467. bindings.insert(n);
  468. }
  469. }
  470. return bindings;
  471. }
  472. private:
  473. std::unordered_map<std::string, int> mNames;
  474. std::vector<Binding> mBindings;
  475. std::vector<void*> mDevicePointers;
  476. };
  477. template <typename T>
  478. struct TrtDestroyer
  479. {
  480. void operator()(T* t)
  481. {
  482. t->destroy();
  483. }
  484. };
  485. template <typename T>
  486. using TrtUniquePtr = std::unique_ptr<T, TrtDestroyer<T>>;
  487. inline bool broadcastIOFormats(const std::vector<IOFormat>& formats, size_t nbBindings, bool isInput = true)
  488. {
  489. bool broadcast = formats.size() == 1;
  490. bool validFormatsCount = broadcast || (formats.size() == nbBindings);
  491. if (!formats.empty() && !validFormatsCount)
  492. {
  493. if (isInput)
  494. {
  495. throw std::invalid_argument(
  496. "The number of inputIOFormats must match network's inputs or be one for broadcasting.");
  497. }
  498. else
  499. {
  500. throw std::invalid_argument(
  501. "The number of outputIOFormats must match network's outputs or be one for broadcasting.");
  502. }
  503. }
  504. return broadcast;
  505. }
  506. inline std::vector<char> loadTimingCacheFile(const std::string inFileName)
  507. {
  508. std::ifstream iFile(inFileName, std::ios::in | std::ios::binary);
  509. if (!iFile)
  510. {
  511. sample::gLogWarning << "Could not read timing cache from: " << inFileName
  512. << ". A new timing cache will be generated and written." << std::endl;
  513. return std::vector<char>();
  514. }
  515. iFile.seekg(0, std::ifstream::end);
  516. size_t fsize = iFile.tellg();
  517. iFile.seekg(0, std::ifstream::beg);
  518. std::vector<char> content(fsize);
  519. iFile.read(content.data(), fsize);
  520. iFile.close();
  521. sample::gLogInfo << "Loaded " << fsize << " bytes of timing cache from " << inFileName << std::endl;
  522. return content;
  523. }
  524. inline void saveTimingCacheFile(const std::string outFileName, const IHostMemory* blob)
  525. {
  526. std::ofstream oFile(outFileName, std::ios::out | std::ios::binary);
  527. if (!oFile)
  528. {
  529. sample::gLogWarning << "Could not write timing cache to: " << outFileName << std::endl;
  530. return;
  531. }
  532. oFile.write((char*) blob->data(), blob->size());
  533. oFile.close();
  534. sample::gLogInfo << "Saved " << blob->size() << " bytes of timing cache to " << outFileName << std::endl;
  535. }
  536. } // namespace sample
  537. #endif // TRT_SAMPLE_UTILS_H