3d稀疏卷积——spconv源码剖析(三)
创始人
2024-02-01 06:18:17
0

构建Rulebook

下面看ops.get_indice_pairs,位于:spconv/ops.py

构建Rulebookops.get_indice_pairs接口完成

get_indice_pairs函数具体实现:

def get_indice_pairs(indices,  # (N, 4) voxel网格坐标索引batch_size,spatial_shape,ksize=3,stride=1,padding=0,dilation=1,out_padding=0,subm=False,transpose=False,grid=None,use_hash=False):ndim = indices.shape[1] - 1 # 4->3if not isinstance(ksize, (list, tuple)):ksize = [ksize] * ndim # 3->[3,3,3],3x3x3 kernelif not isinstance(stride, (list, tuple)):stride = [stride] * ndim # 1->[1,1,1]if not isinstance(padding, (list, tuple)):padding = [padding] * ndim # 0->[0,0,0]if not isinstance(dilation, (list, tuple)):dilation = [dilation] * ndim # 1->[1,1,1]if not isinstance(out_padding, (list, tuple)):out_padding = [out_padding] * ndim # 0->[0,0,0]for d, s in zip(dilation, stride): # 不支持s,d都不等于1的设定assert any([s == 1, d == 1]), "don't support this." # 只要有一个为true,any则为trueif not subm: # 普通稀疏卷积if transpose: # Falseout_shape = get_deconv_output_size(spatial_shape, ksize, stride,padding, dilation, out_padding)else:# 计算普通稀疏卷积输出shapeout_shape = get_conv_output_size(spatial_shape, ksize, stride,padding, dilation)else: # 子流线稀疏卷积out_shape = spatial_shape # 输入输出shape一样if grid is None: # None# 在src/spconv/all.cc文件中通过Pytorch提供的OP Register对底层c++ api进行了注册# 通过torch.ops.load_library加载.so文件,torch.ops.spconv.get_indice_pairs方式来调用src/spconv/spconv_ops.cc文件种的getIndicePairs函数res = torch.ops.spconv.get_indice_pairs(indices, batch_size, out_shape,spatial_shape, ksize, stride,padding, dilation, out_padding,int(subm), int(transpose),int(use_hash))return reselse:if ndim == 2:get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_2delif ndim == 3:get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_3delse:raise NotImplementedErrorreturn get_indice_pairs_func(indices, grid, batch_size, out_shape,spatial_shape, ksize, stride, padding,dilation, out_padding, int(subm),int(transpose), int(use_hash))

主要就是完成了一些参数的校验和预处理。首先,对于3d普通稀疏卷积,根据输入shape大小,kernel sizestride等参数计算出输出输出shape,子流行稀疏卷积就不必计算了,输出shape和输入shape一样大小

准备好参数之后就进入最核心的get_indice_pairs函数。因为spconv通过torch.ops.load_library加载.so文件注册,所以这里通torch.ops.spconv.get_indice_pairs这种方式来调用该函数。

**算子注册:**在src/spconv/all.cc文件中通过Pytorch提供的OP Register(算子注册的方式)对底层c++ api进行了注册,可以python接口形式调用c++算子

static auto registry =torch::RegisterOperators().op("spconv::get_indice_pairs", &spconv::getIndicePairs)

同C++ extension方式一样,OP Register也是Pytorch提供的一种底层扩展算子注册的方式。注册的算子可以通过torch.xxx或者tensor.xxx的方式进行调用,该方式同样与pytorch源码解耦,增加和修改算子不需要重新编译pytorch源码。用该方式注册一个新的算子,流程非常简单:先编写C++相关的算子实现,然后通过pytorch底层的注册接口(torch::RegisterOperators),将该算子注册即可。

构建Rulebook实际通过python接口get_indice_pairs调用src/spconv/spconv_ops.cc文件种的getIndicePairs函数

代码位于:src/spconv/spconv_ops.cc

std::vector
getIndicePairs(torch::Tensor indices,                 // torch.Size([N, 4])int64_t batchSize,                     // 4std::vector outSpatialShape,  // [41, 1440, 1440]std::vector spatialShape,     // [41, 1440, 1440]std::vector kernelSize,       // [3,3,3]std::vector stride,           // [1,1,1]std::vector padding,          // [1,1,1]std::vector dilation,         // [1,1,1]std::vector outPadding,       // [0,0,0]int64_t _subM,                         // SubMConv3d为1,SparseConv3d为0int64_t _transpose,                    // 0int64_t _useHash                       // 0) {// auto timer = spconv::CudaContextTimer<>();bool subM = _subM != 0;           // SubMConv3d为1,SparseConv3d为0bool transpose = _transpose != 0; // Flaseauto NDim = kernelSize.size();    // 3// CPU always use hash (tsl::robin_map).bool useHash = _useHash != 0 || indices.device().type() == torch::kCPU; // 默认_useHas:Falseauto numAct = indices.size(0);      // torch.Size([N,4]) -> N  active input site的个数auto coorDim = indices.size(1) - 1; // torch.Size([N,4]) -> 3  TV_ASSERT_RT_ERR(NDim == coorDim, "error");                  // 3==3TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");     // 3==3TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");// 3==3TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");         // 3==3TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");        // 3==3TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");     // 3==3TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");       // 3==3// [3,3,3] -> 3*3*3 = 27auto kernelVolume = kernelSize[0]; // 3for (int i = 1; i < kernelSize.size(); ++i) {kernelVolume *= kernelSize[i]; }TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");// [41, 1440, 1440]->41*1440*1440 auto outputVolume = outSpatialShape[0]; for (int i = 1; i < outSpatialShape.size(); ++i) {outputVolume *= outSpatialShape[i];}std::string msg = "due to limits of cuda hash, the volume of dense space include batch size ";msg += "must less than std::numeric_limits::max() = 2e9";TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits::max(),msg);// indicePairs:torch.Size([2,27,N]),-1填充// 2表示输入和输出两个方向,kernelVolume为卷积核的volume_size。如一个3x3x3的卷积核,其volume_size就是27(3*3*3)。// numAct表示输入有效(active)特征的数量torch::Tensor indicePairs = torch::full({2, kernelVolume, numAct}, -1,torch::dtype(torch::kInt32).device(indices.device()));// indiceNum torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数,因为稀疏卷积的卷积核上每一个元素和有效数据的运算次数可能是不同的。torch::Tensor indiceNum = torch::zeros({kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));// 4*41*1440*1440auto gridSize = batchSize * outputVolume;if (useHash) { // 默认False,使用GPUgridSize = batchSize; // 输入useHash为true,或者使用cpu}// torch.Size([4*41*1440*1440])torch::Tensor gridOut = torch::full({gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));// torch.Size([4,41*1440*1440])gridOut = gridOut.view({batchSize, -1});int64_t numActOut = -1;// 根据子流线稀疏卷积输出和输入形状相同,计算步长stride=1和paddingfor (int i = 0; i < NDim; ++i) {if (subM) { padding[i] = kernelSize[i] / 2;stride[i] = 1;}}// tv::ssprint("prepare", timer.report() / 1000.0);if (subM) { // 子流线稀疏卷积if (indices.device().type() == torch::kCPU) { // False......// cpu}
#ifdef TV_CUDAelse if (indices.device().type() == torch::kCUDA) { // TruenumActOut = create_submconv_indice_pair_cuda(indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,dilation, outSpatialShape, transpose, false, useHash);if (numActOut == -1) {auto device = indices.device();indicePairs = indicePairs.to({torch::kCPU});indiceNum = indiceNum.to({torch::kCPU});indices = indices.to({torch::kCPU});numActOut = create_submconv_indice_pair_cpu(indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding, dilation, outSpatialShape, transpose, false, useHash);return {indices.to(device), indicePairs.to(device),indiceNum.to(device)};}}
#endifelse {TV_THROW_INVALID_ARG("unknown device type");}// tv::ssprint("subm", timer.report() / 1000.0);return {indices, indicePairs, indiceNum};} else { // 普通稀疏卷积,初始化 indicePairUnique 和 outIndsauto indicePairUnique = torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits::max(),torch::dtype(torch::kInt32).device(indices.device()));torch::Tensor outInds = torch::zeros({numAct * kernelVolume, coorDim + 1},torch::dtype(torch::kInt32).device(indices.device()));if (indices.device().type() == torch::kCPU) { // CPU......// cpu}
#ifdef TV_CUDAelse if (indices.device().type() == torch::kCUDA) { // GPUnumActOut = create_conv_indice_pair_p1_cuda(indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,padding, dilation, outSpatialShape, transpose);if (numActOut > 0) {auto res = torch::_unique(indicePairUnique);indicePairUnique = std::get<0>(res);numActOut = create_conv_indice_pair_p2_cuda(indices, outInds, gridOut, indicePairs, indiceNum, indicePairUnique,outSpatialShape, transpose, false, useHash);if (numActOut == -1) {auto device = indices.device();outInds = outInds.to({torch::kCPU});indicePairs = indicePairs.to({torch::kCPU});indiceNum = indiceNum.to({torch::kCPU});indices = indices.to({torch::kCPU});numActOut = create_conv_indice_pair_cpu(indices, outInds, gridOut, indicePairs, indiceNum, kernelSize,stride, padding, dilation, outSpatialShape, transpose, false,useHash);return {outInds.to(device).slice(0, 0, numActOut),indicePairs.to(device), indiceNum.to(device)};}}}
#endifelse {TV_THROW_INVALID_ARG("unknown device type");}return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};}
}

分析getIndicePairs直接将重心锁定在GPU逻辑部分,并且子流行3d稀疏卷积和正常3d稀疏卷积分开讨论,优先子流行3d稀疏卷积。

代码种最重要的3个变量分别为:indicePairsindiceNumgridOut,其建立过程如下:

  // torch.Size([2,27,N]) -1填充// 2表示输入和输出两个方向,kernelVolume为卷积核的volume_size。如一个3x3x3的卷积核,其volume_size就是27(3*3*3)。// numAct表示输入有效(active)特征的数量torch::Tensor indicePairs = torch::full({2, kernelVolume, numAct}, -1,torch::dtype(torch::kInt32).device(indices.device()));// indiceNum torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数,因为稀疏卷积的卷积核上每一个元素和有效数据的运算次数可能是不同的。torch::Tensor indiceNum = torch::zeros({kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));// 4*41*1440*1440auto gridSize = batchSize * outputVolume;

indicePairs代表了稀疏卷积输入输出的映射规则,即Input Hash TableOutput Hash Table。这里分配理论最大的内存,它的shape{2,kernelVolume,numAct}2表示输入和输出两个方向,kernelVolume为卷积核的volume size。例如一个3x3x3的卷积核,其volume size就是27(3*3*3)。numAct表示输入有效(active)特征的数量。indiceNum用于保存卷积核每一个位置上的总的计算的次数,indiceNum对应图片中的count

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-iAKvOGP3-1668846644440)(pic/rulebook.png)]

代码中关于gpu建立rulebook调用create_submconv_indice_pair_cuda函数来完成,下面具体分析下create_submconv_indice_pair_cuda函数

子流线稀疏卷积

子流线稀疏卷积是调用create_submconv_indice_pair_cuda函数来构建rulebook

int create_submconv_indice_pair_cuda(torch::Tensor indicesIn,          // torch.Size([N, 4]) voxel空间索引torch::Tensor gridsOut,           // torch.Size([4,41*1440*1440]) torch::Tensor indicePairs,        // torch.Size([2,27,N]),-1填充 保存 rulebooktorch::Tensor indiceNum,          // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数std::vector kernelSize,  // [3,3,3] std::vector stride,      // [1,1,1]std::vector padding,     // [1,1,1]std::vector dilation,    // [1,1,1]std::vector outSpatialShape, // [41, 1440, 1440]bool transpose,                   // Flasebool resetGrid,                   // falsebool useHash                      // False) {auto stream = at::cuda::getCurrentCUDAStream();auto ndim = outSpatialShape.size(); // 3auto numActIn = indicesIn.size(0);  // 输入有效(active)特征的数量Nint batchSize = gridsOut.size(0);   // 4auto kernelVolume = indiceNum.size(0); // 3x3x3 => 27if (numActIn == 0)return 0;bool failed = false;tv::dispatch_torch(indicesIn.scalar_type(), [&](auto IndexValue) {using Index = TV_DECLTYPE(IndexValue); //类型推导using IndexGrid = int32_t;tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {constexpr int NDim = TV_DECLTYPE(I)::value; // 3// 将参数信息复制到tv::SimpleVector类型相关变量上tv::SimpleVector ks(kernelSize.begin(),kernelSize.end());tv::SimpleVector st(stride.begin(), stride.end());tv::SimpleVector pa(padding.begin(), padding.end());tv::SimpleVector di(dilation.begin(), dilation.end());tv::SimpleVector ou(outSpatialShape.begin(),outSpatialShape.end());Index spatialVolume = 1;// 输出大小for (int i = 0; i < NDim; ++i) {spatialVolume *= outSpatialShape[i]; // 21*800*704}if (useHash) {//...省略...} else {// auto timer = spconv::CudaContextTimer<>();// block_size为tv::cuda::CUDA_NUM_THREADS=1024,grid_size大小通过tv::cuda::getBlocks(numActIn)计算得到,numActIn表示有效(active)输入数据的数量// prepareSubMGridKernel的作用类似于建立输出张量坐标(通过index表示voxel空间索引)到输出序号之间的一张哈希表prepareSubMGridKernel<<>>(tv::torch2tv(indicesIn),tv::torch2tv(gridsOut), ou, spatialVolume);// tv::ssprint("prepareSubMGridKernel", timer.report() / 1000.0);TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");// when dilation all one, we use a simple kernel to calc resultbool dilation_one = true;for (int i = 0; i < NDim; ++i) {dilation_one &= di[i] == 1; // True}auto found = false;if (dilation_one && (NDim == 2 || NDim == 3)) {auto indiceNumCpu = indiceNum.cpu();if (NDim == 2) {//...省略...} else if (NDim == 3) {tv::SimpleVector ou_(outSpatialShape.begin(),outSpatialShape.end()); // 输出shapetv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) {tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) {tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[2], [&](auto K2C) {// 卷积核尺寸大小constexpr int K0 = TV_DECLTYPE(K0C)::value; constexpr int K1 = TV_DECLTYPE(K1C)::value;constexpr int K2 = TV_DECLTYPE(K2C)::value;found = true;// block_size为tv::cuda::CUDA_NUM_THREADS=1024// grid_size大小通过tv::cuda::getBlocks(numActIn)计算得到,numActIn表示有效(active)输入数据的数量// spatialVolume 输出shape各维度的乘积getSubMIndicePairsKernel3<<>>(tv::torch2tv(indicesIn),tv::torch2tv(gridsOut),tv::torch2tv(indicePairs),tv::torch2tv(indiceNum), ou_,spatialVolume);});});});}}if (!found) {//...省略...}// tv::ssprint("getSubMIndicePairsKernel", timer.report() / 1000.0);}if (resetGrid && (!useHash)) {resetGridSubMKernel<<>>(indicesIn.data_ptr(),tv::torch2tv(gridsOut), ou, numActIn);TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");}});});if (failed) {return -1;}return numActIn;
}

create_submconv_indice_pair_cuda大可不必深究以下动态分发机制的运行原理。

tv::dispatch_torch(indicesIn.scalar_type(), [&](auto IndexValue) {
....
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
....
}    
}

直接将重心锁定在核函数:

        prepareSubMGridKernel<<>>(tv::torch2tv(indicesIn),tv::torch2tv(gridsOut), ou, spatialVolume);

prepareSubMGridKernel核函数中grid_sizeblock_size实则都是用的整形变量。其中block_sizetv::cuda::CUDA_NUM_THREADS,在include/tensorview/cuda_utils.h文件中定义,大小为1024。而grid_size大小通过tv::cuda::getBlocks(numActIn)计算得到,其中numActIn表示有效(active)输入数据的数量。

template  inline int DivUp(const T1 a, const T2 b) {return (a + b - 1) / b;
}// Use 1024 threads per block, which requires cuda sm_2x or above
constexpr int CUDA_NUM_THREADS = 1024;
// CUDA: number of blocks for threads.inline int getNumThreads(const int N) {if (N > CUDA_NUM_THREADS) {return CUDA_NUM_THREADS;}return DivUp(N, 32) * 32;
}inline int getBlocks(const int N) {TV_ASSERT_RT_ERR(N > 0,"CUDA kernel launch blocks must be positive, but got N=", N);return DivUp(N, getNumThreads(N));
}

prepareSubMGridKernel作用:建立输出张量坐标(通过index表示)到输出序号之间的一张哈希表

见:include/spconv/indice.cu.h

// 建立输出张量坐标(通过index表示)到输出序号之间的一张哈希表
template 
__global__ void prepareSubMGridKernel(tv::TensorView indicesIn,                // 输入voxel特征(N,4)tv::TensorView gridsOut,                   // 输出索引(4,41*1440*1440) const tv::SimpleVector outSpatialShape,  // 输出空间shape [41, 1440, 1440]Index spatialVolume                                   // 输出shape每个维度大小乘积 41*1440*1440) {auto numActIn = indicesIn.dim(0); // torch.Size([N,4]) => NIndex index = 0;for (int ix : tv::KernelLoopX(numActIn)) {// NDim 为 3 index为输出张量坐标 ix为输出序号// 转为一维,计算index,只不过这里换了模板加递归形式的写法,看起来复杂,理清楚就好// (batch_id,z,y,x) --> index : index = x*shape[3] + x*y * shape[2] + z + batch_id*spatialVolumeindex = tv::ArrayIndexRowMajor::runPtrs(indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data(), 0) + spatialVolume * indicesIn(ix, 0); // indicesIn(ix, 0) 表示属于第几个batchgridsOut[index] = ix;}
}

这里计算index换了一种模板加递归的写法,看起来比较复杂而已。令:new_indicesIn = indicesIn.data() ,可以推导得出index为:

index = new_indicesIn[1+4*ix] * new_outSpatialShape[3]+ new_indicesIn[2+4*ix] * new_outSpatialShape[2] * new_outSpatialShape[3] + 		  new_indicesIn[3+4*ix] + spatialVolume * indicesIn(1, 0);		

ArrayIndexRowMajor位于include/tensorview/tensorview.h,其递归调用写法如下:

template  struct ArrayIndexRowMajor {// this array index provide almost same compiled code. compile it in// https://godbolt.org/ for more details.template TV_HOST_DEVICE_INLINE static unsigned runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {return ArrayIndexRowMajor::runPtrs(indexes, shape, (indexes[Ndim - N] + start) * shape[Ndim - N + 1]);}
};template  struct ArrayIndexRowMajor<1, Ndim> {template TV_HOST_DEVICE_INLINE static unsigned runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {return start + indexes[Ndim - 1];}
};

接着看核函数getSubMIndicePairsKernel3

getSubMIndicePairsKernel3<<>>(tv::torch2tv(indicesIn),tv::torch2tv(gridsOut),tv::torch2tv(indicePairs),tv::torch2tv(indiceNum), ou_,spatialVolume);

位于:include/spconv/indice.cu.h

template 
__global__ void getSubMIndicePairsKernel3(tv::TensorView indicesIn, // 输入特征[160000,4] tv::TensorView gridsOut,    // 输出特征,一维的tv::TensorView indicePairs,     // 存储稀疏卷积输入输出的映射规则rulebook 维度[2,27,16000]tv::TensorView indiceNum,       // 存储卷积核每一个位置上的总的计算的次数 维度[27]const tv::SimpleVector outSpatialShape, // 输出shape Index spatialVolume                    // 输出shape各维度的乘积) {auto numActIn = indicesIn.dim(0); // numActIn表示有效(active)输入数据的数量Index point[3];Index index = 0;Index offset;constexpr unsigned KV = K0 * K1 * K2; // 卷积核尺寸大小 3*3*3=27// 计算出kernel的大小及其中心位置constexpr unsigned center = KV / 2; // 27/2 = 13// 对于子流行稀疏卷积来说,kernel中心的元素一定会和输入中的每一个有效(active)元素进行一次运算*(indiceNum.data() + center) = numActIn; // 对indiceNum中中心位置的地址赋值为numActInfor (int ix : tv::KernelLoopX(numActIn)) { // 类似C++11范围for循环写法// 索引ix对应的输入特征数据const Index *indice_data = indicesIn.data() + ix * (3 + 1);// 3层for循环对应卷积核3个维度D,H和W,大小分别为K0,K1和K2// #pragma unroll命令,显示地告诉编译器在进行编译时对循环进行展开。
#pragma unrollfor (int i = 0; i < K0; ++i) {
#pragma unrollfor (int j = 0; j < K1; ++j) {
#pragma unrollfor (int k = 0; k < K2; ++k) {// 计算出当前卷积核内的偏移,以3x3x3(K0=3,K1=3,K2=3)3D卷积核为例,offset从0~26,但是代码25行规定当offset > center(13)时continue,所以offset实际只计算到13。offset = i * K1 * K2 + j * K2 + k;if (offset > center){continue;}// 对于卷积核中心位置(center)的元素,它一定会和每一个输入元素作用,当offset等于center时,输入索引等于输出索引等于ixif (center == offset){// center of subm indice pairs dont need atomicaddindicePairs(1, offset, ix) = ix;indicePairs(0, offset, ix) = ix;}else{point[2] = indice_data[3] - k + K2 / 2; point[1] = indice_data[2] - j + K1 / 2;point[0] = indice_data[1] - i + K0 / 2;if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[2] >= 0 && point[2] < outSpatialShape[2] && point[0] >= 0 && point[0] < outSpatialShape[0]) {index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(point, outSpatialShape.data(), 0) + spatialVolume * indice_data[0];if (gridsOut[index] != -1) {// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1]Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));// 建立rulebook的核心,为什么offset只需要计算到center位置这是由子流行稀疏卷积的一个对称特点决定的,归结起来就是下面4行代码:indicePairs(1, offset, oldNum) = gridsOut[index]; // 输出序号indicePairs(0, offset, oldNum) = ix; // 卷积核权重的位置indicePairs(1, KV - offset - 1, oldNum) = ix; // 卷积核权重的位置indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index]; // 输出序号}}}}}}}
}

看:

for (int ix : tv::KernelLoopX(numActIn)) {......	
}

上述写法类似我们函数中常见的循环的写法,具体可以查看include/tensorview/kernel_utils.h

template 
__forceinline__ __device__ detail::KernelLoop KernelLoopX(T count) {return detail::KernelLoop(blockIdx.x * blockDim.x + threadIdx.x,gridDim.x * blockDim.x * NumILP, count);
}

NumILP按默认值等于1的话,其stride也是gridDim.x*blockDim.x。索引最大值要小于该线程块的线程上限索引blockDim.x * gridDim.x,功能与下面代码类似:

{int idx    = threadIdx.x + blockIdx.x * blockDim.x;int stride = blockDim.x * gridDim.x;for(int i = idx; i < num; i += stride) {//...运算...}
}

参考:https://blog.csdn.net/ChuiGeDaQiQiu/article/details/127680713

相关内容

热门资讯

AWSECS:访问外部网络时出... 如果您在AWS ECS中部署了应用程序,并且该应用程序需要访问外部网络,但是无法正常访问,可能是因为...
AWSElasticBeans... 在Dockerfile中手动配置nginx反向代理。例如,在Dockerfile中添加以下代码:FR...
银河麒麟V10SP1高级服务器... 银河麒麟高级服务器操作系统简介: 银河麒麟高级服务器操作系统V10是针对企业级关键业务...
北信源内网安全管理卸载 北信源内网安全管理是一款网络安全管理软件,主要用于保护内网安全。在日常使用过程中,卸载该软件是一种常...
AWR报告解读 WORKLOAD REPOSITORY PDB report (PDB snapshots) AW...
AWS管理控制台菜单和权限 要在AWS管理控制台中创建菜单和权限,您可以使用AWS Identity and Access Ma...
​ToDesk 远程工具安装及... 目录 前言 ToDesk 优势 ToDesk 下载安装 ToDesk 功能展示 文件传输 设备链接 ...
群晖外网访问终极解决方法:IP... 写在前面的话 受够了群晖的quickconnet的小水管了,急需一个新的解决方法&#x...
不能访问光猫的的管理页面 光猫是现代家庭宽带网络的重要组成部分,它可以提供高速稳定的网络连接。但是,有时候我们会遇到不能访问光...
Azure构建流程(Power... 这可能是由于配置错误导致的问题。请检查构建流程任务中的“发布构建制品”步骤,确保正确配置了“Arti...