如何實(shí)現(xiàn)比PyTorch快6倍的Permute/Transpose算子?

x?=?flow.randn(2,?3)
y?=?x.permute(1,?0)
y.shape?
(3,?2)
for?row?in?x.shape[0]:?
??for?col?in?x.shape[1]:?
????y[row][col]?=?x[col][row]

通過當(dāng)前輸出的一維偏移量(offset)計(jì)算對(duì)應(yīng)的高維索引
然后根據(jù)參數(shù)dims重新排列輸出索引,進(jìn)而得到輸入索引。
將輸入索引轉(zhuǎn)換成輸入偏移量
最后進(jìn)行數(shù)據(jù)移動(dòng),整個(gè)過程的示意圖如下:


2 NdIndexOffsetHelper
NdIndexToOffset方法把高維索引轉(zhuǎn)為一維偏移量
OffsetToNdIndex方法把一維偏移量轉(zhuǎn)為高維索引
template<size_t?num_dims,?size_t?movement_size,?typename?IndexType>
__global__?void?PermuteKernel(PermuteKernelParams?params) ?{
??using?T?=?typename?std::aligned_storage::type;
??const?T*?src?=?reinterpret_cast<const?T*>(params.src);
??T*?dst?=?reinterpret_cast(params.dst);
??IndexType?src_index[num_dims];
??IndexType?dst_index[num_dims];
??CUDA_1D_KERNEL_LOOP_T(IndexType,?i,?params.count)?{
????params.dst_index_helper.OffsetToNdIndex(i,?dst_index);
#pragma?unroll
????for?(size_t?dim?=?0;?dim???????src_index[params.permutation[dim]]?=?dst_index[dim];
????}
????IndexType?src_offset?=?params.src_index_helper.NdIndexToOffset(src_index);
????dst[i]?=?src[src_offset];
??}
}
PermuteKernelParams是一個(gè)結(jié)構(gòu)體,里面有初始化好的NdIndexOffsetHelper(src和dst各一個(gè)),元素總數(shù)count還有變換后的維度順序permutation
首先我們?nèi)〉卯?dāng)前處理輸出元素的高維索引dst_index,然后賦給經(jīng)過Permute后的輸入索引src_index
再將輸入索引轉(zhuǎn)換成一維偏移量src_offset,取到輸入元素并賦給對(duì)應(yīng)的輸出
3 常規(guī)情況的優(yōu)化
大小為1的維度可以直接去除
連續(xù)排列的維度可以合并成一個(gè)維度
#?0,?1,?2,?3)?->?(2,?3,?0,?1)
x?=?flow.randn(3,?4,?5,?6)
y?=?x.permute(2,?3,?0,?1)
y.shape?
(5,?6,?3,?4)
#?(0,?1,?2,?3)?->?((2,?3),?(0,?1))
x?=?x.reshape(x.shape[0]*x.shape[1],?x.shape[2]*x.shape[3])
y?=?x.permute(1,?0)
y?=?y.reshape(x.shape[2],?x.shape[3],?x.shape[0],?x.shape[1])
3. 使用更大的訪問粒度
CUDA支持的訪問粒度為1B,2B,4B,8B,16B,粒度越大性能越好
最后一個(gè)維度是作為整體來移動(dòng)的,即permutation[n-1]==x.dims[n-1],且大小是新訪問粒度的倍數(shù)
保證數(shù)據(jù)指針滿足新訪問粒度的對(duì)齊要求
(0, 1, 2, 3) -> (0, 2, 1, 3)


這里Permute的帶寬比原生Copy還高一點(diǎn),是因?yàn)镃opy Kernel里沒有做unroll指令間并行優(yōu)化,而Permute Kernel內(nèi)部做了相關(guān)優(yōu)化,這里僅做參考。
(0,?1)?->?(1,?0)
(0,?1,?2)?->?(0,?2,?1)


template<size_t?num_dims,?size_t?movement_size,?size_t?tile_size,?typename?IndexType>
__global__?void?BatchTransposeKernel(const?void*?src_ptr,?void*?dst_ptr,?IndexType?H,?IndexType?W,
?????????????????????????????????????IndexType?num_tile_rows,?IndexType?num_tile_cols,
?????????????????????????????????????int32_t?block_nums)?{
??using?T?=?typename?std::aligned_storage::type;
??__shared__?T?tile[tile_size][tile_size?+?1];??//?To?avoid?bank?conflict.
??const?T*?src?=?reinterpret_cast<const?T*>(src_ptr);
??T*?dst?=?reinterpret_cast(dst_ptr);
??IndexType?batch_num_tile?=?num_tile_rows?*?num_tile_cols;
??for?(int?i?=?blockIdx.x,?step?=?gridDim.x;?i?????const?IndexType?batch_index?=?i?/?batch_num_tile;??//?the?index?of?batch.
????const?IndexType?flatten_index?=
????????i?-?batch_index?*?batch_num_tile;??
????const?IndexType?row_index?=?flatten_index?/?num_tile_cols;??//?the?row?index?of?tile?in?a?batch.
????const?IndexType?col_index?=
????????flatten_index
????????-?row_index
??????????????*?num_tile_cols;??//?the?col?index?of?tile?in?a?batch.
????const?IndexType?offset?=?batch_index?*?H?*?W;
????IndexType?x?=?col_index?*?tile_size?+?threadIdx.x;
????IndexType?y?=?row_index?*?tile_size?+?threadIdx.y;
????if?(x???????IndexType?y_range?=
??????????((tile_size?-?threadIdx.y)?(H?-?y))???(tile_size?-?threadIdx.y)?:?(H?-?y);
#pragma?unroll
??????for?(int?i?=?0;?i?????????tile[threadIdx.y?+?i][threadIdx.x]?=?src[offset?+?(y?+?i)?*?W?+?x];
??????}
????}
????__syncthreads();
????x?=?row_index?*?tile_size?+?threadIdx.x;
????y?=?col_index?*?tile_size?+?threadIdx.y;
????if?(x???????IndexType?x_range?=
??????????((tile_size?-?threadIdx.y)?(W?-?y))???(tile_size?-?threadIdx.y)?:?(W?-?y);
#pragma?unroll
??????//?`i?
??????for?(int?i?=?0;?i?????????dst[offset?+?(y?+?i)?*?H?+?x]?=?tile[threadIdx.x][threadIdx.y?+?i];
??????}
????}
????__syncthreads();
??}
}
顯式展開循環(huán)
在先前版本,我們的for循環(huán)寫法如下:
#pragma?unroll
for?(int?i?=?0;?threadIdx.y?+?i?????...
}
即便是加入了預(yù)編譯指令#pragma unroll,在Nsight Compute里的匯編代碼中,我們也只能看到兩條相關(guān)指令,也就意味著這部分循環(huán)并沒有展開。
#pragma?unroll
for?(int?i?=?0;?threadIdx.y?+?i?????...
}

IndexType?y_range?=?((tile_size?-?threadIdx.y)?(H?-?y))???(tile_size?-?threadIdx.y)?:?(H?-?y);
#pragma?unroll
for?(int?i?=?0;?i???...
}

針對(duì)half2版本優(yōu)化

template<size_t?num_dims,?size_t?tile_size,?typename?IndexType>
__global__?void?BatchTransposeMovement2Kernel(const?void*?src_ptr,?void*?dst_ptr,?IndexType?rows,
??????????????????????????????????????????????IndexType?cols,?IndexType?num_tile_rows,
??????????????????????????????????????????????IndexType?num_tile_cols,?int32_t?block_nums)?{
??static_assert(tile_size?%?2?==?0);
??using?T_MOV2?=?typename?std::aligned_storage<2,?2>::type;
??using?T_MOV4?=?typename?std::aligned_storage<4,?4>::type;
??const?T_MOV4*?src?=?reinterpret_cast<const?T_MOV4*>(src_ptr);
??T_MOV4*?dst?=?reinterpret_cast(dst_ptr);
??//?Use?union?structure?to?process?Load?and?Store.
??__shared__?union?{
????T_MOV2?tile_m2[tile_size][tile_size?+?2];??????//?half?[64][66]
????T_MOV4?tile_m4[tile_size][tile_size?/?2?+?1];??//?half2?[64][33]
??}?tile_mem;
??IndexType?batch_num_tile?=?num_tile_rows?*?num_tile_cols;
??for?(int?i?=?blockIdx.x,?step?=?gridDim.x;?i?????const?IndexType?batch_index?=?i?/?batch_num_tile;??//?the?index?of?batch.
????const?IndexType?flatten_index?=
????????i?-?batch_index?*?batch_num_tile;??//?the?flatten?index?of?tile?in?a?batch.
????const?IndexType?row_index?=?flatten_index?/?num_tile_cols;??//?the?row?index?of?tile?in?a?batch.
????const?IndexType?col_index?=
????????flatten_index
????????-?row_index
??????????????*?num_tile_cols;??//?equal?to?k?%?num_tile_cols.?the?col?index?of?tile?in?a?batch.
????const?IndexType?offset?=?batch_index?*?rows?*?cols;
????IndexType?x?=
????????col_index?*?tile_size?+?threadIdx.x?*?2;??//?cause?each?thread?process?a?half2?element,?we?need?to?multiply?2?for?threadIdx.x.
????IndexType?y?=?row_index?*?tile_size?+?threadIdx.y;
????if?(x???????//?each?thread?process?4?elements.
??????IndexType?y_range?=
??????????((tile_size?-?threadIdx.y)?(rows?-?y))???(tile_size?-?threadIdx.y)?:?(rows?-?y);
#pragma?unroll
??????//?`i?
??????for?(int?i?=?0;?i?????????//?each?thread?load?a?half2.
????????tile_mem.tile_m4[threadIdx.y?+?i][threadIdx.x]?=?src[(offset?+?(y?+?i)?*?cols?+?x)?/?2];
??????}
????}
????__syncthreads();
????x?=?row_index?*?tile_size?+?threadIdx.x?*?2;??//?cause?each?thread?process?a?half2?element,?we?need?to?multiply?2?for?threadIdx.x.
????y?=?col_index?*?tile_size?+?threadIdx.y;
????if?(x???????IndexType?x_range?=
??????????((tile_size?-?threadIdx.y)?(cols?-?y))???(tile_size?-?threadIdx.y)?:?(cols?-?y);
#pragma?unroll
??????//?`i?
??????for?(int?i?=?0;?i?????????/*
????????When?write?back?as?column,?it?cannot?be?stored?as?half2?directly.
????????So?we?split?as?2?half?elements,?and?write?back?separately.
????????*/
????????union?{
??????????T_MOV4?m4;
??????????T_MOV2?m2[2];
????????}?tmp_storage;
????????tmp_storage.m2[0]?=?tile_mem.tile_m2[threadIdx.x?*?2][threadIdx.y?+?i];
????????tmp_storage.m2[1]?=?tile_mem.tile_m2[threadIdx.x?*?2?+?1][threadIdx.y?+?i];
????????dst[(offset?+?(y?+?i)?*?rows?+?x)?/?2]?=?tmp_storage.m4;
??????}
????}
????__syncthreads();
??}
}


5 未來優(yōu)化方向
6 展望
https://developer.nvidia.com/blog/cuda-pro-tip-increase-performance-with-vectorized-memory-access/https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/https://on-demand.gputechconf.com/gtc/2018/presentation/s81006-volta-architecture-and-performance-optimization.pdf
評(píng)論
圖片
表情
