Skip to content
版 本

asc_copy_gm2l1_dn2nz

产 品 支 持 情 况

产 品是 否 支 持
Ascend 950PR/Ascend 950DT

功 能 说 明

将 矩 阵 数 据 从Global Memory搬 运 到L1 Buffer,在 此 过 程 中 执 行DN->NZ/NCHW->NC1HWC0/NCHW->C1HWNC0操 作。若D未 对 齐C0的 大 小,将 会 在L1 Buffer的 最 内 层 维 度 填 充0值。GM中 的 数 据 以DN类 型 存 储;L1中 的 数 据 以NZ形 式 存 储。

DN->NZ的 搬 运 形 式 如 下 图:

函 数 原 型

  • 常 规 搬 运

    C++
    __aicore__ inline void asc_copy_gm2l1_dn2nz(__cbuf__ int8_t* dst, __gm__ int8_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz(__cbuf__ uint8_t* dst, __gm__ uint8_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz(__cbuf__ hifloat8_t* dst, __gm__ hifloat8_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz(__cbuf__ fp8_e5m2_t* dst, __gm__ fp8_e5m2_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz(__cbuf__ fp8_e4m3fn_t* dst, __gm__ fp8_e4m3fn_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz(__cbuf__ int16_t* dst, __gm__ int16_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz(__cbuf__ uint16_t* dst, __gm__ uint16_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value,  uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz(__cbuf__ half* dst, __gm__ half* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz(__cbuf__ bfloat16_t* dst, __gm__ bfloat16_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz(__cbuf__ int32_t* dst, __gm__ int32_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz(__cbuf__ uint32_t* dst, __gm__ uint32_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz(__cbuf__ float* dst, __gm__ float* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    
  • 同 步 搬 运

    C++
    __aicore__ inline void asc_copy_gm2l1_dn2nz_sync(__cbuf__ int8_t* dst, __gm__ int8_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz_sync(__cbuf__ uint8_t* dst, __gm__ uint8_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz_sync(__cbuf__ hifloat8_t* dst, __gm__ hifloat8_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz_sync(__cbuf__ fp8_e5m2_t* dst, __gm__ fp8_e5m2_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz_sync(__cbuf__ fp8_e4m3fn_t* dst, __gm__ fp8_e4m3fn_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz_sync(__cbuf__ int16_t* dst, __gm__ int16_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz_sync(__cbuf__ uint16_t* dst, __gm__ uint16_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value,  uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz_sync(__cbuf__ half* dst, __gm__ half* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz_sync(__cbuf__ bfloat16_t* dst, __gm__ bfloat16_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz_sync(__cbuf__ int32_t* dst, __gm__ int32_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz_sync(__cbuf__ uint32_t* dst, __gm__ uint32_t* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    __aicore__ inline void asc_copy_gm2l1_dn2nz_sync(__cbuf__ float* dst, __gm__ float* src, uint64_t loop1_src_stride, uint8_t l2_cache_ctl, uint16_t n_value, uint32_t d_value, uint64_t loop4_src_stride, bool smallc0_en)
    

参 数 说 明

参 数 名输 入/输 出描 述
dst输 出目 的 操 作 数(矢 量)的 起 始 地 址。
src输 入源 操 作 数(矢 量)的 起 始 地 址。
loop1_src_stride输 入搬 运 过 程 中 最 内 层 循 环 相 邻 迭 代 源 操 作 数 的 数 据 块 间 的 间 隔,单 位 为 字 节。
l2_cache_ctl输 入配 置 数 据 在L2 Cache中 的 管 理 策 略。取 值 说 明 如 下:
• 0:DISABLE模 式,适 用 于 仅 需 访 问 一 次 的 数 据。
• 1:NORMAL模 式,适 用 于 重 用 模 式 未 知 或 不 极 端 的 数 据。
• 2:LAST模 式,适 用 于 高 频 重 复 访 问 的 数 据。
• 4:PERSISTENT模 式,适 用 于 需 要 长 期 驻 留 在 缓 存 中 的 数 据。
n_value输 入源 操 作 数 中DN排 布 中 的N值。
d_value输 入源 操 作 数 中DN排 布 中 的D值。
loop4_src_stride输 入搬 运 过 程 中 最 外 层 循 环 相 邻 迭 代 源 操 作 数 的 数 据 块 间 的 间 隔,单 位 为 字 节。
smallc0_en输 入SmallC0模 式 开 关:
• true:C0_SIZE会 被pad至32个 字 节。
• false:C0_SIZE会 被pad为4个channel。

返 回 值 说 明

流 水 类 型

PIPE_MTE2

约 束 说 明

  • 注 意 当 且 仅 当D <= 4时,smallc0_en才 可 以 被 启 用,否 则 它 会 失 效。

调 用 示 例

C++
//搬 运 关 于 最 内 层 循 环 的 步 长 为4个 字 节
constexpr uint64_t loop1_src_stride = 4;
//L2 Cache采 用NORMAL模 式
constexpr uint8_t l2_cache_ctl = 1;
//N值 为3
constexpr uint16_t n_value = 3;
//D值 为3
constexpr uint32_t d_value = 3;
//搬 运 关 于 最 外 层 循 环 的 步 长 为4个 字 节
constexpr uint64_t loop4_src_stride = 2;
//每 一 个C0数 据 块 都 会 被pad至32个 字 节
constexpr bool smallc0_en = true;
__gm__ half src[256];
__cbuf__ half dst[256];
asc_copy_gm2l1_dn2nz(dst, src, loop1_src_stride, l2_cache_ctl, n_value, d_value, loop4_src_stride, smallc0_en);

免 责 声 明:本 站 内 容 由 asc-devkit 仓 master 分 支 自 动 编 译 生 成,属 于 持 续 开 发 版 本,可 能 存 在 缺 陷,仅 供 预 览 与 参 考。如 需 稳 定 及 商 用 资 料,请 查 阅 官 方 昇 腾 社 区