Skip to content
版 本

asc_copy_gm2l1_align

产 品 支 持 情 况

产 品是 否 支 持
Ascend 950PR/Ascend 950DT

功 能 说 明

对 矩 阵 数 据 进 行 实 时padding,完 成padding后 将 数 据 从Global Memory搬 运 到L1 Buffer。需 要 与asc_set_gm2l1_loop_sizeasc_set_gm2l1_loop1_strideasc_set_gm2l1_loop2_strideasc_set_gm2l1_pad配 合 使 用。

函 数 原 型

  • 常 规 搬 运

    C++
    __aicore__ inline void asc_copy_gm2l1_align(__cbuf__ int8_t* dst, __gm__ int8_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align(__cbuf__ uint8_t* dst, __gm__ uint8_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align(__cbuf__ hifloat8_t* dst, __gm__ hifloat8_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align(__cbuf__ int16_t* dst, __gm__ int16_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align(__cbuf__ uint16_t* dst, __gm__ uint16_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align(__cbuf__ half* dst, __gm__ half* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align(__cbuf__ bfloat16_t* dst, __gm__ bfloat16_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align(__cbuf__ int32_t* dst, __gm__ int32_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align(__cbuf__ uint32_t* dst, __gm__ uint32_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count,uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align(__cbuf__ float* dst, __gm__ float* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    
  • 同 步 搬 运

    C++
    __aicore__ inline void asc_copy_gm2l1_align_sync(__cbuf__ int8_t* dst, __gm__ int8_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align_sync(__cbuf__ uint8_t* dst, __gm__ uint8_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align_sync(__cbuf__ hifloat8_t* dst, __gm__ hifloat8_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align_sync(__cbuf__ int16_t* dst, __gm__ int16_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align_sync(__cbuf__ uint16_t* dst, __gm__ uint16_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align_sync(__cbuf__ half* dst, __gm__ half* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align_sync(__cbuf__ bfloat16_t* dst, __gm__ bfloat16_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align_sync(__cbuf__ int32_t* dst, __gm__ int32_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align_sync(__cbuf__ uint32_t* dst, __gm__ uint32_t* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count,uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    __aicore__ inline void asc_copy_gm2l1_align_sync(__cbuf__ float* dst, __gm__ float* src, uint32_t n_burst, uint32_t len_burst, uint8_t left_padding_count, uint8_t right_padding_count, bool data_select_bit, uint8_t l2_cache_ctl, uint64_t burst_src_stride, uint32_t burst_dst_stride)
    

参 数 说 明

参 数 名输 入/输 出描 述
dst输 出目 的 操 作 数(矢 量)的 起 始 地 址。
src输 入源 操 作 数(矢 量)的 起 始 地 址。
n_burst输 入待 搬 运 的 连 续 传 输 数 据 块 个 数。
len_burst输 入待 搬 运 的 每 个 连 续 传 输 数 据 块 的 长 度,单 位 为32个 字 节。对 位 宽 为16的 数 据 类 型,该 参 数 取 值 应 为2的 倍 数,对 位 宽 为32的 数 据 类 型,该 参 数 取 值 应 为4的 倍 数。
left_padding_count输 入数 据 左 侧 的padding元 素 数。对 位 宽 为8的 数 据 类 型,该 参 数 的 最 大 值 为32,对 位 宽 为16的 数 据 类 型,该 参 数 的 最 大 值 为16,对 位 宽 为32的 数 据 类 型,该 参 数 的 最 大 值 为8。
right_padding_count输 入数 据 右 侧 的padding元 素 数。对 位 宽 为8的 数 据 类 型,该 参 数 的 最 大 值 为32,对 位 宽 为16的 数 据 类 型,该 参 数 的 最 大 值 为16,对 位 宽 为32的 数 据 类 型,该 参 数 的 最 大 值 为8。
data_select_bit输 入padding数 据 选 择 位,当 前 只 支 持 设 置 为false,将padding值 设 置 为 数 据 块 的 第 一 个 元 素。
l2_cache_ctl输 入配 置 数 据 在L2 Cache中 的 管 理 策 略。该 参 数 取 值 说 明 如 下:
• 0:DISABLE模 式,适 用 于 仅 需 访 问 一 次 的 数 据。
• 1:NORMAL模 式,适 用 于 重 用 模 式 未 知 或 不 极 端 的 数 据。
• 2:LAST模 式,适 用 于 高 频 重 复 访 问 的 数 据。
• 4:PERSISTENT模 式,适 用 于 需 要 长 期 驻 留 在 缓 存 中 的 数 据。
burst_src_stride输 入输 入 数 据 中 两 个 相 邻 的burst所 对 应 的 连 续 数 据 块 头 与 头 之 间 的 距 离。
burst_dst_stride输 入输 出 数 据 中 两 个 相 邻 的burst所 对 应 的 连 续 数 据 块 头 与 头 之 间 的 距 离。

返 回 值 说 明

流 水 类 型

PIPE_MTE2

约 束 说 明

  • 目 的 操 作 数 不 得 存 在 地 址 重 叠。对 于 存 在 地 址 重 叠 的 操 作,硬 件 将 不 返 回 任 何 警 告 或 错 误,也 无 法 保 证 地 址 重 叠 数 据 的 正 确 性。

调 用 示 例

C++
//待 搬 运 的 连 续 传 输 数 据 块 个 数 为2
constexpr uint32_t n_burst = 2;
//待 搬 运 的 每 个 连 续 传 输 数 据 块 的 长 度 为64个 字 节
constexpr uint32_t len_burst = 2;
//数 据 左 右 侧padding的 元 素 数 为0
constexpr uint8_t left_padding_count = 0;
constexpr uint8_t right_padding_count = 0;
//padding值 取 数 据 块 的 第 一 个 元 素
constexpr bool data_select_bit = false;
//l2 cache采 用DISABLE模 式
constexpr uint8_t l2_cache_ctl = 0;
//输 入 输 出 数 据 中 两 个 相 邻 连 续 数 据 块 之 间 的 距 离 为0
constexpr uint64_t burst_src_stride = 0;
constexpr uint32_t burst_dst_stride = 0;
__gm__ half src[256];
__cbuf__ half dst[256];
asc_copy_gm2l1_align(dst, src, n_burst, len_burst,  left_padding_count, right_padding_count, data_select_bit, l2_cache_ctl, burst_src_stride, burst_dst_stride);

免 责 声 明:本 站 内 容 由 asc-devkit 仓 master 分 支 自 动 编 译 生 成,属 于 持 续 开 发 版 本,可 能 存 在 缺 陷,仅 供 预 览 与 参 考。如 需 稳 定 及 商 用 资 料,请 查 阅 官 方 昇 腾 社 区