Skip to content
版 本

asc_copy_l0c2l1

产 品 支 持 情 况

产 品是 否 支 持
Atlas A3 训 练 系 列 产 品/Atlas A3 推 理 系 列 产 品
Atlas A2 训 练 系 列 产 品/Atlas A2 推 理 系 列 产 品

功 能 说 明

矩 阵 计 算 完 成 后,对 结 果 进 行 量 化 处 理,之 后 将 处 理 结 果 搬 运 到L1 Buffer中。量 化 模 式 共 分 为9种,分 别 为:

  • NoQuant:不 开 启 量 化 功 能。
  • DEQF16:int32_t量 化 成half。量 化 结 果 不 支 持INF_NAN模 式。
  • F322BF16:float量 化 成bfloat16_t。量 化 结 果 不 支 持INF_NAN模 式。
  • F322F16:float量 化 成half。量 化 结 果 不 支 持INF_NAN模 式。
  • QF322B8_PRE:float量 化 成uint8_t/int8_t。scalar量 化。
  • REQ8:int32_t量 化 成uint8_t/int8_t。scalar量 化。
  • VDEQF16:int32_t量 化 成half。量 化 结 果 不 支 持INF_NAN模 式。
  • VQF322B8_PRE:float量 化 成uint8_t/int8_t。矢 量 量 化。
  • VREQ8:int32_t量 化 成uint8_t/int8_t。矢 量 量 化。

此 外,该 搬 运 指 令 还 支 持int4b_t类 型 的 量 化,相 关 量 化 类 型 分 为 两 种:int32_t量 化 为int4b_t;float量 化 为int4b_t。

函 数 原 型

  • 常 规 搬 运

    C++
    __aicore__ inline void asc_copy_l0c2l1(__cbuf__ half* dst, __cc__ float* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1(__cbuf__ bfloat16_t* dst, __cc__ float* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1(__cbuf__ int8_t* dst, __cc__ float* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1(__cbuf__ half* dst, __cc__ int32_t* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1(__cbuf__ int16_t* dst, __cc__ int32_t* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1(__cbuf__ int8_t* dst, __cc__ int32_t* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1(__cbuf__ uint8_t* dst, __cc__ int32_t* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1(__cbuf__ void* dst, __cc__ float* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1(__cbuf__ void* dst, __cc__ int32_t* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    
  • 同 步 搬 运

    C++
    __aicore__ inline void asc_copy_l0c2l1_sync(__cbuf__ half* dst, __cc__ float* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1_sync(__cbuf__ bfloat16_t* dst, __cc__ float* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1_sync(__cbuf__ int8_t* dst, __cc__ float* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1_sync(__cbuf__ half* dst, __cc__ int32_t* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1_sync(__cbuf__ int16_t* dst, __cc__ int32_t* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1_sync(__cbuf__ int8_t* dst, __cc__ int32_t* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1_sync(__cbuf__ uint8_t* dst, __cc__ int32_t* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1_sync(__cbuf__ void* dst, __cc__ float* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    __aicore__ inline void asc_copy_l0c2l1_sync(__cbuf__ void* dst, __cc__ int32_t* src, uint16_t n_size,uint16_t m_size, uint32_t dst_stride, uint16_t src_stride, uint8_t unit_flag_mode, uint64_t quant_pre, uint8_t relu_pre, bool enable_channel_split, bool enable_nd2nz)
    

参 数 说 明

参 数 名输 入/输 出描 述
dst输 出目 的 操 作 数(矢 量)的 起 始 地 址。
src输 入源 操 作 数 的 起 始 地 址。
n_size输 入源NZ矩 阵 在N方 向 上 的 大 小。
•不 开 启NZ2ND功 能,取 值 范 围:[1, 4095];
•开 启NZ2ND功 能,取 值 范 围:[1, 4095]。
m_size输 入源NZ矩 阵 在M方 向 上 的 大 小。
•不 开 启NZ2ND功 能,取 值 范 围:[1, 65535];
•开 启NZ2ND功 能,取 值 范 围:[1, 8192]。
dst_stride输 入
- 不 开 启NZ2ND功 能,目 的NZ矩 阵 中 相 邻Z排 布 的 起 始 地 址 偏 移,取 值 不 为0,单 位:element。
- 开 启NZ2ND/NZ2DN功 能,目 的ND矩 阵 每 一 行 中 的 元 素 个 数,取 值 不 为0 ,单 位:element。
src_stride输 入源NZ矩 阵 中 相 邻Z排 布 的 起 始 地 址 偏 移,取 值 范 围:[0, 65535],单 位:C0_Size(16*sizeof(T), T为src的 数 据 类 型)。
unit_flag_mode输 入与unit_flag参 数 相 关,取 值 如 下:
•0保 留 值;
•2 开 启unit_flag,硬 件 执 行 完 指 令 之 后,不 会 设 置 寄 存 器;
•3 开 启unit_flag,硬 件 执 行 完 指 令 后,会 将unit_flag关 闭。
quant_pre输 入量 化 参 数。取 值 见功 能 说 明
relu_pre输 入开 启relu。
enable_channel_split输 入是 否 开 启 通 道 拆 分 的 功 能,默 认false,不 开 启 该 功 能。仅 在src和dst都 为float时 才 能 开 启 通 道 拆 分,且 不 能 同 时 开 启channel_split和NZ2ND功 能。
enable_nd2nz输 入开 启ND2NZ开 关。
•false:不 开 启;
•true:开 启。

返 回 值 说 明

流 水 类 型

PIPE_FIX

约 束 说 明

  • src的 起 始 地 址 要 求 按 照 对 应 数 据 类 型 所 占 字 节 数 对 齐。
  • dst的 起 始 地 址 要 求32字 节 对 齐。
  • 如 果 需 要 执 行 多 条asc_copy_l0c2l1指 令,且asc_copy_l0c2l1指 令 的 目 的 地 址 存 在 重 叠,需 要 插 入 同 步 指 令,保 证 多 个asc_copy_l0c2l1指 令 的 串 行 化,防 止 出 现 异 常 数 据。

调 用 示 例

C++
// dst src分 别 对 应 目 的 操 作 数 的 输 出 地 址 和 源 操 作 数 的 输 入 地 址
__cbuf__ int32_t dst[total_length];
__cc__ int32_t src[total_length];
// 其 余 入 参 均 已 默 认 数 值 传 入
uint16_t n_size = 16;
uint16_t m_size = 16;
uint16_t dst_stride = 8;
uint16_t src_stride = 8;
uint8_t unit_flag_mode = 0;
uint64_t quant_pre = DEQF16;
uint8_t relu_pre = 0;
bool channel_split = false;
bool enable_nd2nz = false;
asc_copy_l0c2l1(dst, src, n_size, m_size, dst_stride, src_stride, unit_flag_mode, quant_pre, relu_pre, enable_channel_split, enable_nd2nz);

免 责 声 明:本 站 内 容 由 asc-devkit 仓 master 分 支 自 动 编 译 生 成,属 于 持 续 开 发 版 本,可 能 存 在 缺 陷,仅 供 预 览 与 参 考。如 需 稳 定 及 商 用 资 料,请 查 阅 官 方 昇 腾 社 区