Skip to content
版 本

DataStoreBarrier

产 品 支 持 情 况

产 品

是 否 支 持

Ascend 950PR/Ascend 950DT

Atlas A3 训 练 系 列 产 品/Atlas A3 推 理 系 列 产 品

Atlas A2 训 练 系 列 产 品/Atlas A2 推 理 系 列 产 品

Atlas 200I/500 A2 推 理 产 品

x

Atlas 推 理 系 列 产 品AI Core

x

Atlas 推 理 系 列 产 品Vector Core

x

Atlas 训 练 系 列 产 品

x

功 能 说 明

数 据 同 步 屏 障 指 令,该 指 令 会 阻 塞 当 前 线 程 执 行,确 保 所 有 先 前 的 写 内 存 操 作 完 成 并 对 其 它 硬 件 单 元 可 见 后,才 继 续 执 行 后 续 指 令。用 于AI CPU与AI Core多 核 之 间 的 数 据 一 致 性。

需 要 包 含 的 头 文 件

Text
#include "aicpu_api.h"

函 数 原 型

Text
DataStoreBarrier(void)

参 数 说 明

返 回 值 说 明

约 束 说 明

  • 该 接 口 仅 支 持 通 过<<<...>>>调 用,并 在 异 构 编 译 场 景 使 用。

调 用 示 例

在AI CPU算 子Kernel侧 实 现 代 码 中 调 用AscendC::DataStoreBarrier(),确 保AI CPU算 子 对Tiling数 据 的 修 改 写 入 内 存,使 得AI Core算 子 能 够 正 确 读 取Tiling数 据:

Text
struct TilingInfo {
    uint64_t lock; // AI CPU/AI Core之 间 同 步 的 锁
    int8_t type;
    int8_t mode;
    int8_t len;
};
struct KernelArgs {
    uint32_t *xDevice;
    uint32_t *yDevice;
    uint32_t *zDevice;
    TilingInfo *ti; // 与AI Core共 享 的 参 数,用 于 同 步tiling选 择
};

template<typename T, int8_t mode, int8_t len>
__aicore__ void hello_world_impl(GM_ADDR m)
{
    if constexpr (std::is_same_v<T, float>) {
       AscendC::printf("Hello World: float mode %u len %u.\n", mode, len);
    } else if constexpr (std::is_same_v<T, int>) {
       AscendC::printf("Hello World: int mode %u len %u.\n", mode, len);
    }
}

// AI Core算 子 总 入 口
// tilingInfo: 和AI CPU算 子 共 同 传 递 的 参 数,用 于 数 据 共 享
template<typename T, int8_t mode, int8_t len>
__mix__(1,2) __global__ __aicore__ void hello_world(GM_ADDR m, GM_ADDR TilingPtr)
{
     __gm__ struct KernelInfo::TilingInfo *ti = (__gm__ struct KernelInfo::TilingInfo *)TilingPtr;
    AscendC::GlobalTensor<uint64_t> lock;
    lock.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t *>(&ti->lock));
    if ASCEND_IS_AIV {
        if (AscendC::GetBlockIdx() == 0) {
            while (*reinterpret_cast<volatile __gm__ uint64_t*>(lock.GetPhyAddr(0)) == 0) {   // 下 沉 模 式,AI Core等 待AICPU tiling计 算 完 成
                AscendC::DataCacheCleanAndInvalid<uint64_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
                    AscendC::DcciDst::CACHELINE_OUT>(lock);    //直 接 访 问Global Memory,获 取 最 新 数 据
            }
        }
    }
    // 上 面 是1个 核 等 待AI CPU tiling计 算 完 成,这 里 进 行 核 间 同 步
    AscendC::SyncAll<false>();
    // 根 据tiling参 数 值 选 择 不 同 模 板
    if (ti->type ==0 && ti->mode == 1 && ti->len == 2) {
        hello_world_impl<float, 1, 2>(m);
    } else if (ti->type == 1 && ti->mode == 2 && ti->len == 4) {
        hello_world_impl<int, 2, 4>(m);
    }
    // 执 行 完 留 一 个 核 释 放lock
    if ASCEND_IS_AIV {
        if (AscendC::GetBlockIdx() == 0) {
            lock.SetValue(0, 0);  // 刷 新 lock
            AscendC::DataCacheCleanAndInvalid<uint64_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
                AscendC::DcciDst::CACHELINE_OUT>(lock);    //刷 新Dcache,同 步 与GM之 间 的 数 据
        }
    }
}

extern "C" __global__ __aicpu__ uint32_t MyAicpuKernel(void *arg)
{
    KernelArgs* cfg = (KernelArgs*)arg;
    AscendC::printf("MyAicpuKernel inited!\n");
    cfg->ti->lock = 1;
    cfg->ti->type = 1;
    cfg->ti->mode = 2;
    cfg->ti->len = 4;
    AscendC::DataStoreBarrier(); // 对tilingInfo进 行 写 同 步
    AscendC::printf("MyAicpuKernel inited type %u mode %u len %u end!\n", cfg->ti->type, cfg->ti->mode, cfg->ti->len);
    return 0;
}

免 责 声 明:本 站 内 容 由 asc-devkit 仓 master 分 支 自 动 编 译 生 成,属 于 持 续 开 发 版 本,可 能 存 在 缺 陷,仅 供 预 览 与 参 考。如 需 稳 定 及 商 用 资 料,请 查 阅 官 方 昇 腾 社 区