From 6881fdb522a9f925f6ef4e455b497b3966efeace Mon Sep 17 00:00:00 2001 From: menfeifei <894242714@qq.com> Date: Tue, 2 Sep 2025 07:47:43 +0000 Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20sample?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sample/README.md | 144 ------------- sample/build/build.sh | 38 ---- sample/normal_sample/.keep | 0 sample/normal_sample/cube_only/Makefile | 28 --- sample/normal_sample/cube_only/main.cpp | 171 --------------- .../normal_sample/cube_only/matmul_kernel.cpp | 100 --------- sample/normal_sample/mix/Makefile | 29 --- sample/normal_sample/mix/main.cpp | 201 ------------------ .../mix/matmul_leakyrelu_kernel.cpp | 188 ---------------- sample/normal_sample/vec_only/Makefile | 27 --- sample/normal_sample/vec_only/add_kernel.cpp | 103 --------- sample/normal_sample/vec_only/main.cpp | 89 -------- sample/pytorch_adapter/README.md | 53 ----- sample/pytorch_adapter/jit_compile/Makefile | 20 -- .../jit_compile/add_adapter.cpp | 128 ----------- .../jit_compile/add_kernel.cpp | 106 --------- sample/pytorch_adapter/jit_compile/main.py | 70 ------ .../pytorch_adapter/with_setuptools/Makefile | 20 -- .../with_setuptools/add_adapter.cpp | 128 ----------- .../with_setuptools/add_kernel.cpp | 106 --------- .../pytorch_adapter/with_setuptools/setup.py | 51 ----- .../pytorch_adapter/with_setuptools/test.py | 34 --- sample/sanitizer_sample/Racecheck/Makefile | 28 --- sample/sanitizer_sample/Racecheck/main.cpp | 42 ---- .../Racecheck/raw_error_kernel.cpp | 28 --- .../memcheck/illegal_align/Makefile | 28 --- .../illegal_align/illegal_align_kernel.cpp | 26 --- .../memcheck/illegal_align/main.cpp | 39 ---- .../memcheck/illegal_read_and_write/Makefile | 28 --- .../illegal_read_and_write_kernel.cpp | 29 --- .../memcheck/illegal_read_and_write/main.cpp | 39 ---- .../memcheck/out_of_bound/Makefile | 28 --- .../memcheck/out_of_bound/main.cpp | 39 ---- .../out_of_bound/out_of_bound_kernel.cpp | 27 --- 34 files changed, 2215 deletions(-) delete mode 100644 sample/README.md delete mode 100644 sample/build/build.sh delete mode 100644 sample/normal_sample/.keep delete mode 100644 sample/normal_sample/cube_only/Makefile delete mode 100644 sample/normal_sample/cube_only/main.cpp delete mode 100644 sample/normal_sample/cube_only/matmul_kernel.cpp delete mode 100644 sample/normal_sample/mix/Makefile delete mode 100644 sample/normal_sample/mix/main.cpp delete mode 100644 sample/normal_sample/mix/matmul_leakyrelu_kernel.cpp delete mode 100644 sample/normal_sample/vec_only/Makefile delete mode 100644 sample/normal_sample/vec_only/add_kernel.cpp delete mode 100644 sample/normal_sample/vec_only/main.cpp delete mode 100644 sample/pytorch_adapter/README.md delete mode 100644 sample/pytorch_adapter/jit_compile/Makefile delete mode 100644 sample/pytorch_adapter/jit_compile/add_adapter.cpp delete mode 100644 sample/pytorch_adapter/jit_compile/add_kernel.cpp delete mode 100644 sample/pytorch_adapter/jit_compile/main.py delete mode 100644 sample/pytorch_adapter/with_setuptools/Makefile delete mode 100644 sample/pytorch_adapter/with_setuptools/add_adapter.cpp delete mode 100644 sample/pytorch_adapter/with_setuptools/add_kernel.cpp delete mode 100644 sample/pytorch_adapter/with_setuptools/setup.py delete mode 100644 sample/pytorch_adapter/with_setuptools/test.py delete mode 100644 sample/sanitizer_sample/Racecheck/Makefile delete mode 100644 sample/sanitizer_sample/Racecheck/main.cpp delete mode 100644 sample/sanitizer_sample/Racecheck/raw_error_kernel.cpp delete mode 100644 sample/sanitizer_sample/memcheck/illegal_align/Makefile delete mode 100644 sample/sanitizer_sample/memcheck/illegal_align/illegal_align_kernel.cpp delete mode 100644 sample/sanitizer_sample/memcheck/illegal_align/main.cpp delete mode 100644 sample/sanitizer_sample/memcheck/illegal_read_and_write/Makefile delete mode 100644 sample/sanitizer_sample/memcheck/illegal_read_and_write/illegal_read_and_write_kernel.cpp delete mode 100644 sample/sanitizer_sample/memcheck/illegal_read_and_write/main.cpp delete mode 100644 sample/sanitizer_sample/memcheck/out_of_bound/Makefile delete mode 100644 sample/sanitizer_sample/memcheck/out_of_bound/main.cpp delete mode 100644 sample/sanitizer_sample/memcheck/out_of_bound/out_of_bound_kernel.cpp diff --git a/sample/README.md b/sample/README.md deleted file mode 100644 index 15238cb9f..000000000 --- a/sample/README.md +++ /dev/null @@ -1,144 +0,0 @@ -# 样例库介绍 - -## 说明 -本案例库主要用于配合AscendC算子开发工具的能力演示,所以,算子工程做了深度简化,聚焦于辅助工具展示。 - -如果考虑商用集成,推荐使用CANN软件包中的AscendC样例工程,比如:ascendc_kernel_cmake目录。本项目中的工程就是基于其进行简化仅用于快速验证。 - -说明:该sample目录中,每个最小目录就是一个完整的样例工程。这些样例工程本身可能以为依赖的不同存在差异。 - -## 依赖说明 -- 硬件环境请参见《[昇腾产品形态说明](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F80RC22%2Fquickstart%2Fquickstart%2Fquickstart_18_0002.html)》。 -- 软件环境请参见《[CANN 软件安装指南](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F80RC22%2Fsoftwareinst%2Finstg%2Finstg_0000.html%3FMode%3DPmIns%26OS%3DUbuntu%26Software%3DcannToolKit)》安装昇腾设备开发或运行环境,即toolkit软件包。 - -以上环境依赖请根据实际环境选择适配的版本。 - -### 版本配套 -| 条件 | 要求 | -|---|---| -| CANN版本 | >=8.0.RC1.alpha001 | -| 硬件要求 | Atlas 800T A2 训练服务器| - -- 支持AscendPyTorch 1.11.0或更高版本,支持的PyTorch和CANN以及PyTorch和Python软件版本配套关系请参见《[Ascend Extension for PyTorch插件](https://gitee.com/ascend/pytorch)》。 -- 固件驱动版本与配套CANN软件支持的固件驱动版本相同,开发者可通过“[昇腾社区-固件与驱动](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fhardware%2Ffirmware-drivers%2Fcommunity%3Fproduct%3D2%26model%3D28%26cann%3D8.0.RC3.alpha003%26driver%3D1.0.25.alpha)”页面根据产品型号与CANN软件版本获取配套的固件与驱动。 - -## 目录介绍 -整体目录结构如下: -``` -- sample - |- build # 编译并运行所有样例内容(建议按需使用,此处命令可以参考 - |- normal_sample # 纯C/C++的AscendC单算子极简工程,可配合msdebug和msprof工具 - |- cube_only # 仅含aic的AscendC单算子极简工程 - |- mix # mix算子的AscendC单算子极简工程 - |- vec_only # 仅含aiv的AscendC单算子极简工程 - |- pytorch_adapter # 适配pytorch的AscendC单算子极简工程,可配合msdebug和msprof工具 - |- jit_compile # jit模式,运行时编译使用 - |- with_setuptools # 编译成wheel包安装使用 - |- sanitizer_sample # 异常样例,用于配合mssanitizer工具 - |- racecheck # 含竞争问题的样例 - |- xx # 其他异常样例 -``` - -如果你关注自定义算子的pytorch框架适配,详见[此处](./pytorch_adapter/README.md) - - -## 算子调试 msdebug -若使用msdebug进行上板调试,还需要额外调整,具体如下: -1. 编译阶段:在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容: - + 调试信息增强,并扩大栈空间: - ``` - COMPILER_FLAG := -xcce -O2 -std=c++17 - 修改为: - COMPILER_FLAG := -xcce -O0 -std=c++17 -g --cce-ignore-always-inline=true - ``` - -2. 运行阶段: -``` -msdebug ./*.fatbin -``` - -## 内存检测 sanitizer -1. 编译阶段:在编译过程中添加```--cce-enable-sanitizer -g```参数, 在链接过程中添加```--cce-enable-sanitizer```参数。(现样例中已在Makefile中添加),执行如下命令: -``` -make -``` - -2. 运行阶段: -``` -mssanitizer ./*.fatbin # 默认进行memcheck检查 -``` - - -## 算子调优 -算子调优工具可以支持上板和仿真算子的调优,下面将以vec_only中的算子为例,进行工具使用的实战命令讲解 - -### 上板调优 -1. 基于原始的sample代码,无需修改,直接编译算子,获得add.fatbin - ``` - cd ./sample/normal_sample/vec_only - make clean && make - ``` -2. 使用算子调优工具,对算子程序进行调优。`--aic-metrics`省略,使用默认全部开启;`--output`参数省略,使用默认值`./` - ``` - msprof op --application=./add.fatbin - ``` -3. 在当前目录下可以看到`OPPROF_`开头的文件夹,进入后将包含开启的`--aic-metrics`开关对应的csv数据和算子基础数据`OpBasicInfo.csv`。查看对应的csv文件即可获得算子的block级别的性能数据。(当前的算子性能数据是算子预热后的数据) - ``` - OPPROF_2024xxxx_XXXXXX - ├── dump - ├── OpBasicInfo.csv - ├── ArithmeticUtilization.csv - ├── ... (开启的aic-metrics) - └── ResourceConflictRatio.csv - ``` -4. 更多csv中指标信息请参考算子开发工具使用手册。 - -### 仿真调优 -使用msprof进行仿真调优时,需要编译出可以运行在仿真器上的可执行算子,需要对编译选项稍作修改,修改如下 -在```./sample/normal_sample/vec_only```相对路径下的```Makefile```文件中修改如下内容: -+ 仿真器依赖: - ``` - LINK_LIBS := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++ - 修改为: - LINK_LIBS := -L${ASCEND_HOME_PATH}/lib64 -L${ASCEND_HOME_PATH}/tools/simulator/${SOC_VERSION}/lib/ -lruntime_camodel -lascendcl -lstdc++ # 需要添加libruntime_camodel的依赖路径, SOC_VERSION 通过使用npu-smi info命令进行查询,获取Chip Name信息。实际配置值 为AscendChip Name,例如Chip Name取值为xxxyy,实际配置值为Ascendxxxyy。当Ascendxxxyy为代码样例路径时,需要配置ascendxxxyy。 - ``` - + 调试信息增强: - ``` - COMPILER_FLAG := -xcce -O2 -std=c++17 - 修改为: - COMPILER_FLAG := -xcce -O2 -std=c++17 -g -``` - -下面将从编译阶段开始进行 - -1. 仿真算子编译 - ``` - cd ./sample/normal_sample/vec_only - make clean && make - ``` -2. 添加运行时依赖库路径 - ``` - # SOC_VERSION为NPU名称,可通过npu-smi info命令进行查询。 - export LD_LIBRARY_PATH=${ASCEND_HOME_PATH}/tools/simulator/${SOC_VERSION}/lib/:$LD_LIBRARY_PATH - ``` -3. 使用算子调优工具进行仿真调优,获取仿真性能数据,`--output`参数省略,使用默认值`./` - ``` - msprof op simulator --application=./add.fatbin - ``` -4. 在当前目录下可以看到`OPPROF_`开头的文件夹,,生成以OPPROF_时间_随机字符串的文件夹,结构如下: - ``` - OPPROF_20231023120542_FQMZMGOHUYVUZEXP - ├── dump # 原始性能数据,无需关注 - └── simulation # 仿真性能数据分析结果 - ├── core0.veccore0 # 算子block级子核,vec样例中使用了8个 - ├── core0.veccore1_code_exe.csv # 代码行耗时 - ├── core0.veccore1_instr_exe.csv # 程序代码指令详细信息 - └── trace.json # 算子block级子核流水图 - ├── ... - ├── api # 算子热点图文件夹,将文件夹内全部文件拖入Ascend Compute即可 - ├── api.json # 代码热点映射 - └── .cpp # 算子kernel代码 - ├── visualize_data.bin # 算子可视化文件,使用Ascend Insight加载 - └── trace.json # 算子所有核的流水图 - ``` -4. 更多指标信息请参考算子开发工具使用手册。 diff --git a/sample/build/build.sh b/sample/build/build.sh deleted file mode 100644 index bbb2915b0..000000000 --- a/sample/build/build.sh +++ /dev/null @@ -1,38 +0,0 @@ -#依赖外部内容ASCEND_HOME_PATH - -CUR_DIR=$(dirname $(readlink -f $0)) -TOP_DIR=$(readlink -f $CUR_DIR/../) - -# add -cd ${TOP_DIR}/normal_sample/vec_only -make -mv *.fatbin ${TOP_DIR}/build - -# matmul -cd ${TOP_DIR}/normal_sample/cube_only -make -mv *.fatbin ${TOP_DIR}/build - -# matmul_leakyrelu -cd ${TOP_DIR}/normal_sample/mix -make -mv *.fatbin ${TOP_DIR}/build - -# illegal_read_and_write -cd ${TOP_DIR}/sanitizer_sample/memcheck/illegal_read_and_write -make -mv *.fatbin ${TOP_DIR}/build - -# out_of_bound -cd ${TOP_DIR}/sanitizer_sample/memcheck/out_of_bound -make -mv *.fatbin ${TOP_DIR}/build - -# illegal align sample for sanitizer -cd ${TOP_DIR}/sanitizer_sample/memcheck/illegal_align -make -mv *.fatbin ${TOP_DIR}/build - -cd ${TOP_DIR}/sanitizer_sample/Racecheck -make -mv *.fatbin ${TOP_DIR}/build \ No newline at end of file diff --git a/sample/normal_sample/.keep b/sample/normal_sample/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/sample/normal_sample/cube_only/Makefile b/sample/normal_sample/cube_only/Makefile deleted file mode 100644 index c730678e4..000000000 --- a/sample/normal_sample/cube_only/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 -ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest - -COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 -COMPILER_FLAG := -xcce -O2 -std=c++17 -LINK_FLAG := --cce-fatobj-link -DAV_FLAG := --cce-aicore-arch=dav-c220-cube -ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 -TILING_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -HOST_INC_FLAG := -I${ASCEND_HOME_PATH}/include -LINK_LIBS := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++ - -all: build - -build: matmul_kernel.o main.o matmul.fatbin - -matmul_kernel.o: matmul_kernel.cpp - $(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^ - -main.o: main.cpp - $(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(HOST_INC_FLAG) $(TILING_INC_FLAG) -o $@ -c $^ - -matmul.fatbin: matmul_kernel.o main.o - $(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ $(LINK_LIBS) - -.PHONY: clean -clean: - rm *.o *.fatbin \ No newline at end of file diff --git a/sample/normal_sample/cube_only/main.cpp b/sample/normal_sample/cube_only/main.cpp deleted file mode 100644 index 2e151c2ff..000000000 --- a/sample/normal_sample/cube_only/main.cpp +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. - */ - -#include -#include -#include "kernel_tiling/kernel_tiling.h" -#include "acl/acl.h" - -extern void matmul_custom_do( - uint32_t coreDim, void *l2ctrl, void *stream, uint8_t *param1, uint8_t *param2, uint8_t *param3, uint8_t *param4); - -#define ACL_ERROR_NONE 0 - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - printf("%s: %d aclError %d\n", __FILE__, __LINE__, __ret); \ - } \ - } while (0) - -void printTensor(float *ptr, size_t size) -{ - size_t colNum = 8; - for (size_t i = 0; i < size / colNum / sizeof(float); i++) { - for (size_t j = 0; j < colNum; j++) { - printf("%5.2f ", ptr[colNum * i + j]); - } - printf("\n"); - } -} - -void fillValue(aclFloat16 *addr, size_t size) -{ - aclFloat16 val = aclFloatToFloat16(4.0f); - for (size_t i = 0; i < size / sizeof(aclFloat16); i++) { - addr[i] = val; - } -} - -void printAclFloat16(aclFloat16 *addr) -{ - for (int i = 0; i < 16; i++) { - printf("%f ", aclFloat16ToFloat(addr[i])); - } -} - - -void MakeTiling(uint32_t *addr, size_t size) -{ - // TCubeTiling该结构体在kernel_tiling/kernel_tiling.h中的结构体定义 - // tiling_api.h中本身定义的结构与kernel_tiling.h相近,通过GetTiling实现映射 - // TCubeTiling定义的可读性较好,可以直接理解,但使用tiling_api可以直接使能部分默认值 - // 考虑到工具本身需要体现对应用的细粒度控制,所以直接使用kernel_tiling.h中的结构 - TCubeTiling *tiling = (TCubeTiling *)addr; - // 此处计算使用的核数 - tiling->usedCoreNum = 16; // (M/singleCoreM)*(N/singleCoreN)*(K/singleCoreK)=4*4*1=16 - // 对于 xa 是[M, Ka]矩阵, xb 是[Kb, N]矩阵,此处数据需要与外部格式保持一致 - // 参考 AscendC算子开发文档 - // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/devguide/opdevg/ascendcopdevg/atlas_ascendc_10_0060.html - // 中对 数据分块(Tiling) 部分的介绍 - tiling->M = 512; // - tiling->N = 1024; // - tiling->Ka = 512; // Ka和Kb一般一样,只有pad的时候存在不一致,比如[1, 62]*[64, 2],这里64就是有pad的 - tiling->Kb = 512; // - tiling->isBias = 0; // 是否有bias - // 多核切分的tiling参数,用于度量单个核上处理的数据大小 - // xa在M轴上切分,分成多个singleCoreM;单核处理singleCoreM * singleCoreK大小数据 - // xb在N轴上切分,分成多个singleCoreN;单核处理singleCoreK * singleCoreN - // 由于输入在M和N轴上切分了,输出singleCoreM * singleCoreN - tiling->singleCoreM = 128; - tiling->singleCoreN = 256; - tiling->singleCoreK = 512; // 不建议对k进行切分,会导致累加,引起不确定计算 - // 核内切分的tiling参数,用于单个核内的最小计算单位 - tiling->baseM = 128; - tiling->baseN = 256; - tiling->baseK = 64; - tiling->stepM = 1; - tiling->stepN = 1; - tiling->stepKa = 8; - tiling->stepKb = 8; - // A1+B1的缓存数据需要小于shareL1Size大小 - tiling->depthA1 = 8; // 矩阵[baseM, baseK]的缓存数量 - tiling->depthB1 = 8; // 矩阵[basek, baseN]的缓存数量 - // 其他参数 - tiling->iterateOrder = 0; // 控制迭代的方向:0代表先M轴再N轴,1代表先N轴再M轴 - tiling->shareL1Size = 384 * 1024; // 如存在多个matmul时,可以单独控制每个使用空间 - // 不小于 (baseM * baseK * depthA1 + baseN * baseK * depthB1) * sizeof(half) - tiling->shareL0CSize = 128 * 256 * 4; // 如存在多个matmul时,可以单独控制每个使用空间 - // 不小于 baseM * baseN * sizeof(float) - // 下列是bmm中使用的batch参数,如果需要实现bmm,该结构体中还有其他tiling参数 - tiling->batchM = 1; // 对于普通matmul,默认1 - tiling->batchN = 1; // 对于普通matmul,默认1 - tiling->singleBatchM = 1; - tiling->singleBatchN = 1; - // 下面的db参数用于控制ping-pong - tiling->dbL0A = 2; - tiling->dbL0B = 2; - tiling->dbL0C = 1; -} - -// y = matmul(xa, xb) -int32_t main(int32_t argc, char *argv[]) -{ - size_t xaSize = 512 * 1024 * sizeof(aclFloat16); - size_t xbSize = 512 * 1024 * sizeof(aclFloat16); - size_t ySize = 512 * 1024 * sizeof(float); - size_t tilingSize = sizeof(TCubeTiling); - uint32_t blockDim = 8; - - CHECK_ACL(aclInit(nullptr)); - aclrtContext context; - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); - - aclFloat16 *xaHost; - CHECK_ACL(aclrtMallocHost((void **)(&xaHost), xaSize)); - fillValue(xaHost, xaSize); - - aclFloat16 *xbHost; - CHECK_ACL(aclrtMallocHost((void **)(&xbHost), xbSize)); - fillValue(xbHost, xbSize); - - uint32_t *tilingHost; - CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingSize)); - MakeTiling(tilingHost, tilingSize); - - // 将host的输入同步到device - uint8_t *xaDevice; - uint8_t *xbDevice; - uint8_t *tilingDevice; - CHECK_ACL(aclrtMalloc((void **)&xaDevice, xaSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(xaDevice, xaSize, xaHost, xaSize, ACL_MEMCPY_HOST_TO_DEVICE)); // 准备 xa - CHECK_ACL(aclrtMalloc((void **)&xbDevice, xbSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(xbDevice, xbSize, xbHost, xbSize, ACL_MEMCPY_HOST_TO_DEVICE)); // 准备 xb - CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(tilingDevice, tilingSize, tilingHost, tilingSize, ACL_MEMCPY_HOST_TO_DEVICE)); // 准备 tiling - - uint8_t *yDevice; - CHECK_ACL(aclrtMalloc((void **)&yDevice, ySize, ACL_MEM_MALLOC_HUGE_FIRST)); // 准备 输出 - - matmul_custom_do(blockDim, nullptr, stream, xaDevice, xbDevice, yDevice, tilingDevice); - CHECK_ACL(aclrtSynchronizeStream(stream)); - - // 将device的输出同步到host - float *yHost; - CHECK_ACL(aclrtMallocHost((void **)(&yHost), ySize)); - CHECK_ACL(aclrtMemcpy(yHost, ySize, yDevice, ySize, ACL_MEMCPY_DEVICE_TO_HOST)); - printTensor(yHost, 4 * 8 * 4); - - // 释放资源 - CHECK_ACL(aclrtFree(xaDevice)); - CHECK_ACL(aclrtFree(xbDevice)); - CHECK_ACL(aclrtFree(tilingDevice)); - CHECK_ACL(aclrtFree(yDevice)); - - CHECK_ACL(aclrtFreeHost(xaHost)); - CHECK_ACL(aclrtFreeHost(xbHost)); - CHECK_ACL(aclrtFreeHost(tilingHost)); - CHECK_ACL(aclrtFreeHost(yHost)); - - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); - return 0; -} \ No newline at end of file diff --git a/sample/normal_sample/cube_only/matmul_kernel.cpp b/sample/normal_sample/cube_only/matmul_kernel.cpp deleted file mode 100644 index 9ed629c51..000000000 --- a/sample/normal_sample/cube_only/matmul_kernel.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. - */ - -#include "kernel_operator.h" -#include "lib/matrix/matmul/matmul.h" -using namespace AscendC; -using namespace matmul; - -__aicore__ inline void CalcGMOffset( - int blockIdx, int usedCoreNum, TCubeTiling ¶m, int &offsetA, int &offsetB, int &offsetC) -{ - ASSERT(blockIdx < usedCoreNum); - uint32_t mIterSize = Ceil(param.M, param.singleCoreM); - ASSERT(mIterSize != 0); - uint32_t mCoreIndx = blockIdx % mIterSize; - uint32_t nCoreIndx = blockIdx / mIterSize; - - offsetA = mCoreIndx * param.Ka * param.singleCoreM; - offsetB = nCoreIndx * param.singleCoreN; - offsetC = mCoreIndx * param.N * param.singleCoreM + nCoreIndx * param.singleCoreN; - - // tail M - int gmUseM = param.M - mCoreIndx * param.singleCoreM; - param.singleCoreM = gmUseM < param.singleCoreM ? gmUseM : param.singleCoreM; - - // tail N - int gmUseN = param.N - nCoreIndx * param.singleCoreN; - param.singleCoreN = gmUseN < param.singleCoreN ? gmUseN : param.singleCoreN; - - // tail K - int gmUseK = param.Ka; - param.singleCoreK = gmUseK < param.singleCoreK ? gmUseK : param.singleCoreK; -} - -__aicore__ inline void CopyTiling(TCubeTiling *tiling, GM_ADDR tilingGM) -{ - uint32_t *ptr = reinterpret_cast(tiling); - auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM); - - for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { - *ptr = *(tiling32 + i); - } - return; -} - -extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR tilingGm) -{ - // cube core cases, ignore vector core - if (g_coreType == AIV) { - return; - } - using A_T = half; - using B_T = half; - using C_T = float; - using BiasT = float; - - TPipe que; - TCubeTiling tiling; - CopyTiling(&tiling, tilingGm); - - if (GetBlockIdx() >= tiling.usedCoreNum) { - return; - } - - GlobalTensor aGlobal; - GlobalTensor bGlobal; - GlobalTensor cGlobal; - - aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ A_T *>(a), tiling.M * tiling.Ka); - bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ B_T *>(b), tiling.Kb * tiling.N); - cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ C_T *>(c), tiling.M * tiling.N); - - int offsetA = 0; - int offsetB = 0; - int offsetC = 0; - CalcGMOffset(GetBlockIdx(), tiling.usedCoreNum, tiling, offsetA, offsetB, offsetC); - auto gmA = aGlobal[offsetA]; - auto gmB = bGlobal[offsetB]; - auto gmC = cGlobal[offsetC]; - - typedef MatmulType aType; - typedef MatmulType bType; - typedef MatmulType cType; - typedef MatmulType biasType; - MatmulImpl mm; - mm.SetSubBlockIdx(0); - mm.Init(&tiling, &que); - - mm.SetTensorA(gmA); - mm.SetTensorB(gmB); - mm.IterateAll(gmC); -} - -// call of kernel function -void matmul_custom_do( - uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *tilingGm) -{ - matmul_custom<<>>(a, b, c, tilingGm); -} \ No newline at end of file diff --git a/sample/normal_sample/mix/Makefile b/sample/normal_sample/mix/Makefile deleted file mode 100644 index 8f162255b..000000000 --- a/sample/normal_sample/mix/Makefile +++ /dev/null @@ -1,29 +0,0 @@ -# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 -ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest - -COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 -COMPILER_FLAG := -xcce -O2 -std=c++17 -LINK_FLAG := --cce-fatobj-link -DAV_FLAG := --cce-aicore-arch=dav-c220 -ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 -TILING_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -HOST_INC_FLAG := -I${ASCEND_HOME_PATH}/include -LINK_LIBS := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++ -lprofapi -lmmpa -lascendalog -lregister -lerror_manager -lc_sec -LINK_STATIC_LIBS := ${ASCEND_HOME_PATH}/lib64/libascendc_runtime.a - -all: build - -build: matmul_leakyrelu_kernel.o main.o matmul_leakyrelu.fatbin - -matmul_leakyrelu_kernel.o: matmul_leakyrelu_kernel.cpp - $(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^ - -main.o: main.cpp - $(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(HOST_INC_FLAG) $(TILING_INC_FLAG) -o $@ -c $^ - -matmul_leakyrelu.fatbin: matmul_leakyrelu_kernel.o main.o - $(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ $(LINK_LIBS) $(LINK_STATIC_LIBS) - -.PHONY: clean -clean: - rm *.o *.fatbin \ No newline at end of file diff --git a/sample/normal_sample/mix/main.cpp b/sample/normal_sample/mix/main.cpp deleted file mode 100644 index 91ba79932..000000000 --- a/sample/normal_sample/mix/main.cpp +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. - */ - -#include -#include -#include "kernel_tiling/kernel_tiling.h" // tiling结构体的依赖 -#include "acl/acl.h" - -extern void matmul_leakyrelu_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *ffts_addr, uint8_t *a, - uint8_t *b, uint8_t *bias, uint8_t *c, uint8_t *workspace, uint8_t *tilingGm); - -// 下面接口是libascendc_runtime.a中定义 -extern "C" uint32_t GetAscendCoreSyncAddr(void **addr); - -#define ACL_ERROR_NONE 0 - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - printf("%s: %d aclError %d\n", __FILE__, __LINE__, __ret); \ - } \ - } while (0) - -void printTensor(float *ptr, size_t size) -{ - size_t colNum = 8; - for (size_t i = 0; i < size / colNum / sizeof(float); i++) { - for (size_t j = 0; j < colNum; j++) { - printf("%5.2f ", ptr[colNum * i + j]); - } - printf("\n"); - } -} - -void fillValue(aclFloat16 *addr, size_t size, float value) -{ - aclFloat16 val = aclFloatToFloat16(value); - for (size_t i = 0; i < size / sizeof(aclFloat16); i++) { - addr[i] = val; - } -} - -void fillFloatValue(float *addr, size_t size, float value) -{ - for (size_t i = 0; i < size / sizeof(float); i++) { - addr[i] = value; - } -} - -void MakeTiling(int32_t *addr, size_t size) -{ - assert(sizeof(TCubeTiling) <= size); - // TCubeTiling该结构体在kernel_tiling/kernel_tiling.h中的结构体定义 - // tiling_api.h中本身定义的结构与kernel_tiling.h相近,通过GetTiling实现映射 - // TCubeTiling定义的可读性较好,可以直接理解,但使用tiling_api可以直接使能部分默认值 - // 考虑到工具本身需要体现对应用的细粒度控制,所以直接使用kernel_tiling.h中的结构 - TCubeTiling *tiling = (TCubeTiling *)addr; - // 此处计算使用的核数 - tiling->usedCoreNum = 2; // (M/singleCoreM)*(N/singleCoreN)*(K/singleCoreK)=2*1*1=2 - // 对于 xa 是[M, Ka]矩阵, xb 是[Kb, N]矩阵,此处数据需要与外部格式保持一致 - // 参考 AscendC算子开发文档 - // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/devguide/opdevg/ascendcopdevg/atlas_ascendc_10_0060.html - // 中对 数据分块(Tiling) 部分的介绍 - tiling->M = 1024; - tiling->N = 640; - tiling->Ka = 256; // Ka和Kb一般一样,只有pad的时候存在不一致,比如[1, 62]*[64, 2],这里64就是有pad的 - tiling->Kb = 256; - tiling->isBias = 1; - // 多核切分的tiling参数,用于度量单个核上处理的数据大小 - // xa在M轴上切分,分成多个singleCoreM;单核处理singleCoreM * singleCoreK大小数据 - // xb在N轴上切分,分成多个singleCoreN;单核处理singleCoreK * singleCoreN - // 由于输入在M和N轴上切分了,输出singleCoreM * singleCoreN - tiling->singleCoreM = 512; - tiling->singleCoreN = 640; - tiling->singleCoreK = 256; // 不建议对k进行切分,会导致累加,引起不确定计算 - // 核内切分的tiling参数,用于单个核内的最小计算单位 - tiling->baseM = 256; - tiling->baseN = 128; - tiling->baseK = 64; - tiling->stepM = 1; - tiling->stepN = 1; - tiling->stepKa = 4; - tiling->stepKb = 1; - // A1+B1的缓存数据需要小于等于shareL1Size大小 - tiling->depthA1 = 8; // 矩阵[baseM, baseK]的缓存数量 - tiling->depthB1 = 2; // 矩阵[basek, baseN]的缓存数量 - // 其他参数 - tiling->iterateOrder = 0; // 控制迭代的方向:0代表先M轴再N轴,1代表先N轴再M轴 - tiling->shareL1Size = 294912; // 如存在多个matmul时,可以单独控制每个使用空间 295424 - // 不小于(baseM*baseK*depthA1 + baseK+baseN*depthB1)*sizeof(half) = 294912 - tiling->shareL0CSize = 256 * 128 * 4; // 如存在多个matmul时,可以单独控制每个使用空间 - // 不小于baseM*baseN*sizeof(float) - // 下列是bmm中使用的batch参数,如果需要实现bmm,该结构体中还有其他tiling参数 - tiling->batchM = 1; // 对于普通matmul,默认1 - tiling->batchN = 1; // 对于普通matmul,默认1 - tiling->singleBatchM = 1; - tiling->singleBatchN = 1; - // 下面的db参数用于控制ping-pong - tiling->dbL0A = 2; - tiling->dbL0B = 2; - tiling->dbL0C = 1; -} - -int32_t main(int32_t argc, char *argv[]) -{ - size_t xaSize = 1024 * 256 * sizeof(aclFloat16); - size_t xbSize = 256 * 640 * sizeof(aclFloat16); - size_t biasSize = 640 * sizeof(float); - size_t ySize = 1024 * 640 * sizeof(float); - size_t workspaceSize = 16 * 1024 * 1024 * sizeof(float); // AscendC::GetUserWorkspace中预留空间 - size_t tilingSize = 96 * sizeof(uint32_t); - uint32_t blockDim = 1; - - CHECK_ACL(aclInit(nullptr)); - aclrtContext context; - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); - - aclFloat16 *xaHost; - CHECK_ACL(aclrtMallocHost((void **)(&xaHost), xaSize)); - fillValue(xaHost, xaSize, 4.0f); - - aclFloat16 *xbHost; - CHECK_ACL(aclrtMallocHost((void **)(&xbHost), xbSize)); - fillValue(xbHost, xbSize, 4.0f); - - float *biasHost; - CHECK_ACL(aclrtMallocHost((void **)(&biasHost), biasSize)); - fillFloatValue(biasHost, biasSize, 0.0f); - - float *workspaceHost; - CHECK_ACL(aclrtMallocHost((void **)(&workspaceHost), workspaceSize)); - fillFloatValue(workspaceHost, workspaceSize, 0.0f); - - int32_t *tilingHost; - CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingSize)); - MakeTiling(tilingHost, tilingSize); - - // 将host的输入同步到device - uint8_t *xaDevice; - uint8_t *xbDevice; - uint8_t *biasDevice; - uint8_t *tilingDevice; - uint8_t *workspaceDevice; - CHECK_ACL(aclrtMalloc((void **)&xaDevice, xaSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(xaDevice, xaSize, xaHost, xaSize, ACL_MEMCPY_HOST_TO_DEVICE)); // 准备 xa - CHECK_ACL(aclrtMalloc((void **)&xbDevice, xbSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(xbDevice, xbSize, xbHost, xbSize, ACL_MEMCPY_HOST_TO_DEVICE)); // 准备 xb - CHECK_ACL(aclrtMalloc((void **)&biasDevice, biasSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(biasDevice, biasSize, biasHost, biasSize, ACL_MEMCPY_HOST_TO_DEVICE)); // 准备 bias - CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(tilingDevice, tilingSize, tilingHost, tilingSize, ACL_MEMCPY_HOST_TO_DEVICE)); // 准备 tiling - CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy( - workspaceDevice, workspaceSize, workspaceHost, workspaceSize, ACL_MEMCPY_HOST_TO_DEVICE)); // 准备workspace - - uint8_t *yDevice; - CHECK_ACL(aclrtMalloc((void **)&yDevice, ySize, ACL_MEM_MALLOC_HUGE_FIRST)); // 准备 输出 - - void *addr; - (void)GetAscendCoreSyncAddr(&addr); - matmul_leakyrelu_custom_do(blockDim, - nullptr, - stream, - (uint8_t *)addr, - xaDevice, - xbDevice, - biasDevice, - yDevice, - workspaceDevice, - tilingDevice); - CHECK_ACL(aclrtSynchronizeStream(stream)); - - // 将device的输出同步到host - float *yHost; - CHECK_ACL(aclrtMallocHost((void **)(&yHost), ySize)); - CHECK_ACL(aclrtMemcpy(yHost, ySize, yDevice, ySize, ACL_MEMCPY_DEVICE_TO_HOST)); - printTensor(yHost, 4 * 8 * 4); - - // 释放资源 - CHECK_ACL(aclrtFree(xaDevice)); - CHECK_ACL(aclrtFree(xbDevice)); - CHECK_ACL(aclrtFree(tilingDevice)); - CHECK_ACL(aclrtFree(yDevice)); - - CHECK_ACL(aclrtFreeHost(xaHost)); - CHECK_ACL(aclrtFreeHost(xbHost)); - CHECK_ACL(aclrtFreeHost(tilingHost)); - CHECK_ACL(aclrtFreeHost(yHost)); - - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); - return 0; -} \ No newline at end of file diff --git a/sample/normal_sample/mix/matmul_leakyrelu_kernel.cpp b/sample/normal_sample/mix/matmul_leakyrelu_kernel.cpp deleted file mode 100644 index 378a8a172..000000000 --- a/sample/normal_sample/mix/matmul_leakyrelu_kernel.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. - */ - -#include "kernel_operator.h" -#include "lib/matmul_intf.h" - -using namespace AscendC; -using namespace matmul; - -template -class MatmulLeakyKernel { -public: - __aicore__ inline MatmulLeakyKernel(){}; - __aicore__ inline void Init( - GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling, TPipe *pipe); - __aicore__ inline void Process(TPipe *pipe); - - __aicore__ inline void MatmulCompute(); - __aicore__ inline void LeakyReluCompute(); - __aicore__ inline void CopyOut(uint32_t count); - __aicore__ inline void CalcOffset(int32_t blockIdx, int32_t usedCoreNum, const TCubeTiling &tiling, - int32_t &offsetA, int32_t &offsetB, int32_t &offsetC, int32_t &offsetBias); - - Matmul, MatmulType, - MatmulType, MatmulType> - matmulObj; - - GlobalTensor aGlobal; - GlobalTensor bGlobal; - GlobalTensor cGlobal; - GlobalTensor biasGlobal; - LocalTensor reluOutLocal; - TCubeTiling tiling; - TQue reluOutQueue_; -}; - -template -__aicore__ inline void MatmulLeakyKernel::Init( - GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGM, TPipe *pipe) -{ - auto tempTilingGM = (__gm__ uint32_t *)tilingGM; - auto tempTiling = (uint32_t *)&tiling; - for (int32_t i = 0; i < sizeof(TCubeTiling) / sizeof(int32_t); ++i, ++tempTilingGM, ++tempTiling) { - *tempTiling = *tempTilingGM; - } - aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType *>(a), tiling.M * tiling.Ka); - bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType *>(b), tiling.Kb * tiling.N); - cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType *>(c), tiling.M * tiling.N); - biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType *>(bias), tiling.N); - - int32_t offsetA, offsetB, offsetC, offsetBias; - CalcOffset(GetBlockIdx(), tiling.usedCoreNum, tiling, offsetA, offsetB, offsetC, offsetBias); - aGlobal = aGlobal[offsetA]; - bGlobal = bGlobal[offsetB]; - cGlobal = cGlobal[offsetC]; - biasGlobal = biasGlobal[offsetBias]; - pipe->InitBuffer(reluOutQueue_, 1, tiling.baseM * tiling.baseN * sizeof(cType)); - SetSysWorkspace(workspace); - if (GetSysWorkSpacePtr() == nullptr) { - return; - } -} - -template -__aicore__ inline void MatmulLeakyKernel::Process(TPipe *pipe) -{ - uint32_t computeRound = 0; - -#ifdef CUSTOM_ASCEND310P - TBuf<> tmpMMFormatUb; - LocalTensor mmformatUb; - pipe->InitBuffer(tmpMMFormatUb, tiling.baseM * tiling.baseN * sizeof(cType)); - mmformatUb = tmpMMFormatUb.Get(tiling.baseM * tiling.baseN * sizeof(cType)); - matmulObj.SetLocalWorkspace(mmformatUb); -#endif - matmulObj.SetTensorA(aGlobal); - matmulObj.SetTensorB(bGlobal); - matmulObj.SetBias(biasGlobal); - while (matmulObj.template Iterate()) { - MatmulCompute(); - LeakyReluCompute(); - CopyOut(computeRound); - computeRound++; - } - matmulObj.End(); -} - -template -__aicore__ inline void MatmulLeakyKernel::MatmulCompute() -{ - reluOutLocal = reluOutQueue_.AllocTensor(); - matmulObj.template GetTensorC(reluOutLocal, false, true); -} - -template -__aicore__ inline void MatmulLeakyKernel::LeakyReluCompute() -{ - LeakyRelu(reluOutLocal, reluOutLocal, (cType)0.001, tiling.baseM * tiling.baseN); - reluOutQueue_.EnQue(reluOutLocal); -} - -template -__aicore__ inline void MatmulLeakyKernel::CopyOut(uint32_t count) -{ - reluOutQueue_.DeQue(); - const uint32_t roundM = tiling.singleCoreM / tiling.baseM; - const uint32_t roundN = tiling.singleCoreN / tiling.baseN; - uint32_t startOffset = (count % roundM * tiling.baseM * tiling.N + count / roundM * tiling.baseN); - DataCopyParams copyParam = {(uint16_t)tiling.baseM, - (uint16_t)(tiling.baseN * sizeof(cType) / DEFAULT_C0_SIZE), - 0, - (uint16_t)((tiling.N - tiling.baseN) * sizeof(cType) / DEFAULT_C0_SIZE)}; - DataCopy(cGlobal[startOffset], reluOutLocal, copyParam); - reluOutQueue_.FreeTensor(reluOutLocal); -} - -template -__aicore__ inline void MatmulLeakyKernel::CalcOffset(int32_t blockIdx, - int32_t usedCoreNum, const TCubeTiling &tiling, int32_t &offsetA, int32_t &offsetB, int32_t &offsetC, - int32_t &offsetBias) -{ - auto mSingleBlocks = Ceil(tiling.M, tiling.singleCoreM); - auto mCoreIndx = blockIdx % mSingleBlocks; - auto nCoreIndx = blockIdx / mSingleBlocks; - - offsetA = mCoreIndx * tiling.Ka * tiling.singleCoreM; - offsetB = nCoreIndx * tiling.singleCoreN; - offsetC = mCoreIndx * tiling.N * tiling.singleCoreM + nCoreIndx * tiling.singleCoreN; - offsetBias = nCoreIndx * tiling.singleCoreN; -} - -__aicore__ inline void matmul_leakyrelu_custom_inner( - GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) -{ - MatmulLeakyKernel matmulLeakyKernel; - TPipe pipe; - matmulLeakyKernel.Init(a, b, bias, c, workspace, tiling, &pipe); - REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulLeakyKernel.matmulObj, &matmulLeakyKernel.tiling); - matmulLeakyKernel.Process(&pipe); -} - -// 基于CANN sample仓中的MatMulLeakyReluCustomSample进行改造 -// https://gitee.com/ascend/samples/tree/master/operator/ -// 如下内容是编译生成的build/auto_gen/ascendc_kernels_npu目录下 -// auto_gen_matmul_leakyrelu_custom.cpp中代码的简化 -// 具体简化内容: -// 1. 移除了cpu部分的代码 -// 2. 由于本身不再需要修改核函数入口,故而不需要__global__宏的替代 -// 3. 由于明确有workspace和tiling,故而移除了这些宏中的内容 -__aicore__ inline GM_ADDR kfc_init(GM_ADDR ffts_addr, GM_ADDR workspace) -{ - GM_ADDR workspace_param; - GM_ADDR workspace_usr; - workspace_param = workspace; - if (workspace_param == nullptr) { - return workspace; - } - set_ffts_base_addr((uint64_t)ffts_addr); - AscendC::SetSysWorkspaceForce(workspace_param); - workspace_usr = AscendC::GetUserWorkspace(workspace_param); -#if defined(REGIST_MATMUL_OBJ) - if constexpr (g_coreType == AscendC::AIC) { - matmul::clearWorkspace(workspace_param); - } - if constexpr (g_coreType == AscendC::AIV) { -#ifdef MIX_N - FftsCrossCoreSync(PIPE_MTE3, 0x501); - WaitEvent(0x5); -#endif - } -#endif - return workspace_usr; -} - -extern "C" __global__ __aicore__ void matmul_leakyrelu_custom( - GM_ADDR ffts_addr, GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) -{ - workspace = kfc_init(ffts_addr, workspace); - matmul_leakyrelu_custom_inner(a, b, bias, c, workspace, tiling); -} - -// call of kernel function -void matmul_leakyrelu_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *ffts_addr, uint8_t *a, - uint8_t *b, uint8_t *bias, uint8_t *c, uint8_t *workspace, uint8_t *tilingGm) -{ - matmul_leakyrelu_custom<<>>(ffts_addr, a, b, bias, c, workspace, tilingGm); -} diff --git a/sample/normal_sample/vec_only/Makefile b/sample/normal_sample/vec_only/Makefile deleted file mode 100644 index 00e70c995..000000000 --- a/sample/normal_sample/vec_only/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 -ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest - -COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 -COMPILER_FLAG := -xcce -O2 -std=c++17 -LINK_FLAG := --cce-fatobj-link -DAV_FLAG := --cce-aicore-arch=dav-c220-vec -ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 -HOST_INC_FLAG := -I${ASCEND_HOME_PATH}/include -LINK_LIBS := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++ - -all: build - -build: add_kernel.o main.o add.fatbin - -add_kernel.o: add_kernel.cpp - $(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^ - -main.o: main.cpp - $(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(HOST_INC_FLAG) -o $@ -c $^ - -add.fatbin: add_kernel.o main.o - $(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ $(LINK_LIBS) - -.PHONY: clean -clean: - rm *.o add.fatbin \ No newline at end of file diff --git a/sample/normal_sample/vec_only/add_kernel.cpp b/sample/normal_sample/vec_only/add_kernel.cpp deleted file mode 100644 index 444be23e5..000000000 --- a/sample/normal_sample/vec_only/add_kernel.cpp +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. - * - * Function : z = x + y - * This sample is a very basic sample that implements vector add on Ascend plaform. - * In this sample: - * Length of x / y / z is 8*2048. - * Num of vector core used in sample is 8. - * Length for each core to compute is 2048. - * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop. - * - */ -#include "kernel_operator.h" -using namespace AscendC; -constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data -constexpr int32_t USE_CORE_NUM = 8; // num of core used -constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core -constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core -constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue -constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer - -class KernelAdd { -public: - __aicore__ inline KernelAdd() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) - { - // get start index for current core, core parallel - xGm.SetGlobalBuffer((__gm__ int16_t*)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - yGm.SetGlobalBuffer((__gm__ int16_t*)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - zGm.SetGlobalBuffer((__gm__ int16_t*)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - // pipe alloc memory to queue, the unit is Bytes - pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t)); - pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t)); - } - __aicore__ inline void Process() - { - // loop count need to be doubled, due to double buffer - constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; - // tiling strategy, pipeline parallel - for (int32_t i = 0; i < loopCount; i++) { - CopyIn(i); - Compute(i); - CopyOut(i); - } - } -private: - __aicore__ inline void CopyIn(int32_t progress) - { - // alloc tensor from queue memory - LocalTensor xLocal = inQueueX.AllocTensor(); - LocalTensor yLocal = inQueueY.AllocTensor(); - // copy progress_th tile from global tensor to local tensor - DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); - DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); - // enque input tensors to VECIN queue - inQueueX.EnQue(xLocal); - inQueueY.EnQue(yLocal); - } - __aicore__ inline void Compute(int32_t progress) - { - // deque input tensors from VECIN queue - LocalTensor xLocal = inQueueX.DeQue(); - LocalTensor yLocal = inQueueY.DeQue(); - LocalTensor zLocal = outQueueZ.AllocTensor(); - // call Add instr for computation - Add(zLocal, xLocal, yLocal, TILE_LENGTH); - // enque the output tensor to VECOUT queue - outQueueZ.EnQue(zLocal); - // free input tensors for reuse - inQueueX.FreeTensor(xLocal); - inQueueY.FreeTensor(yLocal); - } - __aicore__ inline void CopyOut(int32_t progress) - { - // deque output tensor from VECOUT queue - LocalTensor zLocal = outQueueZ.DeQue(); - // copy progress_th tile from local tensor to global tensor - DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); - // free output tensor for reuse - outQueueZ.FreeTensor(zLocal); - } -private: - TPipe pipe; - // create queues for input, in this case depth is equal to buffer num - TQue inQueueX, inQueueY; - // create queue for output, in this case depth is equal to buffer num - TQue outQueueZ; - GlobalTensor xGm, yGm, zGm; -}; -// implementation of kernel function -extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) -{ - KernelAdd op; - op.Init(x, y, z); - op.Process(); -} - -// call of kernel function -void add_custom_do(uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* x, uint8_t* y, uint8_t* z) -{ - add_custom<<>>(x, y, z); -} \ No newline at end of file diff --git a/sample/normal_sample/vec_only/main.cpp b/sample/normal_sample/vec_only/main.cpp deleted file mode 100644 index 45865cea5..000000000 --- a/sample/normal_sample/vec_only/main.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. - */ - -#include -#include "acl/acl.h" - -#define ACL_ERROR_NONE 0 - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - printf("%s: %d aclError %d\n", __FILE__, __LINE__, __ret); \ - } \ - } while (0) - -void prepareTensor(int16_t *ptr, size_t size) -{ - for (size_t i = 0; i < size / sizeof(int16_t); i++) { - ptr[i] = i; - } -} - -void printTensor(int16_t *ptr, size_t size) -{ - size_t colNum = 8; - for (size_t i = 0; i < size / colNum / sizeof(int16_t); i++) { - for (size_t j = 0; j < colNum; j++) { - printf("%hu ", ptr[colNum * i + j]); - } - printf("\n"); - } -} - -extern void add_custom_do(uint32_t coreDim, void *l2ctrl, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); -int32_t main(int32_t argc, char *argv[]) -{ - size_t inputByteSize = 8 * 2048 * sizeof(int16_t); - size_t outputByteSize = 8 * 2048 * sizeof(int16_t); - uint32_t blockDim = 8; - // AscendCL初始化 - CHECK_ACL(aclInit(nullptr)); - // 运行管理资源申请 - aclrtContext context; - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); - // 分配Host内存 - int16_t *xHost = nullptr; - int16_t *yHost = nullptr; - int16_t *zHost = nullptr; - CHECK_ACL(aclrtMallocHost((void**)(&xHost), inputByteSize)); - CHECK_ACL(aclrtMallocHost((void**)(&yHost), inputByteSize)); - CHECK_ACL(aclrtMallocHost((void**)(&zHost), outputByteSize)); - // 分配Device内存 - uint8_t *xDevice = nullptr; - uint8_t *yDevice = nullptr; - uint8_t *zDevice = nullptr; - CHECK_ACL(aclrtMalloc((void**)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMalloc((void**)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMalloc((void**)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); - // Host内存初始化 - prepareTensor(xHost, inputByteSize); - prepareTensor(yHost, inputByteSize); - CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); - CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); - // 用内核调用符<<<>>>调用核函数完成指定的运算,add_custom_do中封装了<<<>>>调用 - add_custom_do(blockDim, nullptr, stream, xDevice, yDevice, zDevice); - CHECK_ACL(aclrtSynchronizeStream(stream)); - // 将Device上的运算结果拷贝回Host - CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); - printTensor(zHost, 2 * 8 * 2); // 显示2行 - // 释放申请的资源 - CHECK_ACL(aclrtFree(xDevice)); - CHECK_ACL(aclrtFree(yDevice)); - CHECK_ACL(aclrtFree(zDevice)); - CHECK_ACL(aclrtFreeHost(xHost)); - CHECK_ACL(aclrtFreeHost(yHost)); - CHECK_ACL(aclrtFreeHost(zHost)); - // AscendCL去初始化 - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); - return 0; -} \ No newline at end of file diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md deleted file mode 100644 index a2b1ba635..000000000 --- a/sample/pytorch_adapter/README.md +++ /dev/null @@ -1,53 +0,0 @@ -# 自定义算子的pytorch框架适配说明 - -## 简介 -昇腾提供丰富的算子接入框架的方式,此处将介绍最简单的一种,每个目录中都是一个独立的可使用的工程 - -## 依赖 -与业内pytorch的算子介入方式相同,算子接入框架需要保障设备上有正确的pytorch版本(我们还依赖torch_npu版本) - -pytorch版本可由pip安装,torch_npu版本详见[此处](https://gitee.com/ascend/pytorch/releases),请选择与pytorch适配的torch_npu版本。 - -## 工程介绍 -整体工程目录如下: -``` -- pytorch_adapter - |- jit_compile # 实时编译的接入方式 - |- add_adapter.cpp # 使用算子动态库接口完成算子在pytorch框架的适配 - |- add_kernel.cpp # 昇腾算子实现,并提供host侧的动态库接口 - |- main.py # python的入口,实现整体集成 - |- Makefile # 用以生成昇腾算子的host侧动态库的编译脚本 - |- with_setuptools # wheel包的接入方式 - |- add_adapter.cpp - |- add_kernel.cpp - |- Makefile - |- setup.py # setuptools的入口,支持编译并打包生成wheel包 - |- test.py # 测试wheel包功能的入口 -``` - -## 工程使用 - -### jit_compile工程 -执行如下命令,就会在运行过程中,现场生成python模块并使用: -``` -python main.py -``` - -### setuptools工程 -针对with_setuptools工程,可以编译出可安装的wheel包,便于多机部署使用。 - - -1. 执行如下命令可以编译出软件包(setuptools可以支持多种方式,比如:build,install等,此处不一一展示): -``` -pytorch setup.py bdist_wheel # 编译出wheel包,在dist目录下 -``` - -2. 到```dist```目录下用pip命令安装对应软件包。 - -3. 执行测试脚本 -``` -python test.py -``` - -## 其他 -1. 此处样例使用的是静态tiling,如果使用动态tiling,则可以在adapter.cpp中对Tensor的shape进行分析,选择合适tiling。(这部分是流程中必须的,只是可能在不同位置,比如aclnn中,这部分在接口实现;此处,我们本身也可以对add_custom_do进行封装,将tiling内置。) \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_compile/Makefile b/sample/pytorch_adapter/jit_compile/Makefile deleted file mode 100644 index ec9115f37..000000000 --- a/sample/pytorch_adapter/jit_compile/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 -ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest - -COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 -COMPILER_FLAG := -xcce -O2 -std=c++17 -DYNAMIC_LIB_FLAG := -fPIC -shared -DAV_FLAG := --cce-aicore-arch=dav-c220-vec -ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 - -all: build - -build: libcustom_kernels.so - -# 后续如果要扩展,把多个kernel的cpp都加到后面 -libcustom_kernels.so: add_kernel.cpp - $(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^ - -.PHONY: clean -clean: - rm *.so \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_compile/add_adapter.cpp b/sample/pytorch_adapter/jit_compile/add_adapter.cpp deleted file mode 100644 index 6c65e60ec..000000000 --- a/sample/pytorch_adapter/jit_compile/add_adapter.cpp +++ /dev/null @@ -1,128 +0,0 @@ -#include -#include "torch_npu/csrc/core/npu/NPUStream.h" -#include "torch_npu/csrc/framework/OpCommand.h" - -using torch::autograd::AutogradContext; -using torch::autograd::Function; -using tensor_list = std::vector; -using namespace at; - -extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); - -// 为NPU设备注册前向实现 -at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other) -{ - // 创建输出内存 - at::Tensor result = at::Tensor(self); - // 将pytorch中的结构翻译成为CANN认识的数据类型和结构 - // 1. (重要)通过对tensor的shape分析,选择合适的tiling(该算子为了简化,固定了tiling,只有特定shape下计算才正确) - // 2. 对数据类型和格式转换 -- 此处无需数据格式处理,直接使用 - auto stream = c10_npu::getCurrentNPUStream().stream(false); - auto x = self.storage().data(); - auto y = other.storage().data(); - auto z = result.storage().data(); - - uint32_t blockDim = 8; - auto callback = [stream, blockDim, x, y, z]() -> int { - add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z); - return 0; // 此处可以通过某种方式获取算子执行结果,还未实现 - }; - // 下发算子 - at_npu::native::OpCommand cmd; - cmd.Name("my_add").SetCustomHandler(callback).Run(); - return result; -} - -// 为NPU设备注册反向实现 -std::tuple my_add_backward_impl_npu(const at::Tensor &self) -{ - at::Tensor result = at::Tensor(self); // 创建输出内存 - - return {result, result}; -} - -// 为Meta设备注册前向实现 -at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other) -{ - return empty_like(self); -} - -// 为Meta设备注册反向实现 -std::tuple my_add_backward_impl_meta(const at::Tensor &self) -{ - auto result = empty_like(self); - return std::make_tuple(result, result); -} - -// 寻找注册在该op上的不同设备的实现 -at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other) -{ - static auto op = - torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed(); - return op.call(self, other); -} -// 寻找注册在该op上的不同设备的实现 -std::tuple my_add_backward_impl(const at::Tensor &self) -{ - static auto op = torch::Dispatcher::singleton() - .findSchemaOrThrow("myaten::my_add_backward", "") - .typed(); - return op.call(self); -} - -// 在myaten命名空间里注册my_add和my_add_backward两个schema -TORCH_LIBRARY(myaten, m) -{ - m.def("my_add(Tensor self, Tensor other) -> Tensor"); - m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)"); -} - -// 通过继承torch::autograd::Function类实现前反向绑定 -class MyAddFunction : public torch::autograd::Function { -public: - static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other) - { - at::AutoDispatchBelowADInplaceOrView guard; - return my_add_impl(self, other); - } - - static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) - { - auto grad_output = grad_outputs[0]; - auto result = my_add_backward_impl(grad_output); - return {std::get<0>(result), std::get<1>(result)}; - } -}; - -at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other) -{ - return MyAddFunction::apply(self, other); -} - -// 给op绑定NPU的自动求导实现 -// 如果是pytorch 2.1以下的版本,AutogradPrivateUse1需要改成AutogradXLA -TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m) -{ - m.impl("my_add", &my_add_impl_autograd); -} - -// 为NPU设备注册前反向实现 -// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1,在2.1以下版本用的是XLA,如果是2.1以下版本PrivateUse1需要改成XLA -TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m) -{ - m.impl("my_add", &my_add_impl_npu); - m.impl("my_add_backward", &my_add_backward_impl_npu); -} - -// 为Meta设备注册前反向实现 -TORCH_LIBRARY_IMPL(myaten, Meta, m) -{ - m.impl("my_add", &my_add_impl_meta); - m.impl("my_add_backward", &my_add_backward_impl_meta); -} - -// 通过pybind将c++接口和python接口绑定 -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - m.def("my_add", &my_add_impl_autograd, "x + y"); -} diff --git a/sample/pytorch_adapter/jit_compile/add_kernel.cpp b/sample/pytorch_adapter/jit_compile/add_kernel.cpp deleted file mode 100644 index 9aa62e093..000000000 --- a/sample/pytorch_adapter/jit_compile/add_kernel.cpp +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. - * - * Function : z = x + y - * This sample is a very basic sample that implements vector add on Ascend plaform. - * In this sample: - * Length of x / y / z is 8*2048. - * Num of vector core used in sample is 8. - * Length for each core to compute is 2048. - * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop. - * - */ -#include "kernel_operator.h" -using namespace AscendC; -constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data -constexpr int32_t USE_CORE_NUM = 8; // num of core used -constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core -constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core -constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue -constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer - -class KernelAdd { -public: - __aicore__ inline KernelAdd() - {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) - { - // get start index for current core, core parallel - xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - // pipe alloc memory to queue, the unit is Bytes - pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half)); - pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half)); - } - __aicore__ inline void Process() - { - // loop count need to be doubled, due to double buffer - constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; - // tiling strategy, pipeline parallel - for (int32_t i = 0; i < loopCount; i++) { - CopyIn(i); - Compute(i); - CopyOut(i); - } - } - -private: - __aicore__ inline void CopyIn(int32_t progress) - { - // alloc tensor from queue memory - LocalTensor xLocal = inQueueX.AllocTensor(); - LocalTensor yLocal = inQueueY.AllocTensor(); - // copy progress_th tile from global tensor to local tensor - DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); - DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); - // enque input tensors to VECIN queue - inQueueX.EnQue(xLocal); - inQueueY.EnQue(yLocal); - } - __aicore__ inline void Compute(int32_t progress) - { - // deque input tensors from VECIN queue - LocalTensor xLocal = inQueueX.DeQue(); - LocalTensor yLocal = inQueueY.DeQue(); - LocalTensor zLocal = outQueueZ.AllocTensor(); - // call Add instr for computation - Add(zLocal, xLocal, yLocal, TILE_LENGTH); - // enque the output tensor to VECOUT queue - outQueueZ.EnQue(zLocal); - // free input tensors for reuse - inQueueX.FreeTensor(xLocal); - inQueueY.FreeTensor(yLocal); - } - __aicore__ inline void CopyOut(int32_t progress) - { - // deque output tensor from VECOUT queue - LocalTensor zLocal = outQueueZ.DeQue(); - // copy progress_th tile from local tensor to global tensor - DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); - // free output tensor for reuse - outQueueZ.FreeTensor(zLocal); - } - -private: - TPipe pipe; - // create queues for input, in this case depth is equal to buffer num - TQue inQueueX, inQueueY; - // create queue for output, in this case depth is equal to buffer num - TQue outQueueZ; - GlobalTensor xGm, yGm, zGm; -}; -// implementation of kernel function -extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) -{ - KernelAdd op; - op.Init(x, y, z); - op.Process(); -} - -// 包裹核函数,使得普通编译器能认识这个符号 -extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) -{ - add_custom<<>>(x, y, z); -} \ No newline at end of file diff --git a/sample/pytorch_adapter/jit_compile/main.py b/sample/pytorch_adapter/jit_compile/main.py deleted file mode 100644 index 847a51f1c..000000000 --- a/sample/pytorch_adapter/jit_compile/main.py +++ /dev/null @@ -1,70 +0,0 @@ -import os -import subprocess -import torch -import torch_npu -import torch.utils.cpp_extension -from torch_npu.testing.testcase import TestCase, run_tests - -PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) -CUR_PATH = os.path.abspath(os.path.dirname(__file__)) - - -def compile_kernels(): - # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make - subprocess.run("make") - - -def compile_host(): - extra_ldflags = [] - extra_ldflags.append(f"-L{PYTORCH_NPU_INSTALL_PATH}/lib") - extra_ldflags.append("-ltorch_npu") - extra_ldflags.append(f"-L{CUR_PATH}/") - extra_ldflags.append("-lcustom_kernels") - extra_include_paths = [] - extra_include_paths.append("./") - extra_include_paths.append(os.path.join( - PYTORCH_NPU_INSTALL_PATH, "include")) - extra_include_paths.append(os.path.join(os.path.join(os.path.join(os.path.join( - PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc")) - - module = torch.utils.cpp_extension.load( - name="jit_extension", - sources=[ - "add_adapter.cpp" - ], - extra_include_paths=extra_include_paths, - extra_ldflags=extra_ldflags, - verbose=True) - return module - - -class TestCustomAdd(TestCase): - def test_add(self): - module = compile_host() - # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 - # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 - length = [8, 2048] - x = torch.rand(length, device='cpu', dtype=torch.float16) - y = torch.rand(length, device='cpu', dtype=torch.float16) - - x_npu = x.npu() - y_npu = y.npu() - x_npu.requires_grad = True - y_npu.requires_grad = True - output = module.my_add(x_npu, y_npu) - # 反向能力验证 - output.backward(output) - - x.requires_grad = True - y.requires_grad = True - cpuout = torch.add(x, y) - cpuout.backward(cpuout) - - self.assertRtolEqual(output, cpuout) - self.assertRtolEqual(x_npu.grad, x.grad) - self.assertRtolEqual(y_npu.grad, y.grad) - - -if __name__ == '__main__': - compile_kernels() - run_tests() diff --git a/sample/pytorch_adapter/with_setuptools/Makefile b/sample/pytorch_adapter/with_setuptools/Makefile deleted file mode 100644 index ec9115f37..000000000 --- a/sample/pytorch_adapter/with_setuptools/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化 -ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest - -COMPILER := $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置 -COMPILER_FLAG := -xcce -O2 -std=c++17 -DYNAMIC_LIB_FLAG := -fPIC -shared -DAV_FLAG := --cce-aicore-arch=dav-c220-vec -ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 - -all: build - -build: libcustom_kernels.so - -# 后续如果要扩展,把多个kernel的cpp都加到后面 -libcustom_kernels.so: add_kernel.cpp - $(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^ - -.PHONY: clean -clean: - rm *.so \ No newline at end of file diff --git a/sample/pytorch_adapter/with_setuptools/add_adapter.cpp b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp deleted file mode 100644 index 6c65e60ec..000000000 --- a/sample/pytorch_adapter/with_setuptools/add_adapter.cpp +++ /dev/null @@ -1,128 +0,0 @@ -#include -#include "torch_npu/csrc/core/npu/NPUStream.h" -#include "torch_npu/csrc/framework/OpCommand.h" - -using torch::autograd::AutogradContext; -using torch::autograd::Function; -using tensor_list = std::vector; -using namespace at; - -extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); - -// 为NPU设备注册前向实现 -at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other) -{ - // 创建输出内存 - at::Tensor result = at::Tensor(self); - // 将pytorch中的结构翻译成为CANN认识的数据类型和结构 - // 1. (重要)通过对tensor的shape分析,选择合适的tiling(该算子为了简化,固定了tiling,只有特定shape下计算才正确) - // 2. 对数据类型和格式转换 -- 此处无需数据格式处理,直接使用 - auto stream = c10_npu::getCurrentNPUStream().stream(false); - auto x = self.storage().data(); - auto y = other.storage().data(); - auto z = result.storage().data(); - - uint32_t blockDim = 8; - auto callback = [stream, blockDim, x, y, z]() -> int { - add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z); - return 0; // 此处可以通过某种方式获取算子执行结果,还未实现 - }; - // 下发算子 - at_npu::native::OpCommand cmd; - cmd.Name("my_add").SetCustomHandler(callback).Run(); - return result; -} - -// 为NPU设备注册反向实现 -std::tuple my_add_backward_impl_npu(const at::Tensor &self) -{ - at::Tensor result = at::Tensor(self); // 创建输出内存 - - return {result, result}; -} - -// 为Meta设备注册前向实现 -at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other) -{ - return empty_like(self); -} - -// 为Meta设备注册反向实现 -std::tuple my_add_backward_impl_meta(const at::Tensor &self) -{ - auto result = empty_like(self); - return std::make_tuple(result, result); -} - -// 寻找注册在该op上的不同设备的实现 -at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other) -{ - static auto op = - torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed(); - return op.call(self, other); -} -// 寻找注册在该op上的不同设备的实现 -std::tuple my_add_backward_impl(const at::Tensor &self) -{ - static auto op = torch::Dispatcher::singleton() - .findSchemaOrThrow("myaten::my_add_backward", "") - .typed(); - return op.call(self); -} - -// 在myaten命名空间里注册my_add和my_add_backward两个schema -TORCH_LIBRARY(myaten, m) -{ - m.def("my_add(Tensor self, Tensor other) -> Tensor"); - m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)"); -} - -// 通过继承torch::autograd::Function类实现前反向绑定 -class MyAddFunction : public torch::autograd::Function { -public: - static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other) - { - at::AutoDispatchBelowADInplaceOrView guard; - return my_add_impl(self, other); - } - - static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs) - { - auto grad_output = grad_outputs[0]; - auto result = my_add_backward_impl(grad_output); - return {std::get<0>(result), std::get<1>(result)}; - } -}; - -at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other) -{ - return MyAddFunction::apply(self, other); -} - -// 给op绑定NPU的自动求导实现 -// 如果是pytorch 2.1以下的版本,AutogradPrivateUse1需要改成AutogradXLA -TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m) -{ - m.impl("my_add", &my_add_impl_autograd); -} - -// 为NPU设备注册前反向实现 -// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1,在2.1以下版本用的是XLA,如果是2.1以下版本PrivateUse1需要改成XLA -TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m) -{ - m.impl("my_add", &my_add_impl_npu); - m.impl("my_add_backward", &my_add_backward_impl_npu); -} - -// 为Meta设备注册前反向实现 -TORCH_LIBRARY_IMPL(myaten, Meta, m) -{ - m.impl("my_add", &my_add_impl_meta); - m.impl("my_add_backward", &my_add_backward_impl_meta); -} - -// 通过pybind将c++接口和python接口绑定 -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - m.def("my_add", &my_add_impl_autograd, "x + y"); -} diff --git a/sample/pytorch_adapter/with_setuptools/add_kernel.cpp b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp deleted file mode 100644 index 9aa62e093..000000000 --- a/sample/pytorch_adapter/with_setuptools/add_kernel.cpp +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. - * - * Function : z = x + y - * This sample is a very basic sample that implements vector add on Ascend plaform. - * In this sample: - * Length of x / y / z is 8*2048. - * Num of vector core used in sample is 8. - * Length for each core to compute is 2048. - * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop. - * - */ -#include "kernel_operator.h" -using namespace AscendC; -constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data -constexpr int32_t USE_CORE_NUM = 8; // num of core used -constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core -constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core -constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue -constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer - -class KernelAdd { -public: - __aicore__ inline KernelAdd() - {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) - { - // get start index for current core, core parallel - xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH); - // pipe alloc memory to queue, the unit is Bytes - pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half)); - pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half)); - } - __aicore__ inline void Process() - { - // loop count need to be doubled, due to double buffer - constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM; - // tiling strategy, pipeline parallel - for (int32_t i = 0; i < loopCount; i++) { - CopyIn(i); - Compute(i); - CopyOut(i); - } - } - -private: - __aicore__ inline void CopyIn(int32_t progress) - { - // alloc tensor from queue memory - LocalTensor xLocal = inQueueX.AllocTensor(); - LocalTensor yLocal = inQueueY.AllocTensor(); - // copy progress_th tile from global tensor to local tensor - DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); - DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); - // enque input tensors to VECIN queue - inQueueX.EnQue(xLocal); - inQueueY.EnQue(yLocal); - } - __aicore__ inline void Compute(int32_t progress) - { - // deque input tensors from VECIN queue - LocalTensor xLocal = inQueueX.DeQue(); - LocalTensor yLocal = inQueueY.DeQue(); - LocalTensor zLocal = outQueueZ.AllocTensor(); - // call Add instr for computation - Add(zLocal, xLocal, yLocal, TILE_LENGTH); - // enque the output tensor to VECOUT queue - outQueueZ.EnQue(zLocal); - // free input tensors for reuse - inQueueX.FreeTensor(xLocal); - inQueueY.FreeTensor(yLocal); - } - __aicore__ inline void CopyOut(int32_t progress) - { - // deque output tensor from VECOUT queue - LocalTensor zLocal = outQueueZ.DeQue(); - // copy progress_th tile from local tensor to global tensor - DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); - // free output tensor for reuse - outQueueZ.FreeTensor(zLocal); - } - -private: - TPipe pipe; - // create queues for input, in this case depth is equal to buffer num - TQue inQueueX, inQueueY; - // create queue for output, in this case depth is equal to buffer num - TQue outQueueZ; - GlobalTensor xGm, yGm, zGm; -}; -// implementation of kernel function -extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) -{ - KernelAdd op; - op.Init(x, y, z); - op.Process(); -} - -// 包裹核函数,使得普通编译器能认识这个符号 -extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) -{ - add_custom<<>>(x, y, z); -} \ No newline at end of file diff --git a/sample/pytorch_adapter/with_setuptools/setup.py b/sample/pytorch_adapter/with_setuptools/setup.py deleted file mode 100644 index 92ab1d3c7..000000000 --- a/sample/pytorch_adapter/with_setuptools/setup.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -import subprocess -import torch -import torch_npu -from setuptools import setup, find_packages -from torch.utils.cpp_extension import BuildExtension -from torch_npu.utils.cpp_extension import NpuExtension - -PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__)) -CUR_PATH = os.path.abspath(os.path.dirname(__file__)) - - -def compile_kernels(): - # 由于pytorch中没有昇腾device编译的扩展,所以此处人工加make - subprocess.run("make") - return "libcustom_kernels.so" # 这个make出来的库名字 - - -def compile_adapter(): - ext = NpuExtension( - name="ascend_custom_kernels_lib", # import的库的名字 - # 如果还有其他cpp文件参与编译,需要在这里添加 - sources=[f"{CUR_PATH}/add_adapter.cpp"], - extra_compile_args=[ - '-I' + os.path.join(os.path.join(os.path.join(os.path.join( - PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"), - ], - library_dirs=[f"{CUR_PATH}"], # 编译时需要依赖的库文件的路径,相当于g++编译时的-L选项 - libraries=["custom_kernels"], # 编译时依赖的库文件,相当于-l选项 - ) - return [ext] - - -if __name__ == "__main__": - # 编译出含有算子的库,并以so的方式提供 - kernel_so = compile_kernels() - - # 编译出pytorch适配层的库,支持被框架集成 - exts = compile_adapter() - - # 将整体打包成wheel包 - setup( - name="ascend_custom_kernels", # package的名字 - version='1.0', - keywords='ascend_custom_kernels', - ext_modules=exts, - packages=find_packages(), - cmdclass={"build_ext": BuildExtension}, - data_files=[(".", [kernel_so])], - include_package_data=True, - ) diff --git a/sample/pytorch_adapter/with_setuptools/test.py b/sample/pytorch_adapter/with_setuptools/test.py deleted file mode 100644 index 896eef2c0..000000000 --- a/sample/pytorch_adapter/with_setuptools/test.py +++ /dev/null @@ -1,34 +0,0 @@ -import torch -import torch_npu -import ascend_custom_kernels_lib -from torch_npu.testing.testcase import TestCase, run_tests - - -class TestCustomAdd(TestCase): - def test_add(self): - # 由于kernel现在是静态tiling,所以此处尺寸需要匹配 - # 因为add是elementwise的,现有算子支持8*2048(详见kernel实现),所以,小于这个应该都可以 - length = [8, 2048] - x = torch.rand(length, device='cpu', dtype=torch.float16) - y = torch.rand(length, device='cpu', dtype=torch.float16) - - x_npu = x.npu() - y_npu = y.npu() - x_npu.requires_grad = True - y_npu.requires_grad = True - output = ascend_custom_kernels_lib.my_add(x_npu, y_npu) - # 反向能力验证 - output.backward(output) - - x.requires_grad = True - y.requires_grad = True - cpuout = torch.add(x, y) - cpuout.backward(cpuout) - - self.assertRtolEqual(output, cpuout) - self.assertRtolEqual(x_npu.grad, x.grad) - self.assertRtolEqual(y_npu.grad, y.grad) - - -if __name__ == "__main__": - run_tests() diff --git a/sample/sanitizer_sample/Racecheck/Makefile b/sample/sanitizer_sample/Racecheck/Makefile deleted file mode 100644 index ac230761b..000000000 --- a/sample/sanitizer_sample/Racecheck/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest - -COMPILER := ${ASCEND_HOME_PATH}/compiler/ccec_compiler/bin/ccec -HOST_COMPILER := g++ -COMPILER_FLAG := -xcce -O2 -std=c++17 --cce-enable-sanitizer -g -HOST_COMPILER_FLAG := -O2 -std=c++17 -LINK_FLAG := --cce-fatobj-link --cce-enable-sanitizer -DAV_FLAG := --cce-aicore-arch=dav-c220-vec -ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include -HOST_INC_FLAG := -I${ASCEND_HOME_PATH}/include -LINK_LIBS := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++ - -all: build - -build: raw_error_kernel.o main.o raw_error.fatbin - -raw_error_kernel.o: raw_error_kernel.cpp - $(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^ - -main.o: main.cpp - $(HOST_COMPILER) $(HOST_COMPILER_FLAG) $(HOST_INC_FLAG) -o $@ -c $^ - -raw_error.fatbin: raw_error_kernel.o main.o - $(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ ${LINK_LIBS} - -.PHONY: clean -clean: - rm *.o raw_error.fatbin diff --git a/sample/sanitizer_sample/Racecheck/main.cpp b/sample/sanitizer_sample/Racecheck/main.cpp deleted file mode 100644 index cd96d8e08..000000000 --- a/sample/sanitizer_sample/Racecheck/main.cpp +++ /dev/null @@ -1,42 +0,0 @@ -#include -#include "acl/acl.h" - -#define ACL_ERROR_NONE 0 - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0); - -extern "C" void raw_error_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gmInput, uint8_t *gmOutput); - -int main(void) -{ - CHECK_ACL(aclInit(nullptr)); - aclrtContext context; - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); - - uint8_t *gmInput = nullptr; - uint8_t *gmOutput = nullptr; - CHECK_ACL(aclrtMalloc((void**)&gmInput, 256, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMalloc((void**)&gmOutput, 256, ACL_MEM_MALLOC_HUGE_FIRST)); - - uint64_t blockDim = 1UL; - raw_error_kernel_do(blockDim, nullptr, stream, gmInput, gmOutput); - CHECK_ACL(aclrtSynchronizeStream(stream)); - - CHECK_ACL(aclrtFree(gmInput)); - CHECK_ACL(aclrtFree(gmOutput)); - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); - return 0; -} \ No newline at end of file diff --git a/sample/sanitizer_sample/Racecheck/raw_error_kernel.cpp b/sample/sanitizer_sample/Racecheck/raw_error_kernel.cpp deleted file mode 100644 index b1db85609..000000000 --- a/sample/sanitizer_sample/Racecheck/raw_error_kernel.cpp +++ /dev/null @@ -1,28 +0,0 @@ -#include "kernel_operator.h" -#include "acl/acl.h" -using namespace AscendC; - -constexpr int32_t BYTESIZE_EXAMPLE = 256; -constexpr int32_t BUFFER_NUM = 1; -constexpr int32_t NUM_DATA = BYTESIZE_EXAMPLE / sizeof(half); - -extern "C" __global__ __aicore__ void raw_error_kernel(__gm__ uint8_t *gmInput, __gm__ uint8_t *gmOutput) { - TPipe pipe; - TQue xQue; - GlobalTensor xInGm, xOutGm; - pipe.InitBuffer(xQue, BUFFER_NUM, BYTESIZE_EXAMPLE); - LocalTensor xLocal = xQue.AllocTensor(); - xInGm.SetGlobalBuffer((__gm__ half*)gmInput, NUM_DATA); - xOutGm.SetGlobalBuffer((__gm__ half*)gmOutput, NUM_DATA); - DataCopy(xLocal, xInGm, NUM_DATA); - // 17行为对UB进行写入,22行为对UB进行读,由于中间没有阻塞,UB上存在先写后读的竞争。解决方法为借助Que,先入队然后出队 - // xQue.EnQue(xLocal); - // LocalTensor deQueLocal = xQue.DeQue(); - // DataCopy(xOutGm, deQueLocal, NUM_DATA); - DataCopy(xOutGm, xLocal, NUM_DATA); -} - -extern "C" void raw_error_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gmInput, uint8_t *gmOutput) -{ - raw_error_kernel<<>>(gmInput, gmOutput); -} diff --git a/sample/sanitizer_sample/memcheck/illegal_align/Makefile b/sample/sanitizer_sample/memcheck/illegal_align/Makefile deleted file mode 100644 index dff315b5d..000000000 --- a/sample/sanitizer_sample/memcheck/illegal_align/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest - -COMPILER := ${ASCEND_HOME_PATH}/compiler/ccec_compiler/bin/ccec -HOST_COMPILER := g++ -COMPILER_FLAG := -xcce -O2 -std=c++17 --cce-enable-sanitizer -g -HOST_COMPILER_FLAG := -O2 -std=c++17 -LINK_FLAG := --cce-fatobj-link --cce-enable-sanitizer -DAV_FLAG := --cce-aicore-arch=dav-c220-vec -ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化 -HOST_INC_FLAG := -I${ASCEND_HOME_PATH}/include -LINK_LIBS := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++ - -all: build - -build: illegal_align_kernel.o main.o illegal_align.fatbin - -illegal_align_kernel.o: illegal_align_kernel.cpp - $(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^ - -main.o: main.cpp - $(HOST_COMPILER) $(HOST_COMPILER_FLAG) $(HOST_INC_FLAG) -o $@ -c $^ - -illegal_align.fatbin: illegal_align_kernel.o main.o - $(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ ${LINK_LIBS} - -.PHONY: clean -clean: - rm *.o illegal_align.fatbin diff --git a/sample/sanitizer_sample/memcheck/illegal_align/illegal_align_kernel.cpp b/sample/sanitizer_sample/memcheck/illegal_align/illegal_align_kernel.cpp deleted file mode 100644 index 605744ed5..000000000 --- a/sample/sanitizer_sample/memcheck/illegal_align/illegal_align_kernel.cpp +++ /dev/null @@ -1,26 +0,0 @@ -#include "kernel_operator.h" -#include "acl/acl.h" -using namespace AscendC; - -constexpr int32_t BYTESIZE_EXAMPLE = 256; -constexpr int32_t NUM_DATA = BYTESIZE_EXAMPLE / sizeof(half); - -extern "C" __global__ __aicore__ void illegal_align_kernel(__gm__ uint8_t *gm) { - TPipe pipe; - TBuf tbuf; - pipe.InitBuffer(tbuf, BYTESIZE_EXAMPLE); - LocalTensor xLm = tbuf.Get(); - - GlobalTensor xGm; - xGm.SetGlobalBuffer((__gm__ half*)gm, NUM_DATA); - - // 对 UB 进行了错误的偏移导致 DataCopy 接口在对数据进行搬运时产生非对齐访问异常 - DataCopy(xGm, xLm[3], NUM_DATA); - // 正确的用法如下,在操作 Local Tensor 时地址偏移量应为 32 字节对齐 - DataCopy(xGm, xLm[32], NUM_DATA); -} - -extern "C" void illegal_align_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gm) -{ - illegal_align_kernel<<>>(gm); -} diff --git a/sample/sanitizer_sample/memcheck/illegal_align/main.cpp b/sample/sanitizer_sample/memcheck/illegal_align/main.cpp deleted file mode 100644 index 7f8b53031..000000000 --- a/sample/sanitizer_sample/memcheck/illegal_align/main.cpp +++ /dev/null @@ -1,39 +0,0 @@ -#include -#include "acl/acl.h" - -#define ACL_ERROR_NONE 0 - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0); - -extern "C" void illegal_align_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gm); - -int main(void) -{ - CHECK_ACL(aclInit(nullptr)); - aclrtContext context; - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); - - uint8_t *gm = nullptr; - CHECK_ACL(aclrtMalloc((void**)&gm, 256, ACL_MEM_MALLOC_HUGE_FIRST)); - - uint64_t blockDim = 1UL; - illegal_align_kernel_do(blockDim, nullptr, stream, gm); - CHECK_ACL(aclrtSynchronizeStream(stream)); - - CHECK_ACL(aclrtFree(gm)); - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); - return 0; -} \ No newline at end of file diff --git a/sample/sanitizer_sample/memcheck/illegal_read_and_write/Makefile b/sample/sanitizer_sample/memcheck/illegal_read_and_write/Makefile deleted file mode 100644 index 4d810639e..000000000 --- a/sample/sanitizer_sample/memcheck/illegal_read_and_write/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest - -COMPILER := ${ASCEND_HOME_PATH}/compiler/ccec_compiler/bin/ccec -HOST_COMPILER := g++ -COMPILER_FLAG := -xcce -O2 -std=c++17 --cce-enable-sanitizer -g -HOST_COMPILER_FLAG := -O2 -std=c++17 -LINK_FLAG := --cce-fatobj-link --cce-enable-sanitizer -DAV_FLAG := --cce-aicore-arch=dav-c220-vec -ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include -HOST_INC_FLAG := -I${ASCEND_HOME_PATH}/include -LINK_LIBS := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++ - -all: build - -build: illegal_read_and_write_kernel.o main.o illegal_read_and_write.fatbin - -illegal_read_and_write_kernel.o: illegal_read_and_write_kernel.cpp - $(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^ - -main.o: main.cpp - $(HOST_COMPILER) $(HOST_COMPILER_FLAG) $(HOST_INC_FLAG) -o $@ -c $^ - -illegal_read_and_write.fatbin: illegal_read_and_write_kernel.o main.o - $(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ ${LINK_LIBS} - -.PHONY: clean -clean: - rm *.o illegal_read_and_write.fatbin diff --git a/sample/sanitizer_sample/memcheck/illegal_read_and_write/illegal_read_and_write_kernel.cpp b/sample/sanitizer_sample/memcheck/illegal_read_and_write/illegal_read_and_write_kernel.cpp deleted file mode 100644 index 8a85df5cc..000000000 --- a/sample/sanitizer_sample/memcheck/illegal_read_and_write/illegal_read_and_write_kernel.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include "kernel_operator.h" -#include "acl/acl.h" -using namespace AscendC; - -constexpr int32_t BYTESIZE = 256; -constexpr int32_t BYTESIZE_LARGE = 512; -constexpr int32_t NUM_DATA = BYTESIZE / sizeof(half); -constexpr int32_t NUM_DATA_LARGE = BYTESIZE_LARGE / sizeof(half); - -extern "C" __global__ __aicore__ void illegal_read_and_write_kernel(__gm__ uint8_t *gm) -{ - TPipe pipe; - TBuf xlm; - GlobalTensor xGm; - pipe.InitBuffer(xlm, BYTESIZE_LARGE); - LocalTensor xLm = xlm.Get(); - xGm.SetGlobalBuffer((__gm__ half *)gm, NUM_DATA); - DataCopy(xLm, xGm, NUM_DATA_LARGE); - DataCopy(xGm, xLm, NUM_DATA_LARGE); - // 第17行给xGm分配了BYTESIZE字节的内存,但是第18、19行DataCopy搬运了BYTESIZE_LARGE字节的内存 - // BYTESIZE_LARGE > BYTESIZE,导致对xGm的越界非法读写,以下是正确写法 - // DataCopy(xLm, xGm, NUM_DATA); - // DataCopy(xGm, xLm, NUM_DATA); -} - -extern "C" void illegal_read_and_write_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gm) -{ - illegal_read_and_write_kernel<<>>(gm); -} diff --git a/sample/sanitizer_sample/memcheck/illegal_read_and_write/main.cpp b/sample/sanitizer_sample/memcheck/illegal_read_and_write/main.cpp deleted file mode 100644 index 44138b3d9..000000000 --- a/sample/sanitizer_sample/memcheck/illegal_read_and_write/main.cpp +++ /dev/null @@ -1,39 +0,0 @@ -#include -#include "acl/acl.h" - -#define ACL_ERROR_NONE 0 - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0); - -extern "C" void illegal_read_and_write_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gm); - -int main(void) -{ - CHECK_ACL(aclInit(nullptr)); - aclrtContext context; - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); - - uint8_t *gm = nullptr; - CHECK_ACL(aclrtMalloc((void**)&gm, 256, ACL_MEM_MALLOC_HUGE_FIRST)); - - uint64_t blockDim = 1UL; - illegal_read_and_write_kernel_do(blockDim, nullptr, stream, gm); - CHECK_ACL(aclrtSynchronizeStream(stream)); - - CHECK_ACL(aclrtFree(gm)); - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); - return 0; -} \ No newline at end of file diff --git a/sample/sanitizer_sample/memcheck/out_of_bound/Makefile b/sample/sanitizer_sample/memcheck/out_of_bound/Makefile deleted file mode 100644 index 22e06f70f..000000000 --- a/sample/sanitizer_sample/memcheck/out_of_bound/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -ASCEND_HOME_PATH ?= /usr/local/Ascend/ascend-toolkit/latest - -COMPILER := ${ASCEND_HOME_PATH}/compiler/ccec_compiler/bin/ccec -HOST_COMPILER := g++ -COMPILER_FLAG := -xcce -O2 -std=c++17 --cce-enable-sanitizer -g -HOST_COMPILER_FLAG := -O2 -std=c++17 -LINK_FLAG := --cce-fatobj-link --cce-enable-sanitizer -DAV_FLAG := --cce-aicore-arch=dav-c220-vec -ASCENDC_INC_FLAG := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include -HOST_INC_FLAG := -I${ASCEND_HOME_PATH}/include -LINK_LIBS := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++ - -all: build - -build: out_of_bound_kernel.o main.o out_of_bound.fatbin - -out_of_bound_kernel.o: out_of_bound_kernel.cpp - $(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^ - -main.o: main.cpp - $(HOST_COMPILER) $(HOST_COMPILER_FLAG) $(HOST_INC_FLAG) -o $@ -c $^ - -out_of_bound.fatbin: out_of_bound_kernel.o main.o - $(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ ${LINK_LIBS} - -.PHONY: clean -clean: - rm *.o out_of_bound.fatbin diff --git a/sample/sanitizer_sample/memcheck/out_of_bound/main.cpp b/sample/sanitizer_sample/memcheck/out_of_bound/main.cpp deleted file mode 100644 index 53da5b54a..000000000 --- a/sample/sanitizer_sample/memcheck/out_of_bound/main.cpp +++ /dev/null @@ -1,39 +0,0 @@ -#include -#include "acl/acl.h" - -#define ACL_ERROR_NONE 0 - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0); - -extern "C" void out_of_bound_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gm); - -int main(void) -{ - CHECK_ACL(aclInit(nullptr)); - aclrtContext context; - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); - - uint8_t *gm = nullptr; - CHECK_ACL(aclrtMalloc((void**)&gm, 512, ACL_MEM_MALLOC_HUGE_FIRST)); - - uint64_t blockDim = 10UL; - out_of_bound_kernel_do(blockDim, nullptr, stream, gm); - CHECK_ACL(aclrtSynchronizeStream(stream)); - - CHECK_ACL(aclrtFree(gm)); - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); - return 0; -} \ No newline at end of file diff --git a/sample/sanitizer_sample/memcheck/out_of_bound/out_of_bound_kernel.cpp b/sample/sanitizer_sample/memcheck/out_of_bound/out_of_bound_kernel.cpp deleted file mode 100644 index cb54dc322..000000000 --- a/sample/sanitizer_sample/memcheck/out_of_bound/out_of_bound_kernel.cpp +++ /dev/null @@ -1,27 +0,0 @@ -#include "kernel_operator.h" -#include "acl/acl.h" -using namespace AscendC; - -constexpr int32_t BYTESIZE = 512; -constexpr int32_t NUM_DATA = 16; -constexpr int32_t CORE_OFFSET = 14; -constexpr int32_t LOOP_COUNT = 10; - -extern "C" __global__ __aicore__ void out_of_bound_kernel(__gm__ uint8_t *gm) -{ - TPipe pipe; - TBuf xlm; - GlobalTensor xGm; - pipe.InitBuffer(xlm, BYTESIZE); - LocalTensor xLm = xlm.Get(); - xGm.SetGlobalBuffer((__gm__ half *)gm + GetBlockIdx() * CORE_OFFSET, NUM_DATA); - // 这里第17行CORE_OFFSET < NUM_DATA, 第21行多核写入GM时,写入的size大于偏移,导致出现内存踩踏 - // 以下是正确写法 - // xGm.SetGlobalBuffer((__gm__ half *)gm + GetBlockIdx() * NUM_DATA, NUM_DATA); - DataCopy(xGm, xLm, NUM_DATA); -} - -extern "C" void out_of_bound_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gm) -{ - out_of_bound_kernel<<>>(gm); -} -- Gitee