From 6881fdb522a9f925f6ef4e455b497b3966efeace Mon Sep 17 00:00:00 2001
From: menfeifei <894242714@qq.com>
Date: Tue, 2 Sep 2025 07:47:43 +0000
Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20sample?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sample/README.md                              | 144 -------------
 sample/build/build.sh                         |  38 ----
 sample/normal_sample/.keep                    |   0
 sample/normal_sample/cube_only/Makefile       |  28 ---
 sample/normal_sample/cube_only/main.cpp       | 171 ---------------
 .../normal_sample/cube_only/matmul_kernel.cpp | 100 ---------
 sample/normal_sample/mix/Makefile             |  29 ---
 sample/normal_sample/mix/main.cpp             | 201 ------------------
 .../mix/matmul_leakyrelu_kernel.cpp           | 188 ----------------
 sample/normal_sample/vec_only/Makefile        |  27 ---
 sample/normal_sample/vec_only/add_kernel.cpp  | 103 ---------
 sample/normal_sample/vec_only/main.cpp        |  89 --------
 sample/pytorch_adapter/README.md              |  53 -----
 sample/pytorch_adapter/jit_compile/Makefile   |  20 --
 .../jit_compile/add_adapter.cpp               | 128 -----------
 .../jit_compile/add_kernel.cpp                | 106 ---------
 sample/pytorch_adapter/jit_compile/main.py    |  70 ------
 .../pytorch_adapter/with_setuptools/Makefile  |  20 --
 .../with_setuptools/add_adapter.cpp           | 128 -----------
 .../with_setuptools/add_kernel.cpp            | 106 ---------
 .../pytorch_adapter/with_setuptools/setup.py  |  51 -----
 .../pytorch_adapter/with_setuptools/test.py   |  34 ---
 sample/sanitizer_sample/Racecheck/Makefile    |  28 ---
 sample/sanitizer_sample/Racecheck/main.cpp    |  42 ----
 .../Racecheck/raw_error_kernel.cpp            |  28 ---
 .../memcheck/illegal_align/Makefile           |  28 ---
 .../illegal_align/illegal_align_kernel.cpp    |  26 ---
 .../memcheck/illegal_align/main.cpp           |  39 ----
 .../memcheck/illegal_read_and_write/Makefile  |  28 ---
 .../illegal_read_and_write_kernel.cpp         |  29 ---
 .../memcheck/illegal_read_and_write/main.cpp  |  39 ----
 .../memcheck/out_of_bound/Makefile            |  28 ---
 .../memcheck/out_of_bound/main.cpp            |  39 ----
 .../out_of_bound/out_of_bound_kernel.cpp      |  27 ---
 34 files changed, 2215 deletions(-)
 delete mode 100644 sample/README.md
 delete mode 100644 sample/build/build.sh
 delete mode 100644 sample/normal_sample/.keep
 delete mode 100644 sample/normal_sample/cube_only/Makefile
 delete mode 100644 sample/normal_sample/cube_only/main.cpp
 delete mode 100644 sample/normal_sample/cube_only/matmul_kernel.cpp
 delete mode 100644 sample/normal_sample/mix/Makefile
 delete mode 100644 sample/normal_sample/mix/main.cpp
 delete mode 100644 sample/normal_sample/mix/matmul_leakyrelu_kernel.cpp
 delete mode 100644 sample/normal_sample/vec_only/Makefile
 delete mode 100644 sample/normal_sample/vec_only/add_kernel.cpp
 delete mode 100644 sample/normal_sample/vec_only/main.cpp
 delete mode 100644 sample/pytorch_adapter/README.md
 delete mode 100644 sample/pytorch_adapter/jit_compile/Makefile
 delete mode 100644 sample/pytorch_adapter/jit_compile/add_adapter.cpp
 delete mode 100644 sample/pytorch_adapter/jit_compile/add_kernel.cpp
 delete mode 100644 sample/pytorch_adapter/jit_compile/main.py
 delete mode 100644 sample/pytorch_adapter/with_setuptools/Makefile
 delete mode 100644 sample/pytorch_adapter/with_setuptools/add_adapter.cpp
 delete mode 100644 sample/pytorch_adapter/with_setuptools/add_kernel.cpp
 delete mode 100644 sample/pytorch_adapter/with_setuptools/setup.py
 delete mode 100644 sample/pytorch_adapter/with_setuptools/test.py
 delete mode 100644 sample/sanitizer_sample/Racecheck/Makefile
 delete mode 100644 sample/sanitizer_sample/Racecheck/main.cpp
 delete mode 100644 sample/sanitizer_sample/Racecheck/raw_error_kernel.cpp
 delete mode 100644 sample/sanitizer_sample/memcheck/illegal_align/Makefile
 delete mode 100644 sample/sanitizer_sample/memcheck/illegal_align/illegal_align_kernel.cpp
 delete mode 100644 sample/sanitizer_sample/memcheck/illegal_align/main.cpp
 delete mode 100644 sample/sanitizer_sample/memcheck/illegal_read_and_write/Makefile
 delete mode 100644 sample/sanitizer_sample/memcheck/illegal_read_and_write/illegal_read_and_write_kernel.cpp
 delete mode 100644 sample/sanitizer_sample/memcheck/illegal_read_and_write/main.cpp
 delete mode 100644 sample/sanitizer_sample/memcheck/out_of_bound/Makefile
 delete mode 100644 sample/sanitizer_sample/memcheck/out_of_bound/main.cpp
 delete mode 100644 sample/sanitizer_sample/memcheck/out_of_bound/out_of_bound_kernel.cpp

diff --git a/sample/README.md b/sample/README.md
deleted file mode 100644
index 15238cb9f..000000000
--- a/sample/README.md
+++ /dev/null
@@ -1,144 +0,0 @@
-# 样例库介绍
-
-## 说明
-本案例库主要用于配合AscendC算子开发工具的能力演示，所以，算子工程做了深度简化，聚焦于辅助工具展示。
-
-如果考虑商用集成，推荐使用CANN软件包中的AscendC样例工程，比如：ascendc_kernel_cmake目录。本项目中的工程就是基于其进行简化仅用于快速验证。
-
-说明：该sample目录中，每个最小目录就是一个完整的样例工程。这些样例工程本身可能以为依赖的不同存在差异。
-
-## 依赖说明
-- 硬件环境请参见《[昇腾产品形态说明](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F80RC22%2Fquickstart%2Fquickstart%2Fquickstart_18_0002.html)》。
-- 软件环境请参见《[CANN 软件安装指南](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fdocument%2Fdetail%2Fzh%2Fcanncommercial%2F80RC22%2Fsoftwareinst%2Finstg%2Finstg_0000.html%3FMode%3DPmIns%26OS%3DUbuntu%26Software%3DcannToolKit)》安装昇腾设备开发或运行环境，即toolkit软件包。
-
-以上环境依赖请根据实际环境选择适配的版本。
-
-### 版本配套   
-| 条件 | 要求 |
-|---|---|
-| CANN版本 | >=8.0.RC1.alpha001 |
-| 硬件要求 | Atlas 800T A2 训练服务器|
-
-- 支持AscendPyTorch 1.11.0或更高版本，支持的PyTorch和CANN以及PyTorch和Python软件版本配套关系请参见《[Ascend Extension for PyTorch插件](https://gitee.com/ascend/pytorch)》。
-- 固件驱动版本与配套CANN软件支持的固件驱动版本相同，开发者可通过“[昇腾社区-固件与驱动](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fhardware%2Ffirmware-drivers%2Fcommunity%3Fproduct%3D2%26model%3D28%26cann%3D8.0.RC3.alpha003%26driver%3D1.0.25.alpha)”页面根据产品型号与CANN软件版本获取配套的固件与驱动。
-
-## 目录介绍
-整体目录结构如下：
-```
-- sample
-  |- build              # 编译并运行所有样例内容（建议按需使用，此处命令可以参考
-  |- normal_sample      # 纯C/C++的AscendC单算子极简工程，可配合msdebug和msprof工具
-    |- cube_only        # 仅含aic的AscendC单算子极简工程
-    |- mix              # mix算子的AscendC单算子极简工程
-    |- vec_only         # 仅含aiv的AscendC单算子极简工程
-  |- pytorch_adapter    # 适配pytorch的AscendC单算子极简工程，可配合msdebug和msprof工具
-    |- jit_compile      # jit模式，运行时编译使用
-    |- with_setuptools  # 编译成wheel包安装使用
-  |- sanitizer_sample   # 异常样例，用于配合mssanitizer工具
-    |- racecheck        # 含竞争问题的样例
-    |- xx               # 其他异常样例 
-```
-
-如果你关注自定义算子的pytorch框架适配，详见[此处](./pytorch_adapter/README.md)
-
-
-## 算子调试 msdebug
-若使用msdebug进行上板调试，还需要额外调整，具体如下：
-1. 编译阶段：在```sample\normal_sample\vec_only```相对路径下的```Makefile```文件中修改如下内容：
-    + 调试信息增强，并扩大栈空间：
-    ```
-    COMPILER_FLAG		:= -xcce -O2 -std=c++17
-    修改为：
-    COMPILER_FLAG		:= -xcce -O0 -std=c++17 -g --cce-ignore-always-inline=true
-    ```
-
-2. 运行阶段：
-```
-msdebug ./*.fatbin
-```
-
-## 内存检测 sanitizer
-1. 编译阶段：在编译过程中添加```--cce-enable-sanitizer -g```参数, 在链接过程中添加```--cce-enable-sanitizer```参数。（现样例中已在Makefile中添加），执行如下命令：
-```
-make
-```
-
-2. 运行阶段：
-```
-mssanitizer ./*.fatbin  # 默认进行memcheck检查
-```
-
-
-## 算子调优
-算子调优工具可以支持上板和仿真算子的调优，下面将以vec_only中的算子为例，进行工具使用的实战命令讲解
-
-### 上板调优
-1. 基于原始的sample代码，无需修改，直接编译算子，获得add.fatbin
-    ```
-    cd ./sample/normal_sample/vec_only
-    make clean && make
-    ```
-2. 使用算子调优工具，对算子程序进行调优。`--aic-metrics`省略，使用默认全部开启；`--output`参数省略，使用默认值`./`
-    ```
-    msprof op --application=./add.fatbin
-    ```
-3. 在当前目录下可以看到`OPPROF_`开头的文件夹，进入后将包含开启的`--aic-metrics`开关对应的csv数据和算子基础数据`OpBasicInfo.csv`。查看对应的csv文件即可获得算子的block级别的性能数据。（当前的算子性能数据是算子预热后的数据）
-    ```
-    OPPROF_2024xxxx_XXXXXX
-    ├── dump
-    ├── OpBasicInfo.csv
-    ├── ArithmeticUtilization.csv
-    ├── ... (开启的aic-metrics)
-    └──  ResourceConflictRatio.csv
-    ```
-4. 更多csv中指标信息请参考算子开发工具使用手册。
-
-### 仿真调优
-使用msprof进行仿真调优时，需要编译出可以运行在仿真器上的可执行算子，需要对编译选项稍作修改，修改如下
-在```./sample/normal_sample/vec_only```相对路径下的```Makefile```文件中修改如下内容：
-+ 仿真器依赖：
- ```
- LINK_LIBS					:= -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++
- 修改为：
- LINK_LIBS					:= -L${ASCEND_HOME_PATH}/lib64 -L${ASCEND_HOME_PATH}/tools/simulator/${SOC_VERSION}/lib/ -lruntime_camodel -lascendcl -lstdc++  # 需要添加libruntime_camodel的依赖路径, SOC_VERSION 通过使用npu-smi info命令进行查询，获取Chip Name信息。实际配置值                                                                                                                                                     为AscendChip Name，例如Chip Name取值为xxxyy，实际配置值为Ascendxxxyy。当Ascendxxxyy为代码样例路径时，需要配置ascendxxxyy。
- ```
- + 调试信息增强：
- ```
- COMPILER_FLAG		:= -xcce -O2 -std=c++17
- 修改为：
- COMPILER_FLAG		:= -xcce -O2 -std=c++17 -g
-```
-
-下面将从编译阶段开始进行
-
-1. 仿真算子编译
-   ```
-    cd ./sample/normal_sample/vec_only
-    make clean && make
-   ```
-2. 添加运行时依赖库路径
-    ```
-    # SOC_VERSION为NPU名称，可通过npu-smi info命令进行查询。
-    export LD_LIBRARY_PATH=${ASCEND_HOME_PATH}/tools/simulator/${SOC_VERSION}/lib/:$LD_LIBRARY_PATH  
-    ```
-3. 使用算子调优工具进行仿真调优，获取仿真性能数据，`--output`参数省略，使用默认值`./`
-   ```
-   msprof op simulator --application=./add.fatbin
-   ```
-4. 在当前目录下可以看到`OPPROF_`开头的文件夹，，生成以OPPROF_时间_随机字符串的文件夹，结构如下：
-    ```
-    OPPROF_20231023120542_FQMZMGOHUYVUZEXP
-    ├── dump                                    # 原始性能数据，无需关注
-    └── simulation                              # 仿真性能数据分析结果
-        ├── core0.veccore0                      # 算子block级子核，vec样例中使用了8个
-            ├── core0.veccore1_code_exe.csv     # 代码行耗时
-            ├── core0.veccore1_instr_exe.csv    # 程序代码指令详细信息
-            └── trace.json                      # 算子block级子核流水图
-        ├── ...
-        ├── api                                 # 算子热点图文件夹，将文件夹内全部文件拖入Ascend Compute即可
-            ├── api.json                        # 代码热点映射
-            └── <user-kernel>.cpp               # 算子kernel代码
-        ├── visualize_data.bin                  # 算子可视化文件，使用Ascend Insight加载
-        └── trace.json                          # 算子所有核的流水图
-    ```
-4. 更多指标信息请参考算子开发工具使用手册。
diff --git a/sample/build/build.sh b/sample/build/build.sh
deleted file mode 100644
index bbb2915b0..000000000
--- a/sample/build/build.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#依赖外部内容ASCEND_HOME_PATH
-
-CUR_DIR=$(dirname $(readlink -f $0))
-TOP_DIR=$(readlink -f $CUR_DIR/../)
-
-# add
-cd ${TOP_DIR}/normal_sample/vec_only
-make
-mv *.fatbin ${TOP_DIR}/build
-
-# matmul
-cd ${TOP_DIR}/normal_sample/cube_only
-make
-mv *.fatbin ${TOP_DIR}/build
-
-# matmul_leakyrelu
-cd ${TOP_DIR}/normal_sample/mix
-make
-mv *.fatbin ${TOP_DIR}/build
-
-# illegal_read_and_write
-cd ${TOP_DIR}/sanitizer_sample/memcheck/illegal_read_and_write
-make
-mv *.fatbin ${TOP_DIR}/build
-
-# out_of_bound
-cd ${TOP_DIR}/sanitizer_sample/memcheck/out_of_bound
-make
-mv *.fatbin ${TOP_DIR}/build
-
-# illegal align sample for sanitizer
-cd ${TOP_DIR}/sanitizer_sample/memcheck/illegal_align
-make
-mv *.fatbin ${TOP_DIR}/build
-
-cd ${TOP_DIR}/sanitizer_sample/Racecheck
-make
-mv *.fatbin ${TOP_DIR}/build
\ No newline at end of file
diff --git a/sample/normal_sample/.keep b/sample/normal_sample/.keep
deleted file mode 100644
index e69de29bb..000000000
diff --git a/sample/normal_sample/cube_only/Makefile b/sample/normal_sample/cube_only/Makefile
deleted file mode 100644
index c730678e4..000000000
--- a/sample/normal_sample/cube_only/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化
-ASCEND_HOME_PATH			?= /usr/local/Ascend/ascend-toolkit/latest
-
-COMPILER					:= $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec  # 参考device_config.cmake中CMAKE_C_COMPILER配置
-COMPILER_FLAG				:= -xcce -O2 -std=c++17
-LINK_FLAG					:= --cce-fatobj-link
-DAV_FLAG                    := --cce-aicore-arch=dav-c220-cube
-ASCENDC_INC_FLAG 			:= -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化
-TILING_INC_FLAG				:= -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw
-HOST_INC_FLAG				:= -I${ASCEND_HOME_PATH}/include
-LINK_LIBS					:= -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++
-
-all: build
-
-build: matmul_kernel.o main.o matmul.fatbin
-
-matmul_kernel.o: matmul_kernel.cpp
-	$(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^
-
-main.o: main.cpp
-	$(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(HOST_INC_FLAG) $(TILING_INC_FLAG) -o $@ -c $^
-
-matmul.fatbin: matmul_kernel.o main.o
-	$(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ $(LINK_LIBS)
-
-.PHONY: clean
-clean:
-	rm *.o *.fatbin
\ No newline at end of file
diff --git a/sample/normal_sample/cube_only/main.cpp b/sample/normal_sample/cube_only/main.cpp
deleted file mode 100644
index 2e151c2ff..000000000
--- a/sample/normal_sample/cube_only/main.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
- */
-
-#include <cassert>
-#include <iostream>
-#include "kernel_tiling/kernel_tiling.h"
-#include "acl/acl.h"
-
-extern void matmul_custom_do(
-    uint32_t coreDim, void *l2ctrl, void *stream, uint8_t *param1, uint8_t *param2, uint8_t *param3, uint8_t *param4);
-
-#define ACL_ERROR_NONE 0
-
-#define CHECK_ACL(x)                                                   \
-    do {                                                               \
-        aclError __ret = x;                                            \
-        if (__ret != ACL_ERROR_NONE) {                                 \
-            printf("%s: %d aclError %d\n", __FILE__, __LINE__, __ret); \
-        }                                                              \
-    } while (0)
-
-void printTensor(float *ptr, size_t size)
-{
-    size_t colNum = 8;
-    for (size_t i = 0; i < size / colNum / sizeof(float); i++) {
-        for (size_t j = 0; j < colNum; j++) {
-            printf("%5.2f ", ptr[colNum * i + j]);
-        }
-        printf("\n");
-    }
-}
-
-void fillValue(aclFloat16 *addr, size_t size)
-{
-    aclFloat16 val = aclFloatToFloat16(4.0f);
-    for (size_t i = 0; i < size / sizeof(aclFloat16); i++) {
-        addr[i] = val;
-    }
-}
-
-void printAclFloat16(aclFloat16 *addr)
-{
-    for (int i = 0; i < 16; i++) {
-        printf("%f ", aclFloat16ToFloat(addr[i]));
-    }
-}
-
-
-void MakeTiling(uint32_t *addr, size_t size)
-{
-    // TCubeTiling该结构体在kernel_tiling/kernel_tiling.h中的结构体定义
-    // tiling_api.h中本身定义的结构与kernel_tiling.h相近，通过GetTiling实现映射
-    // TCubeTiling定义的可读性较好，可以直接理解，但使用tiling_api可以直接使能部分默认值
-    // 考虑到工具本身需要体现对应用的细粒度控制，所以直接使用kernel_tiling.h中的结构
-    TCubeTiling *tiling = (TCubeTiling *)addr;
-    // 此处计算使用的核数
-    tiling->usedCoreNum = 16;  // (M/singleCoreM)*(N/singleCoreN)*(K/singleCoreK)=4*4*1=16
-    // 对于 xa 是[M, Ka]矩阵， xb 是[Kb, N]矩阵，此处数据需要与外部格式保持一致
-    // 参考 AscendC算子开发文档
-    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/devguide/opdevg/ascendcopdevg/atlas_ascendc_10_0060.html
-    // 中对 数据分块(Tiling) 部分的介绍
-    tiling->M = 512;   //
-    tiling->N = 1024;  //
-    tiling->Ka = 512;  // Ka和Kb一般一样，只有pad的时候存在不一致，比如[1, 62]*[64, 2],这里64就是有pad的
-    tiling->Kb = 512;    //
-    tiling->isBias = 0;  // 是否有bias
-    // 多核切分的tiling参数，用于度量单个核上处理的数据大小
-    // xa在M轴上切分，分成多个singleCoreM；单核处理singleCoreM * singleCoreK大小数据
-    // xb在N轴上切分，分成多个singleCoreN；单核处理singleCoreK * singleCoreN
-    // 由于输入在M和N轴上切分了，输出singleCoreM * singleCoreN
-    tiling->singleCoreM = 128;
-    tiling->singleCoreN = 256;
-    tiling->singleCoreK = 512;  // 不建议对k进行切分，会导致累加，引起不确定计算
-    // 核内切分的tiling参数，用于单个核内的最小计算单位
-    tiling->baseM = 128;
-    tiling->baseN = 256;
-    tiling->baseK = 64;
-    tiling->stepM = 1;
-    tiling->stepN = 1;
-    tiling->stepKa = 8;
-    tiling->stepKb = 8;
-    // A1+B1的缓存数据需要小于shareL1Size大小
-    tiling->depthA1 = 8;  // 矩阵[baseM, baseK]的缓存数量
-    tiling->depthB1 = 8;  // 矩阵[basek, baseN]的缓存数量
-    // 其他参数
-    tiling->iterateOrder = 0;          // 控制迭代的方向：0代表先M轴再N轴，1代表先N轴再M轴
-    tiling->shareL1Size = 384 * 1024;  // 如存在多个matmul时，可以单独控制每个使用空间
-                                       // 不小于 (baseM * baseK * depthA1 + baseN * baseK * depthB1) * sizeof(half)
-    tiling->shareL0CSize = 128 * 256 * 4;  // 如存在多个matmul时，可以单独控制每个使用空间
-                                           // 不小于 baseM * baseN * sizeof(float)
-    // 下列是bmm中使用的batch参数，如果需要实现bmm，该结构体中还有其他tiling参数
-    tiling->batchM = 1;  // 对于普通matmul，默认1
-    tiling->batchN = 1;  // 对于普通matmul，默认1
-    tiling->singleBatchM = 1;
-    tiling->singleBatchN = 1;
-    // 下面的db参数用于控制ping-pong
-    tiling->dbL0A = 2;
-    tiling->dbL0B = 2;
-    tiling->dbL0C = 1;
-}
-
-// y = matmul(xa, xb)
-int32_t main(int32_t argc, char *argv[])
-{
-    size_t xaSize = 512 * 1024 * sizeof(aclFloat16);
-    size_t xbSize = 512 * 1024 * sizeof(aclFloat16);
-    size_t ySize = 512 * 1024 * sizeof(float);
-    size_t tilingSize = sizeof(TCubeTiling);
-    uint32_t blockDim = 8;
-
-    CHECK_ACL(aclInit(nullptr));
-    aclrtContext context;
-    int32_t deviceId = 0;
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    aclrtStream stream = nullptr;
-    CHECK_ACL(aclrtCreateStream(&stream));
-
-    aclFloat16 *xaHost;
-    CHECK_ACL(aclrtMallocHost((void **)(&xaHost), xaSize));
-    fillValue(xaHost, xaSize);
-
-    aclFloat16 *xbHost;
-    CHECK_ACL(aclrtMallocHost((void **)(&xbHost), xbSize));
-    fillValue(xbHost, xbSize);
-
-    uint32_t *tilingHost;
-    CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingSize));
-    MakeTiling(tilingHost, tilingSize);
-
-    // 将host的输入同步到device
-    uint8_t *xaDevice;
-    uint8_t *xbDevice;
-    uint8_t *tilingDevice;
-    CHECK_ACL(aclrtMalloc((void **)&xaDevice, xaSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(xaDevice, xaSize, xaHost, xaSize, ACL_MEMCPY_HOST_TO_DEVICE));  // 准备 xa
-    CHECK_ACL(aclrtMalloc((void **)&xbDevice, xbSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(xbDevice, xbSize, xbHost, xbSize, ACL_MEMCPY_HOST_TO_DEVICE));  // 准备 xb
-    CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingSize, tilingHost, tilingSize, ACL_MEMCPY_HOST_TO_DEVICE));  // 准备 tiling
-
-    uint8_t *yDevice;
-    CHECK_ACL(aclrtMalloc((void **)&yDevice, ySize, ACL_MEM_MALLOC_HUGE_FIRST));  // 准备 输出
-
-    matmul_custom_do(blockDim, nullptr, stream, xaDevice, xbDevice, yDevice, tilingDevice);
-    CHECK_ACL(aclrtSynchronizeStream(stream));
-
-    // 将device的输出同步到host
-    float *yHost;
-    CHECK_ACL(aclrtMallocHost((void **)(&yHost), ySize));
-    CHECK_ACL(aclrtMemcpy(yHost, ySize, yDevice, ySize, ACL_MEMCPY_DEVICE_TO_HOST));
-    printTensor(yHost, 4 * 8 * 4);
-
-    // 释放资源
-    CHECK_ACL(aclrtFree(xaDevice));
-    CHECK_ACL(aclrtFree(xbDevice));
-    CHECK_ACL(aclrtFree(tilingDevice));
-    CHECK_ACL(aclrtFree(yDevice));
-
-    CHECK_ACL(aclrtFreeHost(xaHost));
-    CHECK_ACL(aclrtFreeHost(xbHost));
-    CHECK_ACL(aclrtFreeHost(tilingHost));
-    CHECK_ACL(aclrtFreeHost(yHost));
-
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtDestroyContext(context));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-    return 0;
-}
\ No newline at end of file
diff --git a/sample/normal_sample/cube_only/matmul_kernel.cpp b/sample/normal_sample/cube_only/matmul_kernel.cpp
deleted file mode 100644
index 9ed629c51..000000000
--- a/sample/normal_sample/cube_only/matmul_kernel.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
- */
-
-#include "kernel_operator.h"
-#include "lib/matrix/matmul/matmul.h"
-using namespace AscendC;
-using namespace matmul;
-
-__aicore__ inline void CalcGMOffset(
-    int blockIdx, int usedCoreNum, TCubeTiling &param, int &offsetA, int &offsetB, int &offsetC)
-{
-    ASSERT(blockIdx < usedCoreNum);
-    uint32_t mIterSize = Ceil(param.M, param.singleCoreM);
-    ASSERT(mIterSize != 0);
-    uint32_t mCoreIndx = blockIdx % mIterSize;
-    uint32_t nCoreIndx = blockIdx / mIterSize;
-
-    offsetA = mCoreIndx * param.Ka * param.singleCoreM;
-    offsetB = nCoreIndx * param.singleCoreN;
-    offsetC = mCoreIndx * param.N * param.singleCoreM + nCoreIndx * param.singleCoreN;
-
-    // tail M
-    int gmUseM = param.M - mCoreIndx * param.singleCoreM;
-    param.singleCoreM = gmUseM < param.singleCoreM ? gmUseM : param.singleCoreM;
-
-    // tail N
-    int gmUseN = param.N - nCoreIndx * param.singleCoreN;
-    param.singleCoreN = gmUseN < param.singleCoreN ? gmUseN : param.singleCoreN;
-
-    // tail K
-    int gmUseK = param.Ka;
-    param.singleCoreK = gmUseK < param.singleCoreK ? gmUseK : param.singleCoreK;
-}
-
-__aicore__ inline void CopyTiling(TCubeTiling *tiling, GM_ADDR tilingGM)
-{
-    uint32_t *ptr = reinterpret_cast<uint32_t *>(tiling);
-    auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM);
-
-    for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
-        *ptr = *(tiling32 + i);
-    }
-    return;
-}
-
-extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR tilingGm)
-{
-    // cube core cases, ignore vector core
-    if (g_coreType == AIV) {
-        return;
-    }
-    using A_T = half;
-    using B_T = half;
-    using C_T = float;
-    using BiasT = float;
-
-    TPipe que;
-    TCubeTiling tiling;
-    CopyTiling(&tiling, tilingGm);
-
-    if (GetBlockIdx() >= tiling.usedCoreNum) {
-        return;
-    }
-
-    GlobalTensor<A_T> aGlobal;
-    GlobalTensor<B_T> bGlobal;
-    GlobalTensor<C_T> cGlobal;
-
-    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ A_T *>(a), tiling.M * tiling.Ka);
-    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ B_T *>(b), tiling.Kb * tiling.N);
-    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ C_T *>(c), tiling.M * tiling.N);
-
-    int offsetA = 0;
-    int offsetB = 0;
-    int offsetC = 0;
-    CalcGMOffset(GetBlockIdx(), tiling.usedCoreNum, tiling, offsetA, offsetB, offsetC);
-    auto gmA = aGlobal[offsetA];
-    auto gmB = bGlobal[offsetB];
-    auto gmC = cGlobal[offsetC];
-
-    typedef MatmulType<AscendC::TPosition::GM, CubeFormat::ND, A_T> aType;
-    typedef MatmulType<AscendC::TPosition::GM, CubeFormat::ND, B_T> bType;
-    typedef MatmulType<AscendC::TPosition::GM, CubeFormat::ND, C_T> cType;
-    typedef MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasT> biasType;
-    MatmulImpl<aType, bType, cType, biasType> mm;
-    mm.SetSubBlockIdx(0);
-    mm.Init(&tiling, &que);
-
-    mm.SetTensorA(gmA);
-    mm.SetTensorB(gmB);
-    mm.IterateAll(gmC);
-}
-
-// call of kernel function
-void matmul_custom_do(
-    uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *tilingGm)
-{
-    matmul_custom<<<blockDim, l2ctrl, stream>>>(a, b, c, tilingGm);
-}
\ No newline at end of file
diff --git a/sample/normal_sample/mix/Makefile b/sample/normal_sample/mix/Makefile
deleted file mode 100644
index 8f162255b..000000000
--- a/sample/normal_sample/mix/Makefile
+++ /dev/null
@@ -1,29 +0,0 @@
-# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化
-ASCEND_HOME_PATH			?= /usr/local/Ascend/ascend-toolkit/latest
-
-COMPILER					:= $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec # 参考device_config.cmake中CMAKE_C_COMPILER配置
-COMPILER_FLAG				:= -xcce -O2 -std=c++17
-LINK_FLAG					:= --cce-fatobj-link
-DAV_FLAG                    := --cce-aicore-arch=dav-c220
-ASCENDC_INC_FLAG 			:= -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化
-TILING_INC_FLAG				:= -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw
-HOST_INC_FLAG				:= -I${ASCEND_HOME_PATH}/include
-LINK_LIBS					:= -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++ -lprofapi -lmmpa -lascendalog -lregister -lerror_manager -lc_sec
-LINK_STATIC_LIBS			:= ${ASCEND_HOME_PATH}/lib64/libascendc_runtime.a
-
-all: build
-
-build: matmul_leakyrelu_kernel.o main.o matmul_leakyrelu.fatbin
-
-matmul_leakyrelu_kernel.o: matmul_leakyrelu_kernel.cpp
-	$(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^
-
-main.o: main.cpp
-	$(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(HOST_INC_FLAG) $(TILING_INC_FLAG) -o $@ -c $^
-
-matmul_leakyrelu.fatbin: matmul_leakyrelu_kernel.o main.o
-	$(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ $(LINK_LIBS) $(LINK_STATIC_LIBS)
-
-.PHONY: clean
-clean:
-	rm *.o *.fatbin
\ No newline at end of file
diff --git a/sample/normal_sample/mix/main.cpp b/sample/normal_sample/mix/main.cpp
deleted file mode 100644
index 91ba79932..000000000
--- a/sample/normal_sample/mix/main.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
- */
-
-#include <cassert>
-#include <iostream>
-#include "kernel_tiling/kernel_tiling.h"  // tiling结构体的依赖
-#include "acl/acl.h"
-
-extern void matmul_leakyrelu_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *ffts_addr, uint8_t *a,
-    uint8_t *b, uint8_t *bias, uint8_t *c, uint8_t *workspace, uint8_t *tilingGm);
-
-// 下面接口是libascendc_runtime.a中定义
-extern "C" uint32_t GetAscendCoreSyncAddr(void **addr);
-
-#define ACL_ERROR_NONE 0
-
-#define CHECK_ACL(x)                                                   \
-    do {                                                               \
-        aclError __ret = x;                                            \
-        if (__ret != ACL_ERROR_NONE) {                                 \
-            printf("%s: %d aclError %d\n", __FILE__, __LINE__, __ret); \
-        }                                                              \
-    } while (0)
-
-void printTensor(float *ptr, size_t size)
-{
-    size_t colNum = 8;
-    for (size_t i = 0; i < size / colNum / sizeof(float); i++) {
-        for (size_t j = 0; j < colNum; j++) {
-            printf("%5.2f ", ptr[colNum * i + j]);
-        }
-        printf("\n");
-    }
-}
-
-void fillValue(aclFloat16 *addr, size_t size, float value)
-{
-    aclFloat16 val = aclFloatToFloat16(value);
-    for (size_t i = 0; i < size / sizeof(aclFloat16); i++) {
-        addr[i] = val;
-    }
-}
-
-void fillFloatValue(float *addr, size_t size, float value)
-{
-    for (size_t i = 0; i < size / sizeof(float); i++) {
-        addr[i] = value;
-    }
-}
-
-void MakeTiling(int32_t *addr, size_t size)
-{
-    assert(sizeof(TCubeTiling) <= size);
-    // TCubeTiling该结构体在kernel_tiling/kernel_tiling.h中的结构体定义
-    // tiling_api.h中本身定义的结构与kernel_tiling.h相近，通过GetTiling实现映射
-    // TCubeTiling定义的可读性较好，可以直接理解，但使用tiling_api可以直接使能部分默认值
-    // 考虑到工具本身需要体现对应用的细粒度控制，所以直接使用kernel_tiling.h中的结构
-    TCubeTiling *tiling = (TCubeTiling *)addr;
-    // 此处计算使用的核数
-    tiling->usedCoreNum = 2;  // (M/singleCoreM)*(N/singleCoreN)*(K/singleCoreK)=2*1*1=2
-    // 对于 xa 是[M, Ka]矩阵， xb 是[Kb, N]矩阵，此处数据需要与外部格式保持一致
-    // 参考 AscendC算子开发文档
-    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha001/devguide/opdevg/ascendcopdevg/atlas_ascendc_10_0060.html
-    // 中对 数据分块(Tiling) 部分的介绍
-    tiling->M = 1024;
-    tiling->N = 640;
-    tiling->Ka = 256;  // Ka和Kb一般一样，只有pad的时候存在不一致，比如[1, 62]*[64, 2],这里64就是有pad的
-    tiling->Kb = 256;
-    tiling->isBias = 1;
-    // 多核切分的tiling参数，用于度量单个核上处理的数据大小
-    // xa在M轴上切分，分成多个singleCoreM；单核处理singleCoreM * singleCoreK大小数据
-    // xb在N轴上切分，分成多个singleCoreN；单核处理singleCoreK * singleCoreN
-    // 由于输入在M和N轴上切分了，输出singleCoreM * singleCoreN
-    tiling->singleCoreM = 512;
-    tiling->singleCoreN = 640;
-    tiling->singleCoreK = 256;  // 不建议对k进行切分，会导致累加，引起不确定计算
-    // 核内切分的tiling参数，用于单个核内的最小计算单位
-    tiling->baseM = 256;
-    tiling->baseN = 128;
-    tiling->baseK = 64;
-    tiling->stepM = 1;
-    tiling->stepN = 1;
-    tiling->stepKa = 4;
-    tiling->stepKb = 1;
-    // A1+B1的缓存数据需要小于等于shareL1Size大小
-    tiling->depthA1 = 8;  // 矩阵[baseM, baseK]的缓存数量
-    tiling->depthB1 = 2;  // 矩阵[basek, baseN]的缓存数量
-    // 其他参数
-    tiling->iterateOrder = 0;              // 控制迭代的方向：0代表先M轴再N轴，1代表先N轴再M轴
-    tiling->shareL1Size = 294912;          // 如存在多个matmul时，可以单独控制每个使用空间 295424
-                                           // 不小于(baseM*baseK*depthA1 + baseK+baseN*depthB1)*sizeof(half) = 294912
-    tiling->shareL0CSize = 256 * 128 * 4;  // 如存在多个matmul时，可以单独控制每个使用空间
-                                           // 不小于baseM*baseN*sizeof(float)
-    // 下列是bmm中使用的batch参数，如果需要实现bmm，该结构体中还有其他tiling参数
-    tiling->batchM = 1;  // 对于普通matmul，默认1
-    tiling->batchN = 1;  // 对于普通matmul，默认1
-    tiling->singleBatchM = 1;
-    tiling->singleBatchN = 1;
-    // 下面的db参数用于控制ping-pong
-    tiling->dbL0A = 2;
-    tiling->dbL0B = 2;
-    tiling->dbL0C = 1;
-}
-
-int32_t main(int32_t argc, char *argv[])
-{
-    size_t xaSize = 1024 * 256 * sizeof(aclFloat16);
-    size_t xbSize = 256 * 640 * sizeof(aclFloat16);
-    size_t biasSize = 640 * sizeof(float);
-    size_t ySize = 1024 * 640 * sizeof(float);
-    size_t workspaceSize = 16 * 1024 * 1024 * sizeof(float);  // AscendC::GetUserWorkspace中预留空间
-    size_t tilingSize = 96 * sizeof(uint32_t);
-    uint32_t blockDim = 1;
-
-    CHECK_ACL(aclInit(nullptr));
-    aclrtContext context;
-    int32_t deviceId = 0;
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    aclrtStream stream = nullptr;
-    CHECK_ACL(aclrtCreateStream(&stream));
-
-    aclFloat16 *xaHost;
-    CHECK_ACL(aclrtMallocHost((void **)(&xaHost), xaSize));
-    fillValue(xaHost, xaSize, 4.0f);
-
-    aclFloat16 *xbHost;
-    CHECK_ACL(aclrtMallocHost((void **)(&xbHost), xbSize));
-    fillValue(xbHost, xbSize, 4.0f);
-
-    float *biasHost;
-    CHECK_ACL(aclrtMallocHost((void **)(&biasHost), biasSize));
-    fillFloatValue(biasHost, biasSize, 0.0f);
-
-    float *workspaceHost;
-    CHECK_ACL(aclrtMallocHost((void **)(&workspaceHost), workspaceSize));
-    fillFloatValue(workspaceHost, workspaceSize, 0.0f);
-
-    int32_t *tilingHost;
-    CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingSize));
-    MakeTiling(tilingHost, tilingSize);
-
-    // 将host的输入同步到device
-    uint8_t *xaDevice;
-    uint8_t *xbDevice;
-    uint8_t *biasDevice;
-    uint8_t *tilingDevice;
-    uint8_t *workspaceDevice;
-    CHECK_ACL(aclrtMalloc((void **)&xaDevice, xaSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(xaDevice, xaSize, xaHost, xaSize, ACL_MEMCPY_HOST_TO_DEVICE));  // 准备 xa
-    CHECK_ACL(aclrtMalloc((void **)&xbDevice, xbSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(xbDevice, xbSize, xbHost, xbSize, ACL_MEMCPY_HOST_TO_DEVICE));  // 准备 xb
-    CHECK_ACL(aclrtMalloc((void **)&biasDevice, biasSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(biasDevice, biasSize, biasHost, biasSize, ACL_MEMCPY_HOST_TO_DEVICE));  // 准备 bias
-    CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingSize, tilingHost, tilingSize, ACL_MEMCPY_HOST_TO_DEVICE));  // 准备 tiling
-    CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(
-        workspaceDevice, workspaceSize, workspaceHost, workspaceSize, ACL_MEMCPY_HOST_TO_DEVICE));  // 准备workspace
-
-    uint8_t *yDevice;
-    CHECK_ACL(aclrtMalloc((void **)&yDevice, ySize, ACL_MEM_MALLOC_HUGE_FIRST));  // 准备 输出
-
-    void *addr;
-    (void)GetAscendCoreSyncAddr(&addr);
-    matmul_leakyrelu_custom_do(blockDim,
-        nullptr,
-        stream,
-        (uint8_t *)addr,
-        xaDevice,
-        xbDevice,
-        biasDevice,
-        yDevice,
-        workspaceDevice,
-        tilingDevice);
-    CHECK_ACL(aclrtSynchronizeStream(stream));
-
-    // 将device的输出同步到host
-    float *yHost;
-    CHECK_ACL(aclrtMallocHost((void **)(&yHost), ySize));
-    CHECK_ACL(aclrtMemcpy(yHost, ySize, yDevice, ySize, ACL_MEMCPY_DEVICE_TO_HOST));
-    printTensor(yHost, 4 * 8 * 4);
-
-    // 释放资源
-    CHECK_ACL(aclrtFree(xaDevice));
-    CHECK_ACL(aclrtFree(xbDevice));
-    CHECK_ACL(aclrtFree(tilingDevice));
-    CHECK_ACL(aclrtFree(yDevice));
-
-    CHECK_ACL(aclrtFreeHost(xaHost));
-    CHECK_ACL(aclrtFreeHost(xbHost));
-    CHECK_ACL(aclrtFreeHost(tilingHost));
-    CHECK_ACL(aclrtFreeHost(yHost));
-
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtDestroyContext(context));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-    return 0;
-}
\ No newline at end of file
diff --git a/sample/normal_sample/mix/matmul_leakyrelu_kernel.cpp b/sample/normal_sample/mix/matmul_leakyrelu_kernel.cpp
deleted file mode 100644
index 378a8a172..000000000
--- a/sample/normal_sample/mix/matmul_leakyrelu_kernel.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
- */
-
-#include "kernel_operator.h"
-#include "lib/matmul_intf.h"
-
-using namespace AscendC;
-using namespace matmul;
-
-template <typename aType, typename bType, typename cType, typename biasType>
-class MatmulLeakyKernel {
-public:
-    __aicore__ inline MatmulLeakyKernel(){};
-    __aicore__ inline void Init(
-        GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling, TPipe *pipe);
-    __aicore__ inline void Process(TPipe *pipe);
-
-    __aicore__ inline void MatmulCompute();
-    __aicore__ inline void LeakyReluCompute();
-    __aicore__ inline void CopyOut(uint32_t count);
-    __aicore__ inline void CalcOffset(int32_t blockIdx, int32_t usedCoreNum, const TCubeTiling &tiling,
-        int32_t &offsetA, int32_t &offsetB, int32_t &offsetC, int32_t &offsetBias);
-
-    Matmul<MatmulType<TPosition::GM, CubeFormat::ND, aType>, MatmulType<TPosition::GM, CubeFormat::ND, bType>,
-        MatmulType<TPosition::VECIN, CubeFormat::ND, cType>, MatmulType<TPosition::GM, CubeFormat::ND, biasType>>
-        matmulObj;
-
-    GlobalTensor<aType> aGlobal;
-    GlobalTensor<bType> bGlobal;
-    GlobalTensor<cType> cGlobal;
-    GlobalTensor<biasType> biasGlobal;
-    LocalTensor<cType> reluOutLocal;
-    TCubeTiling tiling;
-    TQue<QuePosition::VECOUT, 1> reluOutQueue_;
-};
-
-template <typename aType, typename bType, typename cType, typename biasType>
-__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::Init(
-    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGM, TPipe *pipe)
-{
-    auto tempTilingGM = (__gm__ uint32_t *)tilingGM;
-    auto tempTiling = (uint32_t *)&tiling;
-    for (int32_t i = 0; i < sizeof(TCubeTiling) / sizeof(int32_t); ++i, ++tempTilingGM, ++tempTiling) {
-        *tempTiling = *tempTilingGM;
-    }
-    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType *>(a), tiling.M * tiling.Ka);
-    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType *>(b), tiling.Kb * tiling.N);
-    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType *>(c), tiling.M * tiling.N);
-    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType *>(bias), tiling.N);
-
-    int32_t offsetA, offsetB, offsetC, offsetBias;
-    CalcOffset(GetBlockIdx(), tiling.usedCoreNum, tiling, offsetA, offsetB, offsetC, offsetBias);
-    aGlobal = aGlobal[offsetA];
-    bGlobal = bGlobal[offsetB];
-    cGlobal = cGlobal[offsetC];
-    biasGlobal = biasGlobal[offsetBias];
-    pipe->InitBuffer(reluOutQueue_, 1, tiling.baseM * tiling.baseN * sizeof(cType));
-    SetSysWorkspace(workspace);
-    if (GetSysWorkSpacePtr() == nullptr) {
-        return;
-    }
-}
-
-template <typename aType, typename bType, typename cType, typename biasType>
-__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::Process(TPipe *pipe)
-{
-    uint32_t computeRound = 0;
-
-#ifdef CUSTOM_ASCEND310P
-    TBuf<> tmpMMFormatUb;
-    LocalTensor<uint8_t> mmformatUb;
-    pipe->InitBuffer(tmpMMFormatUb, tiling.baseM * tiling.baseN * sizeof(cType));
-    mmformatUb = tmpMMFormatUb.Get<uint8_t>(tiling.baseM * tiling.baseN * sizeof(cType));
-    matmulObj.SetLocalWorkspace(mmformatUb);
-#endif
-    matmulObj.SetTensorA(aGlobal);
-    matmulObj.SetTensorB(bGlobal);
-    matmulObj.SetBias(biasGlobal);
-    while (matmulObj.template Iterate<true>()) {
-        MatmulCompute();
-        LeakyReluCompute();
-        CopyOut(computeRound);
-        computeRound++;
-    }
-    matmulObj.End();
-}
-
-template <typename aType, typename bType, typename cType, typename biasType>
-__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::MatmulCompute()
-{
-    reluOutLocal = reluOutQueue_.AllocTensor<cType>();
-    matmulObj.template GetTensorC<true>(reluOutLocal, false, true);
-}
-
-template <typename aType, typename bType, typename cType, typename biasType>
-__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::LeakyReluCompute()
-{
-    LeakyRelu(reluOutLocal, reluOutLocal, (cType)0.001, tiling.baseM * tiling.baseN);
-    reluOutQueue_.EnQue(reluOutLocal);
-}
-
-template <typename aType, typename bType, typename cType, typename biasType>
-__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::CopyOut(uint32_t count)
-{
-    reluOutQueue_.DeQue<cType>();
-    const uint32_t roundM = tiling.singleCoreM / tiling.baseM;
-    const uint32_t roundN = tiling.singleCoreN / tiling.baseN;
-    uint32_t startOffset = (count % roundM * tiling.baseM * tiling.N + count / roundM * tiling.baseN);
-    DataCopyParams copyParam = {(uint16_t)tiling.baseM,
-        (uint16_t)(tiling.baseN * sizeof(cType) / DEFAULT_C0_SIZE),
-        0,
-        (uint16_t)((tiling.N - tiling.baseN) * sizeof(cType) / DEFAULT_C0_SIZE)};
-    DataCopy(cGlobal[startOffset], reluOutLocal, copyParam);
-    reluOutQueue_.FreeTensor(reluOutLocal);
-}
-
-template <typename aType, typename bType, typename cType, typename biasType>
-__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::CalcOffset(int32_t blockIdx,
-    int32_t usedCoreNum, const TCubeTiling &tiling, int32_t &offsetA, int32_t &offsetB, int32_t &offsetC,
-    int32_t &offsetBias)
-{
-    auto mSingleBlocks = Ceil(tiling.M, tiling.singleCoreM);
-    auto mCoreIndx = blockIdx % mSingleBlocks;
-    auto nCoreIndx = blockIdx / mSingleBlocks;
-
-    offsetA = mCoreIndx * tiling.Ka * tiling.singleCoreM;
-    offsetB = nCoreIndx * tiling.singleCoreN;
-    offsetC = mCoreIndx * tiling.N * tiling.singleCoreM + nCoreIndx * tiling.singleCoreN;
-    offsetBias = nCoreIndx * tiling.singleCoreN;
-}
-
-__aicore__ inline void matmul_leakyrelu_custom_inner(
-    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling)
-{
-    MatmulLeakyKernel<half, half, float, float> matmulLeakyKernel;
-    TPipe pipe;
-    matmulLeakyKernel.Init(a, b, bias, c, workspace, tiling, &pipe);
-    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulLeakyKernel.matmulObj, &matmulLeakyKernel.tiling);
-    matmulLeakyKernel.Process(&pipe);
-}
-
-// 基于CANN sample仓中的MatMulLeakyReluCustomSample进行改造
-// https://gitee.com/ascend/samples/tree/master/operator/
-// 如下内容是编译生成的build/auto_gen/ascendc_kernels_npu目录下
-// auto_gen_matmul_leakyrelu_custom.cpp中代码的简化
-// 具体简化内容：
-// 1. 移除了cpu部分的代码
-// 2. 由于本身不再需要修改核函数入口，故而不需要__global__宏的替代
-// 3. 由于明确有workspace和tiling，故而移除了这些宏中的内容
-__aicore__ inline GM_ADDR kfc_init(GM_ADDR ffts_addr, GM_ADDR workspace)
-{
-    GM_ADDR workspace_param;
-    GM_ADDR workspace_usr;
-    workspace_param = workspace;
-    if (workspace_param == nullptr) {
-        return workspace;
-    }
-    set_ffts_base_addr((uint64_t)ffts_addr);
-    AscendC::SetSysWorkspaceForce(workspace_param);
-    workspace_usr = AscendC::GetUserWorkspace(workspace_param);
-#if defined(REGIST_MATMUL_OBJ)
-    if constexpr (g_coreType == AscendC::AIC) {
-        matmul::clearWorkspace(workspace_param);
-    }
-    if constexpr (g_coreType == AscendC::AIV) {
-#ifdef MIX_N
-        FftsCrossCoreSync(PIPE_MTE3, 0x501);
-        WaitEvent(0x5);
-#endif
-    }
-#endif
-    return workspace_usr;
-}
-
-extern "C" __global__ __aicore__ void matmul_leakyrelu_custom(
-    GM_ADDR ffts_addr, GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling)
-{
-    workspace = kfc_init(ffts_addr, workspace);
-    matmul_leakyrelu_custom_inner(a, b, bias, c, workspace, tiling);
-}
-
-// call of kernel function
-void matmul_leakyrelu_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *ffts_addr, uint8_t *a,
-    uint8_t *b, uint8_t *bias, uint8_t *c, uint8_t *workspace, uint8_t *tilingGm)
-{
-    matmul_leakyrelu_custom<<<blockDim, l2ctrl, stream>>>(ffts_addr, a, b, bias, c, workspace, tilingGm);
-}
diff --git a/sample/normal_sample/vec_only/Makefile b/sample/normal_sample/vec_only/Makefile
deleted file mode 100644
index 00e70c995..000000000
--- a/sample/normal_sample/vec_only/Makefile
+++ /dev/null
@@ -1,27 +0,0 @@
-# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化
-ASCEND_HOME_PATH			?= /usr/local/Ascend/ascend-toolkit/latest
-
-COMPILER					:= $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec  # 参考device_config.cmake中CMAKE_C_COMPILER配置
-COMPILER_FLAG				:= -xcce -O2 -std=c++17
-LINK_FLAG					:= --cce-fatobj-link
-DAV_FLAG                    := --cce-aicore-arch=dav-c220-vec
-ASCENDC_INC_FLAG 			:= -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化
-HOST_INC_FLAG				:= -I${ASCEND_HOME_PATH}/include
-LINK_LIBS					:= -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++
-
-all: build
-
-build: add_kernel.o main.o add.fatbin
-
-add_kernel.o: add_kernel.cpp
-	$(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^
-
-main.o: main.cpp
-	$(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(HOST_INC_FLAG) -o $@ -c $^
-
-add.fatbin: add_kernel.o main.o
-	$(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ $(LINK_LIBS)
-
-.PHONY: clean
-clean:
-	rm *.o add.fatbin
\ No newline at end of file
diff --git a/sample/normal_sample/vec_only/add_kernel.cpp b/sample/normal_sample/vec_only/add_kernel.cpp
deleted file mode 100644
index 444be23e5..000000000
--- a/sample/normal_sample/vec_only/add_kernel.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
- *
- * Function : z = x + y
- * This sample is a very basic sample that implements vector add on Ascend plaform.
- * In this sample:
- * Length of x / y / z is 8*2048.
- * Num of vector core used in sample is 8.
- * Length for each core to compute is 2048.
- * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop.
- *
- */
-#include "kernel_operator.h"
-using namespace AscendC;
-constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data
-constexpr int32_t USE_CORE_NUM = 8; // num of core used
-constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core
-constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core
-constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue
-constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // seperate to 2 parts, due to double buffer
-
-class KernelAdd {
-public:
-    __aicore__ inline KernelAdd() {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
-    {
-        // get start index for current core, core parallel
-        xGm.SetGlobalBuffer((__gm__ int16_t*)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        yGm.SetGlobalBuffer((__gm__ int16_t*)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        zGm.SetGlobalBuffer((__gm__ int16_t*)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        // pipe alloc memory to queue, the unit is Bytes
-        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t));
-        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(int16_t));
-    }
-    __aicore__ inline void Process()
-    {
-        // loop count need to be doubled, due to double buffer
-        constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM;
-        // tiling strategy, pipeline parallel
-        for (int32_t i = 0; i < loopCount; i++) {
-            CopyIn(i);
-            Compute(i);
-            CopyOut(i);
-        }
-    }
-private:
-    __aicore__ inline void CopyIn(int32_t progress)
-    {
-        // alloc tensor from queue memory
-        LocalTensor<int16_t> xLocal = inQueueX.AllocTensor<int16_t>();
-        LocalTensor<int16_t> yLocal = inQueueY.AllocTensor<int16_t>();
-        // copy progress_th tile from global tensor to local tensor
-        DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
-        DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
-        // enque input tensors to VECIN queue
-        inQueueX.EnQue(xLocal);
-        inQueueY.EnQue(yLocal);
-    }
-    __aicore__ inline void Compute(int32_t progress)
-    {
-        // deque input tensors from VECIN queue
-        LocalTensor<int16_t> xLocal = inQueueX.DeQue<int16_t>();
-        LocalTensor<int16_t> yLocal = inQueueY.DeQue<int16_t>();
-        LocalTensor<int16_t> zLocal = outQueueZ.AllocTensor<int16_t>();
-        // call Add instr for computation
-        Add(zLocal, xLocal, yLocal, TILE_LENGTH);
-        // enque the output tensor to VECOUT queue
-        outQueueZ.EnQue<int16_t>(zLocal);
-        // free input tensors for reuse
-        inQueueX.FreeTensor(xLocal);
-        inQueueY.FreeTensor(yLocal);
-    }
-    __aicore__ inline void CopyOut(int32_t progress)
-    {
-        // deque output tensor from VECOUT queue
-        LocalTensor<int16_t> zLocal = outQueueZ.DeQue<int16_t>();
-        // copy progress_th tile from local tensor to global tensor
-        DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
-        // free output tensor for reuse
-        outQueueZ.FreeTensor(zLocal);
-    }
-private:
-    TPipe pipe;
-    // create queues for input, in this case depth is equal to buffer num
-    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
-    // create queue for output, in this case depth is equal to buffer num
-    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
-    GlobalTensor<int16_t> xGm, yGm, zGm;
-};
-// implementation of kernel function
-extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
-{
-    KernelAdd op;
-    op.Init(x, y, z);
-    op.Process();
-}
-
-// call of kernel function
-void add_custom_do(uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* x, uint8_t* y, uint8_t* z)
-{
-    add_custom<<<blockDim, l2ctrl, stream>>>(x, y, z);
-}
\ No newline at end of file
diff --git a/sample/normal_sample/vec_only/main.cpp b/sample/normal_sample/vec_only/main.cpp
deleted file mode 100644
index 45865cea5..000000000
--- a/sample/normal_sample/vec_only/main.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
- */
-
-#include <iostream>
-#include "acl/acl.h"
-
-#define ACL_ERROR_NONE 0
-
-#define CHECK_ACL(x)                                                        \
-    do {                                                                    \
-        aclError __ret = x;                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                      \
-            printf("%s: %d aclError %d\n", __FILE__, __LINE__, __ret);      \
-        }                                                                   \
-    } while (0)
-
-void prepareTensor(int16_t *ptr, size_t size)
-{
-    for (size_t i = 0; i < size / sizeof(int16_t); i++) {
-        ptr[i] = i;
-    }
-}
-
-void printTensor(int16_t *ptr, size_t size)
-{
-    size_t colNum = 8;
-    for (size_t i = 0; i < size / colNum / sizeof(int16_t); i++) {
-        for (size_t j = 0; j < colNum; j++) {
-            printf("%hu ", ptr[colNum * i + j]);
-        }
-        printf("\n");
-    }
-}
-
-extern void add_custom_do(uint32_t coreDim, void *l2ctrl, void *stream, uint8_t *x, uint8_t *y, uint8_t *z);
-int32_t main(int32_t argc, char *argv[])
-{
-    size_t inputByteSize = 8 * 2048 * sizeof(int16_t);
-    size_t outputByteSize = 8 * 2048 * sizeof(int16_t);
-    uint32_t blockDim = 8;
-    // AscendCL初始化
-    CHECK_ACL(aclInit(nullptr));
-    // 运行管理资源申请
-    aclrtContext context;
-    int32_t deviceId = 0;
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    aclrtStream stream = nullptr;
-    CHECK_ACL(aclrtCreateStream(&stream));
-    // 分配Host内存
-    int16_t *xHost = nullptr;
-    int16_t *yHost = nullptr;
-    int16_t *zHost = nullptr;
-    CHECK_ACL(aclrtMallocHost((void**)(&xHost), inputByteSize));
-    CHECK_ACL(aclrtMallocHost((void**)(&yHost), inputByteSize));
-    CHECK_ACL(aclrtMallocHost((void**)(&zHost), outputByteSize));
-    // 分配Device内存
-    uint8_t *xDevice = nullptr;
-    uint8_t *yDevice = nullptr;
-    uint8_t *zDevice = nullptr;
-    CHECK_ACL(aclrtMalloc((void**)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMalloc((void**)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMalloc((void**)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    // Host内存初始化
-    prepareTensor(xHost, inputByteSize);
-    prepareTensor(yHost, inputByteSize);
-    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
-    CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
-    // 用内核调用符<<<>>>调用核函数完成指定的运算,add_custom_do中封装了<<<>>>调用
-    add_custom_do(blockDim, nullptr, stream, xDevice, yDevice, zDevice);
-    CHECK_ACL(aclrtSynchronizeStream(stream));
-    // 将Device上的运算结果拷贝回Host
-    CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
-    printTensor(zHost, 2 * 8 * 2); // 显示2行
-    // 释放申请的资源
-    CHECK_ACL(aclrtFree(xDevice));
-    CHECK_ACL(aclrtFree(yDevice));
-    CHECK_ACL(aclrtFree(zDevice));
-    CHECK_ACL(aclrtFreeHost(xHost));
-    CHECK_ACL(aclrtFreeHost(yHost));
-    CHECK_ACL(aclrtFreeHost(zHost));
-    // AscendCL去初始化
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtDestroyContext(context));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-    return 0;
-}
\ No newline at end of file
diff --git a/sample/pytorch_adapter/README.md b/sample/pytorch_adapter/README.md
deleted file mode 100644
index a2b1ba635..000000000
--- a/sample/pytorch_adapter/README.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# 自定义算子的pytorch框架适配说明
-
-## 简介
-昇腾提供丰富的算子接入框架的方式，此处将介绍最简单的一种，每个目录中都是一个独立的可使用的工程
-
-## 依赖
-与业内pytorch的算子介入方式相同，算子接入框架需要保障设备上有正确的pytorch版本（我们还依赖torch_npu版本）
-
-pytorch版本可由pip安装，torch_npu版本详见[此处](https://gitee.com/ascend/pytorch/releases)，请选择与pytorch适配的torch_npu版本。
-
-## 工程介绍
-整体工程目录如下：
-```
-- pytorch_adapter
-  |- jit_compile         # 实时编译的接入方式
-    |- add_adapter.cpp   # 使用算子动态库接口完成算子在pytorch框架的适配
-    |- add_kernel.cpp    # 昇腾算子实现，并提供host侧的动态库接口
-    |- main.py           # python的入口，实现整体集成
-    |- Makefile          # 用以生成昇腾算子的host侧动态库的编译脚本
-  |- with_setuptools     # wheel包的接入方式
-    |- add_adapter.cpp
-    |- add_kernel.cpp
-    |- Makefile
-    |- setup.py          # setuptools的入口，支持编译并打包生成wheel包
-    |- test.py           # 测试wheel包功能的入口
-```
-
-## 工程使用
-
-### jit_compile工程
-执行如下命令，就会在运行过程中，现场生成python模块并使用：
-```
-python main.py
-```
-
-### setuptools工程
-针对with_setuptools工程，可以编译出可安装的wheel包，便于多机部署使用。
-
-
-1. 执行如下命令可以编译出软件包(setuptools可以支持多种方式，比如：build,install等，此处不一一展示)：
-```
-pytorch setup.py bdist_wheel    # 编译出wheel包，在dist目录下
-```
-
-2. 到```dist```目录下用pip命令安装对应软件包。
-
-3. 执行测试脚本
-```
-python test.py
-```
-
-## 其他
-1. 此处样例使用的是静态tiling，如果使用动态tiling，则可以在adapter.cpp中对Tensor的shape进行分析，选择合适tiling。（这部分是流程中必须的，只是可能在不同位置，比如aclnn中，这部分在接口实现；此处，我们本身也可以对add_custom_do进行封装，将tiling内置。）
\ No newline at end of file
diff --git a/sample/pytorch_adapter/jit_compile/Makefile b/sample/pytorch_adapter/jit_compile/Makefile
deleted file mode 100644
index ec9115f37..000000000
--- a/sample/pytorch_adapter/jit_compile/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化
-ASCEND_HOME_PATH			?= /usr/local/Ascend/ascend-toolkit/latest
-
-COMPILER					:= $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec  # 参考device_config.cmake中CMAKE_C_COMPILER配置
-COMPILER_FLAG				:= -xcce -O2 -std=c++17
-DYNAMIC_LIB_FLAG			:= -fPIC -shared
-DAV_FLAG                    := --cce-aicore-arch=dav-c220-vec
-ASCENDC_INC_FLAG 			:= -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include  # 参考device_intf.cmake的配置简化
-
-all: build
-
-build: libcustom_kernels.so
-
-# 后续如果要扩展，把多个kernel的cpp都加到后面
-libcustom_kernels.so: add_kernel.cpp
-	$(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^
-
-.PHONY: clean
-clean:
-	rm *.so
\ No newline at end of file
diff --git a/sample/pytorch_adapter/jit_compile/add_adapter.cpp b/sample/pytorch_adapter/jit_compile/add_adapter.cpp
deleted file mode 100644
index 6c65e60ec..000000000
--- a/sample/pytorch_adapter/jit_compile/add_adapter.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <torch/extension.h>
-#include "torch_npu/csrc/core/npu/NPUStream.h"
-#include "torch_npu/csrc/framework/OpCommand.h"
-
-using torch::autograd::AutogradContext;
-using torch::autograd::Function;
-using tensor_list = std::vector<at::Tensor>;
-using namespace at;
-
-extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z);
-
-// 为NPU设备注册前向实现
-at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other)
-{
-    // 创建输出内存
-    at::Tensor result = at::Tensor(self);
-    // 将pytorch中的结构翻译成为CANN认识的数据类型和结构
-    // 1. (重要)通过对tensor的shape分析，选择合适的tiling（该算子为了简化，固定了tiling，只有特定shape下计算才正确）
-    // 2. 对数据类型和格式转换  -- 此处无需数据格式处理，直接使用
-    auto stream = c10_npu::getCurrentNPUStream().stream(false);
-    auto x = self.storage().data();
-    auto y = other.storage().data();
-    auto z = result.storage().data();
-
-    uint32_t blockDim = 8;
-    auto callback = [stream, blockDim, x, y, z]() -> int {
-        add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z);
-        return 0;  // 此处可以通过某种方式获取算子执行结果，还未实现
-    };
-    // 下发算子
-    at_npu::native::OpCommand cmd;
-    cmd.Name("my_add").SetCustomHandler(callback).Run();
-    return result;
-}
-
-// 为NPU设备注册反向实现
-std::tuple<at::Tensor, at::Tensor> my_add_backward_impl_npu(const at::Tensor &self)
-{
-    at::Tensor result = at::Tensor(self);  // 创建输出内存
-
-    return {result, result};
-}
-
-// 为Meta设备注册前向实现
-at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other)
-{
-    return empty_like(self);
-}
-
-// 为Meta设备注册反向实现
-std::tuple<at::Tensor, at::Tensor> my_add_backward_impl_meta(const at::Tensor &self)
-{
-    auto result = empty_like(self);
-    return std::make_tuple(result, result);
-}
-
-// 寻找注册在该op上的不同设备的实现
-at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other)
-{
-    static auto op =
-        torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed<decltype(my_add_impl)>();
-    return op.call(self, other);
-}
-// 寻找注册在该op上的不同设备的实现
-std::tuple<at::Tensor, at::Tensor> my_add_backward_impl(const at::Tensor &self)
-{
-    static auto op = torch::Dispatcher::singleton()
-                         .findSchemaOrThrow("myaten::my_add_backward", "")
-                         .typed<decltype(my_add_backward_impl)>();
-    return op.call(self);
-}
-
-// 在myaten命名空间里注册my_add和my_add_backward两个schema
-TORCH_LIBRARY(myaten, m)
-{
-    m.def("my_add(Tensor self, Tensor other) -> Tensor");
-    m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)");
-}
-
-// 通过继承torch::autograd::Function类实现前反向绑定
-class MyAddFunction : public torch::autograd::Function<MyAddFunction> {
-public:
-    static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other)
-    {
-        at::AutoDispatchBelowADInplaceOrView guard;
-        return my_add_impl(self, other);
-    }
-
-    static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs)
-    {
-        auto grad_output = grad_outputs[0];
-        auto result = my_add_backward_impl(grad_output);
-        return {std::get<0>(result), std::get<1>(result)};
-    }
-};
-
-at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other)
-{
-    return MyAddFunction::apply(self, other);
-}
-
-// 给op绑定NPU的自动求导实现
-// 如果是pytorch 2.1以下的版本，AutogradPrivateUse1需要改成AutogradXLA
-TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m)
-{
-    m.impl("my_add", &my_add_impl_autograd);
-}
-
-// 为NPU设备注册前反向实现
-// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1，在2.1以下版本用的是XLA，如果是2.1以下版本PrivateUse1需要改成XLA
-TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m)
-{
-    m.impl("my_add", &my_add_impl_npu);
-    m.impl("my_add_backward", &my_add_backward_impl_npu);
-}
-
-// 为Meta设备注册前反向实现
-TORCH_LIBRARY_IMPL(myaten, Meta, m)
-{
-    m.impl("my_add", &my_add_impl_meta);
-    m.impl("my_add_backward", &my_add_backward_impl_meta);
-}
-
-// 通过pybind将c++接口和python接口绑定
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("my_add", &my_add_impl_autograd, "x + y");
-}
diff --git a/sample/pytorch_adapter/jit_compile/add_kernel.cpp b/sample/pytorch_adapter/jit_compile/add_kernel.cpp
deleted file mode 100644
index 9aa62e093..000000000
--- a/sample/pytorch_adapter/jit_compile/add_kernel.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
- *
- * Function : z = x + y
- * This sample is a very basic sample that implements vector add on Ascend plaform.
- * In this sample:
- * Length of x / y / z is 8*2048.
- * Num of vector core used in sample is 8.
- * Length for each core to compute is 2048.
- * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop.
- *
- */
-#include "kernel_operator.h"
-using namespace AscendC;
-constexpr int32_t TOTAL_LENGTH = 8 * 2048;                             // total length of data
-constexpr int32_t USE_CORE_NUM = 8;                                    // num of core used
-constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;          // length computed of each core
-constexpr int32_t TILE_NUM = 8;                                        // split data into 8 tiles for each core
-constexpr int32_t BUFFER_NUM = 2;                                      // tensor num for each queue
-constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM;  // seperate to 2 parts, due to double buffer
-
-class KernelAdd {
-public:
-    __aicore__ inline KernelAdd()
-    {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
-    {
-        // get start index for current core, core parallel
-        xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        // pipe alloc memory to queue, the unit is Bytes
-        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half));
-        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half));
-    }
-    __aicore__ inline void Process()
-    {
-        // loop count need to be doubled, due to double buffer
-        constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM;
-        // tiling strategy, pipeline parallel
-        for (int32_t i = 0; i < loopCount; i++) {
-            CopyIn(i);
-            Compute(i);
-            CopyOut(i);
-        }
-    }
-
-private:
-    __aicore__ inline void CopyIn(int32_t progress)
-    {
-        // alloc tensor from queue memory
-        LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
-        LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
-        // copy progress_th tile from global tensor to local tensor
-        DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
-        DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
-        // enque input tensors to VECIN queue
-        inQueueX.EnQue(xLocal);
-        inQueueY.EnQue(yLocal);
-    }
-    __aicore__ inline void Compute(int32_t progress)
-    {
-        // deque input tensors from VECIN queue
-        LocalTensor<half> xLocal = inQueueX.DeQue<half>();
-        LocalTensor<half> yLocal = inQueueY.DeQue<half>();
-        LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
-        // call Add instr for computation
-        Add(zLocal, xLocal, yLocal, TILE_LENGTH);
-        // enque the output tensor to VECOUT queue
-        outQueueZ.EnQue<half>(zLocal);
-        // free input tensors for reuse
-        inQueueX.FreeTensor(xLocal);
-        inQueueY.FreeTensor(yLocal);
-    }
-    __aicore__ inline void CopyOut(int32_t progress)
-    {
-        // deque output tensor from VECOUT queue
-        LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
-        // copy progress_th tile from local tensor to global tensor
-        DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
-        // free output tensor for reuse
-        outQueueZ.FreeTensor(zLocal);
-    }
-
-private:
-    TPipe pipe;
-    // create queues for input, in this case depth is equal to buffer num
-    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
-    // create queue for output, in this case depth is equal to buffer num
-    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
-    GlobalTensor<half> xGm, yGm, zGm;
-};
-// implementation of kernel function
-extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
-{
-    KernelAdd op;
-    op.Init(x, y, z);
-    op.Process();
-}
-
-// 包裹核函数，使得普通编译器能认识这个符号
-extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
-{
-    add_custom<<<blockDim, nullptr, stream>>>(x, y, z);
-}
\ No newline at end of file
diff --git a/sample/pytorch_adapter/jit_compile/main.py b/sample/pytorch_adapter/jit_compile/main.py
deleted file mode 100644
index 847a51f1c..000000000
--- a/sample/pytorch_adapter/jit_compile/main.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import os
-import subprocess
-import torch
-import torch_npu
-import torch.utils.cpp_extension
-from torch_npu.testing.testcase import TestCase, run_tests
-
-PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__))
-CUR_PATH = os.path.abspath(os.path.dirname(__file__))
-
-
-def compile_kernels():
-    # 由于pytorch中没有昇腾device编译的扩展，所以此处人工加make
-    subprocess.run("make")
-
-
-def compile_host():
-    extra_ldflags = []
-    extra_ldflags.append(f"-L{PYTORCH_NPU_INSTALL_PATH}/lib")
-    extra_ldflags.append("-ltorch_npu")
-    extra_ldflags.append(f"-L{CUR_PATH}/")
-    extra_ldflags.append("-lcustom_kernels")
-    extra_include_paths = []
-    extra_include_paths.append("./")
-    extra_include_paths.append(os.path.join(
-        PYTORCH_NPU_INSTALL_PATH, "include"))
-    extra_include_paths.append(os.path.join(os.path.join(os.path.join(os.path.join(
-        PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"))
-
-    module = torch.utils.cpp_extension.load(
-        name="jit_extension",
-        sources=[
-            "add_adapter.cpp"
-        ],
-        extra_include_paths=extra_include_paths,
-        extra_ldflags=extra_ldflags,
-        verbose=True)
-    return module
-
-
-class TestCustomAdd(TestCase):
-    def test_add(self):
-        module = compile_host()
-        # 由于kernel现在是静态tiling，所以此处尺寸需要匹配
-        # 因为add是elementwise的，现有算子支持8*2048(详见kernel实现)，所以，小于这个应该都可以
-        length = [8, 2048]
-        x = torch.rand(length, device='cpu', dtype=torch.float16)
-        y = torch.rand(length, device='cpu', dtype=torch.float16)
-
-        x_npu = x.npu()
-        y_npu = y.npu()
-        x_npu.requires_grad = True
-        y_npu.requires_grad = True
-        output = module.my_add(x_npu, y_npu)
-        # 反向能力验证
-        output.backward(output)
-
-        x.requires_grad = True
-        y.requires_grad = True
-        cpuout = torch.add(x, y)
-        cpuout.backward(cpuout)
-
-        self.assertRtolEqual(output, cpuout)
-        self.assertRtolEqual(x_npu.grad, x.grad)
-        self.assertRtolEqual(y_npu.grad, y.grad)
-
-
-if __name__ == '__main__':
-    compile_kernels()
-    run_tests()
diff --git a/sample/pytorch_adapter/with_setuptools/Makefile b/sample/pytorch_adapter/with_setuptools/Makefile
deleted file mode 100644
index ec9115f37..000000000
--- a/sample/pytorch_adapter/with_setuptools/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Location of the CANN, 主要基于${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake中内容简化
-ASCEND_HOME_PATH			?= /usr/local/Ascend/ascend-toolkit/latest
-
-COMPILER					:= $(ASCEND_HOME_PATH)/compiler/ccec_compiler/bin/ccec  # 参考device_config.cmake中CMAKE_C_COMPILER配置
-COMPILER_FLAG				:= -xcce -O2 -std=c++17
-DYNAMIC_LIB_FLAG			:= -fPIC -shared
-DAV_FLAG                    := --cce-aicore-arch=dav-c220-vec
-ASCENDC_INC_FLAG 			:= -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include  # 参考device_intf.cmake的配置简化
-
-all: build
-
-build: libcustom_kernels.so
-
-# 后续如果要扩展，把多个kernel的cpp都加到后面
-libcustom_kernels.so: add_kernel.cpp
-	$(COMPILER) $(DYNAMIC_LIB_FLAG) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ $^
-
-.PHONY: clean
-clean:
-	rm *.so
\ No newline at end of file
diff --git a/sample/pytorch_adapter/with_setuptools/add_adapter.cpp b/sample/pytorch_adapter/with_setuptools/add_adapter.cpp
deleted file mode 100644
index 6c65e60ec..000000000
--- a/sample/pytorch_adapter/with_setuptools/add_adapter.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <torch/extension.h>
-#include "torch_npu/csrc/core/npu/NPUStream.h"
-#include "torch_npu/csrc/framework/OpCommand.h"
-
-using torch::autograd::AutogradContext;
-using torch::autograd::Function;
-using tensor_list = std::vector<at::Tensor>;
-using namespace at;
-
-extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z);
-
-// 为NPU设备注册前向实现
-at::Tensor my_add_impl_npu(const at::Tensor &self, const at::Tensor &other)
-{
-    // 创建输出内存
-    at::Tensor result = at::Tensor(self);
-    // 将pytorch中的结构翻译成为CANN认识的数据类型和结构
-    // 1. (重要)通过对tensor的shape分析，选择合适的tiling（该算子为了简化，固定了tiling，只有特定shape下计算才正确）
-    // 2. 对数据类型和格式转换  -- 此处无需数据格式处理，直接使用
-    auto stream = c10_npu::getCurrentNPUStream().stream(false);
-    auto x = self.storage().data();
-    auto y = other.storage().data();
-    auto z = result.storage().data();
-
-    uint32_t blockDim = 8;
-    auto callback = [stream, blockDim, x, y, z]() -> int {
-        add_custom_do(blockDim, stream, (uint8_t *)x, (uint8_t *)y, (uint8_t *)z);
-        return 0;  // 此处可以通过某种方式获取算子执行结果，还未实现
-    };
-    // 下发算子
-    at_npu::native::OpCommand cmd;
-    cmd.Name("my_add").SetCustomHandler(callback).Run();
-    return result;
-}
-
-// 为NPU设备注册反向实现
-std::tuple<at::Tensor, at::Tensor> my_add_backward_impl_npu(const at::Tensor &self)
-{
-    at::Tensor result = at::Tensor(self);  // 创建输出内存
-
-    return {result, result};
-}
-
-// 为Meta设备注册前向实现
-at::Tensor my_add_impl_meta(const at::Tensor &self, const at::Tensor &other)
-{
-    return empty_like(self);
-}
-
-// 为Meta设备注册反向实现
-std::tuple<at::Tensor, at::Tensor> my_add_backward_impl_meta(const at::Tensor &self)
-{
-    auto result = empty_like(self);
-    return std::make_tuple(result, result);
-}
-
-// 寻找注册在该op上的不同设备的实现
-at::Tensor my_add_impl(const at::Tensor &self, const at::Tensor &other)
-{
-    static auto op =
-        torch::Dispatcher::singleton().findSchemaOrThrow("myaten::my_add", "").typed<decltype(my_add_impl)>();
-    return op.call(self, other);
-}
-// 寻找注册在该op上的不同设备的实现
-std::tuple<at::Tensor, at::Tensor> my_add_backward_impl(const at::Tensor &self)
-{
-    static auto op = torch::Dispatcher::singleton()
-                         .findSchemaOrThrow("myaten::my_add_backward", "")
-                         .typed<decltype(my_add_backward_impl)>();
-    return op.call(self);
-}
-
-// 在myaten命名空间里注册my_add和my_add_backward两个schema
-TORCH_LIBRARY(myaten, m)
-{
-    m.def("my_add(Tensor self, Tensor other) -> Tensor");
-    m.def("my_add_backward(Tensor self) -> (Tensor, Tensor)");
-}
-
-// 通过继承torch::autograd::Function类实现前反向绑定
-class MyAddFunction : public torch::autograd::Function<MyAddFunction> {
-public:
-    static at::Tensor forward(AutogradContext *ctx, at::Tensor self, at::Tensor other)
-    {
-        at::AutoDispatchBelowADInplaceOrView guard;
-        return my_add_impl(self, other);
-    }
-
-    static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs)
-    {
-        auto grad_output = grad_outputs[0];
-        auto result = my_add_backward_impl(grad_output);
-        return {std::get<0>(result), std::get<1>(result)};
-    }
-};
-
-at::Tensor my_add_impl_autograd(const at::Tensor &self, const at::Tensor &other)
-{
-    return MyAddFunction::apply(self, other);
-}
-
-// 给op绑定NPU的自动求导实现
-// 如果是pytorch 2.1以下的版本，AutogradPrivateUse1需要改成AutogradXLA
-TORCH_LIBRARY_IMPL(myaten, AutogradPrivateUse1, m)
-{
-    m.impl("my_add", &my_add_impl_autograd);
-}
-
-// 为NPU设备注册前反向实现
-// NPU设备在pytorch 2.1及以上版本使用的设备名称是PrivateUse1，在2.1以下版本用的是XLA，如果是2.1以下版本PrivateUse1需要改成XLA
-TORCH_LIBRARY_IMPL(myaten, PrivateUse1, m)
-{
-    m.impl("my_add", &my_add_impl_npu);
-    m.impl("my_add_backward", &my_add_backward_impl_npu);
-}
-
-// 为Meta设备注册前反向实现
-TORCH_LIBRARY_IMPL(myaten, Meta, m)
-{
-    m.impl("my_add", &my_add_impl_meta);
-    m.impl("my_add_backward", &my_add_backward_impl_meta);
-}
-
-// 通过pybind将c++接口和python接口绑定
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("my_add", &my_add_impl_autograd, "x + y");
-}
diff --git a/sample/pytorch_adapter/with_setuptools/add_kernel.cpp b/sample/pytorch_adapter/with_setuptools/add_kernel.cpp
deleted file mode 100644
index 9aa62e093..000000000
--- a/sample/pytorch_adapter/with_setuptools/add_kernel.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
- *
- * Function : z = x + y
- * This sample is a very basic sample that implements vector add on Ascend plaform.
- * In this sample:
- * Length of x / y / z is 8*2048.
- * Num of vector core used in sample is 8.
- * Length for each core to compute is 2048.
- * Tiles for each core is 8 which means we add 2048/8=256 elements in one loop.
- *
- */
-#include "kernel_operator.h"
-using namespace AscendC;
-constexpr int32_t TOTAL_LENGTH = 8 * 2048;                             // total length of data
-constexpr int32_t USE_CORE_NUM = 8;                                    // num of core used
-constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;          // length computed of each core
-constexpr int32_t TILE_NUM = 8;                                        // split data into 8 tiles for each core
-constexpr int32_t BUFFER_NUM = 2;                                      // tensor num for each queue
-constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM;  // seperate to 2 parts, due to double buffer
-
-class KernelAdd {
-public:
-    __aicore__ inline KernelAdd()
-    {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
-    {
-        // get start index for current core, core parallel
-        xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * GetBlockIdx(), BLOCK_LENGTH);
-        // pipe alloc memory to queue, the unit is Bytes
-        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half));
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half));
-        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half));
-    }
-    __aicore__ inline void Process()
-    {
-        // loop count need to be doubled, due to double buffer
-        constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM;
-        // tiling strategy, pipeline parallel
-        for (int32_t i = 0; i < loopCount; i++) {
-            CopyIn(i);
-            Compute(i);
-            CopyOut(i);
-        }
-    }
-
-private:
-    __aicore__ inline void CopyIn(int32_t progress)
-    {
-        // alloc tensor from queue memory
-        LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
-        LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
-        // copy progress_th tile from global tensor to local tensor
-        DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
-        DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
-        // enque input tensors to VECIN queue
-        inQueueX.EnQue(xLocal);
-        inQueueY.EnQue(yLocal);
-    }
-    __aicore__ inline void Compute(int32_t progress)
-    {
-        // deque input tensors from VECIN queue
-        LocalTensor<half> xLocal = inQueueX.DeQue<half>();
-        LocalTensor<half> yLocal = inQueueY.DeQue<half>();
-        LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
-        // call Add instr for computation
-        Add(zLocal, xLocal, yLocal, TILE_LENGTH);
-        // enque the output tensor to VECOUT queue
-        outQueueZ.EnQue<half>(zLocal);
-        // free input tensors for reuse
-        inQueueX.FreeTensor(xLocal);
-        inQueueY.FreeTensor(yLocal);
-    }
-    __aicore__ inline void CopyOut(int32_t progress)
-    {
-        // deque output tensor from VECOUT queue
-        LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
-        // copy progress_th tile from local tensor to global tensor
-        DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
-        // free output tensor for reuse
-        outQueueZ.FreeTensor(zLocal);
-    }
-
-private:
-    TPipe pipe;
-    // create queues for input, in this case depth is equal to buffer num
-    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
-    // create queue for output, in this case depth is equal to buffer num
-    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
-    GlobalTensor<half> xGm, yGm, zGm;
-};
-// implementation of kernel function
-extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
-{
-    KernelAdd op;
-    op.Init(x, y, z);
-    op.Process();
-}
-
-// 包裹核函数，使得普通编译器能认识这个符号
-extern "C" void add_custom_do(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
-{
-    add_custom<<<blockDim, nullptr, stream>>>(x, y, z);
-}
\ No newline at end of file
diff --git a/sample/pytorch_adapter/with_setuptools/setup.py b/sample/pytorch_adapter/with_setuptools/setup.py
deleted file mode 100644
index 92ab1d3c7..000000000
--- a/sample/pytorch_adapter/with_setuptools/setup.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import os
-import subprocess
-import torch
-import torch_npu
-from setuptools import setup, find_packages
-from torch.utils.cpp_extension import BuildExtension
-from torch_npu.utils.cpp_extension import NpuExtension
-
-PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.abspath(torch_npu.__file__))
-CUR_PATH = os.path.abspath(os.path.dirname(__file__))
-
-
-def compile_kernels():
-    # 由于pytorch中没有昇腾device编译的扩展，所以此处人工加make
-    subprocess.run("make")
-    return "libcustom_kernels.so"  # 这个make出来的库名字
-
-
-def compile_adapter():
-    ext = NpuExtension(
-        name="ascend_custom_kernels_lib",  # import的库的名字
-        # 如果还有其他cpp文件参与编译，需要在这里添加
-        sources=[f"{CUR_PATH}/add_adapter.cpp"],
-        extra_compile_args=[
-            '-I' + os.path.join(os.path.join(os.path.join(os.path.join(
-                PYTORCH_NPU_INSTALL_PATH, "include"), "third_party"), "acl"), "inc"),
-        ],
-        library_dirs=[f"{CUR_PATH}"],  # 编译时需要依赖的库文件的路径，相当于g++编译时的-L选项
-        libraries=["custom_kernels"],  # 编译时依赖的库文件，相当于-l选项
-    )
-    return [ext]
-
-
-if __name__ == "__main__":
-    # 编译出含有算子的库，并以so的方式提供
-    kernel_so = compile_kernels()
-
-    # 编译出pytorch适配层的库，支持被框架集成
-    exts = compile_adapter()
-
-    # 将整体打包成wheel包
-    setup(
-        name="ascend_custom_kernels",  # package的名字
-        version='1.0',
-        keywords='ascend_custom_kernels',
-        ext_modules=exts,
-        packages=find_packages(),
-        cmdclass={"build_ext": BuildExtension},
-        data_files=[(".", [kernel_so])],
-        include_package_data=True,
-    )
diff --git a/sample/pytorch_adapter/with_setuptools/test.py b/sample/pytorch_adapter/with_setuptools/test.py
deleted file mode 100644
index 896eef2c0..000000000
--- a/sample/pytorch_adapter/with_setuptools/test.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import torch
-import torch_npu
-import ascend_custom_kernels_lib
-from torch_npu.testing.testcase import TestCase, run_tests
-
-
-class TestCustomAdd(TestCase):
-    def test_add(self):
-        # 由于kernel现在是静态tiling，所以此处尺寸需要匹配
-        # 因为add是elementwise的，现有算子支持8*2048(详见kernel实现)，所以，小于这个应该都可以
-        length = [8, 2048]
-        x = torch.rand(length, device='cpu', dtype=torch.float16)
-        y = torch.rand(length, device='cpu', dtype=torch.float16)
-
-        x_npu = x.npu()
-        y_npu = y.npu()
-        x_npu.requires_grad = True
-        y_npu.requires_grad = True
-        output = ascend_custom_kernels_lib.my_add(x_npu, y_npu)
-        # 反向能力验证
-        output.backward(output)
-
-        x.requires_grad = True
-        y.requires_grad = True
-        cpuout = torch.add(x, y)
-        cpuout.backward(cpuout)
-
-        self.assertRtolEqual(output, cpuout)
-        self.assertRtolEqual(x_npu.grad, x.grad)
-        self.assertRtolEqual(y_npu.grad, y.grad)
-
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/sample/sanitizer_sample/Racecheck/Makefile b/sample/sanitizer_sample/Racecheck/Makefile
deleted file mode 100644
index ac230761b..000000000
--- a/sample/sanitizer_sample/Racecheck/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-ASCEND_HOME_PATH	        ?= /usr/local/Ascend/ascend-toolkit/latest
-
-COMPILER                    := ${ASCEND_HOME_PATH}/compiler/ccec_compiler/bin/ccec
-HOST_COMPILER               := g++ 
-COMPILER_FLAG               := -xcce -O2 -std=c++17 --cce-enable-sanitizer -g
-HOST_COMPILER_FLAG          := -O2 -std=c++17
-LINK_FLAG                   := --cce-fatobj-link --cce-enable-sanitizer
-DAV_FLAG                    := --cce-aicore-arch=dav-c220-vec
-ASCENDC_INC_FLAG            := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include
-HOST_INC_FLAG               := -I${ASCEND_HOME_PATH}/include
-LINK_LIBS                   := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++
-
-all: build
-
-build: raw_error_kernel.o main.o raw_error.fatbin
-
-raw_error_kernel.o: raw_error_kernel.cpp
-	$(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^
-
-main.o: main.cpp
-	$(HOST_COMPILER) $(HOST_COMPILER_FLAG) $(HOST_INC_FLAG) -o $@ -c $^
-
-raw_error.fatbin: raw_error_kernel.o main.o
-	$(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ ${LINK_LIBS}
-
-.PHONY: clean
-clean:
-	rm *.o raw_error.fatbin
diff --git a/sample/sanitizer_sample/Racecheck/main.cpp b/sample/sanitizer_sample/Racecheck/main.cpp
deleted file mode 100644
index cd96d8e08..000000000
--- a/sample/sanitizer_sample/Racecheck/main.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <iostream>
-#include "acl/acl.h"
-
-#define ACL_ERROR_NONE 0
-
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0);
-
-extern "C" void raw_error_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gmInput, uint8_t *gmOutput);
-
-int main(void)
-{
-    CHECK_ACL(aclInit(nullptr));
-    aclrtContext context;
-    int32_t deviceId = 0;
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    aclrtStream stream = nullptr;
-    CHECK_ACL(aclrtCreateStream(&stream));
-
-    uint8_t *gmInput = nullptr;
-    uint8_t *gmOutput = nullptr;
-    CHECK_ACL(aclrtMalloc((void**)&gmInput, 256, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMalloc((void**)&gmOutput, 256, ACL_MEM_MALLOC_HUGE_FIRST));
-
-    uint64_t blockDim = 1UL;
-    raw_error_kernel_do(blockDim, nullptr, stream, gmInput, gmOutput);
-    CHECK_ACL(aclrtSynchronizeStream(stream));
-
-    CHECK_ACL(aclrtFree(gmInput));
-    CHECK_ACL(aclrtFree(gmOutput));
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtDestroyContext(context));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-    return 0;
-}
\ No newline at end of file
diff --git a/sample/sanitizer_sample/Racecheck/raw_error_kernel.cpp b/sample/sanitizer_sample/Racecheck/raw_error_kernel.cpp
deleted file mode 100644
index b1db85609..000000000
--- a/sample/sanitizer_sample/Racecheck/raw_error_kernel.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "kernel_operator.h"
-#include "acl/acl.h"
-using namespace AscendC;
-
-constexpr int32_t BYTESIZE_EXAMPLE = 256;
-constexpr int32_t BUFFER_NUM = 1;
-constexpr int32_t NUM_DATA = BYTESIZE_EXAMPLE / sizeof(half);
-
-extern "C" __global__ __aicore__ void raw_error_kernel(__gm__ uint8_t *gmInput, __gm__ uint8_t *gmOutput) {
-    TPipe pipe;
-    TQue<QuePosition::VECCALC, BUFFER_NUM> xQue;
-    GlobalTensor<half> xInGm, xOutGm;
-    pipe.InitBuffer(xQue, BUFFER_NUM, BYTESIZE_EXAMPLE);
-    LocalTensor xLocal = xQue.AllocTensor<half>();
-    xInGm.SetGlobalBuffer((__gm__ half*)gmInput, NUM_DATA);
-    xOutGm.SetGlobalBuffer((__gm__ half*)gmOutput, NUM_DATA);
-    DataCopy(xLocal, xInGm, NUM_DATA);
-    // 17行为对UB进行写入，22行为对UB进行读，由于中间没有阻塞，UB上存在先写后读的竞争。解决方法为借助Que，先入队然后出队
-    // xQue.EnQue(xLocal);
-    // LocalTensor deQueLocal = xQue.DeQue<half>();
-    // DataCopy(xOutGm, deQueLocal, NUM_DATA);
-    DataCopy(xOutGm, xLocal, NUM_DATA);
-}
-
-extern "C" void raw_error_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gmInput, uint8_t *gmOutput)
-{
-    raw_error_kernel<<<blockDim, l2ctrl, stream>>>(gmInput, gmOutput);
-}
diff --git a/sample/sanitizer_sample/memcheck/illegal_align/Makefile b/sample/sanitizer_sample/memcheck/illegal_align/Makefile
deleted file mode 100644
index dff315b5d..000000000
--- a/sample/sanitizer_sample/memcheck/illegal_align/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-ASCEND_HOME_PATH	    ?= /usr/local/Ascend/ascend-toolkit/latest
-
-COMPILER                    := ${ASCEND_HOME_PATH}/compiler/ccec_compiler/bin/ccec
-HOST_COMPILER               := g++ 
-COMPILER_FLAG               := -xcce -O2 -std=c++17 --cce-enable-sanitizer -g
-HOST_COMPILER_FLAG          := -O2 -std=c++17
-LINK_FLAG                   := --cce-fatobj-link --cce-enable-sanitizer
-DAV_FLAG                    := --cce-aicore-arch=dav-c220-vec
-ASCENDC_INC_FLAG            := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include # 参考device_intf.cmake的配置简化
-HOST_INC_FLAG               := -I${ASCEND_HOME_PATH}/include
-LINK_LIBS                   := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++
-
-all: build
-
-build: illegal_align_kernel.o main.o illegal_align.fatbin
-
-illegal_align_kernel.o: illegal_align_kernel.cpp
-	$(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^
-
-main.o: main.cpp
-	$(HOST_COMPILER) $(HOST_COMPILER_FLAG) $(HOST_INC_FLAG) -o $@ -c $^
-
-illegal_align.fatbin: illegal_align_kernel.o main.o
-	$(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ ${LINK_LIBS}
-
-.PHONY: clean
-clean:
-	rm *.o illegal_align.fatbin
diff --git a/sample/sanitizer_sample/memcheck/illegal_align/illegal_align_kernel.cpp b/sample/sanitizer_sample/memcheck/illegal_align/illegal_align_kernel.cpp
deleted file mode 100644
index 605744ed5..000000000
--- a/sample/sanitizer_sample/memcheck/illegal_align/illegal_align_kernel.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "kernel_operator.h"
-#include "acl/acl.h"
-using namespace AscendC;
-
-constexpr int32_t BYTESIZE_EXAMPLE = 256;
-constexpr int32_t NUM_DATA = BYTESIZE_EXAMPLE / sizeof(half);
-
-extern "C" __global__ __aicore__ void illegal_align_kernel(__gm__ uint8_t *gm) {
-    TPipe pipe;
-    TBuf<QuePosition::VECCALC> tbuf;
-    pipe.InitBuffer(tbuf, BYTESIZE_EXAMPLE);
-    LocalTensor<half> xLm = tbuf.Get<half>();
-
-    GlobalTensor<half> xGm;
-    xGm.SetGlobalBuffer((__gm__ half*)gm, NUM_DATA);
-
-    // 对 UB 进行了错误的偏移导致 DataCopy 接口在对数据进行搬运时产生非对齐访问异常
-    DataCopy(xGm, xLm[3], NUM_DATA);
-    // 正确的用法如下，在操作 Local Tensor 时地址偏移量应为 32 字节对齐
-    DataCopy(xGm, xLm[32], NUM_DATA);
-}
-
-extern "C" void illegal_align_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gm)
-{
-    illegal_align_kernel<<<blockDim, l2ctrl, stream>>>(gm);
-}
diff --git a/sample/sanitizer_sample/memcheck/illegal_align/main.cpp b/sample/sanitizer_sample/memcheck/illegal_align/main.cpp
deleted file mode 100644
index 7f8b53031..000000000
--- a/sample/sanitizer_sample/memcheck/illegal_align/main.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <iostream>
-#include "acl/acl.h"
-
-#define ACL_ERROR_NONE 0
-
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0);
-
-extern "C" void illegal_align_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gm);
-
-int main(void)
-{
-    CHECK_ACL(aclInit(nullptr));
-    aclrtContext context;
-    int32_t deviceId = 0;
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    aclrtStream stream = nullptr;
-    CHECK_ACL(aclrtCreateStream(&stream));
-
-    uint8_t *gm = nullptr;
-    CHECK_ACL(aclrtMalloc((void**)&gm, 256, ACL_MEM_MALLOC_HUGE_FIRST));
-
-    uint64_t blockDim = 1UL;
-    illegal_align_kernel_do(blockDim, nullptr, stream, gm);
-    CHECK_ACL(aclrtSynchronizeStream(stream));
-
-    CHECK_ACL(aclrtFree(gm));
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtDestroyContext(context));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-    return 0;
-}
\ No newline at end of file
diff --git a/sample/sanitizer_sample/memcheck/illegal_read_and_write/Makefile b/sample/sanitizer_sample/memcheck/illegal_read_and_write/Makefile
deleted file mode 100644
index 4d810639e..000000000
--- a/sample/sanitizer_sample/memcheck/illegal_read_and_write/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-ASCEND_HOME_PATH	    ?= /usr/local/Ascend/ascend-toolkit/latest
-
-COMPILER                    := ${ASCEND_HOME_PATH}/compiler/ccec_compiler/bin/ccec
-HOST_COMPILER               := g++ 
-COMPILER_FLAG               := -xcce -O2 -std=c++17 --cce-enable-sanitizer -g
-HOST_COMPILER_FLAG          := -O2 -std=c++17
-LINK_FLAG                   := --cce-fatobj-link --cce-enable-sanitizer
-DAV_FLAG                    := --cce-aicore-arch=dav-c220-vec
-ASCENDC_INC_FLAG            := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include
-HOST_INC_FLAG               := -I${ASCEND_HOME_PATH}/include
-LINK_LIBS                   := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++
-
-all: build
-
-build: illegal_read_and_write_kernel.o main.o illegal_read_and_write.fatbin
-
-illegal_read_and_write_kernel.o: illegal_read_and_write_kernel.cpp
-	$(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^
-
-main.o: main.cpp
-	$(HOST_COMPILER) $(HOST_COMPILER_FLAG) $(HOST_INC_FLAG) -o $@ -c $^
-
-illegal_read_and_write.fatbin: illegal_read_and_write_kernel.o main.o
-	$(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ ${LINK_LIBS}
-
-.PHONY: clean
-clean:
-	rm *.o illegal_read_and_write.fatbin
diff --git a/sample/sanitizer_sample/memcheck/illegal_read_and_write/illegal_read_and_write_kernel.cpp b/sample/sanitizer_sample/memcheck/illegal_read_and_write/illegal_read_and_write_kernel.cpp
deleted file mode 100644
index 8a85df5cc..000000000
--- a/sample/sanitizer_sample/memcheck/illegal_read_and_write/illegal_read_and_write_kernel.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "kernel_operator.h"
-#include "acl/acl.h"
-using namespace AscendC;
-
-constexpr int32_t BYTESIZE = 256;
-constexpr int32_t BYTESIZE_LARGE = 512;
-constexpr int32_t NUM_DATA = BYTESIZE / sizeof(half);
-constexpr int32_t NUM_DATA_LARGE = BYTESIZE_LARGE / sizeof(half);
-
-extern "C" __global__ __aicore__ void illegal_read_and_write_kernel(__gm__ uint8_t *gm)
-{
-    TPipe pipe;
-    TBuf<QuePosition::VECCALC> xlm;
-    GlobalTensor<half> xGm;
-    pipe.InitBuffer(xlm, BYTESIZE_LARGE);
-    LocalTensor<half> xLm = xlm.Get<half>();
-    xGm.SetGlobalBuffer((__gm__ half *)gm, NUM_DATA);
-    DataCopy(xLm, xGm, NUM_DATA_LARGE);
-    DataCopy(xGm, xLm, NUM_DATA_LARGE);
-    // 第17行给xGm分配了BYTESIZE字节的内存，但是第18、19行DataCopy搬运了BYTESIZE_LARGE字节的内存
-    // BYTESIZE_LARGE > BYTESIZE，导致对xGm的越界非法读写，以下是正确写法
-    // DataCopy(xLm, xGm, NUM_DATA);
-    // DataCopy(xGm, xLm, NUM_DATA);
-}
-
-extern "C" void illegal_read_and_write_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gm)
-{
-    illegal_read_and_write_kernel<<<blockDim, l2ctrl, stream>>>(gm);
-}
diff --git a/sample/sanitizer_sample/memcheck/illegal_read_and_write/main.cpp b/sample/sanitizer_sample/memcheck/illegal_read_and_write/main.cpp
deleted file mode 100644
index 44138b3d9..000000000
--- a/sample/sanitizer_sample/memcheck/illegal_read_and_write/main.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <iostream>
-#include "acl/acl.h"
-
-#define ACL_ERROR_NONE 0
-
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0);
-
-extern "C" void illegal_read_and_write_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gm);
-
-int main(void)
-{
-    CHECK_ACL(aclInit(nullptr));
-    aclrtContext context;
-    int32_t deviceId = 0;
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    aclrtStream stream = nullptr;
-    CHECK_ACL(aclrtCreateStream(&stream));
-
-    uint8_t *gm = nullptr;
-    CHECK_ACL(aclrtMalloc((void**)&gm, 256, ACL_MEM_MALLOC_HUGE_FIRST));
-
-    uint64_t blockDim = 1UL;
-    illegal_read_and_write_kernel_do(blockDim, nullptr, stream, gm);
-    CHECK_ACL(aclrtSynchronizeStream(stream));
-
-    CHECK_ACL(aclrtFree(gm));
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtDestroyContext(context));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-    return 0;
-}
\ No newline at end of file
diff --git a/sample/sanitizer_sample/memcheck/out_of_bound/Makefile b/sample/sanitizer_sample/memcheck/out_of_bound/Makefile
deleted file mode 100644
index 22e06f70f..000000000
--- a/sample/sanitizer_sample/memcheck/out_of_bound/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-ASCEND_HOME_PATH	    ?= /usr/local/Ascend/ascend-toolkit/latest
-
-COMPILER                    := ${ASCEND_HOME_PATH}/compiler/ccec_compiler/bin/ccec
-HOST_COMPILER               := g++ 
-COMPILER_FLAG               := -xcce -O2 -std=c++17 --cce-enable-sanitizer -g
-HOST_COMPILER_FLAG          := -O2 -std=c++17
-LINK_FLAG                   := --cce-fatobj-link --cce-enable-sanitizer
-DAV_FLAG                    := --cce-aicore-arch=dav-c220-vec
-ASCENDC_INC_FLAG            := -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/impl -I${ASCEND_HOME_PATH}/compiler/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/include
-HOST_INC_FLAG               := -I${ASCEND_HOME_PATH}/include
-LINK_LIBS                   := -L${ASCEND_HOME_PATH}/lib64 -lruntime -lascendcl -lstdc++
-
-all: build
-
-build: out_of_bound_kernel.o main.o out_of_bound.fatbin
-
-out_of_bound_kernel.o: out_of_bound_kernel.cpp
-	$(COMPILER) $(COMPILER_FLAG) $(DAV_FLAG) $(ASCENDC_INC_FLAG) -o $@ -c $^
-
-main.o: main.cpp
-	$(HOST_COMPILER) $(HOST_COMPILER_FLAG) $(HOST_INC_FLAG) -o $@ -c $^
-
-out_of_bound.fatbin: out_of_bound_kernel.o main.o
-	$(COMPILER) $(LINK_FLAG) $(DAV_FLAG) -o $@ $^ ${LINK_LIBS}
-
-.PHONY: clean
-clean:
-	rm *.o out_of_bound.fatbin
diff --git a/sample/sanitizer_sample/memcheck/out_of_bound/main.cpp b/sample/sanitizer_sample/memcheck/out_of_bound/main.cpp
deleted file mode 100644
index 53da5b54a..000000000
--- a/sample/sanitizer_sample/memcheck/out_of_bound/main.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <iostream>
-#include "acl/acl.h"
-
-#define ACL_ERROR_NONE 0
-
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0);
-
-extern "C" void out_of_bound_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gm);
-
-int main(void)
-{
-    CHECK_ACL(aclInit(nullptr));
-    aclrtContext context;
-    int32_t deviceId = 0;
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    aclrtStream stream = nullptr;
-    CHECK_ACL(aclrtCreateStream(&stream));
-
-    uint8_t *gm = nullptr;
-    CHECK_ACL(aclrtMalloc((void**)&gm, 512, ACL_MEM_MALLOC_HUGE_FIRST));
-
-    uint64_t blockDim = 10UL;
-    out_of_bound_kernel_do(blockDim, nullptr, stream, gm);
-    CHECK_ACL(aclrtSynchronizeStream(stream));
-
-    CHECK_ACL(aclrtFree(gm));
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtDestroyContext(context));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-    return 0;
-}
\ No newline at end of file
diff --git a/sample/sanitizer_sample/memcheck/out_of_bound/out_of_bound_kernel.cpp b/sample/sanitizer_sample/memcheck/out_of_bound/out_of_bound_kernel.cpp
deleted file mode 100644
index cb54dc322..000000000
--- a/sample/sanitizer_sample/memcheck/out_of_bound/out_of_bound_kernel.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "kernel_operator.h"
-#include "acl/acl.h"
-using namespace AscendC;
-
-constexpr int32_t BYTESIZE = 512;
-constexpr int32_t NUM_DATA = 16;
-constexpr int32_t CORE_OFFSET = 14;
-constexpr int32_t LOOP_COUNT = 10;
-
-extern "C" __global__ __aicore__ void out_of_bound_kernel(__gm__ uint8_t *gm)
-{
-    TPipe pipe;
-    TBuf<QuePosition::VECCALC> xlm;
-    GlobalTensor<half> xGm;
-    pipe.InitBuffer(xlm, BYTESIZE);
-    LocalTensor<half> xLm = xlm.Get<half>();
-    xGm.SetGlobalBuffer((__gm__ half *)gm + GetBlockIdx() * CORE_OFFSET, NUM_DATA);
-    // 这里第17行CORE_OFFSET < NUM_DATA, 第21行多核写入GM时，写入的size大于偏移，导致出现内存踩踏
-    // 以下是正确写法
-    // xGm.SetGlobalBuffer((__gm__ half *)gm + GetBlockIdx() * NUM_DATA, NUM_DATA);
-    DataCopy(xGm, xLm, NUM_DATA);
-}
-
-extern "C" void out_of_bound_kernel_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *gm)
-{
-    out_of_bound_kernel<<<blockDim, l2ctrl, stream>>>(gm);
-}
-- 
Gitee