diff --git a/LICENSE b/LICENSE
index 52c5e5f..d583a3a 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2026 InfiniTensor
+Copyright (c) 2025 InfiniTensor
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/Makefile b/Makefile
index 883d452..85db20d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,59 +1,24 @@
 # *********************************************************************
 # Learning-CUDA Makefile
 # Targets:
-#   make               		: Build + run tests (default, non-verbose)
-#   make build         		: Only compile (no run)
-#   make run           		: Run tests (after build, non-verbose)
-#   make run VERBOSE=true 	: Run tests with verbose output
-#   make clean         		: Delete temporary files
+#   make               : Build + run tests (default, non-verbose)
+#   make build         : Only compile (no run)
+#   make run           : Run tests (after build, non-verbose)
+#   make run VERBOSE=true : Run tests with verbose output
+#   make clean         : Delete temporary files
 # *********************************************************************
 
 # -------------------------------
 # Configuration
 # -------------------------------
-PLATFORM        ?= nvidia
-PLATFORM_DEFINE ?= -DPLATFORM_NVIDIA
-STUDENT_SUFFIX  := cu
-CFLAGS          := -std=c++17 -O0
-EXTRA_LIBS     	:= 
-
-# Compiler & Tester object selection based on PLATFORM
-ifeq ($(PLATFORM),nvidia)
-    CC          	:= nvcc
-    TEST_OBJ    	:= tester/tester_nv.o
-	PLATFORM_DEFINE := -DPLATFORM_NVIDIA
-else ifeq ($(PLATFORM),iluvatar)
-    CC          	:= clang++
-	CFLAGS          := -std=c++17 -O3
-    TEST_OBJ    	:= tester/tester_iluvatar.o
-	PLATFORM_DEFINE := -DPLATFORM_ILUVATAR
-	EXTRA_LIBS		:= -lcudart -I/usr/local/corex/include -L/usr/local/corex/lib64 -fPIC
-else ifeq ($(PLATFORM),moore)
-    CC          	:= mcc
-	CFLAGS          := -std=c++11 -O3
-    TEST_OBJ    	:= tester/tester_moore.o
-	STUDENT_SUFFIX  := mu
-	PLATFORM_DEFINE := -DPLATFORM_MOORE
-	EXTRA_LIBS		:= -I/usr/local/musa/include -L/usr/lib/gcc/x86_64-linux-gnu/11/ -L/usr/local/musa/lib -lmusart
-else ifeq ($(PLATFORM),metax)
-    CC          	:= mxcc
-    TEST_OBJ    	:= tester/tester_metax.o
-	STUDENT_SUFFIX  := maca
-	PLATFORM_DEFINE := -DPLATFORM_METAX
-else
-    $(error Unsupported PLATFORM '$(PLATFORM)' (expected: nvidia, iluvatar, moore, metax))
-endif
-
-# Executable name
-TARGET          	:= test_kernels
-# Kernel implementation
-STUDENT_SRC     	:= src/kernels.$(STUDENT_SUFFIX) 
-# Compiled student object (auto-generated)
-STUDENT_OBJ  		:= $(addsuffix .o,$(basename $(STUDENT_SRC)))
-# Tester's actual verbose argument (e.g., --verbose, -v)
-TEST_VERBOSE_FLAG 	:= --verbose
-# User-provided verbose mode (true/false; default: false)
-VERBOSE         	:=  
+CC              := nvcc                  # CUDA compiler
+CFLAGS          := -std=c++17 -O0        # Compile flags
+TARGET          := test_kernels     	 # Executable name
+STUDENT_SRC     := src/kernels.cu        # Kernel implementation 
+STUDENT_OBJ     := $(STUDENT_SRC:.cu=.o) # Compiled student object (auto-generated)
+TEST_OBJ        := tester/tester.o       # Pre-compiled test object
+TEST_VERBOSE_FLAG := --verbose            # Tester's actual verbose argument (e.g., --verbose, -v)
+VERBOSE         :=                      # User-provided verbose mode (true/false; default: false)
 
 # -------------------------------
 # Process User Input (VERBOSE → Tester Flag)
@@ -63,7 +28,7 @@ VERBOSE         	:=
 VERBOSE_ARG := $(if $(filter true True TRUE, $(VERBOSE)), $(TEST_VERBOSE_FLAG), )
 
 # -------------------------------
-# Phony Targets
+# Phony Targets (No Files Generated)
 # -------------------------------
 .PHONY: all build run clean
 
@@ -75,7 +40,7 @@ build: $(TARGET)
 
 # Run target: Execute tests (supports `VERBOSE=true` for verbose output)
 run: $(TARGET)
-	@echo "=== Running tests (output from $(STUDENT_OBJ)) ==="
+	@echo "=== Running tests (output from tester.o) ==="
 	@# Show verbose mode status (friendly for users)
 	@if [ -n "$(VERBOSE_ARG)" ]; then \
 	    echo "=== Verbose mode: Enabled (using '$(TEST_VERBOSE_FLAG)') ==="; \
@@ -95,9 +60,9 @@ clean:
 # Generate executable: Link kernel code (kernels.o) with test logic (tester.o)
 $(TARGET): $(STUDENT_OBJ) $(TEST_OBJ)
 	@echo "=== Linking executable (student code + test logic) ==="
-	$(CC) $(CFLAGS) $(PLATFORM_DEFINE) -o $@ $^ $(EXTRA_LIBS)
+	$(CC) $(CFLAGS) -o $@ $^
 
 # Generate src object: Compile kernels.cu (triggers template instantiation)
 $(STUDENT_OBJ): $(STUDENT_SRC)
-	@echo "=== Compiling student code ($(STUDENT_SRC)) ==="
-	$(CC) $(CFLAGS) $(PLATFORM_DEFINE) -c $< -o $@
+	@echo "=== Compiling student code (src/kernels.cu) ==="
+	$(CC) $(CFLAGS) -c $< -o $@
diff --git a/README.md b/README.md
index 8bc2d9a..410aed4 100644
--- a/README.md
+++ b/README.md
@@ -1,81 +1,43 @@
 # Learning-CUDA
 
-本项目为 2025 年冬季 InfiniTensor 大模型与人工智能系统训练营 CUDA 方向专业阶段的作业与项目系统。
+本项目为 2025 年夏季 InfiniTensor 大模型与人工智能系统训练营 CUDA 方向专业阶段的作业系统。
 
 ## 📁 项目结构
 
 ```text
 learning-CUDA/
 ├── Makefile
-├── LICENSE
 ├── README
 ├── src
-│   ├── kernels.cu
-│   ├── kernels.maca
-|   └── kernels.mu
+│   └── kernels.cu
 └── tester
-    ├── tester_iluvatar.o
-    ├── tester_metax.o
-    ├── tester_moore.o
-    ├── tester_nv.o
+    ├── tester.o
     └── utils.h
 ```  
 
 ## 环境配置
 
-### > 英伟达（NVIDIA）
+如果你使用的是训练营提供的服务器，则该步骤可直接跳过。
 
-- 如果你使用的是训练营所提供的服务器，遵照英伟达算力文档中的步骤配置好环境即可。
-
-- 如果为本地或其他环境，请确保系统已安装以下工具：
-
-  1. **CUDA Toolkit**（版本11.0及以上）：
-      - 验证安装：运行`nvcc --version`。
-      - 安装：从[NVIDIA CUDA Toolkit下载页](https://developer.nvidia.com/cuda-downloads)获取。
-  2. **GNU Make**：
-      - 验证安装：运行`make --version`（大多数Linux/macOS已预装）。
-  3. **C++ 版本**：
-      - 本次作业在英伟达上默认需支持 C++17
-
-### > 天数智芯（Iluvatar CoreX）
-
-- 如果你使用的是训练营所提供的服务器，遵照天数 BI-100 算力文档中的步骤配置好环境即可。
-
-- 对于非训练营所提供的天数算力，请配置标准的天数 GPU 开放环境。本次作业在天数上默认需支持 C++17，且**本次作业的配置不保证能在所有其他天数环境上无修改直接运行**。
-
-### > 沐曦集成电路（Metax）
-
-- 如果你使用的是训练营所提供的服务器，遵照沐曦 (C500) 算力文档中的步骤配置好环境即可。
-
-- 对于非训练营所提供的沐曦算力，请配置标准的沐曦 GPU 开放环境。本次作业在沐曦上默认需支持 C++17，且**本次作业的配置不保证能在所有其他沐曦环境上无修改直接运行**。
-
-### > 摩尔线程（Moore Threads）
-
-- 如果你使用的是训练营所提供的服务器，请先遵照摩尔 (S5000) 算力文档中的步骤配置环境。
-
-    在此基础上，请确保在 `.bashrc` 中添加了以下环境变量：
-
-    ```bash
-    export MUSA_ROOT=/usr/local/musa
-    export PATH="$MUSA_ROOT/bin:$PATH"
-    export LD_LIBRARY_PATH="$MUSA_ROOT/lib:$LD_LIBRARY_PATH"
-    export CPLUS_INCLUDE_PATH=/usr/include/c++/11:/usr/include/x86_64-linux-gnu/c++/11
-    ```
-
-- 对于非训练营所提供的摩尔算力，请配置标准的摩尔 GPU 开放环境。本次作业在摩尔上默认需支持 C++11，且**本次作业的配置不保证能在所有其他摩尔环境上无修改直接运行**。
+请确保系统已安装以下工具：
 
+1. **CUDA Toolkit**（版本11.0及以上）：
+    - 验证安装：运行`nvcc --version`。
+    - 安装：从[NVIDIA CUDA Toolkit下载页](https://developer.nvidia.com/cuda-downloads)获取。
+2. **GNU Make**：
+    - 验证安装：运行`make --version`（大多数Linux/macOS已预装）。
 
 ## 🧠 作业
 
 作业一共有两题。需实现 `src/kernels.cu` 中给定的 **2 个 CUDA  函数** 。
 
-1. **trace**
+1. **kthLargest**
 
-实现 CUDA 的 trace 函数。给定一个逻辑上 2D 的输入矩阵，返回该矩阵的迹。该函数需支持 `int` 和 `float` 两种类型的输入。具体边界处理和一些条件可见文件中的注释。
+实现 CUDA 的 kthLargest 函数。给定一个连续的输入数组和非负数 k，返回该数组中第 k 大的数。该函数需支持 int 和 float 两种类型的输入。具体边界处理和一些条件可见文件中的注释。
   
 2. **flashAttention**
 
-实现 Flash Attention 算子。需支持 causal masking 和 GQA。具体行为与 [torch.nn.functional.scaled_dot_product_attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 保持一致。接口未提供的参数所代表的功能无需支持和实现。具体参数要求请参考文件中的注释。该函数需支持 `float` 和 `half` 两种类型。
+实现 Flash Attention 算子。需支持 causal masking 和 GQA。具体行为与 [torch.nn.functional.scaled_dot_product_attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 保持一致。接口未提供的参数所代表的功能无需支持和实现。具体参数要求请参考文件中的注释。该函数需支持 float。
 
 ### 注意事项
 
@@ -86,7 +48,7 @@ learning-CUDA/
 5. 需进行**适当**的代码注释解释重要部分；
 
 ### 提交方式
-在网站 [InfiniTensor 开源社区](https://www.infinitensor.com/camp/winter2025/homework) 上提交 GitHub 链接，以最新提交为准。
+在网站 [InfiniTensor 开源社区](https://beta.infinitensor.com/camp/summer2025) 上提交 GitHub 链接，以最新提交为准。
 
 ## 🛠️ 编译与运行
 
@@ -94,46 +56,15 @@ learning-CUDA/
 
 ### 构建与运行指令
 
-使用 `Makefile` 简化构建流程，以下命令需在**项目根目录**（即 `Makefile` 所在的目录）执行：
+使用`Makefile`简化构建流程，以下命令需在**项目根目录**（即 `Makefile` 所在的目录）执行：
 
-#### 1. 默认：构建并运行测试（非 verbose 模式）
+#### 默认：构建并运行测试（非 verbose 模式）
 
-- 直接在命令行使用 `make` 指令编译代码并执行测试，输出简洁结果。
+直接在命令行使用 `make` 指令编译代码并执行测试，输出简洁结果。
 
-#### 2. 构建并运行测试（verbose 模式）
-
-- 直接在命令行使用 `make VERBOSE=true` 指令编译代码并执行测试，输出包括执行时间等更多信息在内的结果。
-
-#### 3. 选择性测试算子
-
-如果只想调试/测试某个算子，可以通过设置环境变量的方式来实现。比如只想测试第一题的 trace，则可以：
-
-1. 如果只是临时跳过，可以直接在命令行使用 `SKIP_ATTENTION=1 make` 编译代码并**跳过第二题，只测试第一题**。
-   
-2. 如果较长时间都想跳过，可以一开始使用一次 `export SKIP_ATTENTION=1`, 随后在同一个命令行中照常使用 `make` 相关命令。如果想撤销，则可以使用 `export SKIP_ATTENTION=0` 或 `unset SKIP_ATTENTION`。
-
-
-#### 4. 选择编译平台
-
-可以通过在命令行使用 `make PLATFORM=<platform>` 指令来指定编译平台。**默认的编译平台为英伟达平台**，即如果不指定 `PLATFORM` 直接 `make`，则是编译英伟达平台。具体平台选项：
-
-1. 编译并在英伟达平台运行：`make` 或 `make PLATFORM=nvidia`；
-   
-2. 编译并在天数平台运行：`make PLATFORM=iluvatar`；
-   
-3. 编译并在沐曦平台运行：`make PLATFORM=metax`；
-
-4. 编译并在摩尔平台运行：`make PLATFORM=moore`；
-   
-
-**以上提及的编译选项与环境变量均可根据需求组合。例如：`SKIP_TRACE=1 make PLATFORM=nvidia VERBOSE=true`**
-   
-
-#### 环境变量：
-1. `SKIP_TRACE`: 跳过第一题的 trace 测试。
-   
-2. `SKIP_ATTENTION`: 跳过第二题的 Flash Attention 测试。
+#### 构建并运行测试（verbose 模式）
 
+直接在命令行使用 `make VERBOSE=true` 指令编译代码并执行测试，输出包括执行时间在内的结果。
 
 ## 📊 评分规则
 
@@ -148,21 +79,10 @@ learning-CUDA/
 2. **性能加分**  
    - 在正确性的基础上，会对各实现的性能进行排名；
    - 性能越优，获得的额外分数越多；
-   - **性能评判将在提供的服务器上进行**，因此请在服务器上进行性能评估。
-
-3. **平台适配加分**  
-   - 每道题在英伟达上测例正确的基础上，每多适配一个国产平台可以获得固定得分乘算系数（20%）；
-   - 每个平台适配完成的标准为该题在该平台上可以通过全部测例。全部通过则获得该平台的 20% 加成，无法全部通过则无法获得加成（0%）；
-   - 题目分开计算，即只有在该平台上通过全部测例的题目可以获得该题目部分的乘算系数加成；
-   - 国产平台不进行性能测试，故不参与前面第二点的性能得分计算（与性能加分正交）。但对国产平台做了针对性优化并有较明显成效者，根据导师意见可以额外加分。
-  
-4. **综合评判**
-   - 在前三点的基础上，还有一个整体独立的评分乘区，乘算系数可正增长（`>1`），也可折损（`[0,1)`）；
-   - **正增长** 的主要获得方式为第三点提到的国产平台性能优化额外分（上限30%）
-   - **折损** 主要来源于较差的代码质量（代码是否有较好的整理和结构、是否有合适的注释、是否有统一的命名风格等）、编译与运行问题（比如缺少必要的头文件导致的无法直接运行等）以及未符合前面 **注意事项** 中提及的要求。
+   - 性能评判将在提供的服务器上进行，因此请在服务器上进行性能评估。
 
-5. **最终成绩**  
-   - 总体得分由「通过的测试用例数量」、「性能排名加分」、「平台适配加分」以及「综合评判」共同决定。  
+3. **最终成绩**  
+   - 总体得分由「通过的测试用例数量」与「性能排名加分」共同决定。  
    - 各测试用例的分数相加，形成最终成绩。  
 
 ## 📬 有疑问?
diff --git a/src/kernels.cu b/src/kernels.cu
index 370e66b..efc1bc8 100644
--- a/src/kernels.cu
+++ b/src/kernels.cu
@@ -1,27 +1,121 @@
 #include <vector>
-#include <cuda_fp16.h>
 
 #include "../tester/utils.h"
+#include <stdexcept>
+#include <algorithm>
+#include <vector>
+#include <cmath>
+#include <limits>
+#include <cassert>
 
+#include "flash_attention.cuh"
+#include <cuda_runtime.h>
+#include <math.h>
+#include <stdio.h>
+#include <iostream>
 /**
- * @brief Computes the trace of a matrix.
- *
- * The trace of a matrix is defined as the sum of its diagonal elements.
- * This function expects a flattened row-major matrix stored in a
- * std::vector. If the matrix is not square, the trace will sum up
- * elements along the main diagonal up to the smaller of rows or cols.
- *
- * @tparam T The numeric type of matrix elements (e.g., float, int).
- * @param h_input A flattened matrix of size rows * cols.
- * @param rows Number of rows in the matrix.
- * @param cols Number of columns in the matrix.
- * @return The trace (sum of diagonal values) of the matrix.
+ * @brief Find the k-th largest element in a vector using CUDA.
+ * 
+ * @tparam T Type of elements in the input vector (should support `int` and `float`).
+ * @param h_input Host-side input vector.
+ * @param k 1-based index of the element to find (e.g., `k=1` returns the largest element).
+ * @return T The k-th largest element in `h_input`.
+
+ * @note Must use CUDA kernels for all compute-intensive steps; no significant CPU allowed.
+ * @note Library functions that can directly complete a significant part of the work are NOT allowed. 
+ * @note For invalid cases, return T(-100).
+ * @note Handles device memory management (allocate/copy/free) internally. Errors should be thrown.
  */
 template <typename T>
-T trace(const std::vector<T>& h_input, size_t rows, size_t cols) {
-  // TODO: Implement the trace function
-  return T(-1);
+__device__ void partition3(T* data, int low, int high, int& left_eq, int& right_eq) {
+    T pivot = data[high];
+    int i = low;
+    int lt = low;       // data[low..lt-1] > pivot
+    int gt = high;      // data[gt+1..high] < pivot
+    while (i <= gt) {
+        if (data[i] > pivot) { // 大于pivot
+            T tmp = data[lt];
+            data[lt] = data[i];
+            data[i] = tmp;
+            lt++; i++;
+        } else if (data[i] < pivot) { // 小于pivot
+            T tmp = data[i];
+            data[i] = data[gt];
+            data[gt] = tmp;
+            gt--;
+        } else { // 等于pivot
+            i++;
+        }
+    }
+    left_eq = lt;
+    right_eq = gt;
+}
+
+
+template <typename T>
+__device__ int quickSelect3(T* data, int low, int high, int k) {
+    while (low <= high) {
+        int left_eq, right_eq;
+        partition3(data, low, high, left_eq, right_eq);
+        if (k >= left_eq && k <= right_eq) {
+            return k; // 找到第k大元素，k在等于pivot的范围内
+        } else if (k < left_eq) {
+            high = left_eq - 1; // 在左边找更大的元素
+        } else {
+            low = right_eq + 1; // 在右边找更小的元素
+        }
+    }
+    return -1; // 没找到，理论上不该出现
+}
+
+
+template <typename T>
+__global__ void quickSelectKernel(T* data, int low, int high, int k, int* result_idx) {
+    if (threadIdx.x == 0 && blockIdx.x == 0) { // 只用一个线程执行
+        int idx = quickSelect3(data, low, high, k);
+        *result_idx = idx;
+    }
+}
+
+template <typename T>
+T kthLargest(const std::vector<T>& h_input, size_t k) {
+    int n = h_input.size();
+    if (k <= 0 || k > n) {
+        return T(-100);
+    }
+
+    T* d_data;
+    cudaMalloc(&d_data, n * sizeof(T));
+    cudaMemcpy(d_data, h_input.data(), n * sizeof(T), cudaMemcpyHostToDevice);
+
+    int* d_result_idx;
+    int h_result_idx = -1;
+    cudaMalloc(&d_result_idx, sizeof(int));
+    cudaMemcpy(d_result_idx, &h_result_idx, sizeof(int), cudaMemcpyHostToDevice);
+
+    int kIndex = k - 1;
+
+    quickSelectKernel<T><<<1, 1>>>(d_data, 0, n - 1, kIndex, d_result_idx);
+    cudaDeviceSynchronize();
+
+    cudaMemcpy(&h_result_idx, d_result_idx, sizeof(int), cudaMemcpyDeviceToHost);
+
+    T result;
+    if (h_result_idx >= 0) {
+        cudaMemcpy(&result, d_data + h_result_idx, sizeof(T), cudaMemcpyDeviceToHost);
+    } else {
+        result = T(-100);
+    }
+
+    cudaFree(d_data);
+    cudaFree(d_result_idx);
+    return result;
 }
+// 用partition和quickSelect函数实现并行化的快速选择算法
+// 将数据从主机内存复制到设备内存，然后在GPU上计算第k大的元素
+// 最后将结果从设备内存复制回主机内存，并释放设备内存
+// 使用CUDA编程需要配置一个NVIDIA GPU环境和CUDA编译器
+// CUDA编程通常针对GPU计算密集型任务。
 
 /**
  * @brief Computes flash attention for given query, key, and value tensors.
@@ -39,22 +133,147 @@ T trace(const std::vector<T>& h_input, size_t rows, size_t cols) {
  * @param[in] head_dim Dimension size of each attention head
  * @param[in] is_causal Whether to apply causal masking
  */
+// ---------------------
+// Flash Attention Kernel
+// ---------------------
+
+// 简单GPU端softmax函数
+__device__ void softmax_device(float* scores, int len) {
+    float max_val = scores[0];
+    for (int i = 1; i < len; ++i) {
+        max_val = fmaxf(max_val, scores[i]);
+    }
+
+    float sum = 0.0f;
+    for (int i = 0; i < len; ++i) {
+        scores[i] = expf(scores[i] - max_val);
+        sum += scores[i];
+    }
+    float inv_sum = 1.0f / (sum + 1e-8f);
+    for (int i = 0; i < len; ++i) {
+        scores[i] *= inv_sum;
+    }
+}
+
+// 线程块：一个batch一个query head负责一个block，线程负责head_dim
+// 由于head_dim一般较大，这里使用threadIdx.x控制head_dim维度循环
+// blockIdx.x 控制 batch, blockIdx.y 控制 query_heads
+__global__ void flashAttentionKernel(const float* __restrict__ q,
+                                     const float* __restrict__ k,
+                                     const float* __restrict__ v,
+                                     float* __restrict__ o,
+                                     int batch_size, int tgt_len, int src_len,
+                                     int query_heads, int kv_heads, int head_dim,
+                                     bool is_causal) {
+    int b = blockIdx.x;    // batch index
+    int h = blockIdx.y;    // query head index
+
+    if (b >= batch_size || h >= query_heads) return;
+
+    int kvh = h * kv_heads / query_heads;
+    if (kvh >= kv_heads) return;
+
+    extern __shared__ float shared_mem[];
+    float* scores = shared_mem;       // size src_len
+    // probs可直接用scores覆盖，节省共享内存，这里不额外申请
+
+    for (int tq = 0; tq < tgt_len; ++tq) {
+        // 计算score
+        for (int sk = threadIdx.x; sk < src_len; sk += blockDim.x) {
+            if (is_causal && sk > tq) {
+                scores[sk] = -1e9f;
+            } else {
+                float dot = 0.f;
+                for (int d = 0; d < head_dim; ++d) {
+                    size_t q_idx = ((size_t)b * tgt_len + tq) * query_heads * head_dim + h * head_dim + d;
+                    size_t k_idx = ((size_t)b * src_len + sk) * kv_heads * head_dim + kvh * head_dim + d;
+                    dot += q[q_idx] * k[k_idx];
+                }
+                scores[sk] = dot / sqrtf((float)head_dim);
+            }
+        }
+
+        __syncthreads();
+
+        // 共享内存中的scores已完全写入，使用单线程计算softmax
+        if (threadIdx.x == 0) {
+            softmax_device(scores, src_len);
+        }
+
+        __syncthreads();
+
+        // 计算加权v
+        for (int d = threadIdx.x; d < head_dim; d += blockDim.x) {
+            float acc = 0.f;
+            for (int sk = 0; sk < src_len; ++sk) {
+                size_t v_idx = ((size_t)b * src_len + sk) * kv_heads * head_dim + kvh * head_dim + d;
+                acc += scores[sk] * v[v_idx];
+            }
+            size_t o_idx = ((size_t)b * tgt_len + tq) * query_heads * head_dim + h * head_dim + d;
+            o[o_idx] = acc;
+        }
+        __syncthreads();
+    }
+}
+
+// ---------------------
+// Host template function flashAttention
+// ---------------------
+
 template <typename T>
 void flashAttention(const std::vector<T>& h_q, const std::vector<T>& h_k,
                     const std::vector<T>& h_v, std::vector<T>& h_o,
-                    int batch_size, int target_seq_len, int src_seq_len, 
-                    int query_heads, int kv_heads, int head_dim, bool is_causal) {       
-}
+                    int batch_size, int target_seq_len, int src_seq_len,
+                    int query_heads, int kv_heads, int head_dim, bool is_causal) {
+    static_assert(std::is_same<T, float>::value, "Only float supported");
+
+    assert(h_q.size() == static_cast<size_t>(batch_size * target_seq_len * query_heads * head_dim));
+    assert(h_k.size() == static_cast<size_t>(batch_size * src_seq_len * kv_heads * head_dim));
+    assert(h_v.size() == static_cast<size_t>(batch_size * src_seq_len * kv_heads * head_dim));
+
+    h_o.resize(batch_size * target_seq_len * query_heads * head_dim);
+
+    float *d_q, *d_k, *d_v, *d_o;
+    size_t q_size = h_q.size() * sizeof(float);
+    size_t k_size = h_k.size() * sizeof(float);
+    size_t v_size = h_v.size() * sizeof(float);
+    size_t o_size = h_o.size() * sizeof(float);
+
+    cudaMalloc(&d_q, q_size);
+    cudaMalloc(&d_k, k_size);
+    cudaMalloc(&d_v, v_size);
+    cudaMalloc(&d_o, o_size);
+
+    cudaMemcpy(d_q, h_q.data(), q_size, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_k, h_k.data(), k_size, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_v, h_v.data(), v_size, cudaMemcpyHostToDevice);
 
+    dim3 grid(batch_size, query_heads);
+    int block_dim = 256;  // 线程数，调优用
+    size_t shared_mem_size = src_seq_len * sizeof(float);  // 只用一段共享内存存储scores
+
+    flashAttentionKernel<<<grid, block_dim, shared_mem_size>>>(
+        d_q, d_k, d_v, d_o,
+        batch_size, target_seq_len, src_seq_len,
+        query_heads, kv_heads, head_dim,
+        is_causal
+    );
+
+    cudaDeviceSynchronize();
+
+    cudaMemcpy(h_o.data(), d_o, o_size, cudaMemcpyDeviceToHost);
+
+    cudaFree(d_q);
+    cudaFree(d_k);
+    cudaFree(d_v);
+    cudaFree(d_o);
+}
 // *********************************************************************
 // Explicit Template Instantiations (REQUIRED FOR LINKING WITH TESTER.O)
 // DO NOT MODIFY THIS SECTION
 // *********************************************************************
-template int trace<int>(const std::vector<int>&, size_t, size_t);
-template float trace<float>(const std::vector<float>&, size_t, size_t);
+template int kthLargest<int>(const std::vector<int>&, size_t);
+template float kthLargest<float>(const std::vector<float>&, size_t);
 template void flashAttention<float>(const std::vector<float>&, const std::vector<float>&,
   const std::vector<float>&, std::vector<float>&,
-  int, int, int, int, int, int, bool);
-template void flashAttention<half>(const std::vector<half>&, const std::vector<half>&,
-  const std::vector<half>&, std::vector<half>&,
-  int, int, int, int, int, int, bool);
+  int, int, int, int, int, int, bool);
\ No newline at end of file
diff --git a/src/kernels.maca b/src/kernels.maca
deleted file mode 100644
index 765e08d..0000000
--- a/src/kernels.maca
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <vector>
-#include <common/maca_fp16.h>
-
-#include "../tester/utils.h"
-
-/**
- * @brief Computes the trace of a matrix.
- *
- * The trace of a matrix is defined as the sum of its diagonal elements.
- * This function expects a flattened row-major matrix stored in a
- * std::vector. If the matrix is not square, the trace will sum up
- * elements along the main diagonal up to the smaller of rows or cols.
- *
- * @tparam T The numeric type of matrix elements (e.g., float, int).
- * @param h_input A flattened matrix of size rows * cols.
- * @param rows Number of rows in the matrix.
- * @param cols Number of columns in the matrix.
- * @return The trace (sum of diagonal values) of the matrix.
- */
-template <typename T>
-T trace(const std::vector<T>& h_input, size_t rows, size_t cols) {
-  // TODO: Implement the trace function
-  return T(-1);
-}
-
-/**
- * @brief Computes flash attention for given query, key, and value tensors.
- * 
- * @tparam T Data type (float) for input/output tensors
- * @param[in] h_q Query tensor of shape [batch_size, tgt_seq_len, query_heads, head_dim]
- * @param[in] h_k Key tensor of shape [batch_size, src_seq_len, kv_heads, head_dim]
- * @param[in] h_v Value tensor of shape [batch_size, src_seq_len, kv_heads, head_dim]
- * @param[out] h_o Output attention tensor of shape [batch_size, tgt_seq_len, query_heads, head_dim]
- * @param[in] batch_size Batch dimension size
- * @param[in] target_seq_len Target sequence length
- * @param[in] src_seq_len Source sequence length  
- * @param[in] query_heads Number of query attention heads
- * @param[in] kv_heads Number of key/value heads (supports grouped query attention)
- * @param[in] head_dim Dimension size of each attention head
- * @param[in] is_causal Whether to apply causal masking
- */
-template <typename T>
-void flashAttention(const std::vector<T>& h_q, const std::vector<T>& h_k,
-                    const std::vector<T>& h_v, std::vector<T>& h_o,
-                    int batch_size, int target_seq_len, int src_seq_len, 
-                    int query_heads, int kv_heads, int head_dim, bool is_causal) {       
-}
-
-// *********************************************************************
-// Explicit Template Instantiations (REQUIRED FOR LINKING WITH TESTER.O)
-// DO NOT MODIFY THIS SECTION
-// *********************************************************************
-template int trace<int>(const std::vector<int>&, size_t, size_t);
-template float trace<float>(const std::vector<float>&, size_t, size_t);
-template void flashAttention<float>(const std::vector<float>&, const std::vector<float>&,
-  const std::vector<float>&, std::vector<float>&,
-  int, int, int, int, int, int, bool);
-template void flashAttention<half>(const std::vector<half>&, const std::vector<half>&,
-  const std::vector<half>&, std::vector<half>&,
-  int, int, int, int, int, int, bool);
diff --git a/src/kernels.mu b/src/kernels.mu
deleted file mode 100644
index 1fb8777..0000000
--- a/src/kernels.mu
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <vector>
-#include <musa_fp16.h>
-
-#include "../tester/utils.h"
-
-/**
- * @brief Computes the trace of a matrix.
- *
- * The trace of a matrix is defined as the sum of its diagonal elements.
- * This function expects a flattened row-major matrix stored in a
- * std::vector. If the matrix is not square, the trace will sum up
- * elements along the main diagonal up to the smaller of rows or cols.
- *
- * @tparam T The numeric type of matrix elements (e.g., float, int).
- * @param h_input A flattened matrix of size rows * cols.
- * @param rows Number of rows in the matrix.
- * @param cols Number of columns in the matrix.
- * @return The trace (sum of diagonal values) of the matrix.
- */
-template <typename T>
-T trace(const std::vector<T>& h_input, size_t rows, size_t cols) {
-  // TODO: Implement the trace function
-  return T(-1);
-}
-
-/**
- * @brief Computes flash attention for given query, key, and value tensors.
- * 
- * @tparam T Data type (float) for input/output tensors
- * @param[in] h_q Query tensor of shape [batch_size, tgt_seq_len, query_heads, head_dim]
- * @param[in] h_k Key tensor of shape [batch_size, src_seq_len, kv_heads, head_dim]
- * @param[in] h_v Value tensor of shape [batch_size, src_seq_len, kv_heads, head_dim]
- * @param[out] h_o Output attention tensor of shape [batch_size, tgt_seq_len, query_heads, head_dim]
- * @param[in] batch_size Batch dimension size
- * @param[in] target_seq_len Target sequence length
- * @param[in] src_seq_len Source sequence length  
- * @param[in] query_heads Number of query attention heads
- * @param[in] kv_heads Number of key/value heads (supports grouped query attention)
- * @param[in] head_dim Dimension size of each attention head
- * @param[in] is_causal Whether to apply causal masking
- */
-template <typename T>
-void flashAttention(const std::vector<T>& h_q, const std::vector<T>& h_k,
-                    const std::vector<T>& h_v, std::vector<T>& h_o,
-                    int batch_size, int target_seq_len, int src_seq_len, 
-                    int query_heads, int kv_heads, int head_dim, bool is_causal) {       
-}
-
-// *********************************************************************
-// Explicit Template Instantiations (REQUIRED FOR LINKING WITH TESTER.O)
-// DO NOT MODIFY THIS SECTION
-// *********************************************************************
-template int trace<int>(const std::vector<int>&, size_t, size_t);
-template float trace<float>(const std::vector<float>&, size_t, size_t);
-template void flashAttention<float>(const std::vector<float>&, const std::vector<float>&,
-  const std::vector<float>&, std::vector<float>&,
-  int, int, int, int, int, int, bool);
-template void flashAttention<half>(const std::vector<half>&, const std::vector<half>&,
-  const std::vector<half>&, std::vector<half>&,
-  int, int, int, int, int, int, bool);
diff --git a/src/kernels.o b/src/kernels.o
new file mode 100644
index 0000000..ac36c39
Binary files /dev/null and b/src/kernels.o differ
diff --git a/src/kernels.out b/src/kernels.out
new file mode 100755
index 0000000..d13c31f
Binary files /dev/null and b/src/kernels.out differ
diff --git a/test_kernels b/test_kernels
new file mode 100755
index 0000000..d13c31f
Binary files /dev/null and b/test_kernels differ
diff --git a/tester/tester.o b/tester/tester.o
new file mode 100644
index 0000000..e6bb82c
Binary files /dev/null and b/tester/tester.o differ
diff --git a/tester/tester_iluvatar.o b/tester/tester_iluvatar.o
deleted file mode 100644
index c430b0e..0000000
Binary files a/tester/tester_iluvatar.o and /dev/null differ
diff --git a/tester/tester_metax.o b/tester/tester_metax.o
deleted file mode 100644
index 494e6f7..0000000
Binary files a/tester/tester_metax.o and /dev/null differ
diff --git a/tester/tester_moore.o b/tester/tester_moore.o
deleted file mode 100644
index 0ccba85..0000000
Binary files a/tester/tester_moore.o and /dev/null differ
diff --git a/tester/tester_nv.o b/tester/tester_nv.o
deleted file mode 100644
index b279c38..0000000
Binary files a/tester/tester_nv.o and /dev/null differ
diff --git a/tester/utils.h b/tester/utils.h
index a2bd9c8..4c92eb5 100644
--- a/tester/utils.h
+++ b/tester/utils.h
@@ -1,35 +1,13 @@
 #pragma once
 
-#include <iostream>
-
-#if defined(PLATFORM_NVIDIA) || defined(PLATFORM_ILUVATAR)
 #include <cuda_runtime.h>
-#define RUNTIME_ERR_TYPE cudaError_t
-#define RUNTIME_SUCCESS_CODE cudaSuccess
-#define RUNTIME_GET_ERROR_STR cudaGetErrorString
-
-#elif defined(PLATFORM_MOORE)
-#include <musa_runtime.h>
-#define RUNTIME_ERR_TYPE musaError_t
-#define RUNTIME_SUCCESS_CODE musaSuccess
-#define RUNTIME_GET_ERROR_STR musaGetErrorString
-
-#elif defined(PLATFORM_METAX)
-#include <mcr/mc_runtime.h>
-#define RUNTIME_ERR_TYPE mcError_t
-#define RUNTIME_SUCCESS_CODE mcSuccess
-#define RUNTIME_GET_ERROR_STR mcGetErrorString
-
-#else
-#error "Unknown PLATFORM for RUNTIME_CHECK"
-#endif
 
-#define RUNTIME_CHECK(call)                                                    \
-  do {                                                                         \
-    RUNTIME_ERR_TYPE err = call;                                               \
-    if (err != RUNTIME_SUCCESS_CODE) {                                         \
-      std::cerr << "Runtime error at " << __FILE__ << ":" << __LINE__ << " - " \
-                << RUNTIME_GET_ERROR_STR(err) << "\n";                         \
+#define CUDA_CHECK(call)                                                       \
+  {                                                                            \
+    cudaError_t err = call;                                                    \
+    if (err != cudaSuccess) {                                                  \
+      std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " - "    \
+                << cudaGetErrorString(err) << "\n";                            \
       exit(EXIT_FAILURE);                                                      \
     }                                                                          \
-  } while (0)
+  }