From 7cac138e63addbeb1abaf6947961e4244a64231b Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 21 Oct 2025 23:08:08 +0800 Subject: [PATCH 1/5] CUDA: support topk_moe with weight clamp --- ggml/src/ggml-cuda/ggml-cuda.cu | 17 +++++------ ggml/src/ggml-cuda/topk-moe.cu | 50 +++++++++++++++++++++------------ ggml/src/ggml-cuda/topk-moe.cuh | 3 +- tests/test-backend-ops.cpp | 1 + 4 files changed, 44 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 6e7c5aedbc55a..b8121b2419adf 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2828,7 +2828,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, if (ops.size() == topk_moe_ops_with_norm.size() && ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops_with_norm, { node_idx + 3, node_idx + 8 })) { ggml_tensor * softmax = cgraph->nodes[node_idx]; - ggml_tensor * weights = cgraph->nodes[node_idx+8]; + ggml_tensor * weights = cgraph->nodes[node_idx + 9]; if (ggml_cuda_should_use_topk_moe(softmax, weights)) { return true; @@ -2838,7 +2838,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, if (ops.size() == topk_moe_ops.size() && ggml_can_fuse_subgraph(cgraph, node_idx, topk_moe_ops, { node_idx + 3, node_idx + 4 })) { ggml_tensor * softmax = cgraph->nodes[node_idx]; - ggml_tensor * weights = cgraph->nodes[node_idx+4]; + ggml_tensor * weights = cgraph->nodes[node_idx + 4]; if (ggml_cuda_should_use_topk_moe(softmax, weights)) { return true; } @@ -2945,17 +2945,18 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx if (!disable_fusion) { if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) { - ggml_tensor * weights = cgraph->nodes[i+8]; - ggml_tensor * selected_experts = cgraph->nodes[i+3]; + ggml_tensor * weights = cgraph->nodes[i + 9]; + ggml_tensor * selected_experts = cgraph->nodes[i + 3]; + ggml_tensor * clamp = cgraph->nodes[i + 7]; ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ true, - /*delayed softmax*/ false); - i += 8; + /*delayed softmax*/ false, clamp); + i += 9; continue; } if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) { - ggml_tensor * weights = cgraph->nodes[i+4]; - ggml_tensor * selected_experts = cgraph->nodes[i+3]; + ggml_tensor * weights = cgraph->nodes[i + 4]; + ggml_tensor * selected_experts = cgraph->nodes[i + 3]; ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ false, /*delayed softmax*/ false); i += 4; diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index e28c810ac5df7..325e2b3ddcaea 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -63,7 +63,8 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * float * weights, int32_t * ids, const int n_rows, - const int n_expert_used) { + const int n_expert_used, + const float clamp_val) { const int row = blockIdx.x * blockDim.y + threadIdx.y; if (row >= n_rows) { return; @@ -139,6 +140,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * if constexpr (with_norm) { wt_sum = warp_reduce_sum(wt_sum); + wt_sum = max(wt_sum, clamp_val); const float inv_sum = 1.0f / wt_sum; for (int i = 0; i < experts_per_thread; i++) { @@ -157,6 +159,10 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * weights[idx] = output_weights[i]; } } + + if (!with_norm) { + GGML_UNUSED(clamp_val); + } } template @@ -166,9 +172,9 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx, int32_t * ids, const int n_rows, const int n_expert, - const int n_expert_used) { + const int n_expert_used, + const float clamp_val) { static_assert(!(with_norm && delayed_softmax), "delayed softmax is not supported with weight normalization"); - const int rows_per_block = 4; dim3 grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1); dim3 block_dims(WARP_SIZE, rows_per_block, 1); @@ -177,43 +183,43 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx, switch (n_expert) { case 1: topk_moe_cuda<1, with_norm, delayed_softmax> - <<>>(logits, weights, ids, n_rows, n_expert_used); + <<>>(logits, weights, ids, n_rows, n_expert_used, clamp_val); break; case 2: topk_moe_cuda<2, with_norm, delayed_softmax> - <<>>(logits, weights, ids, n_rows, n_expert_used); + <<>>(logits, weights, ids, n_rows, n_expert_used, clamp_val); break; case 4: topk_moe_cuda<4, with_norm, delayed_softmax> - <<>>(logits, weights, ids, n_rows, n_expert_used); + <<>>(logits, weights, ids, n_rows, n_expert_used, clamp_val); break; case 8: topk_moe_cuda<8, with_norm, delayed_softmax> - <<>>(logits, weights, ids, n_rows, n_expert_used); + <<>>(logits, weights, ids, n_rows, n_expert_used, clamp_val); break; case 16: topk_moe_cuda<16, with_norm, delayed_softmax> - <<>>(logits, weights, ids, n_rows, n_expert_used); + <<>>(logits, weights, ids, n_rows, n_expert_used, clamp_val); break; case 32: topk_moe_cuda<32, with_norm, delayed_softmax> - <<>>(logits, weights, ids, n_rows, n_expert_used); + <<>>(logits, weights, ids, n_rows, n_expert_used, clamp_val); break; case 64: topk_moe_cuda<64, with_norm, delayed_softmax> - <<>>(logits, weights, ids, n_rows, n_expert_used); + <<>>(logits, weights, ids, n_rows, n_expert_used, clamp_val); break; case 128: topk_moe_cuda<128, with_norm, delayed_softmax> - <<>>(logits, weights, ids, n_rows, n_expert_used); + <<>>(logits, weights, ids, n_rows, n_expert_used, clamp_val); break; case 256: topk_moe_cuda<256, with_norm, delayed_softmax> - <<>>(logits, weights, ids, n_rows, n_expert_used); + <<>>(logits, weights, ids, n_rows, n_expert_used, clamp_val); break; case 512: topk_moe_cuda<512, with_norm, delayed_softmax> - <<>>(logits, weights, ids, n_rows, n_expert_used); + <<>>(logits, weights, ids, n_rows, n_expert_used, clamp_val); break; default: GGML_ASSERT(false && "fatal error"); @@ -226,7 +232,8 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, ggml_tensor * weights, ggml_tensor * ids, const bool with_norm, - const bool delayed_softmax) { + const bool delayed_softmax, + ggml_tensor * clamp) { GGML_ASSERT(logits->type == GGML_TYPE_F32); GGML_ASSERT(weights->type == GGML_TYPE_F32); GGML_ASSERT(ids->type == GGML_TYPE_I32); @@ -242,13 +249,19 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, const int n_expert_used = weights->ne[1]; + float clamp_val = 0.0f; if (with_norm) { - launch_topk_moe_cuda(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used); + if (clamp) { + clamp_val = ggml_get_op_params_f32(clamp, 0); + } + launch_topk_moe_cuda(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used, clamp_val); } else { if (delayed_softmax) { - launch_topk_moe_cuda(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used); + launch_topk_moe_cuda(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used, + clamp_val); } else { - launch_topk_moe_cuda(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used); + launch_topk_moe_cuda(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used, + clamp_val); } } } @@ -285,7 +298,8 @@ bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tenso std::initializer_list ggml_cuda_topk_moe_ops(bool norm, bool delayed_softmax) { static std::initializer_list norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE, - GGML_OP_SUM_ROWS, GGML_OP_DIV, GGML_OP_RESHAPE }; + GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV, + GGML_OP_RESHAPE }; static std::initializer_list no_norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS }; diff --git a/ggml/src/ggml-cuda/topk-moe.cuh b/ggml/src/ggml-cuda/topk-moe.cuh index cc2fbfe9e6649..878880360784a 100644 --- a/ggml/src/ggml-cuda/topk-moe.cuh +++ b/ggml/src/ggml-cuda/topk-moe.cuh @@ -8,7 +8,8 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, ggml_tensor * weights, ggml_tensor * ids, const bool with_norm, - const bool delayed_softmax = false); + const bool delayed_softmax = false, + ggml_tensor * weight_clamp = nullptr); bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 991c62597962d..d3fae54401c87 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4712,6 +4712,7 @@ struct test_topk_moe: public test_case { out = ggml_reshape_2d(ctx, out, n_expert_used, n_tokens); ggml_tensor * weights_sum = ggml_sum_rows(ctx, out); // [1, n_tokens] + weights_sum = ggml_clamp(ctx, weights_sum, 6.103515625e-5, INFINITY); out = ggml_div(ctx, out, weights_sum); // [n_expert_used, n_tokens] out = ggml_reshape_3d(ctx, out, 1, n_expert_used, n_tokens); } From 3d22bd959f82adf5ec9fc2ec7e808fbaa17cdf41 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 22 Oct 2025 21:19:13 +0800 Subject: [PATCH 2/5] add assert that max_clamp == inf --- ggml/src/ggml-cuda/topk-moe.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index 325e2b3ddcaea..e260e4fe7b272 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -2,6 +2,7 @@ #include "ggml.h" #include "topk-moe.cuh" +#include #include // Warp-local softmax used for both the pre-top-k logits and the post-top-k delayed path. @@ -253,6 +254,8 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, if (with_norm) { if (clamp) { clamp_val = ggml_get_op_params_f32(clamp, 0); + float max_val = ggml_get_op_params_f32(clamp, 1); + GGML_ASSERT(max_val == INFINITY); } launch_topk_moe_cuda(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used, clamp_val); } else { From ad7409cb48011648d5af8cfe618a30dcdaef54cf Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 22 Oct 2025 21:55:37 +0800 Subject: [PATCH 3/5] instead of assert, check in should_fuse --- ggml/src/ggml-cuda/topk-moe.cu | 16 +++++++++++++--- ggml/src/ggml-cuda/topk-moe.cuh | 2 +- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index e260e4fe7b272..a2eb3d36ceeef 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -254,8 +254,6 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, if (with_norm) { if (clamp) { clamp_val = ggml_get_op_params_f32(clamp, 0); - float max_val = ggml_get_op_params_f32(clamp, 1); - GGML_ASSERT(max_val == INFINITY); } launch_topk_moe_cuda(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used, clamp_val); } else { @@ -269,7 +267,7 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, } } -bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights) { +bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp) { float scale = 1.0f; float max_bias = 0.0f; @@ -295,6 +293,18 @@ bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tenso return false; } + if (clamp) { + if (clamp->op != GGML_OP_CLAMP) { + return false; + } + float max_val = ggml_get_op_params_f32(clamp, 1); + + if (max_val != INFINITY) { + return false; + } + } + + return true; } diff --git a/ggml/src/ggml-cuda/topk-moe.cuh b/ggml/src/ggml-cuda/topk-moe.cuh index 878880360784a..2eff408b03058 100644 --- a/ggml/src/ggml-cuda/topk-moe.cuh +++ b/ggml/src/ggml-cuda/topk-moe.cuh @@ -11,6 +11,6 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, const bool delayed_softmax = false, ggml_tensor * weight_clamp = nullptr); -bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights); +bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp = nullptr); std::initializer_list ggml_cuda_topk_moe_ops(bool with_norm, bool delayed_softmax = false); From e37748bb4385fe6f31f69f5bc96aa5e028af6ccd Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 23 Oct 2025 23:32:58 +0800 Subject: [PATCH 4/5] add assert that non-norm path should not contain a clamp --- ggml/src/ggml-cuda/topk-moe.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index a2eb3d36ceeef..19fcf8b90d56b 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -257,6 +257,7 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, } launch_topk_moe_cuda(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used, clamp_val); } else { + GGML_ASSERT(clamp == nullptr); if (delayed_softmax) { launch_topk_moe_cuda(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used, clamp_val); From 74449e8e3f704d5b8e125e7f730d80c8c82c3997 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 23 Oct 2025 23:41:06 +0800 Subject: [PATCH 5/5] use -inf as default val for clamp --- ggml/src/ggml-cuda/topk-moe.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index 19fcf8b90d56b..572379fcbf0e8 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -250,7 +250,7 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, const int n_expert_used = weights->ne[1]; - float clamp_val = 0.0f; + float clamp_val = -INFINITY; if (with_norm) { if (clamp) { clamp_val = ggml_get_op_params_f32(clamp, 0);