From dddd9537ec70f3171637fa5b75b3aca8d74d244f Mon Sep 17 00:00:00 2001 From: Hang Qu Date: Thu, 6 Nov 2025 22:09:21 -0800 Subject: [PATCH] Update embedding_forward_quantized_cpu_template.cpp to use initialized output memory instead of uninitialized (#5054) Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/2064 We observe, if the memory of output is uninitialized, the output may be garbage. This is because certain memory is untouched. The proposed fix is a quick workaround, but it will be more efficient to directly fill the untouched memory with zero. Reviewed By: sryap Differential Revision: D85447298 --- .../inference/embedding_forward_quantized_cpu_template.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp index 1a2942071c..00a9b944c1 100644 --- a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp +++ b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp @@ -210,6 +210,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ total_adjusted_D += T * kINT8QparamsBytes; } output = at::empty({B, total_adjusted_D}, dev_weights.options().dtype(getScalarType(o_dtype)).pinned_memory(pinned_memory)); + if (!output_is_int8 && !output_is_int4) { + output.fill_(0); + } {% else %} constexpr int kINT8QparamsBytes = 4; // no bag int8 output aligns with fbgemm weights storage size and layout constexpr int kINT4QparamsElems = 8; // scale + bias takes 4 bytes which are 8 int4 elements @@ -220,6 +223,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ adjusted_D += kINT4QparamsElems; } output = at::empty({total_L, adjusted_D}, dev_weights.options().dtype(getScalarType(o_dtype)).pinned_memory(pinned_memory)); + if (!output_is_int8 && !output_is_int4) { + output.fill_(0); + } {% endif %}