pytorch · Bernard-Liu · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025 · Aug 11, 2025
diff --git a/.github/scripts/utils_build.bash b/.github/scripts/utils_build.bash
@@ -370,6 +370,7 @@ install_build_tools () {
     patchelf \
     rhash \
     scikit-build \
+    tbb-devel \
     tbb \
     wheel \
     xz \

diff --git a/.github/workflows/_fbgemm_gpu_cuda_test.yml b/.github/workflows/_fbgemm_gpu_cuda_test.yml
@@ -132,6 +132,9 @@ jobs:
       #   clang-16: error: unknown argument: '-fno-tree-loop-vectorize'
       run: . $PRELUDE; install_cxx_compiler $BUILD_ENV gcc
 
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
     - name: Install CUDA
       run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
 

diff --git a/.github/workflows/fbgemm_gpu_ci_cpu.yml b/.github/workflows/fbgemm_gpu_ci_cpu.yml
@@ -75,7 +75,7 @@ jobs:
           { arch: arm, instance: "linux.arm64.m7g.4xlarge" },
         ]
         build-target: [ "default" ]
-        python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
+        python-version: [ "3.10", "3.11", "3.12", "3.13" ]
         compiler: [ "gcc", "clang" ]
 
     steps:
@@ -149,7 +149,7 @@ jobs:
           { arch: arm, instance: "linux.arm64.m7g.4xlarge", timeout: 30 },
         ]
         build-target: [ "default" ]
-        python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
+        python-version: [ "3.10", "3.11", "3.12", "3.13" ]
         compiler: [ "gcc", "clang" ]
     needs: build_artifact
 

diff --git a/cmake/modules/CppLibrary.cmake b/cmake/modules/CppLibrary.cmake
@@ -168,6 +168,18 @@ function(cpp_library)
         target_link_libraries(${lib_name} PUBLIC OpenMP::OpenMP_CXX)
     endif()
 
+    if(NOT TARGET TBB::tbb)
+        find_package(TBB QUIET)
+    endif()
+    if(TBB_FOUND)
+        target_link_libraries(${lib_name} PUBLIC TBB::tbb)
+    else()
+        find_library(TBB_LIB NAMES tbb tbb12 HINTS $ENV{CONDA_PREFIX}/lib /usr/lib/x86_64-linux-gnu /usr/local/lib /lib/x86_64-linux-gnu)
+        if(TBB_LIB)
+            target_link_libraries(${lib_name} PUBLIC ${TBB_LIB})
+        endif()
+    endif()
+
     # Add sanitizer options if needed
     if(args_SANITIZER_OPTIONS)
         target_link_options(${lib_name} PUBLIC

diff --git a/cmake/modules/GpuCppLibrary.cmake b/cmake/modules/GpuCppLibrary.cmake
@@ -302,6 +302,18 @@ function(gpu_cpp_library)
         list(APPEND library_dependencies ${NVML_LIB_PATH})
     endif()
 
+    if(NOT TARGET TBB::tbb)
+        find_package(TBB QUIET)
+    endif()
+    if(TBB_FOUND)
+        list(APPEND library_dependencies TBB::tbb)
+    else()
+        find_library(TBB_LIB NAMES tbb tbb12 HINTS $ENV{CONDA_PREFIX}/lib /usr/lib/x86_64-linux-gnu /usr/local/lib /lib/x86_64-linux-gnu)
+        if(TBB_LIB)
+            list(APPEND library_dependencies ${TBB_LIB})
+        endif()
+    endif()
+
     # Link against the external libraries as needed
     target_link_libraries(${lib_name} PRIVATE ${library_dependencies})
 

diff --git a/fbgemm_gpu/bench/jagged_tensor_benchmark.py b/fbgemm_gpu/bench/jagged_tensor_benchmark.py
@@ -10,8 +10,11 @@
 
 import functools
 import logging
+import os
 import random
+from contextlib import nullcontext
 from dataclasses import dataclass
+from typing import Callable
 
 import click
 import fbgemm_gpu
@@ -542,6 +545,17 @@ def ref(
 @click.option("--has-weights", is_flag=True, default=False)
 @click.option("--weight-type", type=str, default="float")
 @click.option("--use-selected-lengths-sum", is_flag=True, default=False)
+@click.option(
+    "--export-trace",
+    is_flag=True,
+    default=False,
+    help="Enable export of trace for profiling. Default is False.",
+)
+@click.option(
+    "--trace-url",
+    type=str,
+    default="keyed_jagged_index_select_dim1_{phase}_trace_{ospid}.json",
+)
 def keyed_jagged_index_select_dim1(
     num_batches: int,
     max_seq_length: int,
@@ -551,6 +565,8 @@ def keyed_jagged_index_select_dim1(
     has_weights: bool,
     weight_type: str,
     use_selected_lengths_sum: bool,
+    export_trace: bool,
+    trace_url: str,
 ) -> None:
     jagged_tensor_types = {
         "float": torch.float,
@@ -622,20 +638,28 @@ def keyed_jagged_index_select_dim1(
     if is_float:
         values.requires_grad = True
 
-    time, output = benchmark_torch_function(
-        torch.ops.fbgemm.keyed_jagged_index_select_dim1,
-        (
-            values,
-            lengths,
-            offsets,
-            indices,
-            input_batch_size,
-            weights,
-            selected_lengths_sum,
-        ),
-        iters=1000,
-    )
-    output = output[0]
+    def _kineto_trace_handler(p: profile, phase: str) -> None:
+        p.export_chrome_trace(trace_url.format(phase=phase, ospid=os.getpid()))
+
+    # pyre-ignore[3]
+    def context_factory(on_trace_ready: Callable[[profile], None]):
+        return profile(on_trace_ready=on_trace_ready) if export_trace else nullcontext()
+
+    with context_factory(lambda p: _kineto_trace_handler(p, "fwd")):
+        time, output = benchmark_torch_function(
+            torch.ops.fbgemm.keyed_jagged_index_select_dim1,
+            (
+                values,
+                lengths,
+                offsets,
+                indices,
+                input_batch_size,
+                weights,
+                selected_lengths_sum,
+            ),
+            iters=1000,
+        )
+        output = output[0]
 
     # Prepare inputs for the reference run
     ref_inputs = []
@@ -687,9 +711,12 @@ def keyed_jagged_index_select_dim1_ref(
         return
 
     grad = torch.rand_like(output)
-    time, _ = benchmark_torch_function(
-        functools.partial(output.backward, retain_graph=True), (grad,), iters=1000
-    )
+
+    with context_factory(lambda p: _kineto_trace_handler(p, "bwd")):
+        time, _ = benchmark_torch_function(
+            functools.partial(output.backward, retain_graph=True), (grad,), iters=1000
+        )
+
     time_ref, _ = benchmark_torch_function(
         functools.partial(output_ref.backward, retain_graph=True), (grad,), iters=1000
     )

diff --git a/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/tbe/split_table_batched_embeddings_benchmark.py
@@ -1506,4 +1506,4 @@ def context_factory(on_trace_ready: Callable[[profile], None]):
 
 
 if __name__ == "__main__":
-    cli()
+    cli()
diff --git a/fbgemm_gpu/cmake/tbe_sources.py b/fbgemm_gpu/cmake/tbe_sources.py
@@ -176,7 +176,6 @@
             "_nobag" if nobag else "",
         )
         for nobag in [
-            True,
             False,
         ]
         for weighted in (
@@ -495,7 +494,6 @@
         "_nobag" if nobag else "",
     )
     for nobag in [
-        True,
         False,
     ]
     for weighted in (

diff --git a/fbgemm_gpu/codegen/genscript/generate_backward_split.py b/fbgemm_gpu/codegen/genscript/generate_backward_split.py
@@ -52,7 +52,11 @@ def render_backward_templates(
             return
 
         weighted_options = [True, False]
-        nobag_options = [True, False] if (not is_gwd) else [False]
+        nobag_options = (
+            [True, False]
+            if (not (is_gwd or kwargs.get("is_hip_optimized_backward")))
+            else [False]
+        )
         vbe_options = [True, False] if (kwargs.get("has_vbe_support")) else [False]
         ssd_options = [True, False] if kwargs.get("has_ssd_support") else [False]
         template = CodeTemplate.load(template_filepath)
@@ -327,8 +331,7 @@ def generate_backward_indices() -> None:
 
     @staticmethod
     def generate_rocm_backward_split(**kwargs: Any) -> None:
-        # Generate backward device kernels based on weighted (True/False), VBE
-        # (True/False), no bag (True/False)
+        # Generate backward device kernels based on weighted (True/False)
         template_filepath = (
             "training/backward/rocm/embedding_backward_split_device_kernel_template.hip"
         )
@@ -343,6 +346,7 @@ def generate_rocm_backward_split(**kwargs: Any) -> None:
                 "has_ssd_support": False,
                 "dense": False,
                 "gen_once": False,
+                "is_hip_optimized_backward": True,
             },
         )
 
@@ -422,6 +426,7 @@ def generate() -> None:
                 "lxu_cache_locations",  # 3
                 "uvm_cache_stats",  # 4
                 "prev_iter_dev",  # 5
+                "vbe_output_offsets",  # 6
             ],
             "aux_int": [
                 "iter",  # 0

diff --git a/fbgemm_gpu/codegen/genscript/optimizer_args.py b/fbgemm_gpu/codegen/genscript/optimizer_args.py
@@ -73,9 +73,7 @@ class OptimizerArgsSetItem:
     "row_counter_dev": "(q!)",
     "row_counter_uvm": "(r!)",
     "optim_tensor": "(s!)",
-    "delta_weights_host": "(t!)",
-    "delta_weights_dev": "(u!)",
-    "delta_weights_uvm": "(v!)",
+    "vbe_output": "(t!)",
 }
 
 ######################################################################

diff --git a/fbgemm_gpu/codegen/genscript/optimizers.py b/fbgemm_gpu/codegen/genscript/optimizers.py
@@ -197,6 +197,9 @@ def rowwise_adagrad() -> Dict[str, Any]:
 
     at::acc_type<cache_t, true> multiplier = 0.0;
     at::acc_type<cache_t, true> correction = 0.0;
+    """
+    split_precomputation_preload = split_precomputation
+    split_precomputation += """
     if (threadIdx.x == 0) {
         auto new_sum_square_grads = g_avg_square;
 
@@ -228,6 +231,38 @@ def rowwise_adagrad() -> Dict[str, Any]:
     multiplier = SHFL_SYNC(multiplier, 0);
     correction = SHFL_SYNC(correction, 0);
     """
+    split_precomputation_preload += """
+    if (threadIdx.x == 0) {
+        auto new_sum_square_grads = g_avg_square;
+
+        // Update the optimizer state.  Use optimizer state offloading only if
+        // SSD and if enabled by the user
+        if (enable_optimizer_offloading) {
+            // Fetch the pointer to the optimizer state along the cache row
+            auto* optimizer = weight_row_template.template optimizer_state_ptr<OptimizerState>();
+            new_sum_square_grads += optimizer->momentum;
+            optimizer->momentum = new_sum_square_grads;
+
+        } else {
+            new_sum_square_grads += momentum1_val;
+            momentum1[idx] = new_sum_square_grads;
+        }
+
+        multiplier = learning_rate / (sqrtf(new_sum_square_grads) + eps);
+        if (weight_decay_mode == 1) {
+            // L2 regularization
+            correction = 1.0 - multiplier * weight_decay;
+        } else if (weight_decay_mode == 2 || weight_decay_mode == 5) {
+            // Decoupled weight decay
+            correction = 1.0 - learning_rate * weight_decay;
+        } else {
+            // default value
+            correction = 1.0;
+        }
+    }
+    multiplier = SHFL_SYNC(multiplier, 0);
+    correction = SHFL_SYNC(correction, 0);
+    """
     split_weight_update_cpu = """
         at::acc_type<grad_t, true> g_local_sum_square = 0.0;
         for (int64_t d = 0; d < D; ++d) {
@@ -275,6 +310,7 @@ def rowwise_adagrad() -> Dict[str, Any]:
             },
         ),
         "split_precomputation": split_precomputation,
+        "split_precomputation_preload": split_precomputation_preload,
         "split_weight_update": split_weight_update,
         "split_post_update": split_post_update,
         "split_weight_update_cpu": split_weight_update_cpu,

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_dense_host_cpu.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_dense_host_cpu.cpp
@@ -172,7 +172,7 @@ Tensor split_embedding_codegen_lookup_dense_function(
     c10::SymInt /* max_B = -1 */,
     c10::SymInt /* max_B_feature_rank = -1 */,
     c10::SymInt /* vbe_output_size = -1 */,
-    bool /* mixed_D = true */) {
+    bool /* mixed_D = false */) {
   return SplitLookupFunction_Dense_Op::apply(
       host_weights,
       weights_offsets,
-Original file line number
+Diff line change
@@ Expand Up / @@ -370,6 +370,7 @@ install_build_tools () { @@
         patchelf \
         rhash \
         scikit-build \
+        tbb-devel \
         tbb \
         wheel \
         xz \
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -1506,4 +1506,4 @@ def context_factory(on_trace_ready: Callable[[profile], None]):


		if __name__ == "__main__":
		cli()
		cli()