From e218cfc7e514885524dd4a0da2ab7fb592d04d53 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 5 Nov 2025 10:37:57 +0900
Subject: [PATCH 1/9] wip

---
 .github/scripts/generate_ci_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/generate_ci_matrix.py b/.github/scripts/generate_ci_matrix.py
index b842433c8f..0d6cc47101 100644
--- a/.github/scripts/generate_ci_matrix.py
+++ b/.github/scripts/generate_ci_matrix.py
@@ -307,7 +307,7 @@ def cuda_versions(self) -> List[str]:
             return ["12.6.3", "12.8.1", "13.0.2"]
         else:
             # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
-            return ["12.6.3", "12.8.1"]
+            return ["12.6.3", "12.8.1", "13.0.2"]
 
     def rocm_versions(self) -> List[str]:
         if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH:

From d0b731d5f903f83b9f73029f6f3889ec6abc89da Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 5 Nov 2025 10:38:15 +0900
Subject: [PATCH 2/9] wip

---
 .github/scripts/generate_ci_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/generate_ci_matrix.py b/.github/scripts/generate_ci_matrix.py
index 0d6cc47101..9c8736b0eb 100644
--- a/.github/scripts/generate_ci_matrix.py
+++ b/.github/scripts/generate_ci_matrix.py
@@ -307,7 +307,7 @@ def cuda_versions(self) -> List[str]:
             return ["12.6.3", "12.8.1", "13.0.2"]
         else:
             # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
-            return ["12.6.3", "12.8.1", "13.0.2"]
+            return ["13.0.2"]
 
     def rocm_versions(self) -> List[str]:
         if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH:

From d79c8cdfeac9c5a9de5320d8d8bb6c55c0dcaec6 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 5 Nov 2025 10:38:28 +0900
Subject: [PATCH 3/9] wip

---
 .github/scripts/generate_ci_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/generate_ci_matrix.py b/.github/scripts/generate_ci_matrix.py
index 9c8736b0eb..0d6cc47101 100644
--- a/.github/scripts/generate_ci_matrix.py
+++ b/.github/scripts/generate_ci_matrix.py
@@ -307,7 +307,7 @@ def cuda_versions(self) -> List[str]:
             return ["12.6.3", "12.8.1", "13.0.2"]
         else:
             # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
-            return ["13.0.2"]
+            return ["12.6.3", "12.8.1", "13.0.2"]
 
     def rocm_versions(self) -> List[str]:
         if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH:

From 005da03befef4953bc16b103c372603cbf7abab4 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 5 Nov 2025 14:31:30 +0900
Subject: [PATCH 4/9] wip

---
 cmake/modules/GpuCppLibrary.cmake | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/cmake/modules/GpuCppLibrary.cmake b/cmake/modules/GpuCppLibrary.cmake
index 51c30df750..40cd9986ed 100644
--- a/cmake/modules/GpuCppLibrary.cmake
+++ b/cmake/modules/GpuCppLibrary.cmake
@@ -87,15 +87,31 @@ function(prepare_target_sources)
             list(APPEND ${args_PREFIX}_sources_cu ${args_CUDA_SPECIFIC_SRCS})
         endif()
 
-        # Set source properties
-        set_source_files_properties(${${args_PREFIX}_sources_cu}
-            PROPERTIES COMPILE_OPTIONS
-            "${args_NVCC_FLAGS}")
-
+        # Set include directories
         set_source_files_properties(${${args_PREFIX}_sources_cu}
             PROPERTIES INCLUDE_DIRECTORIES
             "${args_INCLUDE_DIRS}")
 
+        # Starting with CUDA 13.0, nvcc changed the default visibility of
+        # __global__ functions to `hidden`, which causes symbol lookup errors
+        # during linking.  This can be worked around by setting -cudart=shared
+        # and --device-entity-has-hidden-visibility=false.
+        #
+        # https://developer.nvidia.com/blog/cuda-c-compiler-updates-impacting-elf-visibility-and-linkage/
+        if( (FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_CUDA) AND
+            (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") )
+            set(_nvcc_flags ${args_NVCC_FLAGS}
+                -cudart=shared
+                --device-entity-has-hidden-visibility=false)
+        else()
+            set(_nvcc_flags ${args_NVCC_FLAGS})
+        endif()
+
+        # Set compilation flags
+        set_source_files_properties(${${args_PREFIX}_sources_cu}
+            PROPERTIES COMPILE_OPTIONS
+            "${_nvcc_flags}")
+
         # Append to the full sources list
         list(APPEND ${args_PREFIX}_sources_combined ${${args_PREFIX}_sources_cu})
     endif()

From d363b3212f6c2ccb31226e82cbaec950d4ba3930 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 5 Nov 2025 14:56:39 +0900
Subject: [PATCH 5/9] wip

---
 fbgemm_gpu/src/memory_utils/memory_utils.cu       | 15 +++++++++++----
 .../src/sparse_ops/sparse_async_batched_cumsum.cu |  6 +++++-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/fbgemm_gpu/src/memory_utils/memory_utils.cu b/fbgemm_gpu/src/memory_utils/memory_utils.cu
index e7cba2bffe..c59ac736af 100644
--- a/fbgemm_gpu/src/memory_utils/memory_utils.cu
+++ b/fbgemm_gpu/src/memory_utils/memory_utils.cu
@@ -144,6 +144,13 @@ std::tuple<void*, size_t> adjust_to_page_boundaries(void* ptr, size_t size) {
   return std::make_tuple((void*)raw_ptr_adjusted, (size_t)size_adjusted);
 }
 
+cudaMemLocation new_mem_location_from_device(const int device_id) {
+  cudaMemLocation deviceLoc;
+  deviceLoc.type = cudaMemLocationTypeDevice;
+  deviceLoc.id = device_id;
+  return deviceLoc;
+}
+
 } // namespace
 
 Tensor new_managed_tensor(
@@ -158,11 +165,11 @@ Tensor new_managed_tensor(
 
   // Set preferred memory location to host memory
   AT_CUDA_CHECK(cudaMemAdvise(
-      ptr, size_bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
+      ptr, size_bytes, cudaMemAdviseSetPreferredLocation, new_mem_location_from_device(cudaCpuDeviceId)));
   // User hints with "accessed by": GPU will establish direct mapping of data
   // in CPU memory, no page faults will be generated
   AT_CUDA_CHECK(cudaMemAdvise(
-      ptr, size_bytes, cudaMemAdviseSetAccessedBy, at::cuda::current_device()));
+      ptr, size_bytes, cudaMemAdviseSetAccessedBy, new_mem_location_from_device(at::cuda::current_device())));
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   // Work around fork issue - see uvm_mem_advice_dont_fork for details
@@ -353,7 +360,7 @@ void uvm_cuda_mem_advise(const Tensor& t, int64_t cuda_memory_advise) {
       ptr,
       size_bytes,
       static_cast<enum cudaMemoryAdvise>(cuda_memory_advise),
-      hint_device));
+      new_mem_location_from_device(hint_device)));
   return;
 }
 
@@ -379,7 +386,7 @@ void uvm_cuda_mem_prefetch_async(
 
   auto stream = at::cuda::getCurrentCUDAStream();
 
-  AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, prefetch_device, stream));
+  AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, new_mem_location_from_device(prefetch_device), stream));
 
   return;
 }
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu b/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu
index ac9b81cbcb..a281237cf1 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu
+++ b/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu
@@ -72,7 +72,11 @@ __global__ __launch_bounds__(kMaxThreads) void _batched_complete_cumsum_kernel(
       data = (val_t)values[blockIdx.x][i];
     }
     BlockScan(temp_storage).InclusiveSum(data, data, prefix_op);
-    cub::CTA_SYNC();
+#if CUDA_VERSION >= 13000
+  __syncthreads();
+#else
+  cub::CTA_SYNC();
+#endif
     if (i < len) {
       out[blockIdx.x][i + 1] = data;
     }

From fbe9dc00fd9ca7fc62bdb59fd231b9ca75d9ad98 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 5 Nov 2025 15:31:13 +0900
Subject: [PATCH 6/9] wip

---
 fbgemm_gpu/src/memory_utils/memory_utils.cu | 33 ++++++++++++++++++---
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/fbgemm_gpu/src/memory_utils/memory_utils.cu b/fbgemm_gpu/src/memory_utils/memory_utils.cu
index c59ac736af..98647f5a68 100644
--- a/fbgemm_gpu/src/memory_utils/memory_utils.cu
+++ b/fbgemm_gpu/src/memory_utils/memory_utils.cu
@@ -165,11 +165,25 @@ Tensor new_managed_tensor(
 
   // Set preferred memory location to host memory
   AT_CUDA_CHECK(cudaMemAdvise(
-      ptr, size_bytes, cudaMemAdviseSetPreferredLocation, new_mem_location_from_device(cudaCpuDeviceId)));
+      ptr, size_bytes, cudaMemAdviseSetPreferredLocation,
+#if CUDA_VERSION >= 13000
+      new_mem_location_from_device(cudaCpuDeviceId)
+#else
+      cudaCpuDeviceId
+#endif
+    ));
+
   // User hints with "accessed by": GPU will establish direct mapping of data
   // in CPU memory, no page faults will be generated
   AT_CUDA_CHECK(cudaMemAdvise(
-      ptr, size_bytes, cudaMemAdviseSetAccessedBy, new_mem_location_from_device(at::cuda::current_device())));
+      ptr, size_bytes, cudaMemAdviseSetAccessedBy,
+#if CUDA_VERSION >= 13000
+      new_mem_location_from_device(at::cuda::current_device())
+#else
+      at::cuda::current_device()
+#endif
+    ));
+
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   // Work around fork issue - see uvm_mem_advice_dont_fork for details
@@ -360,7 +374,12 @@ void uvm_cuda_mem_advise(const Tensor& t, int64_t cuda_memory_advise) {
       ptr,
       size_bytes,
       static_cast<enum cudaMemoryAdvise>(cuda_memory_advise),
-      new_mem_location_from_device(hint_device)));
+#if CUDA_VERSION >= 13000
+      new_mem_location_from_device(hint_device)
+#else
+      hint_device
+#endif
+  ));
   return;
 }
 
@@ -386,7 +405,13 @@ void uvm_cuda_mem_prefetch_async(
 
   auto stream = at::cuda::getCurrentCUDAStream();
 
-  AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, new_mem_location_from_device(prefetch_device), stream));
+  AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes,
+#if CUDA_VERSION >= 13000
+    new_mem_location_from_device(prefetch_device), 0,
+#else
+    prefetch_device,
+#endif
+    stream));
 
   return;
 }

From 16f3c09b28dcca099cdabc6bcd26225b8ca079f7 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 5 Nov 2025 17:06:24 +0900
Subject: [PATCH 7/9] wip

---
 .../training/forward/embedding_forward_split_template.cu        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu
index 6574bda45e..b90ad76b34 100644
--- a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu
+++ b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu
@@ -57,7 +57,7 @@ template <
     typename index_t,
     size_t kThreadGroupSize
     >
-__launch_bounds__(kForwardMaxThreads) __global__ void
+__launch_bounds__(kForwardMaxThreads) __global__ __attribute__((visibility("default"))) void
 {%- if is_index_select %}
 batch_index_select_dim0_codegen_forward_small_kernel(
 {%- else %}

From b6741a4ad2faf50cf640c70432b04d8777bc4489 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Wed, 5 Nov 2025 17:44:04 +0900
Subject: [PATCH 8/9] wip

---
 cmake/modules/GpuCppLibrary.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/modules/GpuCppLibrary.cmake b/cmake/modules/GpuCppLibrary.cmake
index 40cd9986ed..3985a633d3 100644
--- a/cmake/modules/GpuCppLibrary.cmake
+++ b/cmake/modules/GpuCppLibrary.cmake
@@ -102,6 +102,7 @@ function(prepare_target_sources)
             (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") )
             set(_nvcc_flags ${args_NVCC_FLAGS}
                 -cudart=shared
+                -static-global-template-stub=false
                 --device-entity-has-hidden-visibility=false)
         else()
             set(_nvcc_flags ${args_NVCC_FLAGS})

From a0e05f91efff36ec30f686d6a1ca1a827ff8b36b Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Thu, 6 Nov 2025 08:17:40 +0900
Subject: [PATCH 9/9] wip

---
 .github/scripts/fbgemm_gpu_build.bash | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 96305c056a..aa2dd81517 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -304,8 +304,10 @@ __configure_fbgemm_gpu_build_cuda () {
         local arch_list="7.5;8.0"
       fi
 
-    elif  [[ $cuda_version_nvcc == *"V13.0"* ]] ||
-          [[ $cuda_version_nvcc == *"V12.9"* ]] ||
+    elif  [[ $cuda_version_nvcc == *"V13.0"* ]]; then
+      local arch_list="8.0;9.0a;10.0a;12.0a"
+
+    elif  [[ $cuda_version_nvcc == *"V12.9"* ]] ||
           [[ $cuda_version_nvcc == *"V12.8"* ]]; then
       local arch_list="7.5;8.0;9.0a;10.0a;12.0a"