From e218cfc7e514885524dd4a0da2ab7fb592d04d53 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Wed, 5 Nov 2025 10:37:57 +0900 Subject: [PATCH 1/9] wip --- .github/scripts/generate_ci_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/generate_ci_matrix.py b/.github/scripts/generate_ci_matrix.py index b842433c8f..0d6cc47101 100644 --- a/.github/scripts/generate_ci_matrix.py +++ b/.github/scripts/generate_ci_matrix.py @@ -307,7 +307,7 @@ def cuda_versions(self) -> List[str]: return ["12.6.3", "12.8.1", "13.0.2"] else: # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138 - return ["12.6.3", "12.8.1"] + return ["12.6.3", "12.8.1", "13.0.2"] def rocm_versions(self) -> List[str]: if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH: From d0b731d5f903f83b9f73029f6f3889ec6abc89da Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Wed, 5 Nov 2025 10:38:15 +0900 Subject: [PATCH 2/9] wip --- .github/scripts/generate_ci_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/generate_ci_matrix.py b/.github/scripts/generate_ci_matrix.py index 0d6cc47101..9c8736b0eb 100644 --- a/.github/scripts/generate_ci_matrix.py +++ b/.github/scripts/generate_ci_matrix.py @@ -307,7 +307,7 @@ def cuda_versions(self) -> List[str]: return ["12.6.3", "12.8.1", "13.0.2"] else: # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138 - return ["12.6.3", "12.8.1", "13.0.2"] + return ["13.0.2"] def rocm_versions(self) -> List[str]: if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH: From d79c8cdfeac9c5a9de5320d8d8bb6c55c0dcaec6 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Wed, 5 Nov 2025 10:38:28 +0900 Subject: [PATCH 3/9] wip --- .github/scripts/generate_ci_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/generate_ci_matrix.py b/.github/scripts/generate_ci_matrix.py index 9c8736b0eb..0d6cc47101 100644 --- a/.github/scripts/generate_ci_matrix.py +++ b/.github/scripts/generate_ci_matrix.py @@ -307,7 +307,7 @@ def cuda_versions(self) -> List[str]: return ["12.6.3", "12.8.1", "13.0.2"] else: # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138 - return ["13.0.2"] + return ["12.6.3", "12.8.1", "13.0.2"] def rocm_versions(self) -> List[str]: if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH: From 005da03befef4953bc16b103c372603cbf7abab4 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Wed, 5 Nov 2025 14:31:30 +0900 Subject: [PATCH 4/9] wip --- cmake/modules/GpuCppLibrary.cmake | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/cmake/modules/GpuCppLibrary.cmake b/cmake/modules/GpuCppLibrary.cmake index 51c30df750..40cd9986ed 100644 --- a/cmake/modules/GpuCppLibrary.cmake +++ b/cmake/modules/GpuCppLibrary.cmake @@ -87,15 +87,31 @@ function(prepare_target_sources) list(APPEND ${args_PREFIX}_sources_cu ${args_CUDA_SPECIFIC_SRCS}) endif() - # Set source properties - set_source_files_properties(${${args_PREFIX}_sources_cu} - PROPERTIES COMPILE_OPTIONS - "${args_NVCC_FLAGS}") - + # Set include directories set_source_files_properties(${${args_PREFIX}_sources_cu} PROPERTIES INCLUDE_DIRECTORIES "${args_INCLUDE_DIRS}") + # Starting with CUDA 13.0, nvcc changed the default visibility of + # __global__ functions to `hidden`, which causes symbol lookup errors + # during linking. This can be worked around by setting -cudart=shared + # and --device-entity-has-hidden-visibility=false. + # + # https://developer.nvidia.com/blog/cuda-c-compiler-updates-impacting-elf-visibility-and-linkage/ + if( (FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_CUDA) AND + (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") ) + set(_nvcc_flags ${args_NVCC_FLAGS} + -cudart=shared + --device-entity-has-hidden-visibility=false) + else() + set(_nvcc_flags ${args_NVCC_FLAGS}) + endif() + + # Set compilation flags + set_source_files_properties(${${args_PREFIX}_sources_cu} + PROPERTIES COMPILE_OPTIONS + "${_nvcc_flags}") + # Append to the full sources list list(APPEND ${args_PREFIX}_sources_combined ${${args_PREFIX}_sources_cu}) endif() From d363b3212f6c2ccb31226e82cbaec950d4ba3930 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Wed, 5 Nov 2025 14:56:39 +0900 Subject: [PATCH 5/9] wip --- fbgemm_gpu/src/memory_utils/memory_utils.cu | 15 +++++++++++---- .../src/sparse_ops/sparse_async_batched_cumsum.cu | 6 +++++- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/fbgemm_gpu/src/memory_utils/memory_utils.cu b/fbgemm_gpu/src/memory_utils/memory_utils.cu index e7cba2bffe..c59ac736af 100644 --- a/fbgemm_gpu/src/memory_utils/memory_utils.cu +++ b/fbgemm_gpu/src/memory_utils/memory_utils.cu @@ -144,6 +144,13 @@ std::tuple adjust_to_page_boundaries(void* ptr, size_t size) { return std::make_tuple((void*)raw_ptr_adjusted, (size_t)size_adjusted); } +cudaMemLocation new_mem_location_from_device(const int device_id) { + cudaMemLocation deviceLoc; + deviceLoc.type = cudaMemLocationTypeDevice; + deviceLoc.id = device_id; + return deviceLoc; +} + } // namespace Tensor new_managed_tensor( @@ -158,11 +165,11 @@ Tensor new_managed_tensor( // Set preferred memory location to host memory AT_CUDA_CHECK(cudaMemAdvise( - ptr, size_bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId)); + ptr, size_bytes, cudaMemAdviseSetPreferredLocation, new_mem_location_from_device(cudaCpuDeviceId))); // User hints with "accessed by": GPU will establish direct mapping of data // in CPU memory, no page faults will be generated AT_CUDA_CHECK(cudaMemAdvise( - ptr, size_bytes, cudaMemAdviseSetAccessedBy, at::cuda::current_device())); + ptr, size_bytes, cudaMemAdviseSetAccessedBy, new_mem_location_from_device(at::cuda::current_device()))); C10_CUDA_KERNEL_LAUNCH_CHECK(); // Work around fork issue - see uvm_mem_advice_dont_fork for details @@ -353,7 +360,7 @@ void uvm_cuda_mem_advise(const Tensor& t, int64_t cuda_memory_advise) { ptr, size_bytes, static_cast(cuda_memory_advise), - hint_device)); + new_mem_location_from_device(hint_device))); return; } @@ -379,7 +386,7 @@ void uvm_cuda_mem_prefetch_async( auto stream = at::cuda::getCurrentCUDAStream(); - AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, prefetch_device, stream)); + AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, new_mem_location_from_device(prefetch_device), stream)); return; } diff --git a/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu b/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu index ac9b81cbcb..a281237cf1 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu @@ -72,7 +72,11 @@ __global__ __launch_bounds__(kMaxThreads) void _batched_complete_cumsum_kernel( data = (val_t)values[blockIdx.x][i]; } BlockScan(temp_storage).InclusiveSum(data, data, prefix_op); - cub::CTA_SYNC(); +#if CUDA_VERSION >= 13000 + __syncthreads(); +#else + cub::CTA_SYNC(); +#endif if (i < len) { out[blockIdx.x][i + 1] = data; } From fbe9dc00fd9ca7fc62bdb59fd231b9ca75d9ad98 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Wed, 5 Nov 2025 15:31:13 +0900 Subject: [PATCH 6/9] wip --- fbgemm_gpu/src/memory_utils/memory_utils.cu | 33 ++++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/fbgemm_gpu/src/memory_utils/memory_utils.cu b/fbgemm_gpu/src/memory_utils/memory_utils.cu index c59ac736af..98647f5a68 100644 --- a/fbgemm_gpu/src/memory_utils/memory_utils.cu +++ b/fbgemm_gpu/src/memory_utils/memory_utils.cu @@ -165,11 +165,25 @@ Tensor new_managed_tensor( // Set preferred memory location to host memory AT_CUDA_CHECK(cudaMemAdvise( - ptr, size_bytes, cudaMemAdviseSetPreferredLocation, new_mem_location_from_device(cudaCpuDeviceId))); + ptr, size_bytes, cudaMemAdviseSetPreferredLocation, +#if CUDA_VERSION >= 13000 + new_mem_location_from_device(cudaCpuDeviceId) +#else + cudaCpuDeviceId +#endif + )); + // User hints with "accessed by": GPU will establish direct mapping of data // in CPU memory, no page faults will be generated AT_CUDA_CHECK(cudaMemAdvise( - ptr, size_bytes, cudaMemAdviseSetAccessedBy, new_mem_location_from_device(at::cuda::current_device()))); + ptr, size_bytes, cudaMemAdviseSetAccessedBy, +#if CUDA_VERSION >= 13000 + new_mem_location_from_device(at::cuda::current_device()) +#else + at::cuda::current_device() +#endif + )); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Work around fork issue - see uvm_mem_advice_dont_fork for details @@ -360,7 +374,12 @@ void uvm_cuda_mem_advise(const Tensor& t, int64_t cuda_memory_advise) { ptr, size_bytes, static_cast(cuda_memory_advise), - new_mem_location_from_device(hint_device))); +#if CUDA_VERSION >= 13000 + new_mem_location_from_device(hint_device) +#else + hint_device +#endif + )); return; } @@ -386,7 +405,13 @@ void uvm_cuda_mem_prefetch_async( auto stream = at::cuda::getCurrentCUDAStream(); - AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, new_mem_location_from_device(prefetch_device), stream)); + AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, +#if CUDA_VERSION >= 13000 + new_mem_location_from_device(prefetch_device), 0, +#else + prefetch_device, +#endif + stream)); return; } From 16f3c09b28dcca099cdabc6bcd26225b8ca079f7 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Wed, 5 Nov 2025 17:06:24 +0900 Subject: [PATCH 7/9] wip --- .../training/forward/embedding_forward_split_template.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu index 6574bda45e..b90ad76b34 100644 --- a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu +++ b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu @@ -57,7 +57,7 @@ template < typename index_t, size_t kThreadGroupSize > -__launch_bounds__(kForwardMaxThreads) __global__ void +__launch_bounds__(kForwardMaxThreads) __global__ __attribute__((visibility("default"))) void {%- if is_index_select %} batch_index_select_dim0_codegen_forward_small_kernel( {%- else %} From b6741a4ad2faf50cf640c70432b04d8777bc4489 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Wed, 5 Nov 2025 17:44:04 +0900 Subject: [PATCH 8/9] wip --- cmake/modules/GpuCppLibrary.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/modules/GpuCppLibrary.cmake b/cmake/modules/GpuCppLibrary.cmake index 40cd9986ed..3985a633d3 100644 --- a/cmake/modules/GpuCppLibrary.cmake +++ b/cmake/modules/GpuCppLibrary.cmake @@ -102,6 +102,7 @@ function(prepare_target_sources) (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") ) set(_nvcc_flags ${args_NVCC_FLAGS} -cudart=shared + -static-global-template-stub=false --device-entity-has-hidden-visibility=false) else() set(_nvcc_flags ${args_NVCC_FLAGS}) From a0e05f91efff36ec30f686d6a1ca1a827ff8b36b Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Thu, 6 Nov 2025 08:17:40 +0900 Subject: [PATCH 9/9] wip --- .github/scripts/fbgemm_gpu_build.bash | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash index 96305c056a..aa2dd81517 100644 --- a/.github/scripts/fbgemm_gpu_build.bash +++ b/.github/scripts/fbgemm_gpu_build.bash @@ -304,8 +304,10 @@ __configure_fbgemm_gpu_build_cuda () { local arch_list="7.5;8.0" fi - elif [[ $cuda_version_nvcc == *"V13.0"* ]] || - [[ $cuda_version_nvcc == *"V12.9"* ]] || + elif [[ $cuda_version_nvcc == *"V13.0"* ]]; then + local arch_list="8.0;9.0a;10.0a;12.0a" + + elif [[ $cuda_version_nvcc == *"V12.9"* ]] || [[ $cuda_version_nvcc == *"V12.8"* ]]; then local arch_list="7.5;8.0;9.0a;10.0a;12.0a"