Add get_unique_indices on CPU

gchalump · facebook-github-bot · commit 505c4738b3f1 · 2025-11-05T17:33:58.000-08:00
Summary:
Add `get_unique_indices` on CPU
Add test to compare `get_unique_indices` from CPU with GPU

Differential Revision: D85736286
diff --git a/fbgemm_gpu/src/split_embeddings_cache/common.h b/fbgemm_gpu/src/split_embeddings_cache/common.h
@@ -120,4 +120,23 @@ Tensor direct_mapped_lxu_cache_lookup_cpu(
     bool gather_cache_stats,
     std::optional<Tensor> uvm_cache_stats);
 
+std::tuple<Tensor, Tensor, std::optional<Tensor>, std::optional<Tensor>>
+get_unique_indices_cpu_impl(
+    const Tensor& linear_indices,
+    const int64_t max_indices,
+    const bool compute_count,
+    const bool compute_inverse_indices);
+
+std::tuple<Tensor, Tensor, std::optional<Tensor>> get_unique_indices_cpu(
+    const Tensor& linear_indices,
+    const int64_t max_indices,
+    const bool compute_count);
+
+std::tuple<Tensor, Tensor, std::optional<Tensor>, std::optional<Tensor>>
+get_unique_indices_with_inverse_cpu(
+    const Tensor& linear_indices,
+    const int64_t max_indices,
+    const bool compute_count,
+    const bool compute_inverse_indices);
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cpp b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cpp
@@ -39,4 +39,112 @@ DLL_PUBLIC Tensor linearize_cache_indices_meta(
   return at::empty_like(indices, indices.options().dtype(at::kLong));
 }
 
+DLL_PUBLIC
+std::tuple<Tensor, Tensor, std::optional<Tensor>, std::optional<Tensor>>
+get_unique_indices_cpu_impl(
+    const Tensor& linear_indices,
+    const int64_t /*max_indices*/,
+    const bool compute_count,
+    const bool compute_inverse_indices) {
+  TORCH_CHECK(linear_indices.dim() == 1, "linear_indices must be 1D");
+  TORCH_CHECK(linear_indices.numel() < std::numeric_limits<int32_t>::max());
+
+  const int32_t N = linear_indices.numel();
+
+  // Handle empty input
+  if (N == 0) {
+    return std::make_tuple(
+        at::empty_like(linear_indices),
+        at::zeros({1}, linear_indices.options().dtype(at::kInt)),
+        compute_count ? std::optional<Tensor>(at::arange(
+                            {0}, linear_indices.options().dtype(at::kInt)))
+                      : std::optional<Tensor>(),
+        compute_inverse_indices
+            ? std::optional<Tensor>(
+                  at::empty({0}, linear_indices.options().dtype(at::kInt)))
+            : std::optional<Tensor>());
+  }
+
+  // Use torch::unique to get unique indices
+  Tensor unique_indices;
+  Tensor inverse_indices;
+  Tensor counts;
+
+  if (compute_count || compute_inverse_indices) {
+    std::tie(unique_indices, inverse_indices, counts) = at::unique_dim(
+        linear_indices,
+        /*dim=*/0,
+        /*sorted=*/true,
+        /*return_inverse=*/true,
+        /*return_counts=*/true);
+  } else {
+    unique_indices = std::get<0>(at::unique_dim(
+        linear_indices,
+        /*dim=*/0,
+        /*sorted=*/true,
+        /*return_inverse=*/false,
+        /*return_counts=*/false));
+  }
+
+  // Prepare output tensors
+  const int32_t num_unique = unique_indices.numel();
+  auto unique_indices_length =
+      at::ones({1}, linear_indices.options().dtype(at::kInt)) * num_unique;
+
+  // Resize unique_indices to match same size as input
+  auto unique_indices_output = at::empty_like(linear_indices);
+  unique_indices_output.slice(0, 0, num_unique).copy_(unique_indices);
+
+  std::optional<Tensor> unique_indices_count = std::nullopt;
+  std::optional<Tensor> linear_index_positions_sorted;
+
+  if (compute_count) {
+    // Resize counts to match same size as input
+    unique_indices_count =
+        at::empty({N}, linear_indices.options().dtype(at::kInt));
+    unique_indices_count->slice(0, 0, num_unique).copy_(counts.to(at::kInt));
+  }
+
+  if (compute_inverse_indices) {
+    // Sort linear_indices and get the sort indices
+    auto sorted_indices_and_positions =
+        at::sort(linear_indices, /*dim=*/0, /*descending=*/false);
+    auto sort_indices = std::get<1>(sorted_indices_and_positions);
+
+    // Convert to int32 to match GPU output dtype
+    linear_index_positions_sorted = sort_indices.to(at::kInt);
+  }
+
+  return std::make_tuple(
+      unique_indices_output,
+      unique_indices_length,
+      unique_indices_count,
+      linear_index_positions_sorted);
+}
+
+DLL_PUBLIC
+std::tuple<Tensor, Tensor, std::optional<Tensor>> get_unique_indices_cpu(
+    const Tensor& linear_indices,
+    const int64_t max_indices,
+    const bool compute_count) {
+  const auto ret = get_unique_indices_cpu_impl(
+      linear_indices,
+      max_indices,
+      compute_count,
+      /*compute_inverse_indices=*/false);
+
+  return {std::get<0>(ret), std::get<1>(ret), std::get<2>(ret)};
+}
+
+DLL_PUBLIC
+std::tuple<Tensor, Tensor, std::optional<Tensor>, std::optional<Tensor>>
+get_unique_indices_with_inverse_cpu(
+    const Tensor& linear_indices,
+    const int64_t max_indices,
+    const bool compute_count,
+    const bool compute_inverse_indices) {
+  return get_unique_indices_cpu_impl(
+      linear_indices, max_indices, compute_count, compute_inverse_indices);
+}
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp b/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp
@@ -69,6 +69,9 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CPU("lxu_cache_lookup", lxu_cache_lookup_cpu);
   DISPATCH_TO_CPU(
       "direct_mapped_lxu_cache_lookup", direct_mapped_lxu_cache_lookup_cpu);
+  DISPATCH_TO_CPU("get_unique_indices", get_unique_indices_cpu);
+  DISPATCH_TO_CPU(
+      "get_unique_indices_with_inverse", get_unique_indices_with_inverse_cpu);
 
   DISPATCH_TO_META("linearize_cache_indices", linearize_cache_indices_meta);
   DISPATCH_TO_META("lxu_cache_lookup", lxu_cache_lookup_meta);
diff --git a/fbgemm_gpu/test/tbe/cache/cache_test.py b/fbgemm_gpu/test/tbe/cache/cache_test.py