Add get_unique_indices on CPU (#5096)

gchalump · facebook-github-bot · commit e6346cfd5475 · 2025-11-06T17:25:58.000-08:00
Summary: X-link: facebookresearch/FBGEMM#2103 Add `get_unique_indices` on CPU Add test to compare `get_unique_indices` from CPU with GPU Differential Revision: D85736286
diff --git a/fbgemm_gpu/src/split_embeddings_cache/common.h b/fbgemm_gpu/src/split_embeddings_cache/common.h
@@ -120,4 +120,23 @@ Tensor direct_mapped_lxu_cache_lookup_cpu(
     bool gather_cache_stats,
     std::optional<Tensor> uvm_cache_stats);
 
+std::tuple<Tensor, Tensor, std::optional<Tensor>, std::optional<Tensor>>
+get_unique_indices_cpu_impl(
+    const Tensor& linear_indices,
+    const int64_t max_indices,
+    const bool compute_count,
+    const bool compute_inverse_indices);
+
+std::tuple<Tensor, Tensor, std::optional<Tensor>> get_unique_indices_cpu(
+    const Tensor& linear_indices,
+    const int64_t max_indices,
+    const bool compute_count);
+
+std::tuple<Tensor, Tensor, std::optional<Tensor>, std::optional<Tensor>>
+get_unique_indices_with_inverse_cpu(
+    const Tensor& linear_indices,
+    const int64_t max_indices,
+    const bool compute_count,
+    const bool compute_inverse_indices);
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cpp b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cpp
@@ -39,4 +39,141 @@ DLL_PUBLIC Tensor linearize_cache_indices_meta(
   return at::empty_like(indices, indices.options().dtype(at::kLong));
 }
 
+/**
+ * CPU implementation for computing unique indices from a 1D tensor of linear
+ * indices.
+ *
+ * This function processes a tensor of linear indices and returns the unique
+ * values along with optional metadata (counts and inverse mapping). The
+ * implementation uses stable sorting to ensure deterministic ordering of
+ * duplicate values, matching the reference Python implementation.
+ *
+ * @param linear_indices 1D input tensor containing linear indices to process.
+ *     Must be 1D and have fewer than INT32_MAX elements.
+ * @param max_indices Maximum number of unique indices expected (currently
+ *     unused, present to match GPU interface).
+ * @param compute_count If true, computes and returns the count of each unique
+ *     index in the output.
+ * @param compute_inverse_indices If true, computes the original positions of
+ *     elements in sorted order using stable sort.
+ *
+ * @return A tuple containing:
+ *     - unique_indices_output: Tensor containing unique indices, padded to
+ *       match input size (first `num_unique` elements are valid)
+ *     - unique_indices_length: Scalar tensor (size 1) with count of unique
+ *       indices
+ *     - unique_indices_count: Optional tensor (if compute_count=true) with
+ *       occurrence count for each unique index, padded to match input size
+ *     - linear_index_positions_sorted: Optional tensor (if
+ *       compute_inverse_indices=true) containing original positions in sorted
+ *       order (uses stable sort to preserve order for duplicates), converted
+ *       to int32
+ *
+ */
+DLL_PUBLIC
+std::tuple<Tensor, Tensor, std::optional<Tensor>, std::optional<Tensor>>
+get_unique_indices_cpu_impl(
+    const Tensor& linear_indices,
+    const int64_t /*max_indices*/,
+    const bool compute_count,
+    const bool compute_inverse_indices) {
+  TORCH_CHECK(linear_indices.dim() == 1, "linear_indices must be 1D");
+  TORCH_CHECK(linear_indices.numel() < std::numeric_limits<int32_t>::max());
+
+  const int32_t N = linear_indices.numel();
+
+  // Handle empty input
+  if (N == 0) {
+    return std::make_tuple(
+        at::empty_like(linear_indices),
+        at::zeros({1}, linear_indices.options().dtype(at::kInt)),
+        compute_count ? std::optional<Tensor>(at::arange(
+                            {0}, linear_indices.options().dtype(at::kInt)))
+                      : std::optional<Tensor>(),
+        compute_inverse_indices
+            ? std::optional<Tensor>(
+                  at::empty({0}, linear_indices.options().dtype(at::kInt)))
+            : std::optional<Tensor>());
+  }
+
+  // Use torch::unique to get unique indices
+  Tensor unique_indices;
+  Tensor inverse_indices;
+  Tensor counts;
+
+  if (compute_count || compute_inverse_indices) {
+    std::tie(unique_indices, inverse_indices, counts) = at::unique_dim(
+        linear_indices,
+        /*dim=*/0,
+        /*sorted=*/true,
+        /*return_inverse=*/true,
+        /*return_counts=*/true);
+  } else {
+    unique_indices = std::get<0>(at::unique_dim(
+        linear_indices,
+        /*dim=*/0,
+        /*sorted=*/true,
+        /*return_inverse=*/false,
+        /*return_counts=*/false));
+  }
+
+  // Prepare output tensors
+  const int32_t num_unique = unique_indices.numel();
+  auto unique_indices_length =
+      at::ones({1}, linear_indices.options().dtype(at::kInt)) * num_unique;
+
+  // Resize unique_indices to match same size as input
+  auto unique_indices_output = at::empty_like(linear_indices);
+  unique_indices_output.slice(0, 0, num_unique).copy_(unique_indices);
+
+  std::optional<Tensor> unique_indices_count = std::nullopt;
+  std::optional<Tensor> linear_index_positions_sorted;
+
+  if (compute_count) {
+    // Resize counts to match same size as input
+    unique_indices_count =
+        at::empty({N}, linear_indices.options().dtype(at::kInt));
+    unique_indices_count->slice(0, 0, num_unique).copy_(counts.to(at::kInt));
+  }
+
+  if (compute_inverse_indices) {
+    auto sort_indices = at::argsort(
+        linear_indices, /*stable=*/true, /*dim=*/0, /*descending=*/false);
+
+    // Convert to int32
+    linear_index_positions_sorted = sort_indices.to(at::kInt);
+  }
+
+  return std::make_tuple(
+      unique_indices_output,
+      unique_indices_length,
+      unique_indices_count,
+      linear_index_positions_sorted);
+}
+
+DLL_PUBLIC
+std::tuple<Tensor, Tensor, std::optional<Tensor>> get_unique_indices_cpu(
+    const Tensor& linear_indices,
+    const int64_t max_indices,
+    const bool compute_count) {
+  const auto ret = get_unique_indices_cpu_impl(
+      linear_indices,
+      max_indices,
+      compute_count,
+      /*compute_inverse_indices=*/false);
+
+  return {std::get<0>(ret), std::get<1>(ret), std::get<2>(ret)};
+}
+
+DLL_PUBLIC
+std::tuple<Tensor, Tensor, std::optional<Tensor>, std::optional<Tensor>>
+get_unique_indices_with_inverse_cpu(
+    const Tensor& linear_indices,
+    const int64_t max_indices,
+    const bool compute_count,
+    const bool compute_inverse_indices) {
+  return get_unique_indices_cpu_impl(
+      linear_indices, max_indices, compute_count, compute_inverse_indices);
+}
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp b/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp
@@ -69,6 +69,9 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CPU("lxu_cache_lookup", lxu_cache_lookup_cpu);
   DISPATCH_TO_CPU(
       "direct_mapped_lxu_cache_lookup", direct_mapped_lxu_cache_lookup_cpu);
+  DISPATCH_TO_CPU("get_unique_indices", get_unique_indices_cpu);
+  DISPATCH_TO_CPU(
+      "get_unique_indices_with_inverse", get_unique_indices_with_inverse_cpu);
 
   DISPATCH_TO_META("linearize_cache_indices", linearize_cache_indices_meta);
   DISPATCH_TO_META("lxu_cache_lookup", lxu_cache_lookup_meta);
diff --git a/fbgemm_gpu/test/tbe/cache/cache_test.py b/fbgemm_gpu/test/tbe/cache/cache_test.py