From ad86aa8176dae52d0f817adce914178b6b5028f5 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Wed, 21 May 2025 16:29:23 +0100
Subject: [PATCH 01/22] [MLIR][OpenMP] Remove Generic-SPMD early detection

This patch removes logic from MLIR to attempt identifying Generic kernels that
could be executed in SPMD mode.

This optimization is done by the OpenMPOpt pass for Clang and is only required
here to circumvent missing support for the new DeviceRTL APIs used in MLIR to
LLVM IR translation that Clang doesn't currently use (e.g.
`kmpc_distribute_static_loop`). Removing checks in MLIR avoids duplicating the
logic that should be centralized in the OpenMPOpt pass.

Additionally, offloading kernels currently compiled through the OpenMP dialect
fail to run parallel regions properly when in Generic mode. By disabling early
detection, this issue becomes apparent for a range of kernels where this was
masked by having them run in SPMD mode.

Update TargetRegionFlags to mirror OMPTgtExecModeFlags
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  3 +-
 .../mlir/Dialect/OpenMP/OpenMPEnums.td        | 36 ++++------
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 12 ++--
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 69 +++++++++----------
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 44 +++++-------
 .../LLVMIR/openmp-target-generic-spmd.mlir    |  2 +-
 6 files changed, 74 insertions(+), 92 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 10b2608d95a9c..dd4c636ce1b0c 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6811,7 +6811,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
   Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
   Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
-      Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
+      Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD &&
+                Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP);
   Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
   Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
index ada3a3edd8a30..309135f0c729c 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
@@ -281,31 +281,21 @@ def ScheduleModifier : OpenMP_I32EnumAttr<
 def ScheduleModifierAttr : OpenMP_EnumAttr<ScheduleModifier, "sched_mod">;
 
 //===----------------------------------------------------------------------===//
-// target_region_flags enum.
+// target_exec_mode enum.
 //===----------------------------------------------------------------------===//
 
-def TargetRegionFlagsNone : I32BitEnumAttrCaseNone<"none">;
-def TargetRegionFlagsGeneric : I32BitEnumAttrCaseBit<"generic", 0>;
-def TargetRegionFlagsSpmd : I32BitEnumAttrCaseBit<"spmd", 1>;
-def TargetRegionFlagsTripCount : I32BitEnumAttrCaseBit<"trip_count", 2>;
-def TargetRegionFlagsNoLoop : I32BitEnumAttrCaseBit<"no_loop", 3>;
-
-def TargetRegionFlags : OpenMP_BitEnumAttr<
-    "TargetRegionFlags",
-    "These flags describe properties of the target kernel. "
-    "TargetRegionFlagsGeneric - denotes generic kernel. "
-    "TargetRegionFlagsSpmd - denotes SPMD kernel. "
-    "TargetRegionFlagsNoLoop - denotes kernel where "
-    "num_teams * num_threads >= loop_trip_count. It allows the conversion "
-    "of loops into sequential code by ensuring that each team/thread "
-    "executes at most one iteration. "
-    "TargetRegionFlagsTripCount - checks if the loop trip count should be "
-    "calculated.", [
-      TargetRegionFlagsNone,
-      TargetRegionFlagsGeneric,
-      TargetRegionFlagsSpmd,
-      TargetRegionFlagsTripCount,
-      TargetRegionFlagsNoLoop
+def TargetExecModeBare : I32EnumAttrCase<"bare", 0>;
+def TargetExecModeGeneric : I32EnumAttrCase<"generic", 1>;
+def TargetExecModeSpmd : I32EnumAttrCase<"spmd", 2>;
+def TargetExecModeSpmdNoLoop : I32EnumAttrCase<"no_loop", 3>;
+
+def TargetExecMode : OpenMP_I32EnumAttr<
+    "TargetExecMode",
+    "target execution mode, mirroring the `OMPTgtExecModeFlags` LLVM enum", [
+      TargetExecModeBare,
+      TargetExecModeGeneric,
+      TargetExecModeSpmd,
+      TargetExecModeSpmdNoLoop,
     ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 377f1febf6b8f..ec7d99b86cc8b 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -1509,13 +1509,17 @@ def TargetOp : OpenMP_Op<"target", traits = [
     /// operations, the top level one will be the one captured.
     Operation *getInnermostCapturedOmpOp();
 
-    /// Infers the kernel type (Generic, SPMD or Generic-SPMD) based on the
-    /// contents of the target region.
+    /// Infers the kernel type (Bare, Generic or SPMD) based on the contents of
+    /// the target region.
     ///
     /// \param capturedOp result of a still valid (no modifications made to any
     /// nested operations) previous call to `getInnermostCapturedOmpOp()`.
-    static ::mlir::omp::TargetRegionFlags
-    getKernelExecFlags(Operation *capturedOp);
+    /// \param hostEvalTripCount output argument to store whether this kernel
+    /// wraps a loop whose bounds must be evaluated on the host prior to
+    /// launching it.
+    static ::mlir::omp::TargetExecMode
+    getKernelExecFlags(Operation *capturedOp,
+                       bool *hostEvalTripCount = nullptr);
   }] # clausesExtraClassDeclaration;
 
   let assemblyFormat = clausesAssemblyFormat # [{
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 172f21ff1779e..1ea34e6c898ab 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -2234,8 +2234,9 @@ LogicalResult TargetOp::verifyRegions() {
     return emitError("target containing multiple 'omp.teams' nested ops");
 
   // Check that host_eval values are only used in legal ways.
+  bool hostEvalTripCount;
   Operation *capturedOp = getInnermostCapturedOmpOp();
-  TargetRegionFlags execFlags = getKernelExecFlags(capturedOp);
+  TargetExecMode execMode = getKernelExecFlags(capturedOp, &hostEvalTripCount);
   for (Value hostEvalArg :
        cast<BlockArgOpenMPOpInterface>(getOperation()).getHostEvalBlockArgs()) {
     for (Operation *user : hostEvalArg.getUsers()) {
@@ -2250,7 +2251,7 @@ LogicalResult TargetOp::verifyRegions() {
                                 "and 'thread_limit' in 'omp.teams'";
       }
       if (auto parallelOp = dyn_cast<ParallelOp>(user)) {
-        if (bitEnumContainsAny(execFlags, TargetRegionFlags::spmd) &&
+        if (execMode == TargetExecMode::spmd &&
             parallelOp->isAncestor(capturedOp) &&
             hostEvalArg == parallelOp.getNumThreads())
           continue;
@@ -2260,8 +2261,7 @@ LogicalResult TargetOp::verifyRegions() {
                   "'omp.parallel' when representing target SPMD";
       }
       if (auto loopNestOp = dyn_cast<LoopNestOp>(user)) {
-        if (bitEnumContainsAny(execFlags, TargetRegionFlags::trip_count) &&
-            loopNestOp.getOperation() == capturedOp &&
+        if (hostEvalTripCount && loopNestOp.getOperation() == capturedOp &&
             (llvm::is_contained(loopNestOp.getLoopLowerBounds(), hostEvalArg) ||
              llvm::is_contained(loopNestOp.getLoopUpperBounds(), hostEvalArg) ||
              llvm::is_contained(loopNestOp.getLoopSteps(), hostEvalArg)))
@@ -2387,7 +2387,9 @@ static bool canPromoteToNoLoop(Operation *capturedOp, TeamsOp teamsOp,
          ompFlags.getAssumeThreadsOversubscription();
 }
 
-TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) {
+TargetExecMode TargetOp::getKernelExecFlags(Operation *capturedOp,
+                                            bool *hostEvalTripCount) {
+  // TODO: Support detection of bare kernel mode.
   // A non-null captured op is only valid if it resides inside of a TargetOp
   // and is the result of calling getInnermostCapturedOmpOp() on it.
   TargetOp targetOp =
@@ -2396,9 +2398,12 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) {
           (targetOp && targetOp.getInnermostCapturedOmpOp() == capturedOp)) &&
          "unexpected captured op");
 
+  if (hostEvalTripCount)
+    *hostEvalTripCount = false;
+
   // If it's not capturing a loop, it's a default target region.
   if (!isa_and_present<LoopNestOp>(capturedOp))
-    return TargetRegionFlags::generic;
+    return TargetExecMode::generic;
 
   // Get the innermost non-simd loop wrapper.
   SmallVector<LoopWrapperInterface> loopWrappers;
@@ -2411,31 +2416,32 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) {
 
   auto numWrappers = std::distance(innermostWrapper, loopWrappers.end());
   if (numWrappers != 1 && numWrappers != 2)
-    return TargetRegionFlags::generic;
+    return TargetExecMode::generic;
 
   // Detect target-teams-distribute-parallel-wsloop[-simd].
   if (numWrappers == 2) {
     WsloopOp *wsloopOp = dyn_cast<WsloopOp>(innermostWrapper);
     if (!wsloopOp)
-      return TargetRegionFlags::generic;
+      return TargetExecMode::generic;
 
     innermostWrapper = std::next(innermostWrapper);
     if (!isa<DistributeOp>(innermostWrapper))
-      return TargetRegionFlags::generic;
+      return TargetExecMode::generic;
 
     Operation *parallelOp = (*innermostWrapper)->getParentOp();
     if (!isa_and_present<ParallelOp>(parallelOp))
-      return TargetRegionFlags::generic;
+      return TargetExecMode::generic;
 
     TeamsOp teamsOp = dyn_cast<TeamsOp>(parallelOp->getParentOp());
     if (!teamsOp)
-      return TargetRegionFlags::generic;
+      return TargetExecMode::generic;
 
     if (teamsOp->getParentOp() == targetOp.getOperation()) {
-      TargetRegionFlags result =
-          TargetRegionFlags::spmd | TargetRegionFlags::trip_count;
+      TargetExecMode result = TargetExecMode::spmd;
       if (canPromoteToNoLoop(capturedOp, teamsOp, wsloopOp))
-        result = result | TargetRegionFlags::no_loop;
+        result = TargetExecMode::no_loop;
+      if (hostEvalTripCount)
+        *hostEvalTripCount = true;
       return result;
     }
   }
@@ -2443,43 +2449,30 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) {
   else if (isa<DistributeOp, LoopOp>(innermostWrapper)) {
     Operation *teamsOp = (*innermostWrapper)->getParentOp();
     if (!isa_and_present<TeamsOp>(teamsOp))
-      return TargetRegionFlags::generic;
+      return TargetExecMode::generic;
 
     if (teamsOp->getParentOp() != targetOp.getOperation())
-      return TargetRegionFlags::generic;
+      return TargetExecMode::generic;
+
+    if (hostEvalTripCount)
+      *hostEvalTripCount = true;
 
     if (isa<LoopOp>(innermostWrapper))
-      return TargetRegionFlags::spmd | TargetRegionFlags::trip_count;
-
-    // Add spmd flag if there's a nested omp.parallel (generic-spmd case).
-    //
-    // TODO: This shouldn't have to be done here, as it is too easy to break.
-    // The openmp-opt pass should be updated to be able to promote kernels like
-    // this from "Generic" to "Generic-SPMD". However, the use of the
-    // `kmpc_distribute_static_loop` family of functions produced by the
-    // OMPIRBuilder for these kernels prevents that from working.
-    bool hasParallel = capturedOp
-                           ->walk<WalkOrder::PreOrder>([](ParallelOp) {
-                             return WalkResult::interrupt();
-                           })
-                           .wasInterrupted();
-
-    TargetRegionFlags result =
-        TargetRegionFlags::generic | TargetRegionFlags::trip_count;
-
-    return hasParallel ? result | TargetRegionFlags::spmd : result;
+      return TargetExecMode::spmd;
+
+    return TargetExecMode::generic;
   }
   // Detect target-parallel-wsloop[-simd].
   else if (isa<WsloopOp>(innermostWrapper)) {
     Operation *parallelOp = (*innermostWrapper)->getParentOp();
     if (!isa_and_present<ParallelOp>(parallelOp))
-      return TargetRegionFlags::generic;
+      return TargetExecMode::generic;
 
     if (parallelOp->getParentOp() == targetOp.getOperation())
-      return TargetRegionFlags::spmd;
+      return TargetExecMode::spmd;
   }
 
-  return TargetRegionFlags::generic;
+  return TargetExecMode::generic;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8344332c9063f..fc06676e4a109 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2596,13 +2596,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
     // for every omp.wsloop nested inside a no-loop SPMD target region, even if
     // that loop is not the top-level SPMD one.
     if (loopOp == targetCapturedOp) {
-      omp::TargetRegionFlags kernelFlags =
-          targetOp.getKernelExecFlags(targetCapturedOp);
-      if (omp::bitEnumContainsAll(kernelFlags,
-                                  omp::TargetRegionFlags::spmd |
-                                      omp::TargetRegionFlags::no_loop) &&
-          !omp::bitEnumContainsAny(kernelFlags,
-                                   omp::TargetRegionFlags::generic))
+      if (targetOp.getKernelExecFlags(targetCapturedOp) ==
+          omp::TargetExecMode::no_loop)
         noLoopMode = true;
     }
   }
@@ -5797,23 +5792,21 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp,
   }
 
   // Update kernel bounds structure for the `OpenMPIRBuilder` to use.
-  omp::TargetRegionFlags kernelFlags = targetOp.getKernelExecFlags(capturedOp);
-  assert(
-      omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::generic |
-                                               omp::TargetRegionFlags::spmd) &&
-      "invalid kernel flags");
-  attrs.ExecFlags =
-      omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::generic)
-          ? omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::spmd)
-                ? llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD
-                : llvm::omp::OMP_TGT_EXEC_MODE_GENERIC
-          : llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
-  if (omp::bitEnumContainsAll(kernelFlags,
-                              omp::TargetRegionFlags::spmd |
-                                  omp::TargetRegionFlags::no_loop) &&
-      !omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::generic))
+  omp::TargetExecMode execMode = targetOp.getKernelExecFlags(capturedOp);
+  switch (execMode) {
+  case omp::TargetExecMode::bare:
+    attrs.ExecFlags = llvm::omp::OMP_TGT_EXEC_MODE_BARE;
+    break;
+  case omp::TargetExecMode::generic:
+    attrs.ExecFlags = llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
+    break;
+  case omp::TargetExecMode::spmd:
+    attrs.ExecFlags = llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
+    break;
+  case omp::TargetExecMode::no_loop:
     attrs.ExecFlags = llvm::omp::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP;
-
+    break;
+  }
   attrs.MinTeams = minTeamsVal;
   attrs.MaxTeams.front() = maxTeamsVal;
   attrs.MinThreads = 1;
@@ -5863,8 +5856,9 @@ initTargetRuntimeAttrs(llvm::IRBuilderBase &builder,
   if (numThreads)
     attrs.MaxThreads = moduleTranslation.lookupValue(numThreads);
 
-  if (omp::bitEnumContainsAny(targetOp.getKernelExecFlags(capturedOp),
-                              omp::TargetRegionFlags::trip_count)) {
+  bool hostEvalTripCount;
+  targetOp.getKernelExecFlags(capturedOp, &hostEvalTripCount);
+  if (hostEvalTripCount) {
     llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
     attrs.LoopTripCount = nullptr;
 
diff --git a/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir b/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir
index 504d91b1f6198..6084a33fac8aa 100644
--- a/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir
@@ -84,7 +84,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
   }
 }
 
-// DEVICE:      @[[KERNEL_NAME:.*]]_exec_mode = weak protected constant i8 [[EXEC_MODE:3]]
+// DEVICE:      @[[KERNEL_NAME:.*]]_exec_mode = weak protected constant i8 [[EXEC_MODE:1]]
 // DEVICE:      @llvm.compiler.used = appending global [1 x ptr] [ptr @[[KERNEL_NAME]]_exec_mode], section "llvm.metadata"
 // DEVICE:      @[[KERNEL_NAME]]_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy {
 // DEVICE-SAME: %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 [[EXEC_MODE]], {{.*}}},

From d3c02e9002b5362cf3eefbcc07cf6f354dd348a2 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Tue, 8 Jul 2025 12:27:08 +0100
Subject: [PATCH 02/22] [OpenMP][OMPIRBuilder] Add device shared memory
 allocation support

This patch adds the `__kmpc_alloc_shared` and `__kmpc_free_shared` DeviceRTL
functions to the list of those the OMPIRBuilder is able to create.
---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       | 23 ++++++++++++++++
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 27 +++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index fb8563402528c..6ec4706a1bf91 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2935,6 +2935,29 @@ class OpenMPIRBuilder {
   LLVM_ABI CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
                                    Value *Allocator, std::string Name = "");
 
+  /// Create a runtime call for kmpc_alloc_shared.
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param VarType Type of variable to be allocated.
+  /// \param Name Name of call Instruction.
+  ///
+  /// \returns CallInst to the kmpc_alloc_shared call.
+  LLVM_ABI CallInst *createOMPAllocShared(const LocationDescription &Loc,
+                                          Type *VarType,
+                                          const Twine &Name = Twine(""));
+
+  /// Create a runtime call for kmpc_free_shared.
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param Addr Value obtained from the corresponding kmpc_alloc_shared call.
+  /// \param VarType Type of variable to be freed.
+  /// \param Name Name of call Instruction.
+  ///
+  /// \returns CallInst to the kmpc_free_shared call.
+  LLVM_ABI CallInst *createOMPFreeShared(const LocationDescription &Loc,
+                                         Value *Addr, Type *VarType,
+                                         const Twine &Name = Twine(""));
+
   /// Create a runtime call for kmpc_threadprivate_cached
   ///
   /// \param Loc The insert and source location description.
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index dd4c636ce1b0c..81834e7133e93 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6693,6 +6693,33 @@ CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
   return Builder.CreateCall(Fn, Args, Name);
 }
 
+CallInst *OpenMPIRBuilder::createOMPAllocShared(const LocationDescription &Loc,
+                                                Type *VarType,
+                                                const Twine &Name) {
+  IRBuilder<>::InsertPointGuard IPG(Builder);
+  updateToLocation(Loc);
+
+  const DataLayout &DL = M.getDataLayout();
+  Value *Args[] = {Builder.getInt64(DL.getTypeStoreSize(VarType))};
+  Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc_shared);
+  CallInst *Call = Builder.CreateCall(Fn, Args, Name);
+  Call->addRetAttr(
+      Attribute::getWithAlignment(M.getContext(), DL.getPrefTypeAlign(Int64)));
+  return Call;
+}
+
+CallInst *OpenMPIRBuilder::createOMPFreeShared(const LocationDescription &Loc,
+                                               Value *Addr, Type *VarType,
+                                               const Twine &Name) {
+  IRBuilder<>::InsertPointGuard IPG(Builder);
+  updateToLocation(Loc);
+
+  Value *Args[] = {
+      Addr, Builder.getInt64(M.getDataLayout().getTypeStoreSize(VarType))};
+  Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free_shared);
+  return Builder.CreateCall(Fn, Args, Name);
+}
+
 CallInst *OpenMPIRBuilder::createOMPInteropInit(
     const LocationDescription &Loc, Value *InteropVar,
     omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,

From 8d3d1907c624ed79198724095aa0e0c12e1ba291 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Fri, 27 Jun 2025 13:29:13 +0100
Subject: [PATCH 03/22] [MLIR][OpenMP] Support allocations of device shared
 memory

This patch updates the allocation of some reduction and private variables
within target regions to use device shared memory rather than private memory.
This is a prerequisite to produce working Generic kernels containing parallel
regions.

In particular, the following situations result in the usage of device shared
memory (only when compiling for the target device if they are placed inside of
a target region representing a Generic kernel):
- Reduction variables on `teams` constructs.
- Private variables on `teams` and `distribute` constructs that are reduced or
used inside of a `parallel` region.

Currently, there is no support for delayed privatization on `teams` constructs,
so private variables on these constructs won't currently be affected. When
support is added, if it uses the existing `allocatePrivateVars` and
`cleanupPrivateVars` functions, usage of device shared memory will be
introduced automatically.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 226 +++++++++++++-----
 .../omptarget-device-shared-memory.mlir       |  83 +++++++
 2 files changed, 250 insertions(+), 59 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index fc06676e4a109..0c1c4e8b8a05c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1105,12 +1105,63 @@ struct DeferredStore {
 };
 } // namespace
 
+/// Check whether allocations for the given operation might potentially have to
+/// be done in device shared memory. That means we're compiling for a offloading
+/// target, the operation is an `omp::TargetOp` or nested inside of one and that
+/// target region represents a Generic (non-SPMD) kernel.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. For some variables, the
+/// associated OpenMP construct or their uses might also need to be taken into
+/// account.
+static bool
+mightAllocInDeviceSharedMemory(Operation &op,
+                               const llvm::OpenMPIRBuilder &ompBuilder) {
+  if (!ompBuilder.Config.isTargetDevice())
+    return false;
+
+  auto targetOp = dyn_cast<omp::TargetOp>(op);
+  if (!targetOp)
+    targetOp = op.getParentOfType<omp::TargetOp>();
+
+  return targetOp &&
+         targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) ==
+             omp::TargetExecMode::generic;
+}
+
+/// Check whether the entry block argument representing the private copy of a
+/// variable in an OpenMP construct must be allocated in device shared memory,
+/// based on what the uses of that copy are.
+///
+/// This must only be called if a previous call to
+/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
+/// operation that owns the specified block argument.
+static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
+  Operation *parentOp = value.getOwner()->getParentOp();
+  auto targetOp = dyn_cast<omp::TargetOp>(parentOp);
+  if (!targetOp)
+    targetOp = parentOp->getParentOfType<omp::TargetOp>();
+  assert(targetOp && "expected a parent omp.target operation");
+
+  for (auto *user : value.getUsers()) {
+    if (auto parallelOp = dyn_cast<omp::ParallelOp>(user)) {
+      if (llvm::is_contained(parallelOp.getReductionVars(), value))
+        return true;
+    } else if (auto parallelOp = user->getParentOfType<omp::ParallelOp>()) {
+      if (parentOp->isProperAncestor(parallelOp))
+        return true;
+    }
+  }
+
+  return false;
+}
+
 /// Allocate space for privatized reduction variables.
 /// `deferredStores` contains information to create store operations which needs
 /// to be inserted after all allocas
 template <typename T>
 static LogicalResult
-allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
+allocReductionVars(T op, ArrayRef<BlockArgument> reductionArgs,
                    llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation,
                    const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
@@ -1122,10 +1173,14 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  bool useDeviceSharedMem =
+      isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+
   // delay creating stores until after all allocas
-  deferredStores.reserve(loop.getNumReductionVars());
+  deferredStores.reserve(op.getNumReductionVars());
 
-  for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) {
+  for (std::size_t i = 0; i < op.getNumReductionVars(); ++i) {
     Region &allocRegion = reductionDecls[i].getAllocRegion();
     if (isByRefs[i]) {
       if (allocRegion.empty())
@@ -1134,7 +1189,7 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
       SmallVector<llvm::Value *, 1> phis;
       if (failed(inlineConvertOmpRegions(allocRegion, "omp.reduction.alloc",
                                          builder, moduleTranslation, &phis)))
-        return loop.emitError(
+        return op.emitError(
             "failed to inline `alloc` region of `omp.declare_reduction`");
 
       assert(phis.size() == 1 && "expected one allocation to be yielded");
@@ -1142,32 +1197,43 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
 
       // Allocate reduction variable (which is a pointer to the real reduction
       // variable allocated in the inlined region)
-      llvm::Value *var = builder.CreateAlloca(
-          moduleTranslation.convertType(reductionDecls[i].getType()));
-
       llvm::Type *ptrTy = builder.getPtrTy();
-      llvm::Value *castVar =
-          builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
+      llvm::Type *varTy =
+          moduleTranslation.convertType(reductionDecls[i].getType());
+      llvm::Value *var;
+      if (useDeviceSharedMem) {
+        var = ompBuilder->createOMPAllocShared(builder, varTy);
+      } else {
+        var = builder.CreateAlloca(varTy);
+        var = builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
+      }
+
       llvm::Value *castPhi =
           builder.CreatePointerBitCastOrAddrSpaceCast(phis[0], ptrTy);
 
-      deferredStores.emplace_back(castPhi, castVar);
+      deferredStores.emplace_back(castPhi, var);
 
-      privateReductionVariables[i] = castVar;
+      privateReductionVariables[i] = var;
       moduleTranslation.mapValue(reductionArgs[i], castPhi);
-      reductionVariableMap.try_emplace(loop.getReductionVars()[i], castPhi);
+      reductionVariableMap.try_emplace(op.getReductionVars()[i], castPhi);
     } else {
       assert(allocRegion.empty() &&
              "allocaction is implicit for by-val reduction");
-      llvm::Value *var = builder.CreateAlloca(
-          moduleTranslation.convertType(reductionDecls[i].getType()));
+
       llvm::Type *ptrTy = builder.getPtrTy();
-      llvm::Value *castVar =
-          builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
+      llvm::Type *varTy =
+          moduleTranslation.convertType(reductionDecls[i].getType());
+      llvm::Value *var;
+      if (useDeviceSharedMem) {
+        var = ompBuilder->createOMPAllocShared(builder, varTy);
+      } else {
+        var = builder.CreateAlloca(varTy);
+        var = builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
+      }
 
-      moduleTranslation.mapValue(reductionArgs[i], castVar);
-      privateReductionVariables[i] = castVar;
-      reductionVariableMap.try_emplace(loop.getReductionVars()[i], castVar);
+      moduleTranslation.mapValue(reductionArgs[i], var);
+      privateReductionVariables[i] = var;
+      reductionVariableMap.try_emplace(op.getReductionVars()[i], var);
     }
   }
 
@@ -1229,6 +1295,10 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
   if (op.getNumReductionVars() == 0)
     return success();
 
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  bool useDeviceSharedMem =
+      isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+
   llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
   auto allocaIP = llvm::IRBuilderBase::InsertPoint(
       latestAllocaBlock, latestAllocaBlock->getTerminator()->getIterator());
@@ -1243,8 +1313,12 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
       // TODO: remove after all users of by-ref are updated to use the alloc
       // region: Allocate reduction variable (which is a pointer to the real
       // reduciton variable allocated in the inlined region)
-      byRefVars[i] = builder.CreateAlloca(
-          moduleTranslation.convertType(reductionDecls[i].getType()));
+      llvm::Type *varTy =
+          moduleTranslation.convertType(reductionDecls[i].getType());
+      if (useDeviceSharedMem)
+        byRefVars[i] = ompBuilder->createOMPAllocShared(builder, varTy);
+      else
+        byRefVars[i] = builder.CreateAlloca(varTy);
     }
   }
 
@@ -1435,9 +1509,20 @@ static LogicalResult createReductionsAndCleanup(
                   [](omp::DeclareReductionOp reductionDecl) {
                     return &reductionDecl.getCleanupRegion();
                   });
-  return inlineOmpRegionCleanup(reductionRegions, privateReductionVariables,
-                                moduleTranslation, builder,
-                                "omp.reduction.cleanup");
+  LogicalResult result = inlineOmpRegionCleanup(
+      reductionRegions, privateReductionVariables, moduleTranslation, builder,
+      "omp.reduction.cleanup");
+
+  bool useDeviceSharedMem =
+      isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+  if (useDeviceSharedMem) {
+    for (auto [var, reductionDecl] :
+         llvm::zip_equal(privateReductionVariables, reductionDecls))
+      ompBuilder->createOMPFreeShared(
+          builder, var, moduleTranslation.convertType(reductionDecl.getType()));
+  }
+
+  return result;
 }
 
 static ArrayRef<bool> getIsByRef(std::optional<ArrayRef<bool>> attr) {
@@ -1582,8 +1667,9 @@ initPrivateVars(llvm::IRBuilderBase &builder,
 /// Allocate and initialize delayed private variables. Returns the basic block
 /// which comes after all of these allocations. llvm::Value * for each of these
 /// private variables are populated in llvmPrivateVars.
+template <typename T>
 static llvm::Expected<llvm::BasicBlock *>
-allocatePrivateVars(llvm::IRBuilderBase &builder,
+allocatePrivateVars(T op, llvm::IRBuilderBase &builder,
                     LLVM::ModuleTranslation &moduleTranslation,
                     PrivateVarsInfo &privateVarsInfo,
                     const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
@@ -1606,6 +1692,10 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
   llvm::DataLayout dataLayout = builder.GetInsertBlock()->getDataLayout();
   llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0);
 
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  bool mightUseDeviceSharedMem =
+      isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
+      mightAllocInDeviceSharedMemory(*op, *ompBuilder);
   unsigned int allocaAS =
       moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace();
   unsigned int defaultAS = moduleTranslation.getLLVMModule()
@@ -1618,11 +1708,17 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
     llvm::Type *llvmAllocType =
         moduleTranslation.convertType(privDecl.getType());
     builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
-    llvm::Value *llvmPrivateVar = builder.CreateAlloca(
-        llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
-    if (allocaAS != defaultAS)
-      llvmPrivateVar = builder.CreateAddrSpaceCast(llvmPrivateVar,
-                                                   builder.getPtrTy(defaultAS));
+    llvm::Value *llvmPrivateVar = nullptr;
+    if (mightUseDeviceSharedMem &&
+        mustAllocPrivateVarInDeviceSharedMemory(blockArg)) {
+      llvmPrivateVar = ompBuilder->createOMPAllocShared(builder, llvmAllocType);
+    } else {
+      llvmPrivateVar = builder.CreateAlloca(
+          llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
+      if (allocaAS != defaultAS)
+        llvmPrivateVar = builder.CreateAddrSpaceCast(
+            llvmPrivateVar, builder.getPtrTy(defaultAS));
+    }
 
     privateVarsInfo.llvmVars.push_back(llvmPrivateVar);
   }
@@ -1694,24 +1790,41 @@ static LogicalResult copyFirstPrivateVars(
   return success();
 }
 
+template <typename T>
 static LogicalResult
-cleanupPrivateVars(llvm::IRBuilderBase &builder,
+cleanupPrivateVars(T op, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation, Location loc,
-                   SmallVectorImpl<llvm::Value *> &llvmPrivateVars,
-                   SmallVectorImpl<omp::PrivateClauseOp> &privateDecls) {
+                   PrivateVarsInfo &privateVarsInfo) {
   // private variable deallocation
   SmallVector<Region *> privateCleanupRegions;
-  llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions),
+  llvm::transform(privateVarsInfo.privatizers,
+                  std::back_inserter(privateCleanupRegions),
                   [](omp::PrivateClauseOp privatizer) {
                     return &privatizer.getDeallocRegion();
                   });
 
-  if (failed(inlineOmpRegionCleanup(
-          privateCleanupRegions, llvmPrivateVars, moduleTranslation, builder,
-          "omp.private.dealloc", /*shouldLoadCleanupRegionArg=*/false)))
+  if (failed(inlineOmpRegionCleanup(privateCleanupRegions,
+                                    privateVarsInfo.llvmVars, moduleTranslation,
+                                    builder, "omp.private.dealloc",
+                                    /*shouldLoadCleanupRegionArg=*/false)))
     return mlir::emitError(loc, "failed to inline `dealloc` region of an "
                                 "`omp.private` op in");
 
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  bool mightUseDeviceSharedMem =
+      isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
+      mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+  for (auto [privDecl, llvmPrivVar, blockArg] :
+       llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.llvmVars,
+                       privateVarsInfo.blockArgs)) {
+    if (mightUseDeviceSharedMem &&
+        mustAllocPrivateVarInDeviceSharedMemory(blockArg)) {
+      ompBuilder->createOMPFreeShared(
+          builder, llvmPrivVar,
+          moduleTranslation.convertType(privDecl.getType()));
+    }
+  }
+
   return success();
 }
 
@@ -2378,9 +2491,8 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
 
     builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());
 
-    if (failed(cleanupPrivateVars(builder, moduleTranslation, taskOp.getLoc(),
-                                  privateVarsInfo.llvmVars,
-                                  privateVarsInfo.privatizers)))
+    if (failed(cleanupPrivateVars(taskOp, builder, moduleTranslation,
+                                  taskOp.getLoc(), privateVarsInfo)))
       return llvm::make_error<PreviouslyReportedError>();
 
     // Free heap allocated task context structure at the end of the task.
@@ -2497,7 +2609,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
       wsloopOp.getNumReductionVars());
 
   llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
-      builder, moduleTranslation, privateVarsInfo, allocaIP);
+      wsloopOp, builder, moduleTranslation, privateVarsInfo, allocaIP);
   if (handleError(afterAllocas, opInst).failed())
     return failure();
 
@@ -2639,9 +2751,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
           /*isTeamsReduction=*/false)))
     return failure();
 
-  return cleanupPrivateVars(builder, moduleTranslation, wsloopOp.getLoc(),
-                            privateVarsInfo.llvmVars,
-                            privateVarsInfo.privatizers);
+  return cleanupPrivateVars(wsloopOp, builder, moduleTranslation,
+                            wsloopOp.getLoc(), privateVarsInfo);
 }
 
 /// Converts the OpenMP parallel operation to LLVM IR.
@@ -2668,7 +2779,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   auto bodyGenCB = [&](InsertPointTy allocaIP,
                        InsertPointTy codeGenIP) -> llvm::Error {
     llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
-        builder, moduleTranslation, privateVarsInfo, allocaIP);
+        opInst, builder, moduleTranslation, privateVarsInfo, allocaIP);
     if (handleError(afterAllocas, *opInst).failed())
       return llvm::make_error<PreviouslyReportedError>();
 
@@ -2782,9 +2893,8 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       return llvm::createStringError(
           "failed to inline `cleanup` region of `omp.declare_reduction`");
 
-    if (failed(cleanupPrivateVars(builder, moduleTranslation, opInst.getLoc(),
-                                  privateVarsInfo.llvmVars,
-                                  privateVarsInfo.privatizers)))
+    if (failed(cleanupPrivateVars(opInst, builder, moduleTranslation,
+                                  opInst.getLoc(), privateVarsInfo)))
       return llvm::make_error<PreviouslyReportedError>();
 
     builder.restoreIP(oldIP);
@@ -2856,7 +2966,7 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
       findAllocaInsertPoint(builder, moduleTranslation);
 
   llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
-      builder, moduleTranslation, privateVarsInfo, allocaIP);
+      simdOp, builder, moduleTranslation, privateVarsInfo, allocaIP);
   if (handleError(afterAllocas, opInst).failed())
     return failure();
 
@@ -2975,9 +3085,8 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
                                     "omp.reduction.cleanup")))
     return failure();
 
-  return cleanupPrivateVars(builder, moduleTranslation, simdOp.getLoc(),
-                            privateVarsInfo.llvmVars,
-                            privateVarsInfo.privatizers);
+  return cleanupPrivateVars(simdOp, builder, moduleTranslation, simdOp.getLoc(),
+                            privateVarsInfo);
 }
 
 /// Converts an OpenMP loop nest into LLVM IR using OpenMPIRBuilder.
@@ -5236,8 +5345,8 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
     builder.restoreIP(codeGenIP);
     PrivateVarsInfo privVarsInfo(distributeOp);
 
-    llvm::Expected<llvm::BasicBlock *> afterAllocas =
-        allocatePrivateVars(builder, moduleTranslation, privVarsInfo, allocaIP);
+    llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
+        distributeOp, builder, moduleTranslation, privVarsInfo, allocaIP);
     if (handleError(afterAllocas, opInst).failed())
       return llvm::make_error<PreviouslyReportedError>();
 
@@ -5290,9 +5399,8 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
         return wsloopIP.takeError();
     }
 
-    if (failed(cleanupPrivateVars(builder, moduleTranslation,
-                                  distributeOp.getLoc(), privVarsInfo.llvmVars,
-                                  privVarsInfo.privatizers)))
+    if (failed(cleanupPrivateVars(distributeOp, builder, moduleTranslation,
+                                  distributeOp.getLoc(), privVarsInfo)))
       return llvm::make_error<PreviouslyReportedError>();
 
     return llvm::Error::success();
@@ -6036,8 +6144,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
     PrivateVarsInfo privateVarsInfo(targetOp);
 
     llvm::Expected<llvm::BasicBlock *> afterAllocas =
-        allocatePrivateVars(builder, moduleTranslation, privateVarsInfo,
-                            allocaIP, &mappedPrivateVars);
+        allocatePrivateVars(targetOp, builder, moduleTranslation,
+                            privateVarsInfo, allocaIP, &mappedPrivateVars);
 
     if (failed(handleError(afterAllocas, *targetOp)))
       return llvm::make_error<PreviouslyReportedError>();
diff --git a/mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir b/mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir
new file mode 100644
index 0000000000000..9f57255d564b3
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir
@@ -0,0 +1,83 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// This test checks that, when compiling for an offloading target, device shared
+// memory will be used in place of allocas for certain private variables.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+  omp.private {type = private} @privatizer : i32
+  omp.declare_reduction @reduction : i32 init {
+  ^bb0(%arg0: i32):
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    omp.yield(%0 : i32)
+  } combiner {
+  ^bb0(%arg0: i32, %arg1: i32):
+    %0 = llvm.add %arg0, %arg1 : i32
+    omp.yield(%0 : i32)
+  }
+  llvm.func @main() {
+    %c0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %c0 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr<5>
+    %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+    %3 = llvm.alloca %c0 x i32 {bindc_name = "y"} : (i64) -> !llvm.ptr<5>
+    %4 = llvm.addrspacecast %3 : !llvm.ptr<5> to !llvm.ptr
+    %5 = llvm.alloca %c0 x i32 {bindc_name = "z"} : (i64) -> !llvm.ptr<5>
+    %6 = llvm.addrspacecast %5 : !llvm.ptr<5> to !llvm.ptr
+    %7 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "x"}
+    %8 = omp.map.info var_ptr(%4 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "y"}
+    %9 = omp.map.info var_ptr(%6 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "z"}
+    omp.target map_entries(%7 -> %arg0, %8 -> %arg1, %9 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+      %11 = llvm.mlir.constant(10000 : i32) : i32
+      %12 = llvm.mlir.constant(1 : i32) : i32
+      omp.teams reduction(@reduction %arg0 -> %arg3 : !llvm.ptr) {
+        omp.distribute private(@privatizer %arg1 -> %arg4, @privatizer %arg2 -> %arg5 : !llvm.ptr, !llvm.ptr) {
+          omp.loop_nest (%arg6) : i32 = (%12) to (%11) inclusive step (%12) {
+            llvm.store %arg6, %arg4 : i32, !llvm.ptr
+            %13 = llvm.load %arg3 : !llvm.ptr -> i32
+            %14 = llvm.add %13, %12 : i32
+            llvm.store %14, %arg3 : i32, !llvm.ptr
+            omp.parallel reduction(@reduction %arg5 -> %arg7 : !llvm.ptr) {
+              %15 = llvm.load %arg4 : !llvm.ptr -> i32
+              %16 = llvm.load %arg7 : !llvm.ptr -> i32
+              %17 = llvm.add %15, %16 : i32
+              llvm.store %17, %arg7 : i32, !llvm.ptr
+              omp.terminator
+            }
+            omp.yield
+          }
+        }
+        omp.terminator
+      }
+      omp.terminator
+    }
+    // CHECK: call i32 @__kmpc_target_init
+    // CHECK: call void @[[OUTLINED_TARGET:__omp_offloading_[A-Za-z0-9_.]*]]
+
+    // CHECK: define internal void @[[OUTLINED_TARGET]]
+    // CHECK: %[[X_PRIV:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
+    // CHECK: %[[GEP_X:.*]] = getelementptr { {{.*}} }, ptr addrspace(5) %structArg
+    // CHECK-NEXT: store ptr %[[X_PRIV]], ptr addrspace(5) %[[GEP_X]]
+    // CHECK-NEXT: call void @[[OUTLINED_TEAMS:__omp_offloading_[A-Za-z0-9_.]*]](ptr %structArg.ascast)
+
+    // CHECK: [[REDUCE_FINALIZE_BB:reduce\.finalize.*]]:
+    // CHECK-NEXT: %{{.*}} = call i32 @__kmpc_global_thread_num
+    // CHECK-NEXT: call void @__kmpc_barrier
+    // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[X_PRIV]], i64 4)
+
+    // CHECK: define internal void @[[OUTLINED_TEAMS]]
+    // CHECK: %[[Y_PRIV:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
+    // CHECK: %[[Z_PRIV:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
+
+    // %[[GEP_Y:.*]] = getelementptr { {{.*}} }, ptr addrspace(5) %structArg
+    // store ptr %[[Y_PRIV]], ptr addrspace(5) %[[GEP_Y]], align 8
+    // %[[GEP_Z:.*]] = getelementptr { {{.*}} }, ptr addrspace(5) %structArg
+    // store ptr %[[Z_PRIV]], ptr addrspace(5) %[[GEP_Z]], align 8
+
+    // CHECK: call void @__kmpc_free_shared(ptr %[[Y_PRIV]], i64 4)
+    // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[Z_PRIV]], i64 4)
+    // CHECK-NEXT: br label %[[EXIT_BB:.*]]
+
+    // CHECK: [[EXIT_BB]]:
+    // CHECK-NEXT: ret void
+    llvm.return
+  }
+}

From c3407910da2eb0d6e75a72691d75dc74dd101f1f Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Thu, 3 Jul 2025 16:47:51 +0100
Subject: [PATCH 04/22] [OpenMP][OMPIRBuilder] Use device shared memory for arg
 structures

Argument structures are created when sections of the LLVM IR corresponding to
an OpenMP construct are outlined into their own function. For this, stack
allocations are used.

This patch modifies this behavior when compiling for a target device and
outlining `parallel`-related IR, so that it uses device shared memory instead
of private stack space. This is needed in order for threads to have access to
these arguments.

Address intermittent ICE triggered from the `OpenMPIRBuilder::finalize` method due to an invalid builder insertion point

Replace CodeExtractor callbacks with subclasses and simplify their creation based on OutlineInfo structures
---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  17 +-
 .../llvm/Transforms/Utils/CodeExtractor.h     |  51 ++-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 340 ++++++++++++++----
 llvm/lib/Transforms/IPO/HotColdSplitting.cpp  |   1 +
 llvm/lib/Transforms/IPO/IROutliner.cpp        |   4 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |  71 +++-
 .../Transforms/Utils/CodeExtractorTest.cpp    |   3 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  16 +-
 .../LLVMIR/omptarget-parallel-llvm.mlir       |   8 +-
 9 files changed, 390 insertions(+), 121 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 6ec4706a1bf91..364a4950cb5b4 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -31,6 +31,7 @@
 
 namespace llvm {
 class CanonicalLoopInfo;
+class CodeExtractor;
 class ScanInfo;
 struct TargetRegionEntryInfo;
 class OffloadEntriesInfoManager;
@@ -2276,17 +2277,27 @@ class OpenMPIRBuilder {
     BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
     SmallVector<Value *, 2> ExcludeArgsFromAggregate;
 
+    LLVM_ABI virtual ~OutlineInfo() = default;
+
     /// Collect all blocks in between EntryBB and ExitBB in both the given
     /// vector and set.
     LLVM_ABI void collectBlocks(SmallPtrSetImpl<BasicBlock *> &BlockSet,
                                 SmallVectorImpl<BasicBlock *> &BlockVector);
 
+    /// Create a CodeExtractor instance based on the information stored in this
+    /// structure, the list of collected blocks from a previous call to
+    /// \c collectBlocks and a flag stating whether arguments must be passed in
+    /// address space 0.
+    LLVM_ABI virtual std::unique_ptr<CodeExtractor>
+    createCodeExtractor(ArrayRef<BasicBlock *> Blocks,
+                        bool ArgsInZeroAddressSpace, Twine Suffix = Twine(""));
+
     /// Return the function that contains the region to be outlined.
     Function *getFunction() const { return EntryBB->getParent(); }
   };
 
   /// Collection of regions that need to be outlined during finalization.
-  SmallVector<OutlineInfo, 16> OutlineInfos;
+  SmallVector<std::unique_ptr<OutlineInfo>, 16> OutlineInfos;
 
   /// A collection of candidate target functions that's constant allocas will
   /// attempt to be raised on a call of finalize after all currently enqueued
@@ -2301,7 +2312,9 @@ class OpenMPIRBuilder {
   std::forward_list<ScanInfo> ScanInfos;
 
   /// Add a new region that will be outlined later.
-  void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
+  void addOutlineInfo(std::unique_ptr<OutlineInfo> &&OI) {
+    OutlineInfos.emplace_back(std::move(OI));
+  }
 
   /// An ordered map of auto-generated variables to their unique names.
   /// It stores variables with the following names: 1) ".gomp_critical_user_" +
diff --git a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
index 407eb50d2c7a3..b3bea96039172 100644
--- a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -17,14 +17,15 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/Support/Compiler.h"
 #include <limits>
 
 namespace llvm {
 
 template <typename PtrType> class SmallPtrSetImpl;
+class AddrSpaceCastInst;
 class AllocaInst;
-class BasicBlock;
 class BlockFrequency;
 class BlockFrequencyInfo;
 class BranchProbabilityInfo;
@@ -94,15 +95,23 @@ class CodeExtractorAnalysisCache {
     BranchProbabilityInfo *BPI;
     AssumptionCache *AC;
 
-    // A block outside of the extraction set where any intermediate
-    // allocations will be placed inside. If this is null, allocations
-    // will be placed in the entry block of the function.
+    /// A block outside of the extraction set where any intermediate
+    /// allocations will be placed inside. If this is null, allocations
+    /// will be placed in the entry block of the function.
     BasicBlock *AllocationBlock;
 
-    // If true, varargs functions can be extracted.
+    /// A block outside of the extraction set where deallocations for
+    /// intermediate allocations can be placed inside. Not used for
+    /// automatically deallocated memory (e.g. `alloca`), which is the default.
+    ///
+    /// If it is null and needed, the end of the replacement basic block will be
+    /// used to place deallocations.
+    BasicBlock *DeallocationBlock;
+
+    /// If true, varargs functions can be extracted.
     bool AllowVarArgs;
 
-    // Bits of intermediate state computed at various phases of extraction.
+    /// Bits of intermediate state computed at various phases of extraction.
     SetVector<BasicBlock *> Blocks;
 
     /// Lists of blocks that are branched from the code region to be extracted,
@@ -124,13 +133,13 @@ class CodeExtractorAnalysisCache {
     /// returns 1, etc.
     SmallVector<BasicBlock *> ExtractedFuncRetVals;
 
-    // Suffix to use when creating extracted function (appended to the original
-    // function name + "."). If empty, the default is to use the entry block
-    // label, if non-empty, otherwise "extracted".
+    /// Suffix to use when creating extracted function (appended to the original
+    /// function name + "."). If empty, the default is to use the entry block
+    /// label, if non-empty, otherwise "extracted".
     std::string Suffix;
 
-    // If true, the outlined function has aggregate argument in zero address
-    // space.
+    /// If true, the outlined function has aggregate argument in zero address
+    /// space.
     bool ArgsInZeroAddressSpace;
 
   public:
@@ -146,7 +155,9 @@ class CodeExtractorAnalysisCache {
     /// however code extractor won't validate whether extraction is legal.
     /// Any new allocations will be placed in the AllocationBlock, unless
     /// it is null, in which case it will be placed in the entry block of
-    /// the function from which the code is being extracted.
+    /// the function from which the code is being extracted. Explicit
+    /// deallocations for the aforementioned allocations will be placed in the
+    /// DeallocationBlock or the end of the replacement block, if needed.
     /// If ArgsInZeroAddressSpace param is set to true, then the aggregate
     /// param pointer of the outlined function is declared in zero address
     /// space.
@@ -157,8 +168,11 @@ class CodeExtractorAnalysisCache {
                   AssumptionCache *AC = nullptr, bool AllowVarArgs = false,
                   bool AllowAlloca = false,
                   BasicBlock *AllocationBlock = nullptr,
+                  BasicBlock *DeallocationBlock = nullptr,
                   std::string Suffix = "", bool ArgsInZeroAddressSpace = false);
 
+    LLVM_ABI virtual ~CodeExtractor() = default;
+
     /// Perform the extraction, returning the new function.
     ///
     /// Returns zero when called on a CodeExtractor instance where isEligible
@@ -243,6 +257,19 @@ class CodeExtractorAnalysisCache {
     /// region, passing it instead as a scalar.
     LLVM_ABI void excludeArgFromAggregate(Value *Arg);
 
+  protected:
+    /// Allocate an intermediate variable at the specified point.
+    LLVM_ABI virtual Instruction *
+    allocateVar(BasicBlock *BB, BasicBlock::iterator AllocIP, Type *VarType,
+                const Twine &Name = Twine(""),
+                AddrSpaceCastInst **CastedAlloc = nullptr);
+
+    /// Deallocate a previously-allocated intermediate variable at the specified
+    /// point.
+    LLVM_ABI virtual Instruction *deallocateVar(BasicBlock *BB,
+                                                BasicBlock::iterator DeallocIP,
+                                                Value *Var, Type *VarType);
+
   private:
     struct LifetimeMarkerInfo {
       bool SinkLifeStart = false;
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 81834e7133e93..03e27e91a0f57 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -280,6 +280,38 @@ computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
   return Result;
 }
 
+/// Given a function, if it represents the entry point of a target kernel, this
+/// returns the execution mode flags associated with that kernel.
+static std::optional<omp::OMPTgtExecModeFlags>
+getTargetKernelExecMode(Function &Kernel) {
+  CallInst *TargetInitCall = nullptr;
+  for (Instruction &Inst : Kernel.getEntryBlock()) {
+    if (auto *Call = dyn_cast<CallInst>(&Inst)) {
+      if (Call->getCalledFunction()->getName() == "__kmpc_target_init") {
+        TargetInitCall = Call;
+        break;
+      }
+    }
+  }
+
+  if (!TargetInitCall)
+    return std::nullopt;
+
+  // Get the kernel mode information from the global variable associated to the
+  // first argument to the call to __kmpc_target_init. Refer to
+  // createTargetInit() to see how this is initialized.
+  Value *InitOperand = TargetInitCall->getArgOperand(0);
+  GlobalVariable *KernelEnv = nullptr;
+  if (auto *Cast = dyn_cast<ConstantExpr>(InitOperand))
+    KernelEnv = cast<GlobalVariable>(Cast->getOperand(0));
+  else
+    KernelEnv = cast<GlobalVariable>(InitOperand);
+  auto *KernelEnvInit = cast<ConstantStruct>(KernelEnv->getInitializer());
+  auto *ConfigEnv = cast<ConstantStruct>(KernelEnvInit->getOperand(0));
+  auto *KernelMode = cast<ConstantInt>(ConfigEnv->getOperand(2));
+  return static_cast<OMPTgtExecModeFlags>(KernelMode->getZExtValue());
+}
+
 /// Make \p Source branch to \p Target.
 ///
 /// Handles two situations:
@@ -455,6 +487,88 @@ enum OpenMPOffloadingRequiresDirFlags {
   LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
 };
 
+class OMPCodeExtractor : public CodeExtractor {
+public:
+  OMPCodeExtractor(OpenMPIRBuilder &OMPBuilder, ArrayRef<BasicBlock *> BBs,
+                   DominatorTree *DT = nullptr, bool AggregateArgs = false,
+                   BlockFrequencyInfo *BFI = nullptr,
+                   BranchProbabilityInfo *BPI = nullptr,
+                   AssumptionCache *AC = nullptr, bool AllowVarArgs = false,
+                   bool AllowAlloca = false,
+                   BasicBlock *AllocationBlock = nullptr,
+                   BasicBlock *DeallocationBlock = nullptr,
+                   std::string Suffix = "", bool ArgsInZeroAddressSpace = false)
+      : CodeExtractor(BBs, DT, AggregateArgs, BFI, BPI, AC, AllowVarArgs,
+                      AllowAlloca, AllocationBlock, DeallocationBlock, Suffix,
+                      ArgsInZeroAddressSpace),
+        OMPBuilder(OMPBuilder) {}
+
+  virtual ~OMPCodeExtractor() = default;
+
+protected:
+  OpenMPIRBuilder &OMPBuilder;
+};
+
+class DeviceSharedMemCodeExtractor : public OMPCodeExtractor {
+public:
+  DeviceSharedMemCodeExtractor(
+      OpenMPIRBuilder &OMPBuilder, BasicBlock *AllocBlockOverride,
+      ArrayRef<BasicBlock *> BBs, DominatorTree *DT = nullptr,
+      bool AggregateArgs = false, BlockFrequencyInfo *BFI = nullptr,
+      BranchProbabilityInfo *BPI = nullptr, AssumptionCache *AC = nullptr,
+      bool AllowVarArgs = false, bool AllowAlloca = false,
+      BasicBlock *AllocationBlock = nullptr,
+      BasicBlock *DeallocationBlock = nullptr, std::string Suffix = "",
+      bool ArgsInZeroAddressSpace = false)
+      : OMPCodeExtractor(OMPBuilder, BBs, DT, AggregateArgs, BFI, BPI, AC,
+                         AllowVarArgs, AllowAlloca, AllocationBlock,
+                         DeallocationBlock, Suffix, ArgsInZeroAddressSpace),
+        AllocBlockOverride(AllocBlockOverride) {}
+  virtual ~DeviceSharedMemCodeExtractor() = default;
+
+protected:
+  virtual Instruction *
+  allocateVar(BasicBlock *, BasicBlock::iterator, Type *VarType,
+              const Twine &Name = Twine(""),
+              AddrSpaceCastInst **CastedAlloc = nullptr) override {
+    // Ignore the CastedAlloc pointer, if requested, because shared memory
+    // should not be casted to address space 0 to be passed around.
+    return OMPBuilder.createOMPAllocShared(
+        OpenMPIRBuilder::InsertPointTy(
+            AllocBlockOverride, AllocBlockOverride->getFirstInsertionPt()),
+        VarType, Name);
+  }
+
+  virtual Instruction *deallocateVar(BasicBlock *BB,
+                                     BasicBlock::iterator DeallocIP, Value *Var,
+                                     Type *VarType) override {
+    return OMPBuilder.createOMPFreeShared(
+        OpenMPIRBuilder::InsertPointTy(BB, DeallocIP), Var, VarType);
+  }
+
+private:
+  // TODO: Remove the need for this override and instead get the CodeExtractor
+  // to provide a valid insert point for explicit deallocations by correctly
+  // populating its DeallocationBlock.
+  BasicBlock *AllocBlockOverride;
+};
+
+/// Helper storing information about regions to outline using device shared
+/// memory for intermediate allocations.
+struct DeviceSharedMemOutlineInfo : public OpenMPIRBuilder::OutlineInfo {
+  OpenMPIRBuilder &OMPBuilder;
+  BasicBlock *AllocBlockOverride = nullptr;
+
+  DeviceSharedMemOutlineInfo(OpenMPIRBuilder &OMPBuilder)
+      : OMPBuilder(OMPBuilder) {}
+  virtual ~DeviceSharedMemOutlineInfo() = default;
+
+  virtual std::unique_ptr<CodeExtractor>
+  createCodeExtractor(ArrayRef<BasicBlock *> Blocks,
+                      bool ArgsInZeroAddressSpace,
+                      Twine Suffix = Twine("")) override;
+};
+
 } // anonymous namespace
 
 OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
@@ -734,20 +848,20 @@ static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder,
 void OpenMPIRBuilder::finalize(Function *Fn) {
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
   SmallVector<BasicBlock *, 32> Blocks;
-  SmallVector<OutlineInfo, 16> DeferredOutlines;
-  for (OutlineInfo &OI : OutlineInfos) {
+  SmallVector<std::unique_ptr<OutlineInfo>, 16> DeferredOutlines;
+  for (std::unique_ptr<OutlineInfo> &OI : OutlineInfos) {
     // Skip functions that have not finalized yet; may happen with nested
     // function generation.
-    if (Fn && OI.getFunction() != Fn) {
-      DeferredOutlines.push_back(OI);
+    if (Fn && OI->getFunction() != Fn) {
+      DeferredOutlines.push_back(std::move(OI));
       continue;
     }
 
     ParallelRegionBlockSet.clear();
     Blocks.clear();
-    OI.collectBlocks(ParallelRegionBlockSet, Blocks);
+    OI->collectBlocks(ParallelRegionBlockSet, Blocks);
 
-    Function *OuterFn = OI.getFunction();
+    Function *OuterFn = OI->getFunction();
     CodeExtractorAnalysisCache CEAC(*OuterFn);
     // If we generate code for the target device, we need to allocate
     // struct for aggregate params in the device default alloca address space.
@@ -756,26 +870,19 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
     // CodeExtractor generates correct code for extracted functions
     // which are used by OpenMP runtime.
     bool ArgsInZeroAddressSpace = Config.isTargetDevice();
-    CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
-                            /* AggregateArgs */ true,
-                            /* BlockFrequencyInfo */ nullptr,
-                            /* BranchProbabilityInfo */ nullptr,
-                            /* AssumptionCache */ nullptr,
-                            /* AllowVarArgs */ true,
-                            /* AllowAlloca */ true,
-                            /* AllocaBlock*/ OI.OuterAllocaBB,
-                            /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
+    std::unique_ptr<CodeExtractor> Extractor =
+        OI->createCodeExtractor(Blocks, ArgsInZeroAddressSpace, ".omp_par");
 
     LLVM_DEBUG(dbgs() << "Before     outlining: " << *OuterFn << "\n");
-    LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
-                      << " Exit: " << OI.ExitBB->getName() << "\n");
-    assert(Extractor.isEligible() &&
+    LLVM_DEBUG(dbgs() << "Entry " << OI->EntryBB->getName()
+                      << " Exit: " << OI->ExitBB->getName() << "\n");
+    assert(Extractor->isEligible() &&
            "Expected OpenMP outlining to be possible!");
 
-    for (auto *V : OI.ExcludeArgsFromAggregate)
-      Extractor.excludeArgFromAggregate(V);
+    for (auto *V : OI->ExcludeArgsFromAggregate)
+      Extractor->excludeArgFromAggregate(V);
 
-    Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
+    Function *OutlinedFn = Extractor->extractCodeRegion(CEAC);
     if (Config.isGPU())
       OutlinedFn->addFnAttr(Attribute::AlwaysInline);
 
@@ -802,8 +909,8 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
     // made our own entry block after all.
     {
       BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
-      assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
-      assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
+      assert(ArtificialEntry.getUniqueSuccessor() == OI->EntryBB);
+      assert(OI->EntryBB->getUniquePredecessor() == &ArtificialEntry);
       // Move instructions from the to-be-deleted ArtificialEntry to the entry
       // basic block of the parallel region. CodeExtractor generates
       // instructions to unwrap the aggregate argument and may sink
@@ -819,24 +926,25 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
 
         if (I.isTerminator()) {
           // Absorb any debug value that terminator may have
-          if (OI.EntryBB->getTerminator())
-            OI.EntryBB->getTerminator()->adoptDbgRecords(
+          if (OI->EntryBB->getTerminator())
+            OI->EntryBB->getTerminator()->adoptDbgRecords(
                 &ArtificialEntry, I.getIterator(), false);
           continue;
         }
 
-        I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
+        I.moveBeforePreserving(*OI->EntryBB,
+                               OI->EntryBB->getFirstInsertionPt());
       }
 
-      OI.EntryBB->moveBefore(&ArtificialEntry);
+      OI->EntryBB->moveBefore(&ArtificialEntry);
       ArtificialEntry.eraseFromParent();
     }
-    assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
+    assert(&OutlinedFn->getEntryBlock() == OI->EntryBB);
     assert(OutlinedFn && OutlinedFn->hasNUses(1));
 
     // Run a user callback, e.g. to add attributes.
-    if (OI.PostOutlineCB)
-      OI.PostOutlineCB(*OutlinedFn);
+    if (OI->PostOutlineCB)
+      OI->PostOutlineCB(*OutlinedFn);
   }
 
   // Remove work items that have been completed.
@@ -1669,10 +1777,50 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
 
   LLVM_DEBUG(dbgs() << "After  body codegen: " << *OuterFn << "\n");
 
-  OutlineInfo OI;
+  auto OI = [&]() -> std::unique_ptr<OutlineInfo> {
+    if (Config.isTargetDevice()) {
+      std::optional<omp::OMPTgtExecModeFlags> ExecMode =
+          getTargetKernelExecMode(*OuterFn);
+
+      // If OuterFn is not a Generic kernel, skip custom allocation. This causes
+      // the CodeExtractor to follow its default behavior. Otherwise, we need to
+      // use device shared memory to allocate argument structures.
+      if (ExecMode && *ExecMode & OMP_TGT_EXEC_MODE_GENERIC) {
+        auto Info = std::make_unique<DeviceSharedMemOutlineInfo>(*this);
+
+        // Instead of using the insertion point provided by the CodeExtractor,
+        // here we need to use the block that eventually calls the outlined
+        // function for the `parallel` construct.
+        //
+        // The reason is that the explicit deallocation call will be inserted
+        // within the outlined function, whereas the alloca insertion point
+        // might actually be located somewhere else in the caller. This becomes
+        // a problem when e.g. `parallel` is inside of a `distribute` construct,
+        // because the deallocation would be executed multiple times and the
+        // allocation just once (outside of the loop).
+        //
+        // TODO: Ideally, we'd want to do the allocation and deallocation
+        // outside of the `parallel` outlined function, hence using here the
+        // insertion point provided by the CodeExtractor. We can't do this at
+        // the moment because there is currently no way of passing an eligible
+        // insertion point for the explicit deallocation to the CodeExtractor,
+        // as that block is created (at least when nested inside of
+        // `distribute`) sometime after createParallel() completed, so it can't
+        // be stored in the OutlineInfo structure here.
+        //
+        // The current approach results in an explicit allocation and
+        // deallocation pair for each `distribute` loop iteration in that case,
+        // which is suboptimal.
+        Info->AllocBlockOverride = EntryBB;
+        return Info;
+      }
+    }
+    return std::make_unique<OutlineInfo>();
+  }();
+
   if (Config.isTargetDevice()) {
     // Generate OpenMP target specific runtime call
-    OI.PostOutlineCB = [=, ToBeDeletedVec =
+    OI->PostOutlineCB = [=, ToBeDeletedVec =
                                std::move(ToBeDeleted)](Function &OutlinedFn) {
       targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
                              IfCondition, NumThreads, PrivTID, PrivTIDAddrAcast,
@@ -1680,20 +1828,20 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
     };
   } else {
     // Generate OpenMP host runtime call
-    OI.PostOutlineCB = [=, ToBeDeletedVec =
-                               std::move(ToBeDeleted)](Function &OutlinedFn) {
+    OI->PostOutlineCB = [=, ToBeDeletedVec =
+                                std::move(ToBeDeleted)](Function &OutlinedFn) {
       hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
                            PrivTID, PrivTIDAddrAcast, ToBeDeletedVec);
     };
   }
 
-  OI.OuterAllocaBB = OuterAllocaBlock;
-  OI.EntryBB = PRegEntryBB;
-  OI.ExitBB = PRegExitBB;
+  OI->OuterAllocaBB = OuterAllocaBlock;
+  OI->EntryBB = PRegEntryBB;
+  OI->ExitBB = PRegExitBB;
 
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
   SmallVector<BasicBlock *, 32> Blocks;
-  OI.collectBlocks(ParallelRegionBlockSet, Blocks);
+  OI->collectBlocks(ParallelRegionBlockSet, Blocks);
 
   CodeExtractorAnalysisCache CEAC(*OuterFn);
   CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
@@ -1704,6 +1852,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
                           /* AllowVarArgs */ true,
                           /* AllowAlloca */ true,
                           /* AllocationBlock */ OuterAllocaBlock,
+                          /* DeallocationBlock */ nullptr,
                           /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
 
   // Find inputs to, outputs from the code region.
@@ -1728,7 +1877,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
 
   auto PrivHelper = [&](Value &V) -> Error {
     if (&V == TIDAddr || &V == ZeroAddr) {
-      OI.ExcludeArgsFromAggregate.push_back(&V);
+      OI->ExcludeArgsFromAggregate.push_back(&V);
       return Error::success();
     }
 
@@ -2005,19 +2154,19 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
   if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
     return Err;
 
-  OutlineInfo OI;
-  OI.EntryBB = TaskAllocaBB;
-  OI.OuterAllocaBB = AllocaIP.getBlock();
-  OI.ExitBB = TaskExitBB;
+  auto OI = std::make_unique<OutlineInfo>();
+  OI->EntryBB = TaskAllocaBB;
+  OI->OuterAllocaBB = AllocaIP.getBlock();
+  OI->ExitBB = TaskExitBB;
 
   // Add the thread ID argument.
   SmallVector<Instruction *, 4> ToBeDeleted;
-  OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
+  OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
       Builder, M, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
 
-  OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
-                      Mergeable, Priority, EventHandle, TaskAllocaBB,
-                      ToBeDeleted](Function &OutlinedFn) mutable {
+  OI->PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
+                       Mergeable, Priority, EventHandle, TaskAllocaBB,
+                       ToBeDeleted](Function &OutlinedFn) mutable {
     // Replace the Stale CI by appropriate RTL function call.
     assert(OutlinedFn.hasOneUse() &&
            "there must be a single user for the outlined function");
@@ -5115,19 +5264,19 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
   Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
 
-  OutlineInfo OI;
-  OI.OuterAllocaBB = CLI->getPreheader();
+  auto OI = std::make_unique<OutlineInfo>();
+  OI->OuterAllocaBB = CLI->getPreheader();
   Function *OuterFn = CLI->getPreheader()->getParent();
 
   // Instructions which need to be deleted at the end of code generation
   SmallVector<Instruction *, 4> ToBeDeleted;
 
-  OI.OuterAllocaBB = AllocaIP.getBlock();
+  OI->OuterAllocaBB = AllocaIP.getBlock();
 
   // Mark the body loop as region which needs to be extracted
-  OI.EntryBB = CLI->getBody();
-  OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
-                                               "omp.prelatch", true);
+  OI->EntryBB = CLI->getBody();
+  OI->ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
+                                                "omp.prelatch", true);
 
   // Prepare loop body for extraction
   Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
@@ -5147,7 +5296,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
   // loop body region.
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
   SmallVector<BasicBlock *, 32> Blocks;
-  OI.collectBlocks(ParallelRegionBlockSet, Blocks);
+  OI->collectBlocks(ParallelRegionBlockSet, Blocks);
 
   CodeExtractorAnalysisCache CEAC(*OuterFn);
   CodeExtractor Extractor(Blocks,
@@ -5159,6 +5308,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
                           /* AllowVarArgs */ true,
                           /* AllowAlloca */ true,
                           /* AllocationBlock */ CLI->getPreheader(),
+                          /* DeallocationBlock */ nullptr,
                           /* Suffix */ ".omp_wsloop",
                           /* AggrArgsIn0AddrSpace */ true);
 
@@ -5183,15 +5333,15 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
   }
   // Make sure that loop counter variable is not merged into loop body
   // function argument structure and it is passed as separate variable
-  OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
+  OI->ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
 
   // PostOutline CB is invoked when loop body function is outlined and
   // loop body is replaced by call to outlined function. We need to add
   // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
   // function will handle loop control logic.
   //
-  OI.PostOutlineCB = [=, ToBeDeletedVec =
-                             std::move(ToBeDeleted)](Function &OutlinedFn) {
+  OI->PostOutlineCB = [=, ToBeDeletedVec =
+                              std::move(ToBeDeleted)](Function &OutlinedFn) {
     workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
                                 LoopType, NoLoop);
   };
@@ -8079,13 +8229,13 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
                                    TargetTaskAllocaBB->begin());
   InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
 
-  OutlineInfo OI;
-  OI.EntryBB = TargetTaskAllocaBB;
-  OI.OuterAllocaBB = AllocaIP.getBlock();
+  auto OI = std::make_unique<OutlineInfo>();
+  OI->EntryBB = TargetTaskAllocaBB;
+  OI->OuterAllocaBB = AllocaIP.getBlock();
 
   // Add the thread ID argument.
   SmallVector<Instruction *, 4> ToBeDeleted;
-  OI.ExcludeArgsFromAggregate.push_back(
+  OI->ExcludeArgsFromAggregate.push_back(
       createFakeIntVal(Builder, M, AllocaIP, ToBeDeleted, TargetTaskAllocaIP,
                        "global.tid", false));
 
@@ -8104,8 +8254,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   // OI.ExitBlock is set to the single task body block and will get left out of
   // the outlining process. So, simply create a new empty block to which we
   // uncoditionally branch from where TaskBodyCB left off
-  OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
-  emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
+  OI->ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
+  emitBlock(OI->ExitBB, Builder.GetInsertBlock()->getParent(),
             /*IsFinished=*/true);
 
   SmallVector<Value *, 2> OffloadingArraysToPrivatize;
@@ -8117,13 +8267,13 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
           RTArgs.SizesArray}) {
       if (V && !isa<ConstantPointerNull, GlobalVariable>(V)) {
         OffloadingArraysToPrivatize.push_back(V);
-        OI.ExcludeArgsFromAggregate.push_back(V);
+        OI->ExcludeArgsFromAggregate.push_back(V);
       }
     }
   }
-  OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
-                      DeviceID, OffloadingArraysToPrivatize](
-                         Function &OutlinedFn) mutable {
+  OI->PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
+                       DeviceID, OffloadingArraysToPrivatize](
+                          Function &OutlinedFn) mutable {
     assert(OutlinedFn.hasOneUse() &&
            "there must be a single user for the outlined function");
 
@@ -10088,17 +10238,17 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
   if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
     return Err;
 
-  OutlineInfo OI;
-  OI.EntryBB = AllocaBB;
-  OI.ExitBB = ExitBB;
-  OI.OuterAllocaBB = &OuterAllocaBB;
+  auto OI = std::make_unique<OutlineInfo>();
+  OI->EntryBB = AllocaBB;
+  OI->ExitBB = ExitBB;
+  OI->OuterAllocaBB = &OuterAllocaBB;
 
   // Insert fake values for global tid and bound tid.
   SmallVector<Instruction *, 8> ToBeDeleted;
   InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
-  OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
+  OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
       Builder, M, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
-  OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
+  OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
       Builder, M, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
 
   auto HostPostOutlineCB = [this, Ident,
@@ -10138,7 +10288,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
   };
 
   if (!Config.isTargetDevice())
-    OI.PostOutlineCB = HostPostOutlineCB;
+    OI->PostOutlineCB = HostPostOutlineCB;
 
   addOutlineInfo(std::move(OI));
 
@@ -10177,11 +10327,10 @@ OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
   // When using target we use different runtime functions which require a
   // callback.
   if (Config.isTargetDevice()) {
-    OutlineInfo OI;
-    OI.OuterAllocaBB = OuterAllocaIP.getBlock();
-    OI.EntryBB = AllocaBB;
-    OI.ExitBB = ExitBB;
-
+    auto OI = std::make_unique<OutlineInfo>();
+    OI->OuterAllocaBB = OuterAllocaIP.getBlock();
+    OI->EntryBB = AllocaBB;
+    OI->ExitBB = ExitBB;
     addOutlineInfo(std::move(OI));
   }
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
@@ -10243,6 +10392,39 @@ void OpenMPIRBuilder::OutlineInfo::collectBlocks(
   }
 }
 
+std::unique_ptr<CodeExtractor>
+OpenMPIRBuilder::OutlineInfo::createCodeExtractor(ArrayRef<BasicBlock *> Blocks,
+                                                  bool ArgsInZeroAddressSpace,
+                                                  Twine Suffix) {
+  return std::make_unique<CodeExtractor>(Blocks, /* DominatorTree */ nullptr,
+                                         /* AggregateArgs */ true,
+                                         /* BlockFrequencyInfo */ nullptr,
+                                         /* BranchProbabilityInfo */ nullptr,
+                                         /* AssumptionCache */ nullptr,
+                                         /* AllowVarArgs */ true,
+                                         /* AllowAlloca */ true,
+                                         /* AllocationBlock*/ OuterAllocaBB,
+                                         /* DeallocationBlock */ nullptr,
+                                         /* Suffix */ Suffix.str(),
+                                         ArgsInZeroAddressSpace);
+}
+
+std::unique_ptr<CodeExtractor> DeviceSharedMemOutlineInfo::createCodeExtractor(
+    ArrayRef<BasicBlock *> Blocks, bool ArgsInZeroAddressSpace, Twine Suffix) {
+  // TODO: Initialize the DeallocationBlock with a proper pair to OuterAllocaBB.
+  return std::make_unique<DeviceSharedMemCodeExtractor>(
+      OMPBuilder, AllocBlockOverride, Blocks, /* DominatorTree */ nullptr,
+      /* AggregateArgs */ true,
+      /* BlockFrequencyInfo */ nullptr,
+      /* BranchProbabilityInfo */ nullptr,
+      /* AssumptionCache */ nullptr,
+      /* AllowVarArgs */ true,
+      /* AllowAlloca */ true,
+      /* AllocationBlock*/ OuterAllocaBB,
+      /* DeallocationBlock */ ExitBB,
+      /* Suffix */ Suffix.str(), ArgsInZeroAddressSpace);
+}
+
 void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
                                          uint64_t Size, int32_t Flags,
                                          GlobalValue::LinkageTypes,
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index 3d8b7cbb59630..57809017a75a4 100644
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -721,6 +721,7 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
             SubRegion, &*DT, /* AggregateArgs */ false, /* BFI */ nullptr,
             /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
             /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
+            /* DeallocationBlock */ nullptr,
             /* Suffix */ "cold." + std::to_string(OutlinedFunctionID));
 
         if (CE.isEligible() && isSplittingBeneficial(CE, SubRegion, TTI) &&
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index e3e31befdbfd2..e2bc1f3e86740 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -2829,7 +2829,7 @@ unsigned IROutliner::doOutline(Module &M) {
       OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
-                        false, nullptr, "outlined");
+                        false, nullptr, nullptr, "outlined");
       findAddInputsOutputs(M, *OS, NotSame);
       if (!OS->IgnoreRegion)
         OutlinedRegions.push_back(OS);
@@ -2940,7 +2940,7 @@ unsigned IROutliner::doOutline(Module &M) {
       OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
-                        false, nullptr, "outlined");
+                        false, nullptr, nullptr, "outlined");
       bool FunctionOutlined = extractSection(*OS);
       if (FunctionOutlined) {
         unsigned StartIdx = OS->Candidate->getStartIdx();
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 40de78a1d6e31..d7b7abfd0391a 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -264,11 +263,12 @@ CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
                              bool AggregateArgs, BlockFrequencyInfo *BFI,
                              BranchProbabilityInfo *BPI, AssumptionCache *AC,
                              bool AllowVarArgs, bool AllowAlloca,
-                             BasicBlock *AllocationBlock, std::string Suffix,
+                             BasicBlock *AllocationBlock,
+                             BasicBlock *DeallocationBlock, std::string Suffix,
                              bool ArgsInZeroAddressSpace)
     : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
       BPI(BPI), AC(AC), AllocationBlock(AllocationBlock),
-      AllowVarArgs(AllowVarArgs),
+      DeallocationBlock(DeallocationBlock), AllowVarArgs(AllowVarArgs),
       Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)),
       Suffix(Suffix), ArgsInZeroAddressSpace(ArgsInZeroAddressSpace) {}
 
@@ -444,6 +444,27 @@ CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) {
   return CommonExitBlock;
 }
 
+Instruction *CodeExtractor::allocateVar(BasicBlock *BB,
+                                        BasicBlock::iterator AllocIP,
+                                        Type *VarType, const Twine &Name,
+                                        AddrSpaceCastInst **CastedAlloc) {
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+  Instruction *Alloca =
+      new AllocaInst(VarType, DL.getAllocaAddrSpace(), nullptr, Name, AllocIP);
+
+  if (CastedAlloc && ArgsInZeroAddressSpace && DL.getAllocaAddrSpace() != 0) {
+    *CastedAlloc = new AddrSpaceCastInst(
+        Alloca, PointerType::get(BB->getContext(), 0), Name + ".ascast");
+    (*CastedAlloc)->insertAfter(Alloca->getIterator());
+  }
+  return Alloca;
+}
+
+Instruction *CodeExtractor::deallocateVar(BasicBlock *, BasicBlock::iterator,
+                                          Value *, Type *) {
+  return nullptr;
+}
+
 // Find the pair of life time markers for address 'Addr' that are either
 // defined inside the outline region or can legally be shrinkwrapped into the
 // outline region. If there are not other untracked uses of the address, return
@@ -1821,7 +1842,6 @@ CallInst *CodeExtractor::emitReplacerCall(
     std::vector<Value *> &Reloads) {
   LLVMContext &Context = oldFunction->getContext();
   Module *M = oldFunction->getParent();
-  const DataLayout &DL = M->getDataLayout();
 
   // This takes place of the original loop
   BasicBlock *codeReplacer =
@@ -1852,25 +1872,22 @@ CallInst *CodeExtractor::emitReplacerCall(
     if (StructValues.contains(output))
       continue;
 
-    AllocaInst *alloca = new AllocaInst(
-        output->getType(), DL.getAllocaAddrSpace(), nullptr,
-        output->getName() + ".loc", AllocaBlock->getFirstInsertionPt());
-    params.push_back(alloca);
-    ReloadOutputs.push_back(alloca);
+    Value *OutAlloc =
+        allocateVar(AllocaBlock, AllocaBlock->getFirstInsertionPt(),
+                    output->getType(), output->getName() + ".loc");
+    params.push_back(OutAlloc);
+    ReloadOutputs.push_back(OutAlloc);
   }
 
-  AllocaInst *Struct = nullptr;
+  Instruction *Struct = nullptr;
   if (!StructValues.empty()) {
-    Struct = new AllocaInst(StructArgTy, DL.getAllocaAddrSpace(), nullptr,
-                            "structArg", AllocaBlock->getFirstInsertionPt());
-    if (ArgsInZeroAddressSpace && DL.getAllocaAddrSpace() != 0) {
-      auto *StructSpaceCast = new AddrSpaceCastInst(
-          Struct, PointerType ::get(Context, 0), "structArg.ascast");
-      StructSpaceCast->insertAfter(Struct->getIterator());
+    AddrSpaceCastInst *StructSpaceCast = nullptr;
+    Struct = allocateVar(AllocaBlock, AllocaBlock->getFirstInsertionPt(),
+                         StructArgTy, "structArg", &StructSpaceCast);
+    if (StructSpaceCast)
       params.push_back(StructSpaceCast);
-    } else {
+    else
       params.push_back(Struct);
-    }
 
     unsigned AggIdx = 0;
     for (Value *input : inputs) {
@@ -2013,6 +2030,24 @@ CallInst *CodeExtractor::emitReplacerCall(
   insertLifetimeMarkersSurroundingCall(oldFunction->getParent(), LifetimesStart,
                                        {}, call);
 
+  // Deallocate intermediate variables if they need explicit deallocation.
+  BasicBlock *DeallocBlock = codeReplacer;
+  BasicBlock::iterator DeallocIP = codeReplacer->end();
+  if (DeallocationBlock) {
+    DeallocBlock = DeallocationBlock;
+    DeallocIP = DeallocationBlock->getFirstInsertionPt();
+  }
+
+  int Index = 0;
+  for (Value *Output : outputs) {
+    if (!StructValues.contains(Output))
+      deallocateVar(DeallocBlock, DeallocIP, ReloadOutputs[Index++],
+                    Output->getType());
+  }
+
+  if (Struct)
+    deallocateVar(DeallocBlock, DeallocIP, Struct, StructArgTy);
+
   return call;
 }
 
diff --git a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
index 9ea8de3da1e5b..6fd266a815dcf 100644
--- a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
@@ -711,7 +711,8 @@ TEST(CodeExtractor, OpenMPAggregateArgs) {
                    /* AssumptionCache */ nullptr,
                    /* AllowVarArgs */ true,
                    /* AllowAlloca */ true,
-                   /* AllocaBlock*/ &Func->getEntryBlock(),
+                   /* AllocationBlock*/ &Func->getEntryBlock(),
+                   /* DeallocationBlock */ nullptr,
                    /* Suffix */ ".outlined",
                    /* ArgsInZeroAddressSpace */ true);
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 0c1c4e8b8a05c..09af46544441d 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -6389,6 +6389,7 @@ static void updateDebugInfoForDeclareTargetFunctions(
 
 static LogicalResult
 convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute,
+                         llvm::OpenMPIRBuilder *ompBuilder,
                          LLVM::ModuleTranslation &moduleTranslation) {
   // Amend omp.declare_target by deleting the IR of the outlined functions
   // created for target regions. They cannot be filtered out from MLIR earlier
@@ -6411,8 +6412,14 @@ convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute,
       if (declareType == omp::DeclareTargetDeviceType::host) {
         llvmFunc->dropAllReferences();
         llvmFunc->eraseFromParent();
-      } else
+
+        // Invalidate the builder's current insertion point, as it now points to
+        // a deleted block.
+        ompBuilder->Builder.ClearInsertionPoint();
+        ompBuilder->Builder.SetCurrentDebugLocation(llvm::DebugLoc());
+      } else {
         updateDebugInfoForDeclareTargetFunctions(llvmFunc, moduleTranslation);
+      }
     }
     return success();
   }
@@ -6567,9 +6574,12 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation(
       .Case("omp.declare_target",
             [&](Attribute attr) {
               if (auto declareTargetAttr =
-                      dyn_cast<omp::DeclareTargetAttr>(attr))
+                      dyn_cast<omp::DeclareTargetAttr>(attr)) {
+                llvm::OpenMPIRBuilder *ompBuilder =
+                    moduleTranslation.getOpenMPBuilder();
                 return convertDeclareTargetAttr(op, declareTargetAttr,
-                                                moduleTranslation);
+                                                ompBuilder, moduleTranslation);
+              }
               return failure();
             })
       .Case("omp.requires",
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
index 424e948fac750..e6b529a0d9da7 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
@@ -56,8 +56,6 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK-SAME: ptr %[[TMP:.*]], ptr %[[TMP0:.*]]) #{{[0-9]+}} {
 // CHECK:         %[[TMP1:.*]] = alloca [1 x ptr], align 8, addrspace(5)
 // CHECK:         %[[TMP2:.*]] = addrspacecast ptr addrspace(5) %[[TMP1]] to ptr
-// CHECK:         %[[STRUCTARG:.*]] = alloca { ptr }, align 8, addrspace(5)
-// CHECK:         %[[STRUCTARG_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[STRUCTARG]] to ptr
 // CHECK:         %[[TMP3:.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK:         %[[TMP4:.*]] = addrspacecast ptr addrspace(5) %[[TMP3]] to ptr
 // CHECK:         store ptr %[[TMP0]], ptr %[[TMP4]], align 8
@@ -65,12 +63,14 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:         %[[EXEC_USER_CODE:.*]] = icmp eq i32 %[[TMP5]], -1
 // CHECK:         br i1 %[[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[WORKER_EXIT:.*]]
 // CHECK:         %[[TMP6:.*]] = load ptr, ptr %[[TMP4]], align 8
+// CHECK:         %[[STRUCTARG:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8)
 // CHECK:         %[[OMP_GLOBAL_THREAD_NUM:.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
-// CHECK:         %[[GEP_:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG_ASCAST]], i32 0, i32 0
+// CHECK:         %[[GEP_:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG]], i32 0, i32 0
 // CHECK:         store ptr %[[TMP6]], ptr %[[GEP_]], align 8
 // CHECK:         %[[TMP7:.*]] = getelementptr inbounds [1 x ptr], ptr %[[TMP2]], i64 0, i64 0
-// CHECK:         store ptr %[[STRUCTARG_ASCAST]], ptr %[[TMP7]], align 8
+// CHECK:         store ptr %[[STRUCTARG]], ptr %[[TMP7]], align 8
 // CHECK:         call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 1, i32 -1, i32 -1, ptr @[[FUNC1:.*]], ptr null, ptr %[[TMP2]], i64 1)
+// CHECK:         call void @__kmpc_free_shared(ptr %[[STRUCTARG]], i64 8)
 // CHECK:         call void @__kmpc_target_deinit()
 
 // CHECK: define internal void @[[FUNC1]](

From 8c1f771b7c5c489c46ce0ef47ae7c32a65c01f53 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Fri, 4 Jul 2025 16:32:03 +0100
Subject: [PATCH 05/22] [OpenMP][OMPIRBuilder] Support parallel in Generic
 kernels

This patch introduces codegen logic to produce a wrapper function argument for
the `__kmpc_parallel_51` DeviceRTL function needed to handle arguments passed
using device shared memory in Generic mode.
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 100 ++++++++++++++++--
 .../LLVMIR/omptarget-parallel-llvm.mlir       |  25 ++++-
 2 files changed, 116 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 03e27e91a0f57..f1e246c83f6ea 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1457,6 +1457,86 @@ Error OpenMPIRBuilder::emitCancelationCheckImpl(
   return Error::success();
 }
 
+/// Create wrapper function used to gather the outlined function's argument
+/// structure from a shared buffer and to forward them to it when running in
+/// Generic mode.
+///
+/// The outlined function is expected to receive 2 integer arguments followed by
+/// an optional pointer argument to an argument structure holding the rest.
+static Function *createTargetParallelWrapper(OpenMPIRBuilder *OMPIRBuilder,
+                                             Function &OutlinedFn) {
+  size_t NumArgs = OutlinedFn.arg_size();
+  assert((NumArgs == 2 || NumArgs == 3) &&
+         "expected a 2-3 argument parallel outlined function");
+  bool UseArgStruct = NumArgs == 3;
+
+  IRBuilder<> &Builder = OMPIRBuilder->Builder;
+  IRBuilder<>::InsertPointGuard IPG(Builder);
+  auto *FnTy = FunctionType::get(Builder.getVoidTy(),
+                                 {Builder.getInt16Ty(), Builder.getInt32Ty()},
+                                 /*isVarArg=*/false);
+  auto *WrapperFn =
+      Function::Create(FnTy, GlobalValue::InternalLinkage,
+                       OutlinedFn.getName() + ".wrapper", OMPIRBuilder->M);
+
+  WrapperFn->addParamAttr(0, Attribute::NoUndef);
+  WrapperFn->addParamAttr(0, Attribute::ZExt);
+  WrapperFn->addParamAttr(1, Attribute::NoUndef);
+
+  BasicBlock *EntryBB =
+      BasicBlock::Create(OMPIRBuilder->M.getContext(), "entry", WrapperFn);
+  Builder.SetInsertPoint(EntryBB);
+
+  // Allocation.
+  Value *AddrAlloca = Builder.CreateAlloca(Builder.getInt32Ty(),
+                                           /*ArraySize=*/nullptr, "addr");
+  AddrAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      AddrAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
+      AddrAlloca->getName() + ".ascast");
+
+  Value *ZeroAlloca = Builder.CreateAlloca(Builder.getInt32Ty(),
+                                           /*ArraySize=*/nullptr, "zero");
+  ZeroAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ZeroAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
+      ZeroAlloca->getName() + ".ascast");
+
+  Value *ArgsAlloca = nullptr;
+  if (UseArgStruct) {
+    ArgsAlloca = Builder.CreateAlloca(Builder.getPtrTy(),
+                                      /*ArraySize=*/nullptr, "global_args");
+    ArgsAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        ArgsAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
+        ArgsAlloca->getName() + ".ascast");
+  }
+
+  // Initialization.
+  Builder.CreateStore(WrapperFn->getArg(1), AddrAlloca);
+  Builder.CreateStore(Builder.getInt32(0), ZeroAlloca);
+  if (UseArgStruct) {
+    Builder.CreateCall(
+        OMPIRBuilder->getOrCreateRuntimeFunctionPtr(
+            llvm::omp::RuntimeFunction::OMPRTL___kmpc_get_shared_variables),
+        {ArgsAlloca});
+  }
+
+  SmallVector<Value *, 3> Args{AddrAlloca, ZeroAlloca};
+
+  // Load structArg from global_args.
+  if (UseArgStruct) {
+    Value *StructArg = Builder.CreateLoad(Builder.getPtrTy(), ArgsAlloca);
+    StructArg = Builder.CreateInBoundsGEP(Builder.getPtrTy(), StructArg,
+                                          {Builder.getInt64(0)});
+    StructArg = Builder.CreateLoad(Builder.getPtrTy(), StructArg, "structArg");
+    Args.push_back(StructArg);
+  }
+
+  // Call the outlined function holding the parallel body.
+  Builder.CreateCall(&OutlinedFn, Args);
+  Builder.CreateRetVoid();
+
+  return WrapperFn;
+}
+
 // Callback used to create OpenMP runtime calls to support
 // omp parallel clause for the device.
 // We need to use this callback to replace call to the OutlinedFn in OuterFn
@@ -1466,6 +1546,10 @@ static void targetParallelCallback(
     BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
     Value *NumThreads, Instruction *PrivTID, Value *PrivTIDAddr,
     Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
+  assert(OutlinedFn.arg_size() >= 2 &&
+         "Expected at least tid and bounded tid as arguments");
+  unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
+
   // Add some known attributes.
   IRBuilder<> &Builder = OMPIRBuilder->Builder;
   OutlinedFn.addParamAttr(0, Attribute::NoAlias);
@@ -1474,17 +1558,12 @@ static void targetParallelCallback(
   OutlinedFn.addParamAttr(1, Attribute::NoUndef);
   OutlinedFn.addFnAttr(Attribute::NoUnwind);
 
-  assert(OutlinedFn.arg_size() >= 2 &&
-         "Expected at least tid and bounded tid as arguments");
-  unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
-
   CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
   assert(CI && "Expected call instruction to outlined function");
   CI->getParent()->setName("omp_parallel");
 
   Builder.SetInsertPoint(CI);
   Type *PtrTy = OMPIRBuilder->VoidPtr;
-  Value *NullPtrValue = Constant::getNullValue(PtrTy);
 
   // Add alloca for kernel args
   OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
@@ -1510,6 +1589,15 @@ static void targetParallelCallback(
       IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
                   : Builder.getInt32(1);
 
+  // If this is not a Generic kernel, we can skip generating the wrapper.
+  std::optional<omp::OMPTgtExecModeFlags> ExecMode =
+      getTargetKernelExecMode(*OuterFn);
+  Value *WrapperFn;
+  if (ExecMode && (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC))
+    WrapperFn = createTargetParallelWrapper(OMPIRBuilder, OutlinedFn);
+  else
+    WrapperFn = Constant::getNullValue(PtrTy);
+
   // Build kmpc_parallel_51 call
   Value *Parallel51CallArgs[] = {
       /* identifier*/ Ident,
@@ -1518,7 +1606,7 @@ static void targetParallelCallback(
       /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
       /* Proc bind */ Builder.getInt32(-1),
       /* outlined function */ &OutlinedFn,
-      /* wrapper function */ NullPtrValue,
+      /* wrapper function */ WrapperFn,
       /* arguments of the outlined funciton*/ Args,
       /* number of arguments */ Builder.getInt64(NumCapturedVars)};
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
index e6b529a0d9da7..7b157aeef4fe4 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
@@ -69,7 +69,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:         store ptr %[[TMP6]], ptr %[[GEP_]], align 8
 // CHECK:         %[[TMP7:.*]] = getelementptr inbounds [1 x ptr], ptr %[[TMP2]], i64 0, i64 0
 // CHECK:         store ptr %[[STRUCTARG]], ptr %[[TMP7]], align 8
-// CHECK:         call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 1, i32 -1, i32 -1, ptr @[[FUNC1:.*]], ptr null, ptr %[[TMP2]], i64 1)
+// CHECK:         call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 1, i32 -1, i32 -1, ptr @[[FUNC1:.*]], ptr @[[FUNC1_WRAPPER:.*]], ptr %[[TMP2]], i64 1)
 // CHECK:         call void @__kmpc_free_shared(ptr %[[STRUCTARG]], i64 8)
 // CHECK:         call void @__kmpc_target_deinit()
 
@@ -84,7 +84,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:         call void @__kmpc_parallel_51(ptr addrspacecast (
 // CHECK-SAME:  ptr addrspace(1) @[[NUM_THREADS_GLOB:[0-9]+]] to ptr),
 // CHECK-SAME:  i32 [[NUM_THREADS_TMP0:%.*]], i32 1, i32 156,
-// CHECK-SAME:  i32 -1,  ptr [[FUNC_NUM_THREADS1:@.*]], ptr null, ptr [[NUM_THREADS_TMP1:%.*]], i64 1)
+// CHECK-SAME:  i32 -1, ptr @[[FUNC_NUM_THREADS1:.*]], ptr @[[FUNC2_WRAPPER:.*]], ptr [[NUM_THREADS_TMP1:%.*]], i64 1)
 
 // One of the arguments of  kmpc_parallel_51 function is responsible for handling if clause
 // of omp parallel construct for target region. If this  argument is nonzero,
@@ -105,4 +105,23 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:         call void @__kmpc_parallel_51(ptr addrspacecast (
 // CHECK-SAME:  ptr addrspace(1) {{.*}} to ptr),
 // CHECK-SAME:  i32 {{.*}}, i32 %[[IFCOND_TMP4]], i32 -1,
-// CHECK-SAME:  i32 -1,  ptr {{.*}}, ptr null, ptr {{.*}}, i64 1)
+// CHECK-SAME:  i32 -1,  ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i64 1)
+
+// CHECK: define internal void @[[FUNC1_WRAPPER]](i16 noundef zeroext %{{.*}}, i32 noundef %[[ADDR:.*]])
+// CHECK: %[[ADDR_ALLOCA:.*]] = alloca i32, align 4, addrspace(5)
+// CHECK: %[[ADDR_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[ADDR_ALLOCA]] to ptr
+// CHECK: %[[ZERO_ALLOCA:.*]] = alloca i32, align 4, addrspace(5)
+// CHECK: %[[ZERO_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[ZERO_ALLOCA]] to ptr
+// CHECK: %[[ARGS_ALLOCA:.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK: %[[ARGS_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[ARGS_ALLOCA]] to ptr
+// CHECK: store i32 %[[ADDR]], ptr %[[ADDR_ASCAST]]
+// CHECK: store i32 0, ptr %[[ZERO_ASCAST]]
+// CHECK: call void @__kmpc_get_shared_variables(ptr %[[ARGS_ASCAST]])
+// CHECK: %[[LOAD_ARGS:.*]] = load ptr, ptr %[[ARGS_ASCAST]], align 8
+// CHECK: %[[FIRST_ARG:.*]] = getelementptr inbounds ptr, ptr %[[LOAD_ARGS]], i64 0
+// CHECK: %[[STRUCTARG:.*]] = load ptr, ptr %[[FIRST_ARG]], align 8
+// CHECK: call void @[[FUNC1]](ptr %[[ADDR_ASCAST]], ptr %[[ZERO_ASCAST]], ptr %[[STRUCTARG]])
+
+// CHECK: define internal void @[[FUNC2_WRAPPER]](i16 noundef zeroext %{{.*}}, i32 noundef %{{.*}})
+// CHECK-NOT: define
+// CHECK: call void @[[FUNC_NUM_THREADS1]]({{.*}})

From d1929e363b6835f8c4697419888ce7442333ec25 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Fri, 25 Jul 2025 13:52:11 +0100
Subject: [PATCH 06/22] [OpenMPOpt] Make parallel regions reachable from new
 DeviceRTL loop functions

This patch updates the OpenMP optimization pass to know about the new DeviceRTL
functions for loop constructs.

This change marks these functions as potentially containing parallel regions,
which fixes a current bug with the state machine rewrite optimization. It
previously failed to identify parallel regions located inside of the callbacks
passed to these new DeviceRTL functions, causing the resulting code to skip
executing these parallel regions.

As a result, Generic kernels produced by Flang that contain parallel regions
now work properly.

One known related issue not fixed by this patch is that the presence of calls
to these functions will prevent the SPMD-ization of Generic kernels by
OpenMPOpt. Previously, this was due to assuming there was no parallel region.
This is changed by this patch, but instead we now mark it temporarily as
unsupported in an SPMD context. The reason is that, without additional changes,
code intended for the main thread of the team located outside of the parallel
region would not be guarded properly, resulting in race conditions and
generally invalid behavior.
---
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |  23 ++++
 .../fortran/target-generic-loops.f90          | 130 ++++++++++++++++++
 .../offloading/fortran/target-spmd-loops.f90  |  39 ++++++
 3 files changed, 192 insertions(+)
 create mode 100644 offload/test/offloading/fortran/target-generic-loops.f90
 create mode 100644 offload/test/offloading/fortran/target-spmd-loops.f90

diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 8b1968343416e..8428620e47ff0 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -5047,6 +5047,29 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       case OMPRTL___kmpc_free_shared:
         // Return without setting a fixpoint, to be resolved in updateImpl.
         return;
+      case OMPRTL___kmpc_distribute_static_loop_4:
+      case OMPRTL___kmpc_distribute_static_loop_4u:
+      case OMPRTL___kmpc_distribute_static_loop_8:
+      case OMPRTL___kmpc_distribute_static_loop_8u:
+      case OMPRTL___kmpc_distribute_for_static_loop_4:
+      case OMPRTL___kmpc_distribute_for_static_loop_4u:
+      case OMPRTL___kmpc_distribute_for_static_loop_8:
+      case OMPRTL___kmpc_distribute_for_static_loop_8u:
+      case OMPRTL___kmpc_for_static_loop_4:
+      case OMPRTL___kmpc_for_static_loop_4u:
+      case OMPRTL___kmpc_for_static_loop_8:
+      case OMPRTL___kmpc_for_static_loop_8u:
+        // Parallel regions might be reached by these calls, as they take a
+        // callback argument potentially containing arbitrary user-provided
+        // code.
+        ReachedUnknownParallelRegions.insert(&CB);
+        // TODO: The presence of these calls on their own does not prevent a
+        // kernel from being SPMD-izable. We mark it as such because we need
+        // further changes in order to also consider the contents of the
+        // callbacks passed to them.
+        SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+        SPMDCompatibilityTracker.insert(&CB);
+        break;
       default:
         // Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
         // generally. However, they do not hide parallel regions.
diff --git a/offload/test/offloading/fortran/target-generic-loops.f90 b/offload/test/offloading/fortran/target-generic-loops.f90
new file mode 100644
index 0000000000000..07bcbfd2c8752
--- /dev/null
+++ b/offload/test/offloading/fortran/target-generic-loops.f90
@@ -0,0 +1,130 @@
+! Offloading test for generic target regions containing different kinds of
+! loop constructs inside.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+  integer :: i1, i2, n1, n2, counter
+
+  n1 = 100
+  n2 = 50
+
+  counter = 0
+  !$omp target map(tofrom:counter)
+    !$omp teams distribute reduction(+:counter)
+    do i1=1, n1
+      counter = counter + 1
+    end do
+  !$omp end target
+
+  ! CHECK: 1 100
+  print '(I2" "I0)', 1, counter
+
+  counter = 0
+  !$omp target map(tofrom:counter)
+    !$omp parallel do reduction(+:counter)
+    do i1=1, n1
+      counter = counter + 1
+    end do
+    !$omp parallel do reduction(+:counter)
+    do i1=1, n1
+      counter = counter + 1
+    end do
+  !$omp end target
+
+  ! CHECK: 2 200
+  print '(I2" "I0)', 2, counter
+
+  counter = 0
+  !$omp target map(tofrom:counter)
+    counter = counter + 1
+    !$omp parallel do reduction(+:counter)
+    do i1=1, n1
+      counter = counter + 1
+    end do
+    counter = counter + 1
+    !$omp parallel do reduction(+:counter)
+    do i1=1, n1
+      counter = counter + 1
+    end do
+    counter = counter + 1
+  !$omp end target
+
+  ! CHECK: 3 203
+  print '(I2" "I0)', 3, counter
+
+  counter = 0
+  !$omp target map(tofrom: counter)
+    counter = counter + 1
+    !$omp parallel do reduction(+:counter)
+    do i1=1, n1
+      counter = counter + 1
+    end do
+    counter = counter + 1
+  !$omp end target
+
+  ! CHECK: 4 102
+  print '(I2" "I0)', 4, counter
+
+
+  counter = 0
+  !$omp target teams distribute reduction(+:counter)
+  do i1=1, n1
+    !$omp parallel do reduction(+:counter)
+    do i2=1, n2
+      counter = counter + 1
+    end do
+  end do
+
+  ! CHECK: 5 5000
+  print '(I2" "I0)', 5, counter
+
+  counter = 0
+  !$omp target teams distribute reduction(+:counter)
+  do i1=1, n1
+    counter = counter + 1
+    !$omp parallel do reduction(+:counter)
+    do i2=1, n2
+      counter = counter + 1
+    end do
+    counter = counter + 1
+  end do
+
+  ! CHECK: 6 5200
+  print '(I2" "I0)', 6, counter
+
+  counter = 0
+  !$omp target teams distribute reduction(+:counter)
+  do i1=1, n1
+    !$omp parallel do reduction(+:counter)
+    do i2=1, n2
+      counter = counter + 1
+    end do
+    !$omp parallel do reduction(+:counter)
+    do i2=1, n2
+      counter = counter + 1
+    end do
+  end do
+
+  ! CHECK: 7 10000
+  print '(I2" "I0)', 7, counter
+
+  counter = 0
+  !$omp target teams distribute reduction(+:counter)
+  do i1=1, n1
+    counter = counter + 1
+    !$omp parallel do reduction(+:counter)
+    do i2=1, n2
+      counter = counter + 1
+    end do
+    counter = counter + 1
+    !$omp parallel do reduction(+:counter)
+    do i2=1, n2
+      counter = counter + 1
+    end do
+    counter = counter + 1
+  end do
+
+  ! CHECK: 8 10300
+  print '(I2" "I0)', 8, counter
+end program
diff --git a/offload/test/offloading/fortran/target-spmd-loops.f90 b/offload/test/offloading/fortran/target-spmd-loops.f90
new file mode 100644
index 0000000000000..7407f0c0768cb
--- /dev/null
+++ b/offload/test/offloading/fortran/target-spmd-loops.f90
@@ -0,0 +1,39 @@
+! Offloading test for generic target regions containing different kinds of
+! loop constructs inside.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+  integer :: i1, n1, counter
+
+  n1 = 100
+
+  counter = 0
+  !$omp target parallel do reduction(+:counter)
+  do i1=1, n1
+    counter = counter + 1
+  end do
+
+  ! CHECK: 1 100
+  print '(I2" "I0)', 1, counter
+
+  counter = 0
+  !$omp target map(tofrom:counter)
+    !$omp parallel do reduction(+:counter)
+    do i1=1, n1
+      counter = counter + 1
+    end do
+  !$omp end target
+
+  ! CHECK: 2 100
+  print '(I2" "I0)', 2, counter
+
+  counter = 0
+  !$omp target teams distribute parallel do reduction(+:counter)
+  do i1=1, n1
+    counter = counter + 1
+  end do
+
+  ! CHECK: 3 100
+  print '(I2" "I0)', 3, counter
+end program

From 1135d95b6d24180c2d9ea1256a4aa675e27a21f9 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Thu, 21 Aug 2025 13:18:49 +0100
Subject: [PATCH 07/22] [OMPIRBuilder] Add support for explicit deallocation
 points

In this patch, some OMPIRBuilder codegen functions and callbacks are updated to
work with arrays of deallocation insertion points. The purpose of this is to
enable the replacement of `alloca`s with other types of allocations that
require explicit deallocations in a way that makes it possible for
`CodeExtractor` instances created during OMPIRBuilder finalization to also use
them.

The OpenMP to LLVM IR MLIR translation pass is updated to properly store and
forward deallocation points together with their matching allocation point to
the OMPIRBuilder.

Currently, only the `DeviceSharedMemCodeExtractor` uses this feature to get the
`CodeExtractor` to use device shared memory for intermediate allocations when
outlining a parallel region inside of a Generic kernel (code path that is only
used by Flang via MLIR, currently). However, long term this might also be
useful to refactor finalization of variables with destructors, potentially
reducing the use of callbacks and simplifying privatization and reductions.

Instead of a single deallocation point, lists of those are used. This is to
cover cases where there are multiple exit blocks originating from a single
entry. If an allocation needing explicit deallocation is placed in the entry
block of such cases, it would need to be deallocated before each of the exits.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |   4 +-
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |  77 ++--
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  99 +++---
 .../llvm/Transforms/Utils/CodeExtractor.h     |  24 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 299 ++++++++--------
 llvm/lib/Transforms/IPO/HotColdSplitting.cpp  |   2 +-
 llvm/lib/Transforms/IPO/IROutliner.cpp        |   4 +-
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |  11 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |  37 +-
 .../Frontend/OpenMPIRBuilderTest.cpp          | 328 +++++++++++-------
 .../Transforms/Utils/CodeExtractorTest.cpp    |   2 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 233 ++++++++-----
 .../LLVMIR/omptarget-parallel-llvm.mlir       |  18 +-
 .../LLVMIR/omptarget-region-device-llvm.mlir  |   4 +-
 .../openmp-target-private-allocatable.mlir    |   2 +
 15 files changed, 630 insertions(+), 514 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index a044168205d67..7cbe23e21f6c1 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -11354,8 +11354,8 @@ void CGOpenMPRuntime::emitTargetDataCalls(
   llvm::OpenMPIRBuilder::LocationDescription OmpLoc(CodeGenIP);
   llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
       cantFail(OMPBuilder.createTargetData(
-          OmpLoc, AllocaIP, CodeGenIP, DeviceID, IfCondVal, Info, GenMapInfoCB,
-          CustomMapperCB,
+          OmpLoc, AllocaIP, CodeGenIP, /*DeallocIPs=*/{}, DeviceID, IfCondVal,
+          Info, GenMapInfoCB, CustomMapperCB,
           /*MapperFunc=*/nullptr, BodyCB, DeviceAddrCB, RTLoc));
   CGF.Builder.restoreIP(AfterIP);
 }
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index dd9a4b3fa076a..9a5104dc70fb1 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -2238,10 +2238,10 @@ void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) {
     const CapturedStmt *CS = S.getCapturedStmt(OMPD_parallel);
     const Stmt *ParallelRegionBodyStmt = CS->getCapturedStmt();
 
-    auto BodyGenCB = [&, this](InsertPointTy AllocaIP,
-                               InsertPointTy CodeGenIP) {
+    auto BodyGenCB = [&, this](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                               ArrayRef<InsertPointTy> DeallocIPs) {
       OMPBuilderCBHelpers::EmitOMPOutlinedRegionBody(
-          *this, ParallelRegionBodyStmt, AllocaIP, CodeGenIP, "parallel");
+          *this, ParallelRegionBodyStmt, AllocIP, CodeGenIP, "parallel");
       return llvm::Error::success();
     };
 
@@ -2249,9 +2249,10 @@ void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) {
     CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(*this, &CGSI);
     llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
         AllocaInsertPt->getParent(), AllocaInsertPt->getIterator());
-    llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail(
-        OMPBuilder.createParallel(Builder, AllocaIP, BodyGenCB, PrivCB, FiniCB,
-                                  IfCond, NumThreads, ProcBind, S.hasCancel()));
+    llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
+        cantFail(OMPBuilder.createParallel(
+            Builder, AllocaIP, /*DeallocIPs=*/{}, BodyGenCB, PrivCB, FiniCB,
+            IfCond, NumThreads, ProcBind, S.hasCancel()));
     Builder.restoreIP(AfterIP);
     return;
   }
@@ -4936,21 +4937,23 @@ void CodeGenFunction::EmitOMPSectionsDirective(const OMPSectionsDirective &S) {
     llvm::SmallVector<BodyGenCallbackTy, 4> SectionCBVector;
     if (CS) {
       for (const Stmt *SubStmt : CS->children()) {
-        auto SectionCB = [this, SubStmt](InsertPointTy AllocaIP,
-                                         InsertPointTy CodeGenIP) {
-          OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
-              *this, SubStmt, AllocaIP, CodeGenIP, "section");
+        auto SectionCB = [this, SubStmt](InsertPointTy AllocIP,
+                                         InsertPointTy CodeGenIP,
+                                         ArrayRef<InsertPointTy> DeallocIPs) {
+          OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(*this, SubStmt, AllocIP,
+                                                        CodeGenIP, "section");
           return llvm::Error::success();
         };
         SectionCBVector.push_back(SectionCB);
       }
     } else {
-      auto SectionCB = [this, CapturedStmt](InsertPointTy AllocaIP,
-                                            InsertPointTy CodeGenIP) {
-        OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
-            *this, CapturedStmt, AllocaIP, CodeGenIP, "section");
-        return llvm::Error::success();
-      };
+      auto SectionCB =
+          [this, CapturedStmt](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                               ArrayRef<InsertPointTy> DeallocIPs) {
+            OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
+                *this, CapturedStmt, AllocIP, CodeGenIP, "section");
+            return llvm::Error::success();
+          };
       SectionCBVector.push_back(SectionCB);
     }
 
@@ -5004,10 +5007,11 @@ void CodeGenFunction::EmitOMPSectionDirective(const OMPSectionDirective &S) {
       return llvm::Error::success();
     };
 
-    auto BodyGenCB = [SectionRegionBodyStmt, this](InsertPointTy AllocaIP,
-                                                   InsertPointTy CodeGenIP) {
+    auto BodyGenCB = [SectionRegionBodyStmt,
+                      this](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                            ArrayRef<InsertPointTy> DeallocIPs) {
       OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
-          *this, SectionRegionBodyStmt, AllocaIP, CodeGenIP, "section");
+          *this, SectionRegionBodyStmt, AllocIP, CodeGenIP, "section");
       return llvm::Error::success();
     };
 
@@ -5089,10 +5093,11 @@ void CodeGenFunction::EmitOMPMasterDirective(const OMPMasterDirective &S) {
       return llvm::Error::success();
     };
 
-    auto BodyGenCB = [MasterRegionBodyStmt, this](InsertPointTy AllocaIP,
-                                                  InsertPointTy CodeGenIP) {
+    auto BodyGenCB = [MasterRegionBodyStmt,
+                      this](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                            ArrayRef<InsertPointTy> DeallocIPs) {
       OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
-          *this, MasterRegionBodyStmt, AllocaIP, CodeGenIP, "master");
+          *this, MasterRegionBodyStmt, AllocIP, CodeGenIP, "master");
       return llvm::Error::success();
     };
 
@@ -5139,10 +5144,11 @@ void CodeGenFunction::EmitOMPMaskedDirective(const OMPMaskedDirective &S) {
       return llvm::Error::success();
     };
 
-    auto BodyGenCB = [MaskedRegionBodyStmt, this](InsertPointTy AllocaIP,
-                                                  InsertPointTy CodeGenIP) {
+    auto BodyGenCB = [MaskedRegionBodyStmt,
+                      this](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                            ArrayRef<InsertPointTy> DeallocIPs) {
       OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
-          *this, MaskedRegionBodyStmt, AllocaIP, CodeGenIP, "masked");
+          *this, MaskedRegionBodyStmt, AllocIP, CodeGenIP, "masked");
       return llvm::Error::success();
     };
 
@@ -5182,10 +5188,11 @@ void CodeGenFunction::EmitOMPCriticalDirective(const OMPCriticalDirective &S) {
       return llvm::Error::success();
     };
 
-    auto BodyGenCB = [CriticalRegionBodyStmt, this](InsertPointTy AllocaIP,
-                                                    InsertPointTy CodeGenIP) {
+    auto BodyGenCB = [CriticalRegionBodyStmt,
+                      this](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                            ArrayRef<InsertPointTy> DeallocIPs) {
       OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
-          *this, CriticalRegionBodyStmt, AllocaIP, CodeGenIP, "critical");
+          *this, CriticalRegionBodyStmt, AllocIP, CodeGenIP, "critical");
       return llvm::Error::success();
     };
 
@@ -6152,8 +6159,8 @@ void CodeGenFunction::EmitOMPTaskgroupDirective(
     InsertPointTy AllocaIP(AllocaInsertPt->getParent(),
                            AllocaInsertPt->getIterator());
 
-    auto BodyGenCB = [&, this](InsertPointTy AllocaIP,
-                               InsertPointTy CodeGenIP) {
+    auto BodyGenCB = [&, this](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                               ArrayRef<InsertPointTy> DeallocIPs) {
       Builder.restoreIP(CodeGenIP);
       EmitStmt(S.getInnermostCapturedStmt()->getCapturedStmt());
       return llvm::Error::success();
@@ -6162,7 +6169,8 @@ void CodeGenFunction::EmitOMPTaskgroupDirective(
     if (!CapturedStmtInfo)
       CapturedStmtInfo = &CapStmtInfo;
     llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
-        cantFail(OMPBuilder.createTaskgroup(Builder, AllocaIP, BodyGenCB));
+        cantFail(OMPBuilder.createTaskgroup(Builder, AllocaIP,
+                                            /*DeallocIPs=*/{}, BodyGenCB));
     Builder.restoreIP(AfterIP);
     return;
   }
@@ -6879,8 +6887,9 @@ void CodeGenFunction::EmitOMPOrderedDirective(const OMPOrderedDirective &S) {
         return llvm::Error::success();
       };
 
-      auto BodyGenCB = [&S, C, this](InsertPointTy AllocaIP,
-                                     InsertPointTy CodeGenIP) {
+      auto BodyGenCB = [&S, C, this](InsertPointTy AllocIP,
+                                     InsertPointTy CodeGenIP,
+                                     ArrayRef<InsertPointTy> DeallocIPs) {
         Builder.restoreIP(CodeGenIP);
 
         const CapturedStmt *CS = S.getInnermostCapturedStmt();
@@ -6898,7 +6907,7 @@ void CodeGenFunction::EmitOMPOrderedDirective(const OMPOrderedDirective &S) {
                                                OutlinedFn, CapturedVars);
         } else {
           OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(
-              *this, CS->getCapturedStmt(), AllocaIP, CodeGenIP, "ordered");
+              *this, CS->getCapturedStmt(), AllocIP, CodeGenIP, "ordered");
         }
         return llvm::Error::success();
       };
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 364a4950cb5b4..f5e890e7052f0 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -613,17 +613,19 @@ class OpenMPIRBuilder {
   /// such InsertPoints need to be preserved, it can split the block itself
   /// before calling the callback.
   ///
-  /// AllocaIP and CodeGenIP must not point to the same position.
-  ///
-  /// \param AllocaIP is the insertion point at which new alloca instructions
-  ///                 should be placed. The BasicBlock it is pointing to must
-  ///                 not be split.
-  /// \param CodeGenIP is the insertion point at which the body code should be
-  ///                  placed.
-  ///
+  /// AllocIP and CodeGenIP must not point to the same position.
+  ///
+  /// \param AllocIP    is the insertion point at which new allocations should
+  ///                   be placed. The BasicBlock it is pointing to must not be
+  ///                   split.
+  /// \param CodeGenIP  is the insertion point at which the body code should be
+  ///                   placed.
+  /// \param DeallocIPs is the list of insertion points where explicit
+  ///                   deallocations, if needed, should be placed.
   /// \return an error, if any were triggered during execution.
   using BodyGenCallbackTy =
-      function_ref<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
+      function_ref<Error(InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                         ArrayRef<InsertPointTy> DeallocIPs)>;
 
   // This is created primarily for sections construct as llvm::function_ref
   // (BodyGenCallbackTy) is not storable (as described in the comments of
@@ -632,7 +634,8 @@ class OpenMPIRBuilder {
   ///
   /// \return an error, if any were triggered during execution.
   using StorableBodyGenCallbackTy =
-      std::function<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
+      std::function<Error(InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                          ArrayRef<InsertPointTy> DeallocIPs)>;
 
   /// Callback type for loop body code generation.
   ///
@@ -726,7 +729,9 @@ class OpenMPIRBuilder {
   /// Generator for '#omp parallel'
   ///
   /// \param Loc The insert and source location description.
-  /// \param AllocaIP The insertion points to be used for alloca instructions.
+  /// \param AllocIP The insertion point to be used for allocations.
+  /// \param DeallocIPs The insertion points to be used for explicit
+  /// deallocations, if needed.
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param PrivCB Callback to copy a given variable (think copy constructor).
   /// \param FiniCB Callback to finalize variable copies.
@@ -737,10 +742,10 @@ class OpenMPIRBuilder {
   ///
   /// \returns The insertion position *after* the parallel.
   LLVM_ABI InsertPointOrErrorTy createParallel(
-      const LocationDescription &Loc, InsertPointTy AllocaIP,
-      BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
-      FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
-      omp::ProcBindKind ProcBind, bool IsCancellable);
+      const LocationDescription &Loc, InsertPointTy AllocIP,
+      ArrayRef<InsertPointTy> DeallocIPs, BodyGenCallbackTy BodyGenCB,
+      PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition,
+      Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable);
 
   /// Generator for the control flow structure of an OpenMP canonical loop.
   ///
@@ -1364,7 +1369,9 @@ class OpenMPIRBuilder {
   /// Generator for `#omp task`
   ///
   /// \param Loc The location where the task construct was encountered.
-  /// \param AllocaIP The insertion point to be used for alloca instructions.
+  /// \param AllocIP The insertion point to be used for allocations.
+  /// \param DeallocIPs The insertion points to be used for explicit
+  ///                   deallocations, if needed.
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param Tied True if the task is tied, false if the task is untied.
   /// \param Final i1 value which is `true` if the task is final, `false` if the
@@ -1380,21 +1387,23 @@ class OpenMPIRBuilder {
   /// \param Mergeable	 If the given task is `mergeable`
   /// \param priority `priority-value' specifies the execution order of the
   ///                 tasks that is generated by the construct
-  LLVM_ABI InsertPointOrErrorTy
-  createTask(const LocationDescription &Loc, InsertPointTy AllocaIP,
-             BodyGenCallbackTy BodyGenCB, bool Tied = true,
-             Value *Final = nullptr, Value *IfCondition = nullptr,
-             SmallVector<DependData> Dependencies = {}, bool Mergeable = false,
-             Value *EventHandle = nullptr, Value *Priority = nullptr);
+  LLVM_ABI InsertPointOrErrorTy createTask(
+      const LocationDescription &Loc, InsertPointTy AllocIP,
+      ArrayRef<InsertPointTy> DeallocIPs, BodyGenCallbackTy BodyGenCB,
+      bool Tied = true, Value *Final = nullptr, Value *IfCondition = nullptr,
+      SmallVector<DependData> Dependencies = {}, bool Mergeable = false,
+      Value *EventHandle = nullptr, Value *Priority = nullptr);
 
   /// Generator for the taskgroup construct
   ///
   /// \param Loc The location where the taskgroup construct was encountered.
-  /// \param AllocaIP The insertion point to be used for alloca instructions.
+  /// \param AllocIP The insertion point to be used for allocations.
+  /// \param DeallocIPs The insertion point to be used for explicit deallocation
+  /// instructions, if needed.
   /// \param BodyGenCB Callback that will generate the region code.
-  LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc,
-                                                InsertPointTy AllocaIP,
-                                                BodyGenCallbackTy BodyGenCB);
+  LLVM_ABI InsertPointOrErrorTy createTaskgroup(
+      const LocationDescription &Loc, InsertPointTy AllocIP,
+      ArrayRef<InsertPointTy> DeallocIPs, BodyGenCallbackTy BodyGenCB);
 
   using FileIdentifierInfoCallbackTy =
       std::function<std::tuple<std::string, uint64_t>()>;
@@ -2274,7 +2283,8 @@ class OpenMPIRBuilder {
   struct OutlineInfo {
     using PostOutlineCBTy = std::function<void(Function &)>;
     PostOutlineCBTy PostOutlineCB;
-    BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
+    BasicBlock *EntryBB, *ExitBB, *OuterAllocBB;
+    SmallVector<BasicBlock *> OuterDeallocBBs;
     SmallVector<Value *, 2> ExcludeArgsFromAggregate;
 
     LLVM_ABI virtual ~OutlineInfo() = default;
@@ -2347,7 +2357,8 @@ class OpenMPIRBuilder {
   /// \return an error, if any were triggered during execution.
   LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
                               BodyGenCallbackTy ElseGen,
-                              InsertPointTy AllocaIP = {});
+                              InsertPointTy AllocIP = {},
+                              ArrayRef<InsertPointTy> DeallocIPs = {});
 
   /// Create the global variable holding the offload mappings information.
   LLVM_ABI GlobalVariable *
@@ -2902,11 +2913,13 @@ class OpenMPIRBuilder {
   /// Generator for `#omp distribute`
   ///
   /// \param Loc The location where the distribute construct was encountered.
-  /// \param AllocaIP The insertion points to be used for alloca instructions.
+  /// \param AllocIP The insertion point to be used for allocations.
+  /// \param DeallocIPs The insertion points to be used for explicit
+  /// deallocations, if needed.
   /// \param BodyGenCB Callback that will generate the region code.
-  LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc,
-                                                 InsertPointTy AllocaIP,
-                                                 BodyGenCallbackTy BodyGenCB);
+  LLVM_ABI InsertPointOrErrorTy createDistribute(
+      const LocationDescription &Loc, InsertPointTy AllocIP,
+      ArrayRef<InsertPointTy> DeallocIPs, BodyGenCallbackTy BodyGenCB);
 
   /// Generate conditional branch and relevant BasicBlocks through which private
   /// threads copy the 'copyin' variables from Master copy to threadprivate
@@ -3234,9 +3247,11 @@ class OpenMPIRBuilder {
   /// Generator for '#omp target data'
   ///
   /// \param Loc The location where the target data construct was encountered.
-  /// \param AllocaIP The insertion points to be used for alloca instructions.
+  /// \param AllocIP The insertion points to be used for allocations.
   /// \param CodeGenIP The insertion point at which the target directive code
   /// should be placed.
+  /// \param DeallocIPs The insertion points at which explicit deallocations
+  /// should be placed, if needed.
   /// \param IsBegin If true then emits begin mapper call otherwise emits
   /// end mapper call.
   /// \param DeviceID Stores the DeviceID from the device clause.
@@ -3249,10 +3264,10 @@ class OpenMPIRBuilder {
   /// \param DeviceAddrCB Optional callback to generate code related to
   /// use_device_ptr and use_device_addr.
   LLVM_ABI InsertPointOrErrorTy createTargetData(
-      const LocationDescription &Loc, InsertPointTy AllocaIP,
-      InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
-      TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
-      CustomMapperCallbackTy CustomMapperCB,
+      const LocationDescription &Loc, InsertPointTy AllocIP,
+      InsertPointTy CodeGenIP, ArrayRef<InsertPointTy> DeallocIPs,
+      Value *DeviceID, Value *IfCond, TargetDataInfo &Info,
+      GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB,
       omp::RuntimeFunction *MapperFunc = nullptr,
       function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
                                         BodyGenTy BodyGenType)>
@@ -3261,7 +3276,8 @@ class OpenMPIRBuilder {
       Value *SrcLocInfo = nullptr);
 
   using TargetBodyGenCallbackTy = function_ref<InsertPointOrErrorTy(
-      InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
+      InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+      ArrayRef<InsertPointTy> DeallocIPs)>;
 
   using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy(
       Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
@@ -3273,6 +3289,8 @@ class OpenMPIRBuilder {
   /// \param IsOffloadEntry whether it is an offload entry.
   /// \param CodeGenIP The insertion point where the call to the outlined
   ///        function should be emitted.
+  /// \param DeallocIPs The insertion points at which explicit deallocations
+  ///        should be placed, if needed.
   /// \param Info Stores all information realted to the Target directive.
   /// \param EntryInfo The entry information about the function.
   /// \param DefaultAttrs Structure containing the default attributes, including
@@ -3293,8 +3311,9 @@ class OpenMPIRBuilder {
   ///        not.
   LLVM_ABI InsertPointOrErrorTy createTarget(
       const LocationDescription &Loc, bool IsOffloadEntry,
-      OpenMPIRBuilder::InsertPointTy AllocaIP,
-      OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info,
+      OpenMPIRBuilder::InsertPointTy AllocIP,
+      OpenMPIRBuilder::InsertPointTy CodeGenIP,
+      ArrayRef<InsertPointTy> DeallocIPs, TargetDataInfo &Info,
       TargetRegionEntryInfo &EntryInfo,
       const TargetKernelDefaultAttrs &DefaultAttrs,
       const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
diff --git a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
index b3bea96039172..7b1e3a759470f 100644
--- a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -100,13 +100,13 @@ class CodeExtractorAnalysisCache {
     /// will be placed in the entry block of the function.
     BasicBlock *AllocationBlock;
 
-    /// A block outside of the extraction set where deallocations for
-    /// intermediate allocations can be placed inside. Not used for
-    /// automatically deallocated memory (e.g. `alloca`), which is the default.
+    /// A set of blocks outside of the extraction set where deallocations for
+    /// intermediate allocations should be placed. Not used for automatically
+    /// deallocated memory (e.g. `alloca`), which is the default.
     ///
-    /// If it is null and needed, the end of the replacement basic block will be
-    /// used to place deallocations.
-    BasicBlock *DeallocationBlock;
+    /// If it is empty and needed, the end of the replacement basic block will
+    /// be used to place deallocations.
+    SmallVector<BasicBlock *> DeallocationBlocks;
 
     /// If true, varargs functions can be extracted.
     bool AllowVarArgs;
@@ -156,11 +156,11 @@ class CodeExtractorAnalysisCache {
     /// Any new allocations will be placed in the AllocationBlock, unless
     /// it is null, in which case it will be placed in the entry block of
     /// the function from which the code is being extracted. Explicit
-    /// deallocations for the aforementioned allocations will be placed in the
-    /// DeallocationBlock or the end of the replacement block, if needed.
-    /// If ArgsInZeroAddressSpace param is set to true, then the aggregate
-    /// param pointer of the outlined function is declared in zero address
-    /// space.
+    /// deallocations for the aforementioned allocations will be placed, if
+    /// needed, in all blocks in DeallocationBlocks or the end of the
+    /// replacement block. If ArgsInZeroAddressSpace param is set to true, then
+    /// the aggregate param pointer of the outlined function is declared in zero
+    /// address space.
     LLVM_ABI
     CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT = nullptr,
                   bool AggregateArgs = false, BlockFrequencyInfo *BFI = nullptr,
@@ -168,7 +168,7 @@ class CodeExtractorAnalysisCache {
                   AssumptionCache *AC = nullptr, bool AllowVarArgs = false,
                   bool AllowAlloca = false,
                   BasicBlock *AllocationBlock = nullptr,
-                  BasicBlock *DeallocationBlock = nullptr,
+                  ArrayRef<BasicBlock *> DeallocationBlocks = {},
                   std::string Suffix = "", bool ArgsInZeroAddressSpace = false);
 
     LLVM_ABI virtual ~CodeExtractor() = default;
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index f1e246c83f6ea..75295d45cb958 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -496,10 +496,10 @@ class OMPCodeExtractor : public CodeExtractor {
                    AssumptionCache *AC = nullptr, bool AllowVarArgs = false,
                    bool AllowAlloca = false,
                    BasicBlock *AllocationBlock = nullptr,
-                   BasicBlock *DeallocationBlock = nullptr,
+                   ArrayRef<BasicBlock *> DeallocationBlocks = {},
                    std::string Suffix = "", bool ArgsInZeroAddressSpace = false)
       : CodeExtractor(BBs, DT, AggregateArgs, BFI, BPI, AC, AllowVarArgs,
-                      AllowAlloca, AllocationBlock, DeallocationBlock, Suffix,
+                      AllowAlloca, AllocationBlock, DeallocationBlocks, Suffix,
                       ArgsInZeroAddressSpace),
         OMPBuilder(OMPBuilder) {}
 
@@ -511,32 +511,16 @@ class OMPCodeExtractor : public CodeExtractor {
 
 class DeviceSharedMemCodeExtractor : public OMPCodeExtractor {
 public:
-  DeviceSharedMemCodeExtractor(
-      OpenMPIRBuilder &OMPBuilder, BasicBlock *AllocBlockOverride,
-      ArrayRef<BasicBlock *> BBs, DominatorTree *DT = nullptr,
-      bool AggregateArgs = false, BlockFrequencyInfo *BFI = nullptr,
-      BranchProbabilityInfo *BPI = nullptr, AssumptionCache *AC = nullptr,
-      bool AllowVarArgs = false, bool AllowAlloca = false,
-      BasicBlock *AllocationBlock = nullptr,
-      BasicBlock *DeallocationBlock = nullptr, std::string Suffix = "",
-      bool ArgsInZeroAddressSpace = false)
-      : OMPCodeExtractor(OMPBuilder, BBs, DT, AggregateArgs, BFI, BPI, AC,
-                         AllowVarArgs, AllowAlloca, AllocationBlock,
-                         DeallocationBlock, Suffix, ArgsInZeroAddressSpace),
-        AllocBlockOverride(AllocBlockOverride) {}
+  using OMPCodeExtractor::OMPCodeExtractor;
   virtual ~DeviceSharedMemCodeExtractor() = default;
 
 protected:
   virtual Instruction *
-  allocateVar(BasicBlock *, BasicBlock::iterator, Type *VarType,
+  allocateVar(BasicBlock *BB, BasicBlock::iterator AllocIP, Type *VarType,
               const Twine &Name = Twine(""),
               AddrSpaceCastInst **CastedAlloc = nullptr) override {
-    // Ignore the CastedAlloc pointer, if requested, because shared memory
-    // should not be casted to address space 0 to be passed around.
     return OMPBuilder.createOMPAllocShared(
-        OpenMPIRBuilder::InsertPointTy(
-            AllocBlockOverride, AllocBlockOverride->getFirstInsertionPt()),
-        VarType, Name);
+        OpenMPIRBuilder::InsertPointTy(BB, AllocIP), VarType, Name);
   }
 
   virtual Instruction *deallocateVar(BasicBlock *BB,
@@ -545,19 +529,12 @@ class DeviceSharedMemCodeExtractor : public OMPCodeExtractor {
     return OMPBuilder.createOMPFreeShared(
         OpenMPIRBuilder::InsertPointTy(BB, DeallocIP), Var, VarType);
   }
-
-private:
-  // TODO: Remove the need for this override and instead get the CodeExtractor
-  // to provide a valid insert point for explicit deallocations by correctly
-  // populating its DeallocationBlock.
-  BasicBlock *AllocBlockOverride;
 };
 
 /// Helper storing information about regions to outline using device shared
 /// memory for intermediate allocations.
 struct DeviceSharedMemOutlineInfo : public OpenMPIRBuilder::OutlineInfo {
   OpenMPIRBuilder &OMPBuilder;
-  BasicBlock *AllocBlockOverride = nullptr;
 
   DeviceSharedMemOutlineInfo(OpenMPIRBuilder &OMPBuilder)
       : OMPBuilder(OMPBuilder) {}
@@ -1718,11 +1695,11 @@ hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
 }
 
 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
-    const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
-    BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
-    FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
-    omp::ProcBindKind ProcBind, bool IsCancellable) {
-  assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
+    const LocationDescription &Loc, InsertPointTy OuterAllocIP,
+    ArrayRef<InsertPointTy> OuterDeallocIPs, BodyGenCallbackTy BodyGenCB,
+    PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition,
+    Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable) {
+  assert(!isConflictIP(Loc.IP, OuterAllocIP) && "IPs must not be ambiguous");
 
   if (!updateToLocation(Loc))
     return Loc.IP;
@@ -1762,7 +1739,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
 
   // Save the outer alloca block because the insertion iterator may get
   // invalidated and we still need this later.
-  BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
+  BasicBlock *OuterAllocaBlock = OuterAllocIP.getBlock();
 
   // Vector to remember instructions we used only during the modeling but which
   // we want to delete at the end.
@@ -1860,7 +1837,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
   // Let the caller create the body.
   assert(BodyGenCB && "Expected body generation callback!");
   InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
-  if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
+  InsertPointTy DeallocIP(PRegExitBB, PRegExitBB->begin());
+  if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP, DeallocIP))
     return Err;
 
   LLVM_DEBUG(dbgs() << "After  body codegen: " << *OuterFn << "\n");
@@ -1873,35 +1851,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
       // If OuterFn is not a Generic kernel, skip custom allocation. This causes
       // the CodeExtractor to follow its default behavior. Otherwise, we need to
       // use device shared memory to allocate argument structures.
-      if (ExecMode && *ExecMode & OMP_TGT_EXEC_MODE_GENERIC) {
-        auto Info = std::make_unique<DeviceSharedMemOutlineInfo>(*this);
-
-        // Instead of using the insertion point provided by the CodeExtractor,
-        // here we need to use the block that eventually calls the outlined
-        // function for the `parallel` construct.
-        //
-        // The reason is that the explicit deallocation call will be inserted
-        // within the outlined function, whereas the alloca insertion point
-        // might actually be located somewhere else in the caller. This becomes
-        // a problem when e.g. `parallel` is inside of a `distribute` construct,
-        // because the deallocation would be executed multiple times and the
-        // allocation just once (outside of the loop).
-        //
-        // TODO: Ideally, we'd want to do the allocation and deallocation
-        // outside of the `parallel` outlined function, hence using here the
-        // insertion point provided by the CodeExtractor. We can't do this at
-        // the moment because there is currently no way of passing an eligible
-        // insertion point for the explicit deallocation to the CodeExtractor,
-        // as that block is created (at least when nested inside of
-        // `distribute`) sometime after createParallel() completed, so it can't
-        // be stored in the OutlineInfo structure here.
-        //
-        // The current approach results in an explicit allocation and
-        // deallocation pair for each `distribute` loop iteration in that case,
-        // which is suboptimal.
-        Info->AllocBlockOverride = EntryBB;
-        return Info;
-      }
+      if (ExecMode && *ExecMode & OMP_TGT_EXEC_MODE_GENERIC)
+        return std::make_unique<DeviceSharedMemOutlineInfo>(*this);
     }
     return std::make_unique<OutlineInfo>();
   }();
@@ -1923,9 +1874,12 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
     };
   }
 
-  OI->OuterAllocaBB = OuterAllocaBlock;
+  OI->OuterAllocBB = OuterAllocaBlock;
   OI->EntryBB = PRegEntryBB;
   OI->ExitBB = PRegExitBB;
+  OI->OuterDeallocBBs.reserve(OuterDeallocIPs.size());
+  for (InsertPointTy DeallocIP : OuterDeallocIPs)
+    OI->OuterDeallocBBs.push_back(DeallocIP.getBlock());
 
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
   SmallVector<BasicBlock *, 32> Blocks;
@@ -1940,7 +1894,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
                           /* AllowVarArgs */ true,
                           /* AllowAlloca */ true,
                           /* AllocationBlock */ OuterAllocaBlock,
-                          /* DeallocationBlock */ nullptr,
+                          /* DeallocationBlocks */ {},
                           /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
 
   // Find inputs to, outputs from the code region.
@@ -1986,7 +1940,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
       IRBuilder<>::InsertPointGuard Guard(Builder);
       LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
 
-      Builder.restoreIP(OuterAllocaIP);
+      Builder.restoreIP(OuterAllocIP);
       Value *Ptr =
           Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
 
@@ -2038,7 +1992,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
 
   // Reset the outer alloca insertion point to the entry of the relevant block
   // in case it was invalidated.
-  OuterAllocaIP = IRBuilder<>::InsertPoint(
+  OuterAllocIP = IRBuilder<>::InsertPoint(
       OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
 
   for (Value *Input : Inputs) {
@@ -2204,10 +2158,10 @@ static Value *emitTaskDependencies(
 }
 
 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
-    const LocationDescription &Loc, InsertPointTy AllocaIP,
-    BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
-    SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
-    Value *Priority) {
+    const LocationDescription &Loc, InsertPointTy AllocIP,
+    ArrayRef<InsertPointTy> DeallocIPs, BodyGenCallbackTy BodyGenCB, bool Tied,
+    Value *Final, Value *IfCondition, SmallVector<DependData> Dependencies,
+    bool Mergeable, Value *EventHandle, Value *Priority) {
 
   if (!updateToLocation(Loc))
     return InsertPointTy();
@@ -2239,18 +2193,22 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
   InsertPointTy TaskAllocaIP =
       InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
   InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
-  if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
+  InsertPointTy TaskDeallocIP = InsertPointTy(TaskExitBB, TaskExitBB->begin());
+  if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP, TaskDeallocIP))
     return Err;
 
   auto OI = std::make_unique<OutlineInfo>();
   OI->EntryBB = TaskAllocaBB;
-  OI->OuterAllocaBB = AllocaIP.getBlock();
+  OI->OuterAllocBB = AllocIP.getBlock();
   OI->ExitBB = TaskExitBB;
+  OI->OuterDeallocBBs.reserve(DeallocIPs.size());
+  for (InsertPointTy DeallocIP : DeallocIPs)
+    OI->OuterDeallocBBs.push_back(DeallocIP.getBlock());
 
   // Add the thread ID argument.
   SmallVector<Instruction *, 4> ToBeDeleted;
   OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
-      Builder, M, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
+      Builder, M, AllocIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
 
   OI->PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
                        Mergeable, Priority, EventHandle, TaskAllocaBB,
@@ -2467,10 +2425,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
   return Builder.saveIP();
 }
 
-OpenMPIRBuilder::InsertPointOrErrorTy
-OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
-                                 InsertPointTy AllocaIP,
-                                 BodyGenCallbackTy BodyGenCB) {
+OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskgroup(
+    const LocationDescription &Loc, InsertPointTy AllocIP,
+    ArrayRef<InsertPointTy> DeallocIPs, BodyGenCallbackTy BodyGenCB) {
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
@@ -2485,7 +2442,7 @@ OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
   Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
 
   BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
-  if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
+  if (Error Err = BodyGenCB(AllocIP, Builder.saveIP(), DeallocIPs))
     return Err;
 
   Builder.SetInsertPoint(TaskgroupExitBB);
@@ -2554,8 +2511,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
       SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
       Builder.SetInsertPoint(CaseBB);
       BranchInst *CaseEndBr = Builder.CreateBr(Continue);
-      if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
-                                                  CaseEndBr->getIterator()}))
+      if (Error Err =
+              SectionCB(InsertPointTy(),
+                        {CaseEndBr->getParent(), CaseEndBr->getIterator()}, {}))
         return Err;
       CaseNumber++;
     }
@@ -4413,8 +4371,8 @@ Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
   }
 
   // Allocate temporary buffer by master thread
-  auto BodyGenCB = [&](InsertPointTy AllocaIP,
-                       InsertPointTy CodeGenIP) -> Error {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) -> Error {
     Builder.restoreIP(CodeGenIP);
     Value *AllocSpan =
         Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
@@ -4453,8 +4411,8 @@ Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
 
 Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
     ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
-  auto BodyGenCB = [&](InsertPointTy AllocaIP,
-                       InsertPointTy CodeGenIP) -> Error {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) -> Error {
     Builder.restoreIP(CodeGenIP);
     for (ReductionInfo RedInfo : ReductionInfos) {
       Value *PrivateVar = RedInfo.PrivateVariable;
@@ -4505,8 +4463,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
 
   if (!updateToLocation(Loc))
     return Loc.IP;
-  auto BodyGenCB = [&](InsertPointTy AllocaIP,
-                       InsertPointTy CodeGenIP) -> Error {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) -> Error {
     Builder.restoreIP(CodeGenIP);
     Function *CurFn = Builder.GetInsertBlock()->getParent();
     // for (int k = 0; k <= ceil(log2(n)); ++k)
@@ -5353,13 +5311,13 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
 
   auto OI = std::make_unique<OutlineInfo>();
-  OI->OuterAllocaBB = CLI->getPreheader();
+  OI->OuterAllocBB = CLI->getPreheader();
   Function *OuterFn = CLI->getPreheader()->getParent();
 
   // Instructions which need to be deleted at the end of code generation
   SmallVector<Instruction *, 4> ToBeDeleted;
 
-  OI->OuterAllocaBB = AllocaIP.getBlock();
+  OI->OuterAllocBB = AllocaIP.getBlock();
 
   // Mark the body loop as region which needs to be extracted
   OI->EntryBB = CLI->getBody();
@@ -5396,7 +5354,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
                           /* AllowVarArgs */ true,
                           /* AllowAlloca */ true,
                           /* AllocationBlock */ CLI->getPreheader(),
-                          /* DeallocationBlock */ nullptr,
+                          /* DeallocationBlocks */ {},
                           /* Suffix */ ".omp_wsloop",
                           /* AggrArgsIn0AddrSpace */ true);
 
@@ -6753,8 +6711,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
   emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
 
   // generate body
-  if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
-                            /* CodeGenIP */ Builder.saveIP()))
+  if (Error Err =
+          BodyGenCB(/* AllocIP */ InsertPointTy(),
+                    /* CodeGenIP */ Builder.saveIP(), /* DeallocIPs */ {}))
     return Err;
 
   // emit exit call and do any needed finalization.
@@ -7398,10 +7357,11 @@ Constant *OpenMPIRBuilder::registerTargetRegionFunction(
 }
 
 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
-    const LocationDescription &Loc, InsertPointTy AllocaIP,
-    InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
-    TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
-    CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
+    const LocationDescription &Loc, InsertPointTy AllocIP,
+    InsertPointTy CodeGenIP, ArrayRef<InsertPointTy> DeallocIPs,
+    Value *DeviceID, Value *IfCond, TargetDataInfo &Info,
+    GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB,
+    omp::RuntimeFunction *MapperFunc,
     function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
                                       BodyGenTy BodyGenType)>
         BodyGenCB,
@@ -7426,11 +7386,11 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
   // Generate the code for the opening of the data environment. Capture all the
   // arguments of the runtime call by reference because they are used in the
   // closing of the region.
-  auto BeginThenGen = [&](InsertPointTy AllocaIP,
-                          InsertPointTy CodeGenIP) -> Error {
+  auto BeginThenGen = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                          ArrayRef<InsertPointTy> DeallocIPs) -> Error {
     MapInfo = &GenMapInfoCB(Builder.saveIP());
     if (Error Err = emitOffloadingArrays(
-            AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
+            AllocIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
             /*IsNonContiguous=*/true, DeviceAddrCB))
       return Err;
 
@@ -7484,7 +7444,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
         cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
                             /*TargetTaskAllocaIP=*/{}));
       else
-        cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
+        cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocIP,
                                 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
     } else {
       Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
@@ -7515,8 +7475,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
   // If we need device pointer privatization, we need to emit the body of the
   // region with no privatization in the 'else' branch of the conditional.
   // Otherwise, we don't have to do anything.
-  auto BeginElseGen = [&](InsertPointTy AllocaIP,
-                          InsertPointTy CodeGenIP) -> Error {
+  auto BeginElseGen = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                          ArrayRef<InsertPointTy> DeallocIPs) -> Error {
     InsertPointOrErrorTy AfterIP =
         BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
     if (!AfterIP)
@@ -7526,7 +7486,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
   };
 
   // Generate code for the closing of the data region.
-  auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto EndThenGen = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                        ArrayRef<InsertPointTy> DeallocIPs) {
     TargetDataRTArgs RTArgs;
     Info.EmitDebug = !MapInfo->Names.empty();
     emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
@@ -7555,7 +7516,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
 
   // We don't have to do anything to close the region if the if clause evaluates
   // to false.
-  auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto EndElseGen = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                        ArrayRef<InsertPointTy> DeallocIPs) {
     return Error::success();
   };
 
@@ -7563,8 +7525,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
     if (BodyGenCB) {
       Error Err = [&]() {
         if (IfCond)
-          return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
-        return BeginThenGen(AllocaIP, Builder.saveIP());
+          return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocIP);
+        return BeginThenGen(AllocIP, Builder.saveIP(), DeallocIPs);
       }();
 
       if (Err)
@@ -7579,12 +7541,12 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
       restoreIPandDebugLoc(Builder, *AfterIP);
 
       if (IfCond)
-        return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
-      return EndThenGen(AllocaIP, Builder.saveIP());
+        return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocIP);
+      return EndThenGen(AllocIP, Builder.saveIP(), DeallocIPs);
     }
     if (IfCond)
-      return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
-    return BeginThenGen(AllocaIP, Builder.saveIP());
+      return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocIP);
+    return BeginThenGen(AllocIP, Builder.saveIP(), DeallocIPs);
   }();
 
   if (Err)
@@ -7862,15 +7824,18 @@ static Expected<Function *> createOutlinedFunction(
   if (OMPBuilder.Config.isTargetDevice())
     OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
 
-  // Insert target deinit call in the device compilation pass.
+  BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "target.exit");
   BasicBlock *OutlinedBodyBB =
       splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
       Builder.saveIP(),
-      OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
+      OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()),
+      OpenMPIRBuilder::InsertPointTy(ExitBB, ExitBB->begin()));
   if (!AfterIP)
     return AfterIP.takeError();
-  Builder.restoreIP(*AfterIP);
+  Builder.SetInsertPoint(ExitBB);
+
+  // Insert target deinit call in the device compilation pass.
   if (OMPBuilder.Config.isTargetDevice())
     OMPBuilder.createTargetDeinit(Builder);
 
@@ -8319,7 +8284,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
 
   auto OI = std::make_unique<OutlineInfo>();
   OI->EntryBB = TargetTaskAllocaBB;
-  OI->OuterAllocaBB = AllocaIP.getBlock();
+  OI->OuterAllocBB = AllocaIP.getBlock();
 
   // Add the thread ID argument.
   SmallVector<Instruction *, 4> ToBeDeleted;
@@ -8587,7 +8552,8 @@ Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
 
 static void emitTargetCall(
     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
-    OpenMPIRBuilder::InsertPointTy AllocaIP,
+    OpenMPIRBuilder::InsertPointTy AllocIP,
+    ArrayRef<OpenMPIRBuilder::InsertPointTy> DeallocIPs,
     OpenMPIRBuilder::TargetDataInfo &Info,
     const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
     const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
@@ -8644,8 +8610,9 @@ static void emitTargetCall(
   };
 
   auto &&EmitTargetCallElse =
-      [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
-          OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
+      [&](OpenMPIRBuilder::InsertPointTy AllocIP,
+          OpenMPIRBuilder::InsertPointTy CodeGenIP,
+          ArrayRef<OpenMPIRBuilder::InsertPointTy> DeallocIPs) -> Error {
     // Assume no error was returned because EmitTargetCallFallbackCB doesn't
     // produce any.
     OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
@@ -8655,7 +8622,7 @@ static void emitTargetCall(
         // OutlinedFnID=nullptr results in that call not being done.
         OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
         return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
-                                         /*RTLoc=*/nullptr, AllocaIP,
+                                         /*RTLoc=*/nullptr, AllocIP,
                                          Dependencies, EmptyRTArgs, HasNoWait);
       }
       return EmitTargetCallFallbackCB(CodeGenIP);
@@ -8666,13 +8633,14 @@ static void emitTargetCall(
   };
 
   auto &&EmitTargetCallThen =
-      [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
-          OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
+      [&](OpenMPIRBuilder::InsertPointTy AllocIP,
+          OpenMPIRBuilder::InsertPointTy CodeGenIP,
+          ArrayRef<OpenMPIRBuilder::InsertPointTy> DeallocIPs) -> Error {
     Info.HasNoWait = HasNoWait;
     OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
     OpenMPIRBuilder::TargetDataRTArgs RTArgs;
     if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
-            AllocaIP, CodeGenIP, Info, RTArgs, MapInfo, CustomMapperCB,
+            AllocIP, CodeGenIP, Info, RTArgs, MapInfo, CustomMapperCB,
             /*IsNonContiguous=*/true,
             /*ForEndCall=*/false))
       return Err;
@@ -8745,13 +8713,13 @@ static void emitTargetCall(
       // The presence of certain clauses on the target directive require the
       // explicit generation of the target task.
       if (RequiresOuterTargetTask)
-        return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
+        return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocIP,
                                          Dependencies, KArgs.RTArgs,
                                          Info.HasNoWait);
 
       return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
                                          EmitTargetCallFallbackCB, KArgs,
-                                         DeviceID, RTLoc, AllocaIP);
+                                         DeviceID, RTLoc, AllocIP);
     }());
 
     Builder.restoreIP(AfterIP);
@@ -8762,24 +8730,24 @@ static void emitTargetCall(
   // wasn't created. In this case we just run the host fallback directly and
   // ignore any potential 'if' clauses.
   if (!OutlinedFnID) {
-    cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
+    cantFail(EmitTargetCallElse(AllocIP, Builder.saveIP(), DeallocIPs));
     return;
   }
 
   // If there's no 'if' clause, only generate the kernel launch code path.
   if (!IfCond) {
-    cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
+    cantFail(EmitTargetCallThen(AllocIP, Builder.saveIP(), DeallocIPs));
     return;
   }
 
   cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
-                                   EmitTargetCallElse, AllocaIP));
+                                   EmitTargetCallElse, AllocIP));
 }
 
 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
-    const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
-    InsertPointTy CodeGenIP, TargetDataInfo &Info,
-    TargetRegionEntryInfo &EntryInfo,
+    const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocIP,
+    InsertPointTy CodeGenIP, ArrayRef<InsertPointTy> DeallocIPs,
+    TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo,
     const TargetKernelDefaultAttrs &DefaultAttrs,
     const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
     SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
@@ -8807,9 +8775,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
   // to make a remote call (offload) to the previously outlined function
   // that represents the target region. Do that now.
   if (!Config.isTargetDevice())
-    emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
-                   IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
-                   CustomMapperCB, Dependencies, HasNowait);
+    emitTargetCall(*this, Builder, AllocIP, DeallocIPs, Info, DefaultAttrs,
+                   RuntimeAttrs, IfCond, OutlinedFn, OutlinedFnID, Inputs,
+                   GenMapInfoCB, CustomMapperCB, Dependencies, HasNowait);
   return Builder.saveIP();
 }
 
@@ -9588,15 +9556,16 @@ void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
 
 Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
                                     BodyGenCallbackTy ElseGen,
-                                    InsertPointTy AllocaIP) {
+                                    InsertPointTy AllocIP,
+                                    ArrayRef<InsertPointTy> DeallocIPs) {
   // If the condition constant folds and can be elided, try to avoid emitting
   // the condition and the dead arm of the if/else.
   if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
     auto CondConstant = CI->getSExtValue();
     if (CondConstant)
-      return ThenGen(AllocaIP, Builder.saveIP());
+      return ThenGen(AllocIP, Builder.saveIP(), DeallocIPs);
 
-    return ElseGen(AllocaIP, Builder.saveIP());
+    return ElseGen(AllocIP, Builder.saveIP(), DeallocIPs);
   }
 
   Function *CurFn = Builder.GetInsertBlock()->getParent();
@@ -9609,13 +9578,13 @@ Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
   Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
   // Emit the 'then' code.
   emitBlock(ThenBlock, CurFn);
-  if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
+  if (Error Err = ThenGen(AllocIP, Builder.saveIP(), DeallocIPs))
     return Err;
   emitBranch(ContBlock);
   // Emit the 'else' code if present.
   // There is no need to emit line number for unconditional branch.
   emitBlock(ElseBlock, CurFn);
-  if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
+  if (Error Err = ElseGen(AllocIP, Builder.saveIP(), DeallocIPs))
     return Err;
   // There is no need to emit line number for unconditional branch.
   emitBranch(ContBlock);
@@ -10323,13 +10292,14 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
   // Generate the body of teams.
   InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
   InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
-  if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
+  InsertPointTy DeallocIP(ExitBB, ExitBB->begin());
+  if (Error Err = BodyGenCB(AllocaIP, CodeGenIP, DeallocIP))
     return Err;
 
   auto OI = std::make_unique<OutlineInfo>();
   OI->EntryBB = AllocaBB;
   OI->ExitBB = ExitBB;
-  OI->OuterAllocaBB = &OuterAllocaBB;
+  OI->OuterAllocBB = &OuterAllocaBB;
 
   // Insert fake values for global tid and bound tid.
   SmallVector<Instruction *, 8> ToBeDeleted;
@@ -10385,14 +10355,13 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
   return Builder.saveIP();
 }
 
-OpenMPIRBuilder::InsertPointOrErrorTy
-OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
-                                  InsertPointTy OuterAllocaIP,
-                                  BodyGenCallbackTy BodyGenCB) {
+OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createDistribute(
+    const LocationDescription &Loc, InsertPointTy OuterAllocIP,
+    ArrayRef<InsertPointTy> OuterDeallocIPs, BodyGenCallbackTy BodyGenCB) {
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
-  BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
+  BasicBlock *OuterAllocaBB = OuterAllocIP.getBlock();
 
   if (OuterAllocaBB == Builder.GetInsertBlock()) {
     BasicBlock *BodyBB =
@@ -10409,16 +10378,21 @@ OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
   // Generate the body of distribute clause
   InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
   InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
-  if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
+  InsertPointTy DeallocIP(ExitBB, ExitBB->begin());
+  if (Error Err = BodyGenCB(AllocaIP, CodeGenIP, DeallocIP))
     return Err;
 
   // When using target we use different runtime functions which require a
   // callback.
   if (Config.isTargetDevice()) {
     auto OI = std::make_unique<OutlineInfo>();
-    OI->OuterAllocaBB = OuterAllocaIP.getBlock();
+    OI->OuterAllocBB = OuterAllocIP.getBlock();
     OI->EntryBB = AllocaBB;
     OI->ExitBB = ExitBB;
+    OI->OuterDeallocBBs.reserve(OuterDeallocIPs.size());
+    for (InsertPointTy DeallocIP : OuterDeallocIPs)
+      OI->OuterDeallocBBs.push_back(DeallocIP.getBlock());
+
     addOutlineInfo(std::move(OI));
   }
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
@@ -10484,32 +10458,33 @@ std::unique_ptr<CodeExtractor>
 OpenMPIRBuilder::OutlineInfo::createCodeExtractor(ArrayRef<BasicBlock *> Blocks,
                                                   bool ArgsInZeroAddressSpace,
                                                   Twine Suffix) {
-  return std::make_unique<CodeExtractor>(Blocks, /* DominatorTree */ nullptr,
-                                         /* AggregateArgs */ true,
-                                         /* BlockFrequencyInfo */ nullptr,
-                                         /* BranchProbabilityInfo */ nullptr,
-                                         /* AssumptionCache */ nullptr,
-                                         /* AllowVarArgs */ true,
-                                         /* AllowAlloca */ true,
-                                         /* AllocationBlock*/ OuterAllocaBB,
-                                         /* DeallocationBlock */ nullptr,
-                                         /* Suffix */ Suffix.str(),
-                                         ArgsInZeroAddressSpace);
+  return std::make_unique<CodeExtractor>(
+      Blocks, /* DominatorTree */ nullptr,
+      /* AggregateArgs */ true,
+      /* BlockFrequencyInfo */ nullptr,
+      /* BranchProbabilityInfo */ nullptr,
+      /* AssumptionCache */ nullptr,
+      /* AllowVarArgs */ true,
+      /* AllowAlloca */ true,
+      /* AllocationBlock*/ OuterAllocBB,
+      /* DeallocationBlocks */ ArrayRef<BasicBlock *>(),
+      /* Suffix */ Suffix.str(), ArgsInZeroAddressSpace);
 }
 
 std::unique_ptr<CodeExtractor> DeviceSharedMemOutlineInfo::createCodeExtractor(
     ArrayRef<BasicBlock *> Blocks, bool ArgsInZeroAddressSpace, Twine Suffix) {
-  // TODO: Initialize the DeallocationBlock with a proper pair to OuterAllocaBB.
   return std::make_unique<DeviceSharedMemCodeExtractor>(
-      OMPBuilder, AllocBlockOverride, Blocks, /* DominatorTree */ nullptr,
+      OMPBuilder, Blocks, /* DominatorTree */ nullptr,
       /* AggregateArgs */ true,
       /* BlockFrequencyInfo */ nullptr,
       /* BranchProbabilityInfo */ nullptr,
       /* AssumptionCache */ nullptr,
       /* AllowVarArgs */ true,
       /* AllowAlloca */ true,
-      /* AllocationBlock*/ OuterAllocaBB,
-      /* DeallocationBlock */ ExitBB,
+      /* AllocationBlock*/ OuterAllocBB,
+      /* DeallocationBlocks */ OuterDeallocBBs.empty()
+          ? SmallVector<BasicBlock *>{ExitBB}
+          : OuterDeallocBBs,
       /* Suffix */ Suffix.str(), ArgsInZeroAddressSpace);
 }
 
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index 57809017a75a4..e8f3c68f90980 100644
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -721,7 +721,7 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
             SubRegion, &*DT, /* AggregateArgs */ false, /* BFI */ nullptr,
             /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
             /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
-            /* DeallocationBlock */ nullptr,
+            /* DeallocationBlocks */ {},
             /* Suffix */ "cold." + std::to_string(OutlinedFunctionID));
 
         if (CE.isEligible() && isSplittingBeneficial(CE, SubRegion, TTI) &&
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index e2bc1f3e86740..73c48db131b28 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -2829,7 +2829,7 @@ unsigned IROutliner::doOutline(Module &M) {
       OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
-                        false, nullptr, nullptr, "outlined");
+                        false, nullptr, {}, "outlined");
       findAddInputsOutputs(M, *OS, NotSame);
       if (!OS->IgnoreRegion)
         OutlinedRegions.push_back(OS);
@@ -2940,7 +2940,7 @@ unsigned IROutliner::doOutline(Module &M) {
       OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
-                        false, nullptr, nullptr, "outlined");
+                        false, nullptr, {}, "outlined");
       bool FunctionOutlined = extractSection(*OS);
       if (FunctionOutlined) {
         unsigned StartIdx = OS->Candidate->getStartIdx();
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 8428620e47ff0..12b61c27fc78b 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1086,7 +1086,8 @@ struct OpenMPOpt {
     SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
 
     BasicBlock *StartBB = nullptr, *EndBB = nullptr;
-    auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+    auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                         ArrayRef<InsertPointTy> DeallocIPs) {
       BasicBlock *CGStartBB = CodeGenIP.getBlock();
       BasicBlock *CGEndBB =
           SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
@@ -1126,7 +1127,8 @@ struct OpenMPOpt {
       const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
       ParentBB->getTerminator()->eraseFromParent();
 
-      auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+      auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                           ArrayRef<InsertPointTy> DeallocIPs) {
         BasicBlock *CGStartBB = CodeGenIP.getBlock();
         BasicBlock *CGEndBB =
             SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
@@ -1256,8 +1258,9 @@ struct OpenMPOpt {
       // avoid overriding binding settings, and without explicit cancellation.
       OpenMPIRBuilder::InsertPointTy AfterIP =
           cantFail(OMPInfoCache.OMPBuilder.createParallel(
-              Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
-              OMP_PROC_BIND_default, /* IsCancellable */ false));
+              Loc, AllocaIP, /* DeallocIPs */ {}, BodyGenCB, PrivCB, FiniCB,
+              nullptr, nullptr, OMP_PROC_BIND_default,
+              /* IsCancellable */ false));
       BranchInst::Create(AfterBB, AfterIP.getBlock());
 
       // Perform the actual outlining.
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index d7b7abfd0391a..675413a963fd8 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -264,11 +264,11 @@ CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
                              BranchProbabilityInfo *BPI, AssumptionCache *AC,
                              bool AllowVarArgs, bool AllowAlloca,
                              BasicBlock *AllocationBlock,
-                             BasicBlock *DeallocationBlock, std::string Suffix,
-                             bool ArgsInZeroAddressSpace)
+                             ArrayRef<BasicBlock *> DeallocationBlocks,
+                             std::string Suffix, bool ArgsInZeroAddressSpace)
     : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
       BPI(BPI), AC(AC), AllocationBlock(AllocationBlock),
-      DeallocationBlock(DeallocationBlock), AllowVarArgs(AllowVarArgs),
+      DeallocationBlocks(DeallocationBlocks), AllowVarArgs(AllowVarArgs),
       Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)),
       Suffix(Suffix), ArgsInZeroAddressSpace(ArgsInZeroAddressSpace) {}
 
@@ -2031,22 +2031,25 @@ CallInst *CodeExtractor::emitReplacerCall(
                                        {}, call);
 
   // Deallocate intermediate variables if they need explicit deallocation.
-  BasicBlock *DeallocBlock = codeReplacer;
-  BasicBlock::iterator DeallocIP = codeReplacer->end();
-  if (DeallocationBlock) {
-    DeallocBlock = DeallocationBlock;
-    DeallocIP = DeallocationBlock->getFirstInsertionPt();
-  }
+  auto deallocVars = [&](BasicBlock *DeallocBlock,
+                         BasicBlock::iterator DeallocIP) {
+    int Index = 0;
+    for (Value *Output : outputs) {
+      if (!StructValues.contains(Output))
+        deallocateVar(DeallocBlock, DeallocIP, ReloadOutputs[Index++],
+                      Output->getType());
+    }
 
-  int Index = 0;
-  for (Value *Output : outputs) {
-    if (!StructValues.contains(Output))
-      deallocateVar(DeallocBlock, DeallocIP, ReloadOutputs[Index++],
-                    Output->getType());
-  }
+    if (Struct)
+      deallocateVar(DeallocBlock, DeallocIP, Struct, StructArgTy);
+  };
 
-  if (Struct)
-    deallocateVar(DeallocBlock, DeallocIP, Struct, StructArgTy);
+  if (DeallocationBlocks.empty()) {
+    deallocVars(codeReplacer, codeReplacer->end());
+  } else {
+    for (BasicBlock *DeallocationBlock : DeallocationBlocks)
+      deallocVars(DeallocationBlock, DeallocationBlock->getFirstInsertionPt());
+  }
 
   return call;
 }
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 5b22ac31b572e..b90b6a6923cac 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -56,8 +56,9 @@ using namespace omp;
   }
 
 #define BODYGENCB_WRAPPER(cb)                                                  \
-  [&cb](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) -> Error {            \
-    cb(AllocaIP, CodeGenIP);                                                   \
+  [&cb](InsertPointTy AllocIP, InsertPointTy CodeGenIP,                        \
+        ArrayRef<InsertPointTy> DeallocIPs) -> Error {                         \
+    cb(AllocIP, CodeGenIP, DeallocIPs);                                        \
     return Error::success();                                                   \
   }
 
@@ -666,10 +667,11 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
   unsigned NumPrivatizedVars = 0;
   unsigned NumFinalizationPoints = 0;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     ++NumBodiesGenerated;
 
-    Builder.restoreIP(AllocaIP);
+    Builder.restoreIP(AllocIP);
     PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
     Builder.CreateStore(F->arg_begin(), PrivAI);
 
@@ -717,8 +719,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
                                     F->getEntryBlock().getFirstInsertionPt());
   ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP,
                        OMPBuilder.createParallel(
-                           Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr,
-                           nullptr, OMP_PROC_BIND_default, false));
+                           Loc, AllocaIP, {}, BodyGenCB, PrivCB, FiniCB,
+                           nullptr, nullptr, OMP_PROC_BIND_default, false));
 
   EXPECT_EQ(NumBodiesGenerated, 1U);
   EXPECT_EQ(NumPrivatizedVars, 1U);
@@ -780,10 +782,11 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimple) {
   unsigned NumPrivatizedVars = 0;
   unsigned NumFinalizationPoints = 0;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     ++NumBodiesGenerated;
 
-    Builder.restoreIP(AllocaIP);
+    Builder.restoreIP(AllocIP);
     PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
     Builder.CreateStore(F->arg_begin(), PrivAI);
 
@@ -831,8 +834,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimple) {
                                     F->getEntryBlock().getFirstInsertionPt());
   ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP,
                        OMPBuilder.createParallel(
-                           Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr,
-                           nullptr, OMP_PROC_BIND_default, false));
+                           Loc, AllocaIP, {}, BodyGenCB, PrivCB, FiniCB,
+                           nullptr, nullptr, OMP_PROC_BIND_default, false));
   EXPECT_EQ(NumBodiesGenerated, 1U);
   EXPECT_EQ(NumPrivatizedVars, 1U);
   EXPECT_EQ(NumFinalizationPoints, 1U);
@@ -889,7 +892,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested) {
   unsigned NumOuterBodiesGenerated = 0;
   unsigned NumFinalizationPoints = 0;
 
-  auto InnerBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto InnerBodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                            ArrayRef<InsertPointTy> DeallocIPs) {
     ++NumInnerBodiesGenerated;
     return Error::success();
   };
@@ -912,7 +916,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested) {
     return Error::success();
   };
 
-  auto OuterBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto OuterBodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                            ArrayRef<InsertPointTy> DeallocIPs) {
     ++NumOuterBodiesGenerated;
     Builder.restoreIP(CodeGenIP);
     BasicBlock *CGBB = CodeGenIP.getBlock();
@@ -921,7 +926,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested) {
 
     ASSERT_EXPECTED_INIT(
         OpenMPIRBuilder::InsertPointTy, AfterIP,
-        OMPBuilder.createParallel(InsertPointTy(CGBB, CGBB->end()), AllocaIP,
+        OMPBuilder.createParallel(InsertPointTy(CGBB, CGBB->end()), AllocIP, {},
                                   InnerBodyGenCB, PrivCB, FiniCB, nullptr,
                                   nullptr, OMP_PROC_BIND_default, false));
 
@@ -933,7 +938,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested) {
                                     F->getEntryBlock().getFirstInsertionPt());
   ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP,
                        OMPBuilder.createParallel(
-                           Loc, AllocaIP, BODYGENCB_WRAPPER(OuterBodyGenCB),
+                           Loc, AllocaIP, {}, BODYGENCB_WRAPPER(OuterBodyGenCB),
                            PrivCB, FiniCB, nullptr, nullptr,
                            OMP_PROC_BIND_default, false));
 
@@ -991,7 +996,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) {
   unsigned NumOuterBodiesGenerated = 0;
   unsigned NumFinalizationPoints = 0;
 
-  auto InnerBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto InnerBodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                            ArrayRef<InsertPointTy> DeallocIPs) {
     ++NumInnerBodiesGenerated;
     return Error::success();
   };
@@ -1014,7 +1020,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) {
     return Error::success();
   };
 
-  auto OuterBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto OuterBodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                            ArrayRef<InsertPointTy> DeallocIPs) {
     ++NumOuterBodiesGenerated;
     Builder.restoreIP(CodeGenIP);
     BasicBlock *CGBB = CodeGenIP.getBlock();
@@ -1027,18 +1034,18 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) {
 
     ASSERT_EXPECTED_INIT(
         OpenMPIRBuilder::InsertPointTy, AfterIP1,
-        OMPBuilder.createParallel(InsertPointTy(CGBB, CGBB->end()), AllocaIP,
+        OMPBuilder.createParallel(InsertPointTy(CGBB, CGBB->end()), AllocIP, {},
                                   InnerBodyGenCB, PrivCB, FiniCB, nullptr,
                                   nullptr, OMP_PROC_BIND_default, false));
 
     Builder.restoreIP(AfterIP1);
     Builder.CreateBr(NewBB1);
 
-    ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP2,
-                         OMPBuilder.createParallel(
-                             InsertPointTy(NewBB1, NewBB1->end()), AllocaIP,
-                             InnerBodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
-                             OMP_PROC_BIND_default, false));
+    ASSERT_EXPECTED_INIT(
+        OpenMPIRBuilder::InsertPointTy, AfterIP2,
+        OMPBuilder.createParallel(InsertPointTy(NewBB1, NewBB1->end()), AllocIP,
+                                  {}, InnerBodyGenCB, PrivCB, FiniCB, nullptr,
+                                  nullptr, OMP_PROC_BIND_default, false));
 
     Builder.restoreIP(AfterIP2);
     Builder.CreateBr(NewBB2);
@@ -1048,7 +1055,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) {
                                     F->getEntryBlock().getFirstInsertionPt());
   ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP,
                        OMPBuilder.createParallel(
-                           Loc, AllocaIP, BODYGENCB_WRAPPER(OuterBodyGenCB),
+                           Loc, AllocaIP, {}, BODYGENCB_WRAPPER(OuterBodyGenCB),
                            PrivCB, FiniCB, nullptr, nullptr,
                            OMP_PROC_BIND_default, false));
 
@@ -1113,10 +1120,11 @@ TEST_F(OpenMPIRBuilderTest, ParallelIfCond) {
   unsigned NumPrivatizedVars = 0;
   unsigned NumFinalizationPoints = 0;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     ++NumBodiesGenerated;
 
-    Builder.restoreIP(AllocaIP);
+    Builder.restoreIP(AllocIP);
     PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
     Builder.CreateStore(F->arg_begin(), PrivAI);
 
@@ -1165,7 +1173,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelIfCond) {
                                     F->getEntryBlock().getFirstInsertionPt());
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
-      OMPBuilder.createParallel(Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB,
+      OMPBuilder.createParallel(Loc, AllocaIP, {}, BodyGenCB, PrivCB, FiniCB,
                                 Builder.CreateIsNotNull(F->arg_begin()),
                                 nullptr, OMP_PROC_BIND_default, false));
 
@@ -1221,7 +1229,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelCancelBarrier) {
   unsigned NumFinalizationPoints = 0;
 
   CallInst *CheckedBarrier = nullptr;
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     ++NumBodiesGenerated;
 
     Builder.restoreIP(CodeGenIP);
@@ -1289,11 +1298,12 @@ TEST_F(OpenMPIRBuilderTest, ParallelCancelBarrier) {
 
   IRBuilder<>::InsertPoint AllocaIP(&F->getEntryBlock(),
                                     F->getEntryBlock().getFirstInsertionPt());
-  ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP,
-                       OMPBuilder.createParallel(
-                           Loc, AllocaIP, BODYGENCB_WRAPPER(BodyGenCB), PrivCB,
-                           FiniCB, Builder.CreateIsNotNull(F->arg_begin()),
-                           nullptr, OMP_PROC_BIND_default, true));
+  ASSERT_EXPECTED_INIT(
+      OpenMPIRBuilder::InsertPointTy, AfterIP,
+      OMPBuilder.createParallel(Loc, AllocaIP, {}, BODYGENCB_WRAPPER(BodyGenCB),
+                                PrivCB, FiniCB,
+                                Builder.CreateIsNotNull(F->arg_begin()),
+                                nullptr, OMP_PROC_BIND_default, true));
 
   EXPECT_EQ(NumBodiesGenerated, 1U);
   EXPECT_EQ(NumPrivatizedVars, 0U);
@@ -1359,7 +1369,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelForwardAsPointers) {
   Value *StructPtrVal = Builder.CreateCall(RetStructPtrFunc);
 
   Instruction *Internal;
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     IRBuilder<>::InsertPointGuard Guard(Builder);
     Builder.restoreIP(CodeGenIP);
     Internal = Builder.CreateCall(TakeI32Func, I32Val);
@@ -1379,8 +1390,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelForwardAsPointers) {
                                     F->getEntryBlock().getFirstInsertionPt());
   ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP,
                        OMPBuilder.createParallel(
-                           Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr,
-                           nullptr, OMP_PROC_BIND_default, false));
+                           Loc, AllocaIP, {}, BodyGenCB, PrivCB, FiniCB,
+                           nullptr, nullptr, OMP_PROC_BIND_default, false));
   Builder.restoreIP(AfterIP);
   Builder.CreateRetVoid();
 
@@ -2893,9 +2904,10 @@ TEST_F(OpenMPIRBuilderTest, MasterDirective) {
   BasicBlock *EntryBB = nullptr;
   BasicBlock *ThenBB = nullptr;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
-    if (AllocaIP.isSet())
-      Builder.restoreIP(AllocaIP);
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
+    if (AllocIP.isSet())
+      Builder.restoreIP(AllocIP);
     else
       Builder.SetInsertPoint(&*(F->getEntryBlock().getFirstInsertionPt()));
     PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
@@ -2974,9 +2986,10 @@ TEST_F(OpenMPIRBuilderTest, MaskedDirective) {
   BasicBlock *EntryBB = nullptr;
   BasicBlock *ThenBB = nullptr;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
-    if (AllocaIP.isSet())
-      Builder.restoreIP(AllocaIP);
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
+    if (AllocIP.isSet())
+      Builder.restoreIP(AllocIP);
     else
       Builder.SetInsertPoint(&*(F->getEntryBlock().getFirstInsertionPt()));
     PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
@@ -3053,7 +3066,8 @@ TEST_F(OpenMPIRBuilderTest, CriticalDirective) {
 
   AllocaInst *PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     // actual start for bodyCB
     llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
     llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint();
@@ -3304,7 +3318,8 @@ TEST_F(OpenMPIRBuilderTest, OrderedDirectiveThreads) {
   AllocaInst *PrivAI =
       Builder.CreateAlloca(F->arg_begin()->getType(), nullptr, "priv.inst");
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
     llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint();
     EXPECT_EQ(CodeGenIPBB->getTerminator(), CodeGenIPInst);
@@ -3378,7 +3393,8 @@ TEST_F(OpenMPIRBuilderTest, OrderedDirectiveSimd) {
   AllocaInst *PrivAI =
       Builder.CreateAlloca(F->arg_begin()->getType(), nullptr, "priv.inst");
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
     llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint();
     EXPECT_EQ(CodeGenIPBB->getTerminator(), CodeGenIPInst);
@@ -3485,9 +3501,10 @@ TEST_F(OpenMPIRBuilderTest, SingleDirective) {
   BasicBlock *EntryBB = nullptr;
   BasicBlock *ThenBB = nullptr;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
-    if (AllocaIP.isSet())
-      Builder.restoreIP(AllocaIP);
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
+    if (AllocIP.isSet())
+      Builder.restoreIP(AllocIP);
     else
       Builder.SetInsertPoint(&*(F->getEntryBlock().getFirstInsertionPt()));
     PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
@@ -3578,9 +3595,10 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) {
   BasicBlock *EntryBB = nullptr;
   BasicBlock *ThenBB = nullptr;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
-    if (AllocaIP.isSet())
-      Builder.restoreIP(AllocaIP);
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
+    if (AllocIP.isSet())
+      Builder.restoreIP(AllocIP);
     else
       Builder.SetInsertPoint(&*(F->getEntryBlock().getFirstInsertionPt()));
     PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
@@ -3699,9 +3717,10 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveCopyPrivate) {
   Function *CopyFunc =
       Function::Create(CopyFuncTy, Function::PrivateLinkage, "copy_var", *M);
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
-    if (AllocaIP.isSet())
-      Builder.restoreIP(AllocaIP);
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
+    if (AllocIP.isSet())
+      Builder.restoreIP(AllocIP);
     else
       Builder.SetInsertPoint(&*(F->getEntryBlock().getFirstInsertionPt()));
     PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
@@ -4564,8 +4583,9 @@ TEST_F(OpenMPIRBuilderTest, CreateTeams) {
   AllocaInst *ValPtr128 = Builder.CreateAlloca(Builder.getInt128Ty());
   Value *Val128 = Builder.CreateLoad(Builder.getInt128Ty(), ValPtr128, "load");
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
-    Builder.restoreIP(AllocaIP);
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
+    Builder.restoreIP(AllocIP);
     AllocaInst *Local128 = Builder.CreateAlloca(Builder.getInt128Ty(), nullptr,
                                                 "bodygen.alloca128");
 
@@ -4646,7 +4666,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithThreadLimit) {
       Function::Create(FunctionType::get(Builder.getVoidTy(), false),
                        GlobalValue::ExternalLinkage, "fakeFunction", M.get());
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     Builder.restoreIP(CodeGenIP);
     Builder.CreateCall(FakeFunction, {});
     return Error::success();
@@ -4703,7 +4724,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsUpper) {
       Function::Create(FunctionType::get(Builder.getVoidTy(), false),
                        GlobalValue::ExternalLinkage, "fakeFunction", M.get());
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     Builder.restoreIP(CodeGenIP);
     Builder.CreateCall(FakeFunction, {});
     return Error::success();
@@ -4766,7 +4788,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsBoth) {
   Value *NumTeamsUpper =
       Builder.CreateAdd(F->arg_begin(), Builder.getInt32(10), "numTeamsUpper");
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     Builder.restoreIP(CodeGenIP);
     Builder.CreateCall(FakeFunction, {});
     return Error::success();
@@ -4834,7 +4857,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsAndThreadLimit) {
       Function::Create(FunctionType::get(Builder.getVoidTy(), false),
                        GlobalValue::ExternalLinkage, "fakeFunction", M.get());
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     Builder.restoreIP(CodeGenIP);
     Builder.CreateCall(FakeFunction, {});
     return Error::success();
@@ -4892,7 +4916,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithIfCondition) {
       Function::Create(FunctionType::get(Builder.getVoidTy(), false),
                        GlobalValue::ExternalLinkage, "fakeFunction", M.get());
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     Builder.restoreIP(CodeGenIP);
     Builder.CreateCall(FakeFunction, {});
     return Error::success();
@@ -4960,7 +4985,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithIfConditionAndNumTeams) {
       Function::Create(FunctionType::get(Builder.getVoidTy(), false),
                        GlobalValue::ExternalLinkage, "fakeFunction", M.get());
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     Builder.restoreIP(CodeGenIP);
     Builder.CreateCall(FakeFunction, {});
     return Error::success();
@@ -5179,7 +5205,8 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) {
   //   xor of thread-id;
   // and store the result in global variables.
   InsertPointTy BodyIP, BodyAllocaIP;
-  auto BodyGenCB = [&](InsertPointTy InnerAllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy InnerAllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     IRBuilderBase::InsertPointGuard Guard(Builder);
     Builder.restoreIP(CodeGenIP);
 
@@ -5197,7 +5224,7 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) {
     Builder.CreateStore(Xor, XorReduced);
 
     BodyIP = Builder.saveIP();
-    BodyAllocaIP = InnerAllocaIP;
+    BodyAllocaIP = InnerAllocIP;
     return Error::success();
   };
 
@@ -5233,12 +5260,12 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) {
   // Do nothing in finalization.
   auto FiniCB = [&](InsertPointTy CodeGenIP) { return Error::success(); };
 
-  ASSERT_EXPECTED_INIT(
-      OpenMPIRBuilder::InsertPointTy, AfterIP,
-      OMPBuilder.createParallel(Loc, OuterAllocaIP, BodyGenCB, PrivCB, FiniCB,
-                                /* IfCondition */ nullptr,
-                                /* NumThreads */ nullptr, OMP_PROC_BIND_default,
-                                /* IsCancellable */ false));
+  ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP,
+                       OMPBuilder.createParallel(
+                           Loc, OuterAllocaIP, {}, BodyGenCB, PrivCB, FiniCB,
+                           /* IfCondition */ nullptr,
+                           /* NumThreads */ nullptr, OMP_PROC_BIND_default,
+                           /* IsCancellable */ false));
   Builder.restoreIP(AfterIP);
 
   OpenMPIRBuilder::ReductionInfo ReductionInfos[] = {
@@ -5558,8 +5585,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) {
   Builder.CreateStore(Builder.getInt32(1), XorReduced);
 
   InsertPointTy FirstBodyIP, FirstBodyAllocaIP;
-  auto FirstBodyGenCB = [&](InsertPointTy InnerAllocaIP,
-                            InsertPointTy CodeGenIP) {
+  auto FirstBodyGenCB = [&](InsertPointTy InnerAllocIP, InsertPointTy CodeGenIP,
+                            ArrayRef<InsertPointTy> DeallocIPs) {
     IRBuilderBase::InsertPointGuard Guard(Builder);
     Builder.restoreIP(CodeGenIP);
 
@@ -5574,13 +5601,14 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) {
     Builder.CreateStore(Sum, SumReduced);
 
     FirstBodyIP = Builder.saveIP();
-    FirstBodyAllocaIP = InnerAllocaIP;
+    FirstBodyAllocaIP = InnerAllocIP;
     return Error::success();
   };
 
   InsertPointTy SecondBodyIP, SecondBodyAllocaIP;
-  auto SecondBodyGenCB = [&](InsertPointTy InnerAllocaIP,
-                             InsertPointTy CodeGenIP) {
+  auto SecondBodyGenCB = [&](InsertPointTy InnerAllocIP,
+                             InsertPointTy CodeGenIP,
+                             ArrayRef<InsertPointTy> DeallocIPs) {
     IRBuilderBase::InsertPointGuard Guard(Builder);
     Builder.restoreIP(CodeGenIP);
 
@@ -5593,7 +5621,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) {
     Builder.CreateStore(Xor, XorReduced);
 
     SecondBodyIP = Builder.saveIP();
-    SecondBodyAllocaIP = InnerAllocaIP;
+    SecondBodyAllocaIP = InnerAllocIP;
     return Error::success();
   };
 
@@ -5633,14 +5661,14 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) {
 
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP1,
-      OMPBuilder.createParallel(Loc, OuterAllocaIP, FirstBodyGenCB, PrivCB,
+      OMPBuilder.createParallel(Loc, OuterAllocaIP, {}, FirstBodyGenCB, PrivCB,
                                 FiniCB, /* IfCondition */ nullptr,
                                 /* NumThreads */ nullptr, OMP_PROC_BIND_default,
                                 /* IsCancellable */ false));
   Builder.restoreIP(AfterIP1);
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP2,
-      OMPBuilder.createParallel({Builder.saveIP(), DL}, OuterAllocaIP,
+      OMPBuilder.createParallel({Builder.saveIP(), DL}, OuterAllocaIP, {},
                                 SecondBodyGenCB, PrivCB, FiniCB,
                                 /* IfCondition */ nullptr,
                                 /* NumThreads */ nullptr, OMP_PROC_BIND_default,
@@ -5734,7 +5762,8 @@ TEST_F(OpenMPIRBuilderTest, CreateSectionsSimple) {
   llvm::SmallVector<BodyGenCallbackTy, 4> SectionCBVector;
 
   auto FiniCB = [&](InsertPointTy IP) { return Error::success(); };
-  auto SectionCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto SectionCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     return Error::success();
   };
   SectionCBVector.push_back(SectionCB);
@@ -5779,7 +5808,8 @@ TEST_F(OpenMPIRBuilderTest, CreateSections) {
     EXPECT_NE(IPBB->end(), IP.getPoint());
   };
 
-  auto SectionCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto SectionCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     ++NumBodiesGenerated;
     CaseBBs.push_back(CodeGenIP.getBlock());
     SwitchBB = CodeGenIP.getBlock()->getSinglePredecessor();
@@ -6119,7 +6149,7 @@ TEST_F(OpenMPIRBuilderTest, TargetEnterData) {
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTargetData(
-          Loc, AllocaIP, Builder.saveIP(), Builder.getInt64(DeviceID),
+          Loc, AllocaIP, Builder.saveIP(), {}, Builder.getInt64(DeviceID),
           /* IfCond= */ nullptr, Info, GenMapInfoCB, CustomMapperCB, &RTLFunc));
   Builder.restoreIP(AfterIP);
 
@@ -6182,7 +6212,7 @@ TEST_F(OpenMPIRBuilderTest, TargetExitData) {
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTargetData(
-          Loc, AllocaIP, Builder.saveIP(), Builder.getInt64(DeviceID),
+          Loc, AllocaIP, Builder.saveIP(), {}, Builder.getInt64(DeviceID),
           /* IfCond= */ nullptr, Info, GenMapInfoCB, CustomMapperCB, &RTLFunc));
   Builder.restoreIP(AfterIP);
 
@@ -6293,7 +6323,7 @@ TEST_F(OpenMPIRBuilderTest, TargetDataRegion) {
 
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, TargetDataIP1,
-      OMPBuilder.createTargetData(Loc, AllocaIP, Builder.saveIP(),
+      OMPBuilder.createTargetData(Loc, AllocaIP, Builder.saveIP(), {},
                                   Builder.getInt64(DeviceID),
                                   /* IfCond= */ nullptr, Info, GenMapInfoCB,
                                   CustomMapperCB, nullptr, BodyCB));
@@ -6322,7 +6352,7 @@ TEST_F(OpenMPIRBuilderTest, TargetDataRegion) {
   };
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, TargetDataIP2,
-      OMPBuilder.createTargetData(Loc, AllocaIP, Builder.saveIP(),
+      OMPBuilder.createTargetData(Loc, AllocaIP, Builder.saveIP(), {},
                                   Builder.getInt64(DeviceID),
                                   /* IfCond= */ nullptr, Info, GenMapInfoCB,
                                   CustomMapperCB, nullptr, BodyTargetCB));
@@ -6373,8 +6403,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
 
   Builder.CreateStore(Builder.getInt32(10), APtr);
   Builder.CreateStore(Builder.getInt32(20), BPtr);
-  auto BodyGenCB = [&](InsertPointTy AllocaIP,
-                       InsertPointTy CodeGenIP) -> InsertPointTy {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) -> InsertPointTy {
     IRBuilderBase::InsertPointGuard guard(Builder);
     Builder.SetCurrentDebugLocation(llvm::DebugLoc());
     Builder.restoreIP(CodeGenIP);
@@ -6444,10 +6474,10 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTarget(OmpLoc, /*IsOffloadEntry=*/true, Builder.saveIP(),
-                              Builder.saveIP(), Info, EntryInfo, DefaultAttrs,
-                              RuntimeAttrs, /*IfCond=*/nullptr, Inputs,
-                              GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB,
-                              CustomMapperCB, {}, false));
+                              Builder.saveIP(), {}, Info, EntryInfo,
+                              DefaultAttrs, RuntimeAttrs, /*IfCond=*/nullptr,
+                              Inputs, GenMapInfoCB, BodyGenCB,
+                              SimpleArgAccessorCB, CustomMapperCB, {}, false));
   EXPECT_EQ(DL, Builder.getCurrentDebugLocation());
   Builder.restoreIP(AfterIP);
 
@@ -6598,8 +6628,9 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   };
 
   auto CustomMapperCB = [&](unsigned int I) { return nullptr; };
-  auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
-                       OpenMPIRBuilder::InsertPointTy CodeGenIP)
+  auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy AllocIP,
+                       OpenMPIRBuilder::InsertPointTy CodeGenIP,
+                       ArrayRef<OpenMPIRBuilder::InsertPointTy> DeallocIPs)
       -> OpenMPIRBuilder::InsertPointTy {
     IRBuilderBase::InsertPointGuard guard(Builder);
     Builder.SetCurrentDebugLocation(llvm::DebugLoc());
@@ -6624,7 +6655,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP,
-                              Info, EntryInfo, DefaultAttrs, RuntimeAttrs,
+                              {}, Info, EntryInfo, DefaultAttrs, RuntimeAttrs,
                               /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB,
                               BodyGenCB, SimpleArgAccessorCB, CustomMapperCB,
                               {}, false));
@@ -6710,7 +6741,14 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   Instruction *Value1 = &*OutlinedBlock->getFirstNonPHIIt();
   EXPECT_EQ(Value1, Value);
   EXPECT_EQ(Value1->getNextNode(), TargetStore);
-  auto *Deinit = TargetStore->getNextNode();
+
+  auto *TargetExitBlockBr = TargetStore->getNextNode();
+  EXPECT_TRUE(isa<BranchInst>(TargetExitBlockBr));
+
+  auto *TargetExitBlock = TargetExitBlockBr->getSuccessor(0);
+  EXPECT_EQ(TargetExitBlock->getName(), "target.exit");
+
+  Instruction *Deinit = &*TargetExitBlock->getFirstNonPHIIt();
   EXPECT_NE(Deinit, nullptr);
 
   auto *DeinitCall = dyn_cast<CallInst>(Deinit);
@@ -6758,8 +6796,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) {
   IRBuilder<> Builder(BB);
 
   auto CustomMapperCB = [&](unsigned int I) { return nullptr; };
-  auto BodyGenCB = [&](InsertPointTy,
-                       InsertPointTy CodeGenIP) -> InsertPointTy {
+  auto BodyGenCB = [&](InsertPointTy, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy>) -> InsertPointTy {
     Builder.restoreIP(CodeGenIP);
     return Builder.saveIP();
   };
@@ -6792,10 +6830,10 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) {
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTarget(OmpLoc, /*IsOffloadEntry=*/true, Builder.saveIP(),
-                              Builder.saveIP(), Info, EntryInfo, DefaultAttrs,
-                              RuntimeAttrs, /*IfCond=*/nullptr, Inputs,
-                              GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB,
-                              CustomMapperCB, {}));
+                              Builder.saveIP(), {}, Info, EntryInfo,
+                              DefaultAttrs, RuntimeAttrs, /*IfCond=*/nullptr,
+                              Inputs, GenMapInfoCB, BodyGenCB,
+                              SimpleArgAccessorCB, CustomMapperCB, {}));
   Builder.restoreIP(AfterIP);
 
   OMPBuilder.finalize();
@@ -6879,7 +6917,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) {
 
   auto CustomMapperCB = [&](unsigned int I) { return nullptr; };
   auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy,
-                       OpenMPIRBuilder::InsertPointTy CodeGenIP)
+                       OpenMPIRBuilder::InsertPointTy CodeGenIP,
+                       ArrayRef<OpenMPIRBuilder::InsertPointTy>)
       -> OpenMPIRBuilder::InsertPointTy {
     Builder.restoreIP(CodeGenIP);
     OutlinedFn = CodeGenIP.getBlock()->getParent();
@@ -6900,8 +6939,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) {
 
   ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP,
                        OMPBuilder.createTarget(
-                           Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, Info,
-                           EntryInfo, DefaultAttrs, RuntimeAttrs,
+                           Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, {},
+                           Info, EntryInfo, DefaultAttrs, RuntimeAttrs,
                            /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB,
                            BodyGenCB, SimpleArgAccessorCB, CustomMapperCB, {}));
   Builder.restoreIP(AfterIP);
@@ -7004,8 +7043,9 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
   llvm::Value *RaiseAlloca = nullptr;
 
   auto CustomMapperCB = [&](unsigned int I) { return nullptr; };
-  auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
-                       OpenMPIRBuilder::InsertPointTy CodeGenIP)
+  auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy AllocIP,
+                       OpenMPIRBuilder::InsertPointTy CodeGenIP,
+                       ArrayRef<OpenMPIRBuilder::InsertPointTy> DeallocIPs)
       -> OpenMPIRBuilder::InsertPointTy {
     IRBuilderBase::InsertPointGuard guard(Builder);
     Builder.SetCurrentDebugLocation(llvm::DebugLoc());
@@ -7031,7 +7071,7 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP,
-                              Info, EntryInfo, DefaultAttrs, RuntimeAttrs,
+                              {}, Info, EntryInfo, DefaultAttrs, RuntimeAttrs,
                               /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB,
                               BodyGenCB, SimpleArgAccessorCB, CustomMapperCB,
                               {}, false));
@@ -7110,7 +7150,14 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
   EXPECT_TRUE(isa<LoadInst>(Load2));
   EXPECT_EQ(Load2, Value);
   EXPECT_EQ(Load2->getNextNode(), TargetStore);
-  auto *Deinit = TargetStore->getNextNode();
+
+  auto *TargetExitBlockBr = TargetStore->getNextNode();
+  EXPECT_TRUE(isa<BranchInst>(TargetExitBlockBr));
+
+  auto *TargetExitBlock = TargetExitBlockBr->getSuccessor(0);
+  EXPECT_EQ(TargetExitBlock->getName(), "target.exit");
+
+  Instruction *Deinit = &*TargetExitBlock->getFirstNonPHIIt();
   EXPECT_NE(Deinit, nullptr);
 
   auto *DeinitCall = dyn_cast<CallInst>(Deinit);
@@ -7141,8 +7188,9 @@ TEST_F(OpenMPIRBuilderTest, CreateTask) {
   Value *Val128 =
       Builder.CreateLoad(Builder.getInt128Ty(), ValPtr128, "bodygen.load");
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
-    Builder.restoreIP(AllocaIP);
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
+    Builder.restoreIP(AllocIP);
     AllocaInst *Local128 = Builder.CreateAlloca(Builder.getInt128Ty(), nullptr,
                                                 "bodygen.alloca128");
 
@@ -7170,7 +7218,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTask) {
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTask(
           Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()),
-          BodyGenCB));
+          /*DeallocIPs=*/{}, BodyGenCB));
   Builder.restoreIP(AfterIP);
   OMPBuilder.finalize();
   Builder.CreateRetVoid();
@@ -7270,7 +7318,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskNoArgs) {
   F->setName("func");
   IRBuilder<> Builder(BB);
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     return Error::success();
   };
 
@@ -7282,7 +7331,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskNoArgs) {
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTask(
           Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()),
-          BodyGenCB));
+          /*DeallocIPs=*/{}, BodyGenCB));
   Builder.restoreIP(AfterIP);
   OMPBuilder.finalize();
   Builder.CreateRetVoid();
@@ -7306,7 +7355,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskUntied) {
   OMPBuilder.initialize();
   F->setName("func");
   IRBuilder<> Builder(BB);
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     return Error::success();
   };
   BasicBlock *AllocaBB = Builder.GetInsertBlock();
@@ -7317,7 +7367,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskUntied) {
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTask(
           Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()),
-          BodyGenCB,
+          /*DeallocIPs=*/{}, BodyGenCB,
           /*Tied=*/false));
   Builder.restoreIP(AfterIP);
   OMPBuilder.finalize();
@@ -7343,7 +7393,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskDepend) {
   OMPBuilder.initialize();
   F->setName("func");
   IRBuilder<> Builder(BB);
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     return Error::success();
   };
   BasicBlock *AllocaBB = Builder.GetInsertBlock();
@@ -7361,7 +7412,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskDepend) {
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTask(
           Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()),
-          BodyGenCB,
+          /*DeallocIPs=*/{}, BodyGenCB,
           /*Tied=*/false, /*Final*/ nullptr, /*IfCondition*/ nullptr, DDS));
   Builder.restoreIP(AfterIP);
   OMPBuilder.finalize();
@@ -7424,7 +7475,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskFinal) {
   OMPBuilder.initialize();
   F->setName("func");
   IRBuilder<> Builder(BB);
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     return Error::success();
   };
   BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "alloca.split");
@@ -7435,7 +7487,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskFinal) {
       ConstantInt::get(Type::getInt32Ty(M->getContext()), 0U));
   OpenMPIRBuilder::LocationDescription Loc(Builder.saveIP(), DL);
   ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP,
-                       OMPBuilder.createTask(Loc, AllocaIP, BodyGenCB,
+                       OMPBuilder.createTask(Loc, AllocaIP, /*DeallocIPs=*/{},
+                                             BodyGenCB,
                                              /*Tied=*/false, Final));
   Builder.restoreIP(AfterIP);
   OMPBuilder.finalize();
@@ -7483,7 +7536,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskIfCondition) {
   OMPBuilder.initialize();
   F->setName("func");
   IRBuilder<> Builder(BB);
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
     return Error::success();
   };
   BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "alloca.split");
@@ -7493,10 +7547,10 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskIfCondition) {
       CmpInst::Predicate::ICMP_EQ, F->getArg(0),
       ConstantInt::get(Type::getInt32Ty(M->getContext()), 0U));
   OpenMPIRBuilder::LocationDescription Loc(Builder.saveIP(), DL);
-  ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP,
-                       OMPBuilder.createTask(Loc, AllocaIP, BodyGenCB,
-                                             /*Tied=*/false, /*Final=*/nullptr,
-                                             IfCondition));
+  ASSERT_EXPECTED_INIT(
+      OpenMPIRBuilder::InsertPointTy, AfterIP,
+      OMPBuilder.createTask(Loc, AllocaIP, /*DeallocIPs=*/{}, BodyGenCB,
+                            /*Tied=*/false, /*Final=*/nullptr, IfCondition));
   Builder.restoreIP(AfterIP);
   OMPBuilder.finalize();
   Builder.CreateRetVoid();
@@ -7562,8 +7616,9 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroup) {
 
   Value *InternalStoreInst, *InternalLoad32, *InternalLoad128, *InternalIfCmp;
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
-    Builder.restoreIP(AllocaIP);
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
+    Builder.restoreIP(AllocIP);
     AllocaInst *Local128 = Builder.CreateAlloca(Builder.getInt128Ty(), nullptr,
                                                 "bodygen.alloca128");
 
@@ -7591,7 +7646,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroup) {
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTaskgroup(
-          Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()),
+          Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()), {},
           BodyGenCB));
   Builder.restoreIP(AfterIP);
   OMPBuilder.finalize();
@@ -7654,14 +7709,16 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroupWithTasks) {
   F->setName("func");
   IRBuilder<> Builder(BB);
 
-  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
-    Builder.restoreIP(AllocaIP);
+  auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                       ArrayRef<InsertPointTy> DeallocIPs) {
+    Builder.restoreIP(AllocIP);
     AllocaInst *Alloca32 =
         Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, "bodygen.alloca32");
     AllocaInst *Alloca64 =
         Builder.CreateAlloca(Builder.getInt64Ty(), nullptr, "bodygen.alloca64");
     Builder.restoreIP(CodeGenIP);
-    auto TaskBodyGenCB1 = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+    auto TaskBodyGenCB1 = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                              ArrayRef<InsertPointTy> DeallocIPs) {
       Builder.restoreIP(CodeGenIP);
       LoadInst *LoadValue =
           Builder.CreateLoad(Alloca64->getAllocatedType(), Alloca64);
@@ -7670,11 +7727,13 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroupWithTasks) {
       return Error::success();
     };
     OpenMPIRBuilder::LocationDescription Loc(Builder.saveIP(), DL);
-    ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, TaskIP1,
-                         OMPBuilder.createTask(Loc, AllocaIP, TaskBodyGenCB1));
+    ASSERT_EXPECTED_INIT(
+        OpenMPIRBuilder::InsertPointTy, TaskIP1,
+        OMPBuilder.createTask(Loc, AllocIP, DeallocIPs, TaskBodyGenCB1));
     Builder.restoreIP(TaskIP1);
 
-    auto TaskBodyGenCB2 = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+    auto TaskBodyGenCB2 = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP,
+                              ArrayRef<InsertPointTy> DeallocIPs) {
       Builder.restoreIP(CodeGenIP);
       LoadInst *LoadValue =
           Builder.CreateLoad(Alloca32->getAllocatedType(), Alloca32);
@@ -7683,8 +7742,9 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroupWithTasks) {
       return Error::success();
     };
     OpenMPIRBuilder::LocationDescription Loc2(Builder.saveIP(), DL);
-    ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, TaskIP2,
-                         OMPBuilder.createTask(Loc2, AllocaIP, TaskBodyGenCB2));
+    ASSERT_EXPECTED_INIT(
+        OpenMPIRBuilder::InsertPointTy, TaskIP2,
+        OMPBuilder.createTask(Loc2, AllocIP, DeallocIPs, TaskBodyGenCB2));
     Builder.restoreIP(TaskIP2);
   };
 
@@ -7695,7 +7755,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroupWithTasks) {
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTaskgroup(
-          Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()),
+          Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()), {},
           BODYGENCB_WRAPPER(BodyGenCB)));
   Builder.restoreIP(AfterIP);
   OMPBuilder.finalize();
diff --git a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
index 6fd266a815dcf..d63e346e31a1d 100644
--- a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp
@@ -712,7 +712,7 @@ TEST(CodeExtractor, OpenMPAggregateArgs) {
                    /* AllowVarArgs */ true,
                    /* AllowAlloca */ true,
                    /* AllocationBlock*/ &Func->getEntryBlock(),
-                   /* DeallocationBlock */ nullptr,
+                   /* DeallocationBlocks */ {},
                    /* Suffix */ ".outlined",
                    /* ArgsInZeroAddressSpace */ true);
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 09af46544441d..0c88110a7e2e0 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -71,14 +71,17 @@ convertToScheduleKind(std::optional<omp::ClauseScheduleKind> schedKind) {
 
 /// ModuleTranslation stack frame for OpenMP operations. This keeps track of the
 /// insertion points for allocas.
-class OpenMPAllocaStackFrame
-    : public StateStackFrameBase<OpenMPAllocaStackFrame> {
+class OpenMPAllocStackFrame
+    : public StateStackFrameBase<OpenMPAllocStackFrame> {
 public:
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OpenMPAllocaStackFrame)
-
-  explicit OpenMPAllocaStackFrame(llvm::OpenMPIRBuilder::InsertPointTy allocaIP)
-      : allocaInsertPoint(allocaIP) {}
-  llvm::OpenMPIRBuilder::InsertPointTy allocaInsertPoint;
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OpenMPAllocStackFrame)
+
+  explicit OpenMPAllocStackFrame(
+      llvm::OpenMPIRBuilder::InsertPointTy allocIP,
+      llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> deallocIPs)
+      : allocInsertPoint(allocIP), deallocInsertPoints(deallocIPs) {}
+  llvm::OpenMPIRBuilder::InsertPointTy allocInsertPoint;
+  llvm::SmallVector<llvm::OpenMPIRBuilder::InsertPointTy> deallocInsertPoints;
 };
 
 /// Stack frame to hold a \see llvm::CanonicalLoopInfo representing the
@@ -485,26 +488,33 @@ static LogicalResult handleError(llvm::Expected<T> &result, Operation &op) {
 
 /// Find the insertion point for allocas given the current insertion point for
 /// normal operations in the builder.
-static llvm::OpenMPIRBuilder::InsertPointTy
-findAllocaInsertPoint(llvm::IRBuilderBase &builder,
-                      LLVM::ModuleTranslation &moduleTranslation) {
-  // If there is an alloca insertion point on stack, i.e. we are in a nested
+static llvm::OpenMPIRBuilder::InsertPointTy findAllocInsertPoints(
+    llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation,
+    llvm::SmallVectorImpl<llvm::OpenMPIRBuilder::InsertPointTy> *deallocIPs =
+        nullptr) {
+  // If there is an allocation insertion point on stack, i.e. we are in a nested
   // operation and a specific point was provided by some surrounding operation,
   // use it.
-  llvm::OpenMPIRBuilder::InsertPointTy allocaInsertPoint;
-  WalkResult walkResult = moduleTranslation.stackWalk<OpenMPAllocaStackFrame>(
-      [&](OpenMPAllocaStackFrame &frame) {
-        allocaInsertPoint = frame.allocaInsertPoint;
+  llvm::OpenMPIRBuilder::InsertPointTy allocInsertPoint;
+  llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> deallocInsertPoints;
+  WalkResult walkResult = moduleTranslation.stackWalk<OpenMPAllocStackFrame>(
+      [&](OpenMPAllocStackFrame &frame) {
+        allocInsertPoint = frame.allocInsertPoint;
+        deallocInsertPoints = frame.deallocInsertPoints;
         return WalkResult::interrupt();
       });
   // In cases with multiple levels of outlining, the tree walk might find an
-  // alloca insertion point that is inside the original function while the
-  // builder insertion point is inside the outlined function. We need to make
-  // sure that we do not use it in those cases.
+  // insertion point that is inside the original function while the builder
+  // insertion point is inside the outlined function. We need to make sure that
+  // we do not use it in those cases.
   if (walkResult.wasInterrupted() &&
-      allocaInsertPoint.getBlock()->getParent() ==
-          builder.GetInsertBlock()->getParent())
-    return allocaInsertPoint;
+      allocInsertPoint.getBlock()->getParent() ==
+          builder.GetInsertBlock()->getParent()) {
+    if (deallocIPs)
+      deallocIPs->insert(deallocIPs->end(), deallocInsertPoints.begin(),
+                         deallocInsertPoints.end());
+    return allocInsertPoint;
+  }
 
   // Otherwise, insert to the entry block of the surrounding function.
   // If the current IRBuilder InsertPoint is the function's entry, it cannot
@@ -512,7 +522,7 @@ findAllocaInsertPoint(llvm::IRBuilderBase &builder,
   // confusion. Create a new BasicBlock for the Builder and use the entry block
   // for the allocs.
   // TODO: Create a dedicated alloca BasicBlock at function creation such that
-  // we do not need to move the current InertPoint here.
+  // we do not need to move the current InsertPoint here.
   if (builder.GetInsertBlock() ==
       &builder.GetInsertBlock()->getParent()->getEntryBlock()) {
     assert(builder.GetInsertPoint() == builder.GetInsertBlock()->end() &&
@@ -524,6 +534,16 @@ findAllocaInsertPoint(llvm::IRBuilderBase &builder,
     builder.SetInsertPoint(entryBB);
   }
 
+  // Collect exit blocks, which is where explicit deallocations should happen in
+  // this case.
+  if (deallocIPs) {
+    for (llvm::BasicBlock &block : *builder.GetInsertBlock()->getParent()) {
+      llvm::Instruction *terminator = block.getTerminator();
+      if (isa_and_present<llvm::ReturnInst>(terminator))
+        deallocIPs->emplace_back(&block, terminator->getIterator());
+    }
+  }
+
   llvm::BasicBlock &funcEntryBlock =
       builder.GetInsertBlock()->getParent()->getEntryBlock();
   return llvm::OpenMPIRBuilder::InsertPointTy(
@@ -711,7 +731,8 @@ convertOmpMasked(Operation &opInst, llvm::IRBuilderBase &builder,
   if (failed(checkImplementationStatus(opInst)))
     return failure();
 
-  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
+  auto bodyGenCB = [&](InsertPointTy allocIP, InsertPointTy codeGenIP,
+                       llvm::ArrayRef<InsertPointTy> deallocIPs) {
     // MaskedOp has only one region associated with it.
     auto &region = maskedOp.getRegion();
     builder.restoreIP(codeGenIP);
@@ -755,7 +776,8 @@ convertOmpMaster(Operation &opInst, llvm::IRBuilderBase &builder,
   if (failed(checkImplementationStatus(opInst)))
     return failure();
 
-  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
+  auto bodyGenCB = [&](InsertPointTy allocIP, InsertPointTy codeGenIP,
+                       llvm::ArrayRef<InsertPointTy> deallocIPs) {
     // MasterOp has only one region associated with it.
     auto &region = masterOp.getRegion();
     builder.restoreIP(codeGenIP);
@@ -790,7 +812,8 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder,
   if (failed(checkImplementationStatus(opInst)))
     return failure();
 
-  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
+  auto bodyGenCB = [&](InsertPointTy allocIP, InsertPointTy codeGenIP,
+                       llvm::ArrayRef<InsertPointTy> deallocIPs) {
     // CriticalOp has only one region associated with it.
     auto &region = cast<omp::CriticalOp>(opInst).getRegion();
     builder.restoreIP(codeGenIP);
@@ -1050,7 +1073,7 @@ convertOmpOrdered(Operation &opInst, llvm::IRBuilderBase &builder,
       indexVecValues++;
     }
     llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-        findAllocaInsertPoint(builder, moduleTranslation);
+        findAllocInsertPoints(builder, moduleTranslation);
     llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
     builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createOrderedDepend(
         ompLoc, allocaIP, numLoops, storeValues, ".cnt.addr", isDependSource));
@@ -1069,7 +1092,8 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder,
   if (failed(checkImplementationStatus(opInst)))
     return failure();
 
-  auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
+  auto bodyGenCB = [&](InsertPointTy allocIP, InsertPointTy codeGenIP,
+                       llvm::ArrayRef<InsertPointTy> deallocIPs) {
     // OrderedOp has only one region associated with it.
     auto &region = cast<omp::OrderedRegionOp>(opInst).getRegion();
     builder.restoreIP(codeGenIP);
@@ -1860,7 +1884,7 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
   SmallVector<omp::DeclareReductionOp> reductionDecls;
   collectReductionDecls(sectionsOp, reductionDecls);
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
+      findAllocInsertPoints(builder, moduleTranslation);
 
   SmallVector<llvm::Value *> privateReductionVariables(
       sectionsOp.getNumReductionVars());
@@ -1884,7 +1908,8 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
 
     Region &region = sectionOp.getRegion();
     auto sectionCB = [&sectionsOp, &region, &builder, &moduleTranslation](
-                         InsertPointTy allocaIP, InsertPointTy codeGenIP) {
+                         InsertPointTy allocIP, InsertPointTy codeGenIP,
+                         ArrayRef<InsertPointTy> deallocIPs) {
       builder.restoreIP(codeGenIP);
 
       // map the omp.section reduction block argument to the omp.sections block
@@ -1929,7 +1954,7 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
   // called for variables which have destructors/finalizers.
   auto finiCB = [&](InsertPointTy codeGenIP) { return llvm::Error::success(); };
 
-  allocaIP = findAllocaInsertPoint(builder, moduleTranslation);
+  allocaIP = findAllocInsertPoints(builder, moduleTranslation);
   bool isCancellable = constructIsCancellable(sectionsOp);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
@@ -1958,7 +1983,8 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder,
   if (failed(checkImplementationStatus(*singleOp)))
     return failure();
 
-  auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
+  auto bodyCB = [&](InsertPointTy allocIP, InsertPointTy codegenIP,
+                    llvm::ArrayRef<InsertPointTy> deallocIPs) {
     builder.restoreIP(codegenIP);
     return convertOmpOpRegions(singleOp.getRegion(), "omp.single.region",
                                builder, moduleTranslation)
@@ -2041,7 +2067,7 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
   SmallVector<llvm::Value *> privateReductionVariables(numReductionVars);
   llvm::ArrayRef<bool> isByRef;
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
+      findAllocInsertPoints(builder, moduleTranslation);
 
   // Only do teams reduction if there is no distribute op that captures the
   // reduction instead.
@@ -2063,9 +2089,10 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
       return failure();
   }
 
-  auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
-    LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
-        moduleTranslation, allocaIP);
+  auto bodyCB = [&](InsertPointTy allocIP, InsertPointTy codegenIP,
+                    llvm::ArrayRef<InsertPointTy> deallocIPs) {
+    LLVM::ModuleTranslation::SaveStack<OpenMPAllocStackFrame> frame(
+        moduleTranslation, allocIP, deallocIPs);
     builder.restoreIP(codegenIP);
     return convertOmpOpRegions(op.getRegion(), "omp.teams.region", builder,
                                moduleTranslation)
@@ -2322,9 +2349,9 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   // code outside of the outlined task region, which is what we want because
   // this way the initialization and copy regions are executed immediately while
   // the host variable data are still live.
-
-  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
+  llvm::SmallVector<InsertPointTy> deallocIPs;
+  InsertPointTy allocIP =
+      findAllocInsertPoints(builder, moduleTranslation, &deallocIPs);
 
   // Not using splitBB() because that requires the current block to have a
   // terminator.
@@ -2354,8 +2381,8 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
 
   // Save the alloca insertion point on ModuleTranslation stack for use in
   // nested regions.
-  LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
-      moduleTranslation, allocaIP);
+  LLVM::ModuleTranslation::SaveStack<OpenMPAllocStackFrame> frame(
+      moduleTranslation, allocIP, deallocIPs);
 
   // Allocate and initialize private variables
   builder.SetInsertPoint(initBlock->getTerminator());
@@ -2419,12 +2446,12 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   // Set up for call to createTask()
   builder.SetInsertPoint(taskStartBlock);
 
-  auto bodyCB = [&](InsertPointTy allocaIP,
-                    InsertPointTy codegenIP) -> llvm::Error {
+  auto bodyCB = [&](InsertPointTy allocIP, InsertPointTy codegenIP,
+                    llvm::ArrayRef<InsertPointTy> deallocIPs) -> llvm::Error {
     // Save the alloca insertion point on ModuleTranslation stack for use in
     // nested regions.
-    LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
-        moduleTranslation, allocaIP);
+    LLVM::ModuleTranslation::SaveStack<OpenMPAllocStackFrame> frame(
+        moduleTranslation, allocIP, deallocIPs);
 
     // translate the body of the task:
     builder.restoreIP(codegenIP);
@@ -2442,7 +2469,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
       llvm::IRBuilderBase::InsertPointGuard guard(builder);
       llvm::Type *llvmAllocType =
           moduleTranslation.convertType(privDecl.getType());
-      builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
+      builder.SetInsertPoint(allocIP.getBlock()->getTerminator());
       llvm::Value *llvmPrivateVar = builder.CreateAlloca(
           llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
 
@@ -2516,7 +2543,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       moduleTranslation.getOpenMPBuilder()->createTask(
-          ompLoc, allocaIP, bodyCB, !taskOp.getUntied(),
+          ompLoc, allocIP, deallocIPs, bodyCB, !taskOp.getUntied(),
           moduleTranslation.lookupValue(taskOp.getFinal()),
           moduleTranslation.lookupValue(taskOp.getIfExpr()), dds,
           taskOp.getMergeable(),
@@ -2541,18 +2568,21 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
   if (failed(checkImplementationStatus(*tgOp)))
     return failure();
 
-  auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
+  auto bodyCB = [&](InsertPointTy allocIP, InsertPointTy codegenIP,
+                    llvm::ArrayRef<InsertPointTy> deallocIPs) {
     builder.restoreIP(codegenIP);
     return convertOmpOpRegions(tgOp.getRegion(), "omp.taskgroup.region",
                                builder, moduleTranslation)
         .takeError();
   };
 
-  InsertPointTy allocaIP = findAllocaInsertPoint(builder, moduleTranslation);
+  llvm::SmallVector<InsertPointTy> deallocIPs;
+  InsertPointTy allocIP =
+      findAllocInsertPoints(builder, moduleTranslation, &deallocIPs);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
-      moduleTranslation.getOpenMPBuilder()->createTaskgroup(ompLoc, allocaIP,
-                                                            bodyCB);
+      moduleTranslation.getOpenMPBuilder()->createTaskgroup(ompLoc, allocIP,
+                                                            deallocIPs, bodyCB);
 
   if (failed(handleError(afterIP, *tgOp)))
     return failure();
@@ -2602,8 +2632,9 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
 
   SmallVector<omp::DeclareReductionOp> reductionDecls;
   collectReductionDecls(wsloopOp, reductionDecls);
+
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
+      findAllocInsertPoints(builder, moduleTranslation);
 
   SmallVector<llvm::Value *> privateReductionVariables(
       wsloopOp.getNumReductionVars());
@@ -2776,10 +2807,11 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       opInst.getNumReductionVars());
   SmallVector<DeferredStore> deferredStores;
 
-  auto bodyGenCB = [&](InsertPointTy allocaIP,
-                       InsertPointTy codeGenIP) -> llvm::Error {
+  auto bodyGenCB =
+      [&](InsertPointTy allocIP, InsertPointTy codeGenIP,
+          llvm::ArrayRef<InsertPointTy> deallocIPs) -> llvm::Error {
     llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
-        opInst, builder, moduleTranslation, privateVarsInfo, allocaIP);
+        opInst, builder, moduleTranslation, privateVarsInfo, allocIP);
     if (handleError(afterAllocas, *opInst).failed())
       return llvm::make_error<PreviouslyReportedError>();
 
@@ -2789,12 +2821,11 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     MutableArrayRef<BlockArgument> reductionArgs =
         cast<omp::BlockArgOpenMPOpInterface>(*opInst).getReductionBlockArgs();
 
-    allocaIP =
-        InsertPointTy(allocaIP.getBlock(),
-                      allocaIP.getBlock()->getTerminator()->getIterator());
+    allocIP = InsertPointTy(allocIP.getBlock(),
+                            allocIP.getBlock()->getTerminator()->getIterator());
 
     if (failed(allocReductionVars(
-            opInst, reductionArgs, builder, moduleTranslation, allocaIP,
+            opInst, reductionArgs, builder, moduleTranslation, allocIP,
             reductionDecls, privateReductionVariables, reductionVariableMap,
             deferredStores, isByRef)))
       return llvm::make_error<PreviouslyReportedError>();
@@ -2823,8 +2854,8 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
     // Save the alloca insertion point on ModuleTranslation stack for use in
     // nested regions.
-    LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
-        moduleTranslation, allocaIP);
+    LLVM::ModuleTranslation::SaveStack<OpenMPAllocStackFrame> frame(
+        moduleTranslation, allocIP, deallocIPs);
 
     // ParallelOp has only one region associated with it.
     llvm::Expected<llvm::BasicBlock *> regionBlock = convertOmpOpRegions(
@@ -2851,7 +2882,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
       llvm::OpenMPIRBuilder::InsertPointOrErrorTy contInsertPoint =
           ompBuilder->createReductions(
-              builder.saveIP(), allocaIP, reductionInfos, isByRef,
+              builder.saveIP(), allocIP, reductionInfos, isByRef,
               /*IsNoWait=*/false, /*IsTeamsReduction=*/false);
       if (!contInsertPoint)
         return contInsertPoint.takeError();
@@ -2912,13 +2943,15 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     pbKind = getProcBindKind(*bind);
   bool isCancellable = constructIsCancellable(opInst);
 
-  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
+  llvm::SmallVector<llvm::OpenMPIRBuilder::InsertPointTy> deallocIPs;
+  llvm::OpenMPIRBuilder::InsertPointTy allocIP =
+      findAllocInsertPoints(builder, moduleTranslation, &deallocIPs);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
-      ompBuilder->createParallel(ompLoc, allocaIP, bodyGenCB, privCB, finiCB,
-                                 ifCond, numThreads, pbKind, isCancellable);
+      ompBuilder->createParallel(ompLoc, allocIP, deallocIPs, bodyGenCB, privCB,
+                                 finiCB, ifCond, numThreads, pbKind,
+                                 isCancellable);
 
   if (failed(handleError(afterIP, *opInst)))
     return failure();
@@ -2963,7 +2996,7 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
   assert(isByRef.size() == simdOp.getNumReductionVars());
 
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
+      findAllocInsertPoints(builder, moduleTranslation);
 
   llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
       simdOp, builder, moduleTranslation, privateVarsInfo, allocaIP);
@@ -3343,7 +3376,7 @@ convertOmpAtomicRead(Operation &opInst, llvm::IRBuilderBase &builder,
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
+      findAllocInsertPoints(builder, moduleTranslation);
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
@@ -3370,7 +3403,7 @@ convertOmpAtomicWrite(Operation &opInst, llvm::IRBuilderBase &builder,
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
+      findAllocInsertPoints(builder, moduleTranslation);
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::AtomicOrdering ao = convertAtomicOrdering(writeOp.getMemoryOrder());
@@ -3487,7 +3520,7 @@ convertOmpAtomicUpdate(omp::AtomicUpdateOp &opInst,
   extractAtomicControlFlags(opInst, isIgnoreDenormalMode, isFineGrainedMemory,
                             isRemoteMemory);
   // Handle ambiguous alloca, if any.
-  auto allocaIP = findAllocaInsertPoint(builder, moduleTranslation);
+  auto allocaIP = findAllocInsertPoints(builder, moduleTranslation);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       ompBuilder->createAtomicUpdate(ompLoc, allocaIP, llvmAtomicX, llvmExpr,
@@ -3588,7 +3621,7 @@ convertOmpAtomicCapture(omp::AtomicCaptureOp atomicCaptureOp,
   extractAtomicControlFlags(atomicUpdateOp, isIgnoreDenormalMode,
                             isFineGrainedMemory, isRemoteMemory);
   // Handle ambiguous alloca, if any.
-  auto allocaIP = findAllocaInsertPoint(builder, moduleTranslation);
+  auto allocaIP = findAllocInsertPoints(builder, moduleTranslation);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       ompBuilder->createAtomicCapture(
@@ -4881,7 +4914,7 @@ createAlteredByCaptureMap(MapInfoData &mapData,
         if (!isPtrTy) {
           auto curInsert = builder.saveIP();
           llvm::DebugLoc DbgLoc = builder.getCurrentDebugLocation();
-          builder.restoreIP(findAllocaInsertPoint(builder, moduleTranslation));
+          builder.restoreIP(findAllocInsertPoints(builder, moduleTranslation));
           auto *memTempAlloc =
               builder.CreateAlloca(builder.getPtrTy(), nullptr, ".casted");
           builder.SetCurrentDebugLocation(DbgLoc);
@@ -5273,18 +5306,21 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   };
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
-  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
+  llvm::SmallVector<llvm::OpenMPIRBuilder::InsertPointTy> deallocIPs;
+  llvm::OpenMPIRBuilder::InsertPointTy allocIP =
+      findAllocInsertPoints(builder, moduleTranslation, &deallocIPs);
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = [&]() {
     if (isa<omp::TargetDataOp>(op))
-      return ompBuilder->createTargetData(ompLoc, allocaIP, builder.saveIP(),
+      return ompBuilder->createTargetData(ompLoc, allocIP, builder.saveIP(),
+                                          deallocIPs,
                                           builder.getInt64(deviceID), ifCond,
                                           info, genMapInfoCB, customMapperCB,
                                           /*MapperFunc=*/nullptr, bodyGenCB,
                                           /*DeviceAddrCB=*/nullptr);
-    return ompBuilder->createTargetData(
-        ompLoc, allocaIP, builder.saveIP(), builder.getInt64(deviceID), ifCond,
-        info, genMapInfoCB, customMapperCB, &RTLFn);
+    return ompBuilder->createTargetData(ompLoc, allocIP, builder.saveIP(),
+                                        deallocIPs, builder.getInt64(deviceID),
+                                        ifCond, info, genMapInfoCB,
+                                        customMapperCB, &RTLFn);
   }();
 
   if (failed(handleError(afterIP, *op)))
@@ -5320,7 +5356,7 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
 
     collectReductionDecls(teamsOp, reductionDecls);
     llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-        findAllocaInsertPoint(builder, moduleTranslation);
+        findAllocInsertPoints(builder, moduleTranslation);
 
     MutableArrayRef<BlockArgument> reductionArgs =
         llvm::cast<omp::BlockArgOpenMPOpInterface>(*teamsOp)
@@ -5334,19 +5370,20 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
   }
 
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
-  auto bodyGenCB = [&](InsertPointTy allocaIP,
-                       InsertPointTy codeGenIP) -> llvm::Error {
+  auto bodyGenCB =
+      [&](InsertPointTy allocIP, InsertPointTy codeGenIP,
+          llvm::ArrayRef<InsertPointTy> deallocIPs) -> llvm::Error {
     // Save the alloca insertion point on ModuleTranslation stack for use in
     // nested regions.
-    LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
-        moduleTranslation, allocaIP);
+    LLVM::ModuleTranslation::SaveStack<OpenMPAllocStackFrame> frame(
+        moduleTranslation, allocIP, deallocIPs);
 
     // DistributeOp has only one region associated with it.
     builder.restoreIP(codeGenIP);
     PrivateVarsInfo privVarsInfo(distributeOp);
 
     llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
-        distributeOp, builder, moduleTranslation, privVarsInfo, allocaIP);
+        distributeOp, builder, moduleTranslation, privVarsInfo, allocIP);
     if (handleError(afterAllocas, opInst).failed())
       return llvm::make_error<PreviouslyReportedError>();
 
@@ -5389,7 +5426,7 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
           findCurrentLoopInfo(moduleTranslation);
       llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP =
           ompBuilder->applyWorkshareLoop(
-              ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier,
+              ompLoc.DL, loopInfo, allocIP, loopNeedsBarrier,
               convertToScheduleKind(schedule), chunk, isSimd,
               scheduleMod == omp::ScheduleModifier::monotonic,
               scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered,
@@ -5406,11 +5443,12 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
     return llvm::Error::success();
   };
 
-  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
+  llvm::SmallVector<llvm::OpenMPIRBuilder::InsertPointTy> deallocIPs;
+  llvm::OpenMPIRBuilder::InsertPointTy allocIP =
+      findAllocInsertPoints(builder, moduleTranslation, &deallocIPs);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
-      ompBuilder->createDistribute(ompLoc, allocaIP, bodyGenCB);
+      ompBuilder->createDistribute(ompLoc, allocIP, deallocIPs, bodyGenCB);
 
   if (failed(handleError(afterIP, opInst)))
     return failure();
@@ -5420,7 +5458,7 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
   if (doDistributeReduction) {
     // Process the reductions if required.
     return createReductionsAndCleanup(
-        teamsOp, builder, moduleTranslation, allocaIP, reductionDecls,
+        teamsOp, builder, moduleTranslation, allocIP, reductionDecls,
         privateReductionVariables, isByRef,
         /*isNoWait*/ false, /*isTeamsReduction*/ true);
   }
@@ -6103,7 +6141,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   }
 
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
-  auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP)
+  auto bodyCB = [&](InsertPointTy allocIP, InsertPointTy codeGenIP,
+                    ArrayRef<InsertPointTy> deallocIPs)
       -> llvm::OpenMPIRBuilder::InsertPointOrErrorTy {
     llvm::IRBuilderBase::InsertPointGuard guard(builder);
     builder.SetCurrentDebugLocation(llvm::DebugLoc());
@@ -6145,7 +6184,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
 
     llvm::Expected<llvm::BasicBlock *> afterAllocas =
         allocatePrivateVars(targetOp, builder, moduleTranslation,
-                            privateVarsInfo, allocaIP, &mappedPrivateVars);
+                            privateVarsInfo, allocIP, &mappedPrivateVars);
 
     if (failed(handleError(afterAllocas, *targetOp)))
       return llvm::make_error<PreviouslyReportedError>();
@@ -6170,6 +6209,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
                       return &privatizer.getDeallocRegion();
                     });
 
+    LLVM::ModuleTranslation::SaveStack<OpenMPAllocStackFrame> frame(
+        moduleTranslation, allocIP, deallocIPs);
     llvm::Expected<llvm::BasicBlock *> exitBlock = convertOmpOpRegions(
         targetRegion, "omp.target", builder, moduleTranslation);
 
@@ -6274,8 +6315,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   buildDependData(targetOp.getDependKinds(), targetOp.getDependVars(),
                   moduleTranslation, dds);
 
-  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
+  llvm::SmallVector<llvm::OpenMPIRBuilder::InsertPointTy> deallocIPs;
+  llvm::OpenMPIRBuilder::InsertPointTy allocIP =
+      findAllocInsertPoints(builder, moduleTranslation, &deallocIPs);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
   llvm::OpenMPIRBuilder::TargetDataInfo info(
@@ -6297,9 +6339,10 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
 
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       moduleTranslation.getOpenMPBuilder()->createTarget(
-          ompLoc, isOffloadEntry, allocaIP, builder.saveIP(), info, entryInfo,
-          defaultAttrs, runtimeAttrs, ifCond, kernelInput, genMapInfoCB, bodyCB,
-          argAccessorCB, customMapperCB, dds, targetOp.getNowait());
+          ompLoc, isOffloadEntry, allocIP, builder.saveIP(), deallocIPs, info,
+          entryInfo, defaultAttrs, runtimeAttrs, ifCond, kernelInput,
+          genMapInfoCB, bodyCB, argAccessorCB, customMapperCB, dds,
+          targetOp.getNowait());
 
   if (failed(handleError(afterIP, opInst)))
     return failure();
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
index 7b157aeef4fe4..866284a47050f 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
@@ -55,21 +55,21 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK: define weak_odr protected amdgpu_kernel void @[[FUNC0:.*]](
 // CHECK-SAME: ptr %[[TMP:.*]], ptr %[[TMP0:.*]]) #{{[0-9]+}} {
 // CHECK:         %[[TMP1:.*]] = alloca [1 x ptr], align 8, addrspace(5)
-// CHECK:         %[[TMP2:.*]] = addrspacecast ptr addrspace(5) %[[TMP1]] to ptr
-// CHECK:         %[[TMP3:.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK:         %[[TMP4:.*]] = addrspacecast ptr addrspace(5) %[[TMP3]] to ptr
-// CHECK:         store ptr %[[TMP0]], ptr %[[TMP4]], align 8
-// CHECK:         %[[TMP5:.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{.*}} to ptr), ptr %[[TMP]])
-// CHECK:         %[[EXEC_USER_CODE:.*]] = icmp eq i32 %[[TMP5]], -1
+// CHECK:         %[[TMP2:.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK:         %[[TMP3:.*]] = addrspacecast ptr addrspace(5) %[[TMP2]] to ptr
+// CHECK:         store ptr %[[TMP0]], ptr %[[TMP3]], align 8
+// CHECK:         %[[TMP4:.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{.*}} to ptr), ptr %[[TMP]])
+// CHECK:         %[[EXEC_USER_CODE:.*]] = icmp eq i32 %[[TMP4]], -1
 // CHECK:         br i1 %[[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[WORKER_EXIT:.*]]
-// CHECK:         %[[TMP6:.*]] = load ptr, ptr %[[TMP4]], align 8
+// CHECK:         %[[TMP5:.*]] = addrspacecast ptr addrspace(5) %[[TMP1]] to ptr
 // CHECK:         %[[STRUCTARG:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8)
+// CHECK:         %[[TMP6:.*]] = load ptr, ptr %[[TMP3]], align 8
 // CHECK:         %[[OMP_GLOBAL_THREAD_NUM:.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
 // CHECK:         %[[GEP_:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG]], i32 0, i32 0
 // CHECK:         store ptr %[[TMP6]], ptr %[[GEP_]], align 8
-// CHECK:         %[[TMP7:.*]] = getelementptr inbounds [1 x ptr], ptr %[[TMP2]], i64 0, i64 0
+// CHECK:         %[[TMP7:.*]] = getelementptr inbounds [1 x ptr], ptr %[[TMP5]], i64 0, i64 0
 // CHECK:         store ptr %[[STRUCTARG]], ptr %[[TMP7]], align 8
-// CHECK:         call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 1, i32 -1, i32 -1, ptr @[[FUNC1:.*]], ptr @[[FUNC1_WRAPPER:.*]], ptr %[[TMP2]], i64 1)
+// CHECK:         call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 1, i32 -1, i32 -1, ptr @[[FUNC1:.*]], ptr @[[FUNC1_WRAPPER:.*]], ptr %[[TMP5]], i64 1)
 // CHECK:         call void @__kmpc_free_shared(ptr %[[STRUCTARG]], i64 8)
 // CHECK:         call void @__kmpc_target_deinit()
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
index 5a76871c180ab..3ebb79fef7474 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
@@ -56,7 +56,9 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:        %[[B:.*]] = load i32, ptr %[[PTR_B]], align 4
 // CHECK:        %[[C:.*]] = add i32 %[[A]], %[[B]]
 // CHECK:        store i32 %[[C]], ptr %[[PTR_C]], align 4
-// CHECK:        br label %[[LABEL_DEINIT:.*]]
+// CHECK:        br label %[[LABEL_TARGET_EXIT:.*]]
+// CHECK:        [[LABEL_TARGET_EXIT]]:
+// CHECK-NEXT:   br label %[[LABEL_DEINIT:.*]]
 // CHECK:        [[LABEL_DEINIT]]:
 // CHECK-NEXT:   call void @__kmpc_target_deinit()
 // CHECK-NEXT:   ret void
diff --git a/mlir/test/Target/LLVMIR/openmp-target-private-allocatable.mlir b/mlir/test/Target/LLVMIR/openmp-target-private-allocatable.mlir
index 0ee9230b5af0e..2aa11f3a1aa34 100644
--- a/mlir/test/Target/LLVMIR/openmp-target-private-allocatable.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-target-private-allocatable.mlir
@@ -70,4 +70,6 @@ llvm.func @_FortranAAssign(!llvm.ptr, !llvm.ptr, !llvm.ptr, i32) -> !llvm.struct
 // CHECK:  call void @dealloc_foo_1(ptr %[[DESC_TO_DEALLOC]])
 // CHECK-NEXT: br label %[[CONT_BLOCK:.*]]
 // CHECK: [[CONT_BLOCK]]:
+// CHECK-NEXT: br label %[[EXIT_BLOCK:.*]]
+// CHECK: [[EXIT_BLOCK]]:
 // CHECK-NEXT: ret void

From 2c3db47eef2071b736223b2d963c78dca76ac180 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Fri, 12 Sep 2025 11:26:40 +0100
Subject: [PATCH 08/22] [MLIR][OpenMP] Refactor omp.target_allocmem to allow
 reuse, NFC

This patch moves tablegen definitions that could be used for all kinds of heap
allocations out of `omp.target_allocmem` and into a new
`OpenMP_HeapAllocClause` that can be reused.

Descriptions are updated to follow the format of most other operations and the
custom verifier for `omp.target_allocmem` is removed as it only made a
redundant check on its result type.
---
 .../mlir/Dialect/OpenMP/OpenMPClauses.td      |  53 ++++++
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  80 ++++-----
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 153 ++++++------------
 mlir/test/Dialect/OpenMP/invalid.mlir         |  14 ++
 mlir/test/Dialect/OpenMP/ops.mlir             |  24 +++
 5 files changed, 176 insertions(+), 148 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
index 8e43c4284d078..bfee763290757 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
@@ -20,6 +20,7 @@
 #define OPENMP_CLAUSES
 
 include "mlir/Dialect/OpenMP/OpenMPOpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/SymbolInterfaces.td"
 
 //===----------------------------------------------------------------------===//
@@ -547,6 +548,58 @@ class OpenMP_HasDeviceAddrClauseSkip<
 
 def OpenMP_HasDeviceAddrClause : OpenMP_HasDeviceAddrClauseSkip<>;
 
+//===----------------------------------------------------------------------===//
+// Not in the spec: Clause-like structure to hold heap allocation information.
+//===----------------------------------------------------------------------===//
+
+class OpenMP_HeapAllocClauseSkip<
+    bit traits = false, bit arguments = false, bit assemblyFormat = false,
+    bit description = false, bit extraClassDeclaration = false
+  > : OpenMP_Clause<traits, arguments, assemblyFormat, description,
+                    extraClassDeclaration> {
+  let traits = [
+    MemoryEffects<[MemAlloc<DefaultResource>]>
+  ];
+
+  let arguments = (ins
+    TypeAttr:$in_type,
+    OptionalAttr<StrAttr>:$uniq_name,
+    OptionalAttr<StrAttr>:$bindc_name,
+    Variadic<IntLikeType>:$typeparams,
+    Variadic<IntLikeType>:$shape
+  );
+
+  // The custom parser doesn't parse `uniq_name` and `bindc_name`. This is
+  // handled by the attr-dict, which must be present in the operation's
+  // `assemblyFormat`.
+  let reqAssemblyFormat = [{
+    custom<HeapAllocClause>($in_type, $typeparams, type($typeparams), $shape,
+                            type($shape))
+  }];
+
+  let extraClassDeclaration = [{
+    mlir::Type getAllocatedType() { return getInTypeAttr().getValue(); }
+  }];
+
+  let description = [{
+    The `in_type` is the type of the object for which memory is being allocated.
+    For arrays, this can be a static or dynamic array type.
+
+    The optional `uniq_name` is a unique name for the allocated memory.
+
+    The optional `bindc_name` is a name used for C interoperability.
+
+    The `typeparams` are runtime type parameters for polymorphic or
+    parameterized types. These are typically integer values that define aspects
+    of a type not fixed at compile time.
+
+    The `shape` holds runtime shape operands for dynamic arrays. Each operand is
+    an integer value representing the extent of a specific dimension.
+  }];
+}
+
+def OpenMP_HeapAllocClause : OpenMP_HeapAllocClauseSkip<>;
+
 //===----------------------------------------------------------------------===//
 // V5.2: [5.4.7] `inclusive` clause
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index ec7d99b86cc8b..303144c3c6626 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -2115,59 +2115,45 @@ def AllocateDirOp : OpenMP_Op<"allocate_dir", [AttrSizedOperandSegments], clause
 // TargetAllocMemOp
 //===----------------------------------------------------------------------===//
 
-def TargetAllocMemOp : OpenMP_Op<"target_allocmem",
-    [MemoryEffects<[MemAlloc<DefaultResource>]>, AttrSizedOperandSegments]> {
+def TargetAllocMemOp : OpenMP_Op<"target_allocmem", traits = [
+    AttrSizedOperandSegments
+  ], clauses = [
+    OpenMP_HeapAllocClause
+  ]> {
   let summary = "allocate storage on an openmp device for an object of a given type";
 
   let description = [{
-    Allocates memory on the specified OpenMP device for an object of the given type.
-    Returns an integer value representing the device pointer to the allocated memory.
-    The memory is uninitialized after allocation. Operations must be paired with 
-    `omp.target_freemem` to avoid memory leaks.
-
-    * `$device`: The integer ID of the OpenMP device where the memory will be allocated.
-    * `$in_type`: The type of the object for which memory is being allocated. 
-      For arrays, this can be a static or dynamic array type.
-    * `$uniq_name`: An optional unique name for the allocated memory.
-    * `$bindc_name`: An optional name used for C interoperability.
-    * `$typeparams`: Runtime type parameters for polymorphic or parameterized types. 
-      These are typically integer values that define aspects of a type not fixed at compile time.
-    * `$shape`: Runtime shape operands for dynamic arrays. 
-      Each operand is an integer value representing the extent of a specific dimension. 
-
-  ```mlir
-    // Allocate a static 3x3 integer vector on device 0
-    %device_0 = arith.constant 0 : i32
-    %ptr_static = omp.target_allocmem %device_0 : i32, vector<3x3xi32>
-    // ... use %ptr_static ...
-    omp.target_freemem %device_0, %ptr_static : i32, i64
-
-    // Allocate a dynamic 2D Fortran array (fir.array) on device 1
-    %device_1 = arith.constant 1 : i32
-    %rows = arith.constant 10 : index
-    %cols = arith.constant 20 : index
-    %ptr_dynamic = omp.target_allocmem %device_1 : i32, !fir.array<?x?xf32>, %rows, %cols : index, index
-    // ... use %ptr_dynamic ...
-    omp.target_freemem %device_1, %ptr_dynamic : i32, i64
-  ```
-  }];
+    Allocates memory on the specified OpenMP device for an object of the given
+    type. Returns an integer value representing the device pointer to the
+    allocated memory. The memory is uninitialized after allocation. Operations
+    must be paired with  `omp.target_freemem` to avoid memory leaks.
 
-  let arguments = (ins
-    Arg<AnyInteger>:$device,
-    TypeAttr:$in_type,
-    OptionalAttr<StrAttr>:$uniq_name,
-    OptionalAttr<StrAttr>:$bindc_name,
-    Variadic<IntLikeType>:$typeparams,
-    Variadic<IntLikeType>:$shape
-  );
-  let results = (outs I64);
+    ```mlir
+      // Allocate a static 3x3 integer vector on device 0
+      %device_0 = arith.constant 0 : i32
+      %ptr_static = omp.target_allocmem %device_0 : i32, vector<3x3xi32>
+      // ... use %ptr_static ...
+      omp.target_freemem %device_0, %ptr_static : i32, i64
+
+      // Allocate a dynamic 2D Fortran array (fir.array) on device 1
+      %device_1 = arith.constant 1 : i32
+      %rows = arith.constant 10 : index
+      %cols = arith.constant 20 : index
+      %ptr_dynamic = omp.target_allocmem %device_1 : i32, !fir.array<?x?xf32>, %rows, %cols : index, index
+      // ... use %ptr_dynamic ...
+      omp.target_freemem %device_1, %ptr_dynamic : i32, i64
+    ```
 
-  let hasCustomAssemblyFormat = 1;
-  let hasVerifier = 1;
+    The `device` is an integer ID of the OpenMP device where the memory will be
+    allocated.
+  }] # clausesDescription;
 
-  let extraClassDeclaration = [{
-    mlir::Type getAllocatedType();
-  }];
+  let arguments = !con((ins Arg<AnyInteger>:$device), clausesArgs);
+  let results = (outs I64);
+
+  // Override inherited assembly format to include `device`.
+  let assemblyFormat = " $device `:` type($device) `,` "
+                     # clausesReqAssemblyFormat # " attr-dict";
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 1ea34e6c898ab..f6455de8c6b76 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -797,6 +797,58 @@ static void printNumTasksClause(OpAsmPrinter &p, Operation *op,
       p, op, numTasksMod, numTasks, numTasksType, &stringifyClauseNumTasksType);
 }
 
+//===----------------------------------------------------------------------===//
+// Parser and printer for Heap Alloc Clause
+//===----------------------------------------------------------------------===//
+
+/// operation ::= $in_type ( `(` $typeparams `)` )? ( `,` $shape )?
+static ParseResult parseHeapAllocClause(
+    OpAsmParser &parser, TypeAttr &inTypeAttr,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &typeparams,
+    SmallVectorImpl<Type> &typeparamsTypes,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &shape,
+    SmallVectorImpl<Type> &shapeTypes) {
+  mlir::Type inType;
+  if (parser.parseType(inType))
+    return mlir::failure();
+  inTypeAttr = TypeAttr::get(inType);
+
+  if (!parser.parseOptionalLParen()) {
+    // parse the LEN params of the derived type. (<params> : <types>)
+    if (parser.parseOperandList(typeparams, OpAsmParser::Delimiter::None) ||
+        parser.parseColonTypeList(typeparamsTypes) || parser.parseRParen())
+      return failure();
+  }
+
+  if (!parser.parseOptionalComma()) {
+    // parse size to scale by, vector of n dimensions of type index
+    if (parser.parseOperandList(shape, OpAsmParser::Delimiter::None))
+      return failure();
+
+    // TODO: This overrides the actual types of the operands, which might cause
+    // issues when they don't match. At the moment this is done in place of
+    // making the corresponding operand type `Variadic<Index>` because index
+    // types are lowered to I64 prior to LLVM IR translation.
+    shapeTypes.append(shape.size(), IndexType::get(parser.getContext()));
+  }
+
+  return success();
+}
+
+static void printHeapAllocClause(OpAsmPrinter &p, Operation *op,
+                                 TypeAttr inType, ValueRange typeparams,
+                                 TypeRange typeparamsTypes, ValueRange shape,
+                                 TypeRange shapeTypes) {
+  p << inType;
+  if (!typeparams.empty()) {
+    p << '(' << typeparams << " : " << typeparamsTypes << ')';
+  }
+  for (auto sh : shape) {
+    p << ", ";
+    p.printOperand(sh);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Parsers for operations including clauses that define entry block arguments.
 //===----------------------------------------------------------------------===//
@@ -4284,107 +4336,6 @@ LogicalResult AllocateDirOp::verify() {
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// TargetAllocMemOp
-//===----------------------------------------------------------------------===//
-
-mlir::Type omp::TargetAllocMemOp::getAllocatedType() {
-  return getInTypeAttr().getValue();
-}
-
-/// operation ::= %res = (`omp.target_alloc_mem`) $device : devicetype,
-///                      $in_type ( `(` $typeparams `)` )? ( `,` $shape )?
-///                      attr-dict-without-keyword
-static mlir::ParseResult parseTargetAllocMemOp(mlir::OpAsmParser &parser,
-                                               mlir::OperationState &result) {
-  auto &builder = parser.getBuilder();
-  bool hasOperands = false;
-  std::int32_t typeparamsSize = 0;
-
-  // Parse device number as a new operand
-  mlir::OpAsmParser::UnresolvedOperand deviceOperand;
-  mlir::Type deviceType;
-  if (parser.parseOperand(deviceOperand) || parser.parseColonType(deviceType))
-    return mlir::failure();
-  if (parser.resolveOperand(deviceOperand, deviceType, result.operands))
-    return mlir::failure();
-  if (parser.parseComma())
-    return mlir::failure();
-
-  mlir::Type intype;
-  if (parser.parseType(intype))
-    return mlir::failure();
-  result.addAttribute("in_type", mlir::TypeAttr::get(intype));
-  llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand> operands;
-  llvm::SmallVector<mlir::Type> typeVec;
-  if (!parser.parseOptionalLParen()) {
-    // parse the LEN params of the derived type. (<params> : <types>)
-    if (parser.parseOperandList(operands, mlir::OpAsmParser::Delimiter::None) ||
-        parser.parseColonTypeList(typeVec) || parser.parseRParen())
-      return mlir::failure();
-    typeparamsSize = operands.size();
-    hasOperands = true;
-  }
-  std::int32_t shapeSize = 0;
-  if (!parser.parseOptionalComma()) {
-    // parse size to scale by, vector of n dimensions of type index
-    if (parser.parseOperandList(operands, mlir::OpAsmParser::Delimiter::None))
-      return mlir::failure();
-    shapeSize = operands.size() - typeparamsSize;
-    auto idxTy = builder.getIndexType();
-    for (std::int32_t i = typeparamsSize, end = operands.size(); i != end; ++i)
-      typeVec.push_back(idxTy);
-    hasOperands = true;
-  }
-  if (hasOperands &&
-      parser.resolveOperands(operands, typeVec, parser.getNameLoc(),
-                             result.operands))
-    return mlir::failure();
-
-  mlir::Type restype = builder.getIntegerType(64);
-  if (!restype) {
-    parser.emitError(parser.getNameLoc(), "invalid allocate type: ") << intype;
-    return mlir::failure();
-  }
-  llvm::SmallVector<std::int32_t> segmentSizes{1, typeparamsSize, shapeSize};
-  result.addAttribute("operandSegmentSizes",
-                      builder.getDenseI32ArrayAttr(segmentSizes));
-  if (parser.parseOptionalAttrDict(result.attributes) ||
-      parser.addTypeToList(restype, result.types))
-    return mlir::failure();
-  return mlir::success();
-}
-
-mlir::ParseResult omp::TargetAllocMemOp::parse(mlir::OpAsmParser &parser,
-                                               mlir::OperationState &result) {
-  return parseTargetAllocMemOp(parser, result);
-}
-
-void omp::TargetAllocMemOp::print(mlir::OpAsmPrinter &p) {
-  p << " ";
-  p.printOperand(getDevice());
-  p << " : ";
-  p << getDevice().getType();
-  p << ", ";
-  p << getInType();
-  if (!getTypeparams().empty()) {
-    p << '(' << getTypeparams() << " : " << getTypeparams().getTypes() << ')';
-  }
-  for (auto sh : getShape()) {
-    p << ", ";
-    p.printOperand(sh);
-  }
-  p.printOptionalAttrDict((*this)->getAttrs(),
-                          {"in_type", "operandSegmentSizes"});
-}
-
-llvm::LogicalResult omp::TargetAllocMemOp::verify() {
-  mlir::Type outType = getType();
-  if (!mlir::dyn_cast<IntegerType>(outType))
-    return emitOpError("must be a integer type");
-  return mlir::success();
-}
-
 //===----------------------------------------------------------------------===//
 // WorkdistributeOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 6777a03a4f026..385b89fd1dc7c 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -3133,3 +3133,17 @@ func.func @invalid_workdistribute() -> () {
   }
   return
 }
+
+// -----
+func.func @target_allocmem_invalid_uniq_name(%device : i32) -> () {
+// expected-error @below {{op attribute 'uniq_name' failed to satisfy constraint: string attribute}}
+  %0 = omp.target_allocmem %device : i32, i64 {uniq_name=2}
+  return
+}
+
+// -----
+func.func @target_allocmem_invalid_bindc_name(%device : i32) -> () {
+// expected-error @below {{op attribute 'bindc_name' failed to satisfy constraint: string attribute}}
+  %0 = omp.target_allocmem %device : i32, i64 {bindc_name=2}
+  return
+}
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index ac29e20907b55..3ab439f30b76f 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -3367,3 +3367,27 @@ func.func @omp_target_map_clause_type_test(%arg0 : memref<?xi32>) -> () {
 
     return
 }
+
+// CHECK-LABEL: func.func @omp_target_allocmem(
+// CHECK-SAME: %[[DEVICE:.*]]: i32, %[[X:.*]]: index, %[[Y:.*]]: index, %[[Z:.*]]: i32) {
+func.func @omp_target_allocmem(%device: i32, %x: index, %y: index, %z: i32) {
+  // CHECK: %{{.*}} = omp.target_allocmem %[[DEVICE]] : i32, i64
+  %0 = omp.target_allocmem %device : i32, i64
+  // CHECK: %{{.*}} = omp.target_allocmem %[[DEVICE]] : i32, vector<16x16xf32> {bindc_name = "bindc", uniq_name = "uniq"}
+  %1 = omp.target_allocmem %device : i32, vector<16x16xf32> {uniq_name="uniq", bindc_name="bindc"}
+  // CHECK: %{{.*}} = omp.target_allocmem %[[DEVICE]] : i32, !llvm.ptr(%[[X]], %[[Y]], %[[Z]] : index, index, i32)
+  %2 = omp.target_allocmem %device : i32, !llvm.ptr(%x, %y, %z : index, index, i32)
+  // CHECK: %{{.*}} = omp.target_allocmem %[[DEVICE]] : i32, !llvm.ptr, %[[X]], %[[Y]]
+  %3 = omp.target_allocmem %device : i32, !llvm.ptr, %x, %y
+  // CHECK: %{{.*}} = omp.target_allocmem %[[DEVICE]] : i32, !llvm.ptr(%[[X]], %[[Y]], %[[Z]] : index, index, i32), %[[X]], %[[Y]]
+  %4 = omp.target_allocmem %device : i32, !llvm.ptr(%x, %y, %z : index, index, i32), %x, %y
+  return
+}
+
+// CHECK-LABEL: func.func @omp_target_freemem(
+// CHECK-SAME: %[[DEVICE:.*]]: i32, %[[PTR:.*]]: i64) {
+func.func @omp_target_freemem(%device : i32, %ptr : i64) {
+  // CHECK: omp.target_freemem %[[DEVICE]], %[[PTR]] : i32, i64
+  omp.target_freemem %device, %ptr : i32, i64
+  return
+}

From bda01aec8b263a15e20c35f8cc08ac0f15414116 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Fri, 12 Sep 2025 15:56:04 +0100
Subject: [PATCH 09/22] [Flang][MLIR][OpenMP] Add explicit shared memory
 (de-)allocation ops

This patch introduces the `omp.alloc_shared_mem` and `omp.free_shared_mem`
operations to represent explicit allocations and deallocations of shared memory
across threads in a team, mirroring the existing `omp.target_allocmem` and
`omp.target_freemem`.

The `omp.alloc_shared_mem` op goes through the same Flang-specific
transformations as `omp.target_allocmem`, so that the size of the buffer can be
properly calculated when translating to LLVM IR.

The corresponding runtime functions produced for these new operations are
`__kmpc_alloc_shared` and `__kmpc_free_shared`, which previously could only be
created for implicit allocations (e.g. privatized and reduction variables).
---
 flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp | 42 ++++++++-----
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       | 23 +++++++
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 29 ++++++---
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 62 ++++++++++++++++++
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 22 +++++++
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 63 ++++++++++++++++---
 mlir/test/Dialect/OpenMP/invalid.mlir         | 28 +++++++++
 mlir/test/Dialect/OpenMP/ops.mlir             | 31 ++++++++-
 8 files changed, 265 insertions(+), 35 deletions(-)

diff --git a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
index f74d635d50a75..ea0f7dff9f99e 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
@@ -222,36 +222,47 @@ static mlir::Type convertObjectType(const fir::LLVMTypeConverter &converter,
   return converter.convertType(firType);
 }
 
-// FIR Op specific conversion for TargetAllocMemOp
-struct TargetAllocMemOpConversion
-    : public OpenMPFIROpConversion<mlir::omp::TargetAllocMemOp> {
-  using OpenMPFIROpConversion::OpenMPFIROpConversion;
+// FIR Op specific conversion for allocation operations
+template <typename T>
+struct AllocMemOpConversion : public OpenMPFIROpConversion<T> {
+  using OpenMPFIROpConversion<T>::OpenMPFIROpConversion;
 
   llvm::LogicalResult
-  matchAndRewrite(mlir::omp::TargetAllocMemOp allocmemOp, OpAdaptor adaptor,
+  matchAndRewrite(T allocmemOp,
+                  typename OpenMPFIROpConversion<T>::OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     mlir::Type heapTy = allocmemOp.getAllocatedType();
     mlir::Location loc = allocmemOp.getLoc();
-    auto ity = lowerTy().indexType();
+    auto ity = OpenMPFIROpConversion<T>::lowerTy().indexType();
     mlir::Type dataTy = fir::unwrapRefType(heapTy);
-    mlir::Type llvmObjectTy = convertObjectType(lowerTy(), dataTy);
+    mlir::Type llvmObjectTy =
+        convertObjectType(OpenMPFIROpConversion<T>::lowerTy(), dataTy);
     if (fir::isRecordWithTypeParameters(fir::unwrapSequenceType(dataTy)))
-      TODO(loc, "omp.target_allocmem codegen of derived type with length "
-                "parameters");
+      TODO(loc, allocmemOp->getName().getStringRef() +
+                    " codegen of derived type with length parameters");
     mlir::Value size = fir::computeElementDistance(
-        loc, llvmObjectTy, ity, rewriter, lowerTy().getDataLayout());
+        loc, llvmObjectTy, ity, rewriter,
+        OpenMPFIROpConversion<T>::lowerTy().getDataLayout());
     if (auto scaleSize = fir::genAllocationScaleSize(
             loc, allocmemOp.getInType(), ity, rewriter))
       size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, scaleSize);
-    for (mlir::Value opnd : adaptor.getOperands().drop_front())
+    for (mlir::Value opnd : adaptor.getTypeparams())
+      size = mlir::LLVM::MulOp::create(
+          rewriter, loc, ity, size,
+          integerCast(OpenMPFIROpConversion<T>::lowerTy(), loc, rewriter, ity,
+                      opnd));
+    for (mlir::Value opnd : adaptor.getShape())
       size = mlir::LLVM::MulOp::create(
           rewriter, loc, ity, size,
-          integerCast(lowerTy(), loc, rewriter, ity, opnd));
-    auto mallocTyWidth = lowerTy().getIndexTypeBitwidth();
+          integerCast(OpenMPFIROpConversion<T>::lowerTy(), loc, rewriter, ity,
+                      opnd));
+    auto mallocTyWidth =
+        OpenMPFIROpConversion<T>::lowerTy().getIndexTypeBitwidth();
     auto mallocTy =
         mlir::IntegerType::get(rewriter.getContext(), mallocTyWidth);
     if (mallocTyWidth != ity.getIntOrFloatBitWidth())
-      size = integerCast(lowerTy(), loc, rewriter, mallocTy, size);
+      size = integerCast(OpenMPFIROpConversion<T>::lowerTy(), loc, rewriter,
+                         mallocTy, size);
     rewriter.modifyOpInPlace(allocmemOp, [&]() {
       allocmemOp.setInType(rewriter.getI8Type());
       allocmemOp.getTypeparamsMutable().clear();
@@ -266,5 +277,6 @@ void fir::populateOpenMPFIRToLLVMConversionPatterns(
     const LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns) {
   patterns.add<MapInfoOpConversion>(converter);
   patterns.add<PrivateClauseOpConversion>(converter);
-  patterns.add<TargetAllocMemOpConversion>(converter);
+  patterns.add<AllocMemOpConversion<mlir::omp::TargetAllocMemOp>,
+               AllocMemOpConversion<mlir::omp::AllocSharedMemOp>>(converter);
 }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index f5e890e7052f0..5e90417836f4a 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2961,6 +2961,17 @@ class OpenMPIRBuilder {
   LLVM_ABI CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
                                    Value *Allocator, std::string Name = "");
 
+  /// Create a runtime call for kmpc_alloc_shared.
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param Size Size of allocated memory space.
+  /// \param Name Name of call Instruction.
+  ///
+  /// \returns CallInst to the kmpc_alloc_shared call.
+  LLVM_ABI CallInst *createOMPAllocShared(const LocationDescription &Loc,
+                                          Value *Size,
+                                          const Twine &Name = Twine(""));
+
   /// Create a runtime call for kmpc_alloc_shared.
   ///
   /// \param Loc The insert and source location description.
@@ -2972,6 +2983,18 @@ class OpenMPIRBuilder {
                                           Type *VarType,
                                           const Twine &Name = Twine(""));
 
+  /// Create a runtime call for kmpc_free_shared.
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param Addr Value obtained from the corresponding kmpc_alloc_shared call.
+  /// \param Size Size of allocated memory space.
+  /// \param Name Name of call Instruction.
+  ///
+  /// \returns CallInst to the kmpc_free_shared call.
+  LLVM_ABI CallInst *createOMPFreeShared(const LocationDescription &Loc,
+                                         Value *Addr, Value *Size,
+                                         const Twine &Name = Twine(""));
+
   /// Create a runtime call for kmpc_free_shared.
   ///
   /// \param Loc The insert and source location description.
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 75295d45cb958..73c751f2851b7 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6891,32 +6891,45 @@ CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
 }
 
 CallInst *OpenMPIRBuilder::createOMPAllocShared(const LocationDescription &Loc,
-                                                Type *VarType,
+                                                Value *Size,
                                                 const Twine &Name) {
   IRBuilder<>::InsertPointGuard IPG(Builder);
   updateToLocation(Loc);
 
-  const DataLayout &DL = M.getDataLayout();
-  Value *Args[] = {Builder.getInt64(DL.getTypeStoreSize(VarType))};
+  Value *Args[] = {Size};
   Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc_shared);
   CallInst *Call = Builder.CreateCall(Fn, Args, Name);
-  Call->addRetAttr(
-      Attribute::getWithAlignment(M.getContext(), DL.getPrefTypeAlign(Int64)));
+  Call->addRetAttr(Attribute::getWithAlignment(
+      M.getContext(), M.getDataLayout().getPrefTypeAlign(Int64)));
   return Call;
 }
 
+CallInst *OpenMPIRBuilder::createOMPAllocShared(const LocationDescription &Loc,
+                                                Type *VarType,
+                                                const Twine &Name) {
+  return createOMPAllocShared(
+      Loc, Builder.getInt64(M.getDataLayout().getTypeStoreSize(VarType)), Name);
+}
+
 CallInst *OpenMPIRBuilder::createOMPFreeShared(const LocationDescription &Loc,
-                                               Value *Addr, Type *VarType,
+                                               Value *Addr, Value *Size,
                                                const Twine &Name) {
   IRBuilder<>::InsertPointGuard IPG(Builder);
   updateToLocation(Loc);
 
-  Value *Args[] = {
-      Addr, Builder.getInt64(M.getDataLayout().getTypeStoreSize(VarType))};
+  Value *Args[] = {Addr, Size};
   Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free_shared);
   return Builder.CreateCall(Fn, Args, Name);
 }
 
+CallInst *OpenMPIRBuilder::createOMPFreeShared(const LocationDescription &Loc,
+                                               Value *Addr, Type *VarType,
+                                               const Twine &Name) {
+  return createOMPFreeShared(
+      Loc, Addr, Builder.getInt64(M.getDataLayout().getTypeStoreSize(VarType)),
+      Name);
+}
+
 CallInst *OpenMPIRBuilder::createOMPInteropInit(
     const LocationDescription &Loc, Value *InteropVar,
     omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 303144c3c6626..414a61d949914 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -2189,6 +2189,68 @@ def TargetFreeMemOp : OpenMP_Op<"target_freemem",
   Arg<I64, "", [MemFree]>:$heapref
   );
   let assemblyFormat = "$device `,` $heapref attr-dict `:` type($device) `,` qualified(type($heapref))";
+  let hasVerifier = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// AllocSharedMemOp
+//===----------------------------------------------------------------------===//
+
+def AllocSharedMemOp : OpenMP_Op<"alloc_shared_mem", traits = [
+    AttrSizedOperandSegments
+  ], clauses = [
+    OpenMP_HeapAllocClause
+  ]> {
+  let summary = "allocate storage on shared memory for an object of a given type";
+
+  let description = [{
+    Allocates memory shared across threads of a team for an object of the given
+    type. Returns a pointer representing the allocated memory. The memory is
+    uninitialized after allocation. Operations must be paired with
+    `omp.free_shared` to avoid memory leaks.
+
+    ```mlir
+      // Allocate a static 3x3 integer vector.
+      %ptr_shared = omp.alloc_shared_mem vector<3x3xi32> : !llvm.ptr
+      // ...
+      omp.free_shared_mem %ptr_shared : !llvm.ptr
+    ```
+  }] # clausesDescription;
+
+  let results = (outs OpenMP_PointerLikeType);
+  let assemblyFormat = clausesAssemblyFormat # " attr-dict `:` type(results)";
+}
+
+//===----------------------------------------------------------------------===//
+// FreeSharedMemOp
+//===----------------------------------------------------------------------===//
+
+def FreeSharedMemOp : OpenMP_Op<"free_shared_mem", [MemoryEffects<[MemFree]>]> {
+  let summary = "free shared memory";
+
+  let description = [{
+    Deallocates shared memory that was previously allocated by an
+    `omp.alloc_shared_mem` operation. After this operation, the deallocated
+    memory is in an undefined state and should not be accessed.
+    It is crucial to ensure that all accesses to the memory region are completed
+    before `omp.alloc_shared_mem` is called to avoid undefined behavior.
+
+    ```mlir
+      // Example of allocating and freeing shared memory.
+      %ptr_shared = omp.alloc_shared_mem vector<3x3xi32> : !llvm.ptr
+      // ...
+      omp.free_shared_mem %ptr_shared : !llvm.ptr
+    ```
+
+    The `heapref` operand represents the pointer to shared memory to be
+    deallocated, previously returned by `omp.alloc_shared_mem`.
+  }];
+
+  let arguments = (ins
+    Arg<OpenMP_PointerLikeType, "", [MemFree]>:$heapref
+  );
+  let assemblyFormat = "$heapref attr-dict `:` type($heapref)";
+  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index f6455de8c6b76..8b92f942d05e5 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -4336,6 +4336,28 @@ LogicalResult AllocateDirOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// TargetFreeMemOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult TargetFreeMemOp::verify() {
+  return getHeapref().getDefiningOp<TargetAllocMemOp>()
+             ? success()
+             : emitOpError() << "'heapref' operand must be defined by an "
+                                "'omp.target_allocmem' op";
+}
+
+//===----------------------------------------------------------------------===//
+// FreeSharedMemOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult FreeSharedMemOp::verify() {
+  return getHeapref().getDefiningOp<AllocSharedMemOp>()
+             ? success()
+             : emitOpError() << "'heapref' operand must be defined by an "
+                                "'omp.alloc_shared_memory' op";
+}
+
 //===----------------------------------------------------------------------===//
 // WorkdistributeOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 0c88110a7e2e0..5151bc63e6a04 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -6676,8 +6676,7 @@ static bool isHostDeviceOp(Operation *op) {
   if (op->getParentOfType<omp::TargetOp>())
     return false;
 
-  if (mlir::isa<omp::TargetAllocMemOp>(op) ||
-      mlir::isa<omp::TargetFreeMemOp>(op))
+  if (mlir::isa<omp::TargetAllocMemOp, omp::TargetFreeMemOp>(op))
     return false;
 
   if (auto parentFn = op->getParentOfType<LLVM::LLVMFuncOp>()) {
@@ -6707,6 +6706,21 @@ static llvm::Function *getOmpTargetAlloc(llvm::IRBuilderBase &builder,
   return func;
 }
 
+static llvm::Value *
+getAllocationSize(llvm::IRBuilderBase &builder,
+                  LLVM::ModuleTranslation &moduleTranslation, Type allocatedTy,
+                  OperandRange typeparams, OperandRange shape) {
+  llvm::DataLayout dataLayout =
+      moduleTranslation.getLLVMModule()->getDataLayout();
+  llvm::Type *llvmHeapTy = moduleTranslation.convertType(allocatedTy);
+  llvm::TypeSize typeSize = dataLayout.getTypeStoreSize(llvmHeapTy);
+  llvm::Value *allocSize = builder.getInt64(typeSize.getFixedValue());
+  for (auto typeParam : typeparams)
+    allocSize =
+        builder.CreateMul(allocSize, moduleTranslation.lookupValue(typeParam));
+  return allocSize;
+}
+
 static LogicalResult
 convertTargetAllocMemOp(Operation &opInst, llvm::IRBuilderBase &builder,
                         LLVM::ModuleTranslation &moduleTranslation) {
@@ -6721,14 +6735,9 @@ convertTargetAllocMemOp(Operation &opInst, llvm::IRBuilderBase &builder,
   mlir::Value deviceNum = allocMemOp.getDevice();
   llvm::Value *llvmDeviceNum = moduleTranslation.lookupValue(deviceNum);
   // Get the allocation size.
-  llvm::DataLayout dataLayout = llvmModule->getDataLayout();
-  mlir::Type heapTy = allocMemOp.getAllocatedType();
-  llvm::Type *llvmHeapTy = moduleTranslation.convertType(heapTy);
-  llvm::TypeSize typeSize = dataLayout.getTypeStoreSize(llvmHeapTy);
-  llvm::Value *allocSize = builder.getInt64(typeSize.getFixedValue());
-  for (auto typeParam : allocMemOp.getTypeparams())
-    allocSize =
-        builder.CreateMul(allocSize, moduleTranslation.lookupValue(typeParam));
+  llvm::Value *allocSize = getAllocationSize(
+      builder, moduleTranslation, allocMemOp.getAllocatedType(),
+      allocMemOp.getTypeparams(), allocMemOp.getShape());
   // Create call to "omp_target_alloc" with the args as translated llvm values.
   llvm::CallInst *call =
       builder.CreateCall(ompTargetAllocFunc, {allocSize, llvmDeviceNum});
@@ -6739,6 +6748,19 @@ convertTargetAllocMemOp(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
+static LogicalResult
+convertAllocSharedMemOp(omp::AllocSharedMemOp allocMemOp,
+                        llvm::IRBuilderBase &builder,
+                        LLVM::ModuleTranslation &moduleTranslation) {
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  llvm::Value *size = getAllocationSize(
+      builder, moduleTranslation, allocMemOp.getAllocatedType(),
+      allocMemOp.getTypeparams(), allocMemOp.getShape());
+  moduleTranslation.mapValue(allocMemOp.getResult(),
+                             ompBuilder->createOMPAllocShared(builder, size));
+  return success();
+}
+
 static llvm::Function *getOmpTargetFree(llvm::IRBuilderBase &builder,
                                         llvm::Module *llvmModule) {
   llvm::Type *ptrTy = builder.getPtrTy(0);
@@ -6774,6 +6796,21 @@ convertTargetFreeMemOp(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
+static LogicalResult
+convertFreeSharedMemOp(omp::FreeSharedMemOp freeMemOp,
+                       llvm::IRBuilderBase &builder,
+                       LLVM::ModuleTranslation &moduleTranslation) {
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  auto allocMemOp =
+      freeMemOp.getHeapref().getDefiningOp<omp::AllocSharedMemOp>();
+  llvm::Value *size = getAllocationSize(
+      builder, moduleTranslation, allocMemOp.getAllocatedType(),
+      allocMemOp.getTypeparams(), allocMemOp.getShape());
+  ompBuilder->createOMPFreeShared(
+      builder, moduleTranslation.lookupValue(freeMemOp.getHeapref()), size);
+  return success();
+}
+
 /// Given an OpenMP MLIR operation, create the corresponding LLVM IR
 /// (including OpenMP runtime calls).
 LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
@@ -6964,6 +7001,12 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
           .Case([&](omp::TargetFreeMemOp) {
             return convertTargetFreeMemOp(*op, builder, moduleTranslation);
           })
+          .Case([&](omp::AllocSharedMemOp op) {
+            return convertAllocSharedMemOp(op, builder, moduleTranslation);
+          })
+          .Case([&](omp::FreeSharedMemOp op) {
+            return convertFreeSharedMemOp(op, builder, moduleTranslation);
+          })
           .Default([&](Operation *inst) {
             return inst->emitError()
                    << "not yet implemented: " << inst->getName();
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 385b89fd1dc7c..c9d0a413aaa38 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -3147,3 +3147,31 @@ func.func @target_allocmem_invalid_bindc_name(%device : i32) -> () {
   %0 = omp.target_allocmem %device : i32, i64 {bindc_name=2}
   return
 }
+
+// -----
+func.func @target_freemem_invalid_ptr(%device : i32, %ptr : i64) -> () {
+  // expected-error @below {{op 'heapref' operand must be defined by an 'omp.target_allocmem' op}}
+  omp.target_freemem %device, %ptr : i32, i64
+  return
+}
+
+// -----
+func.func @alloc_shared_mem_invalid_uniq_name() -> () {
+  // expected-error @below {{op attribute 'uniq_name' failed to satisfy constraint: string attribute}}
+  %0 = omp.alloc_shared_mem i64 {uniq_name=2}
+  return
+}
+
+// -----
+func.func @alloc_shared_mem_invalid_bindc_name() -> () {
+  // expected-error @below {{op attribute 'bindc_name' failed to satisfy constraint: string attribute}}
+  %0 = omp.alloc_shared_mem i64 {bindc_name=2}
+  return
+}
+
+// -----
+func.func @free_shared_mem_invalid_ptr(%ptr : !llvm.ptr) -> () {
+  // expected-error @below {{op 'heapref' operand must be defined by an 'omp.alloc_shared_memory' op}}
+  omp.free_shared_mem %ptr : !llvm.ptr
+  return
+}
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 3ab439f30b76f..b9bf8c3f39468 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -3385,9 +3385,36 @@ func.func @omp_target_allocmem(%device: i32, %x: index, %y: index, %z: i32) {
 }
 
 // CHECK-LABEL: func.func @omp_target_freemem(
-// CHECK-SAME: %[[DEVICE:.*]]: i32, %[[PTR:.*]]: i64) {
-func.func @omp_target_freemem(%device : i32, %ptr : i64) {
+// CHECK-SAME: %[[DEVICE:.*]]: i32) {
+func.func @omp_target_freemem(%device : i32) {
+  // CHECK: %[[PTR:.*]] = omp.target_allocmem
+  %ptr = omp.target_allocmem %device : i32, i64
   // CHECK: omp.target_freemem %[[DEVICE]], %[[PTR]] : i32, i64
   omp.target_freemem %device, %ptr : i32, i64
   return
 }
+
+// CHECK-LABEL: func.func @omp_alloc_shared_mem(
+// CHECK-SAME: %[[X:.*]]: index, %[[Y:.*]]: index, %[[Z:.*]]: i32) {
+func.func @omp_alloc_shared_mem(%x: index, %y: index, %z: i32) {
+  // CHECK: %{{.*}} = omp.alloc_shared_mem i64 : !llvm.ptr
+  %0 = omp.alloc_shared_mem i64 : !llvm.ptr
+  // CHECK: %{{.*}} = omp.alloc_shared_mem vector<16x16xf32> {bindc_name = "bindc", uniq_name = "uniq"} : !llvm.ptr
+  %1 = omp.alloc_shared_mem vector<16x16xf32> {uniq_name="uniq", bindc_name="bindc"} : !llvm.ptr
+  // CHECK: %{{.*}} = omp.alloc_shared_mem !llvm.ptr(%[[X]], %[[Y]], %[[Z]] : index, index, i32) : !llvm.ptr
+  %2 = omp.alloc_shared_mem !llvm.ptr(%x, %y, %z : index, index, i32) : !llvm.ptr
+  // CHECK: %{{.*}} = omp.alloc_shared_mem !llvm.ptr, %[[X]], %[[Y]] : !llvm.ptr
+  %3 = omp.alloc_shared_mem !llvm.ptr, %x, %y : !llvm.ptr
+  // CHECK: %{{.*}} = omp.alloc_shared_mem !llvm.ptr(%[[X]], %[[Y]], %[[Z]] : index, index, i32), %[[X]], %[[Y]] : !llvm.ptr
+  %4 = omp.alloc_shared_mem !llvm.ptr(%x, %y, %z : index, index, i32), %x, %y : !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: func.func @omp_free_shared_mem() {
+func.func @omp_free_shared_mem() {
+  // CHECK: %[[PTR:.*]] = omp.alloc_shared_mem
+  %0 = omp.alloc_shared_mem i64 : !llvm.ptr
+  // CHECK: omp.free_shared_mem %[[PTR]] : !llvm.ptr
+  omp.free_shared_mem %0 : !llvm.ptr
+  return
+}

From 4ebdc72b904c83b0c6aa68329aace117bb5812cd Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Tue, 16 Sep 2025 13:45:43 +0100
Subject: [PATCH 10/22] [Flang][OpenMP] Add pass to replace allocas with device
 shared memory

This patch introduces a new Flang OpenMP MLIR pass, only ran for target device
modules, that identifies `fir.alloca` operations that should use device shared
memory and replaces them with pairs of `omp.alloc_shared_mem` and
`omp.free_shared_mem` operations.

This works in conjunction to the MLIR to LLVM IR translation pass' handling of
privatization, mapping and reductions in the OpenMP dialect to properly select
the right memory space for allocations based on where they are made and where
they are used.

This pass, in particular, handles explicit stack allocations in MLIR, whereas
the aforementioned translation pass takes care of implicit ones represented by
entry block arguments.
---
 .../include/flang/Optimizer/OpenMP/Passes.td  |  17 ++
 flang/lib/Optimizer/OpenMP/CMakeLists.txt     |   1 +
 flang/lib/Optimizer/OpenMP/StackToShared.cpp  | 162 +++++++++++++
 flang/lib/Optimizer/Passes/Pipelines.cpp      |   1 +
 .../Transforms/OpenMP/stack-to-shared.mlir    | 215 ++++++++++++++++++
 5 files changed, 396 insertions(+)
 create mode 100644 flang/lib/Optimizer/OpenMP/StackToShared.cpp
 create mode 100644 flang/test/Transforms/OpenMP/stack-to-shared.mlir

diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td
index 8d30f165dd8b6..a024849ed04b2 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.td
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.td
@@ -149,4 +149,21 @@ def AutomapToTargetDataPass
   let dependentDialects = ["mlir::omp::OpenMPDialect"];
 }
 
+def StackToSharedPass : Pass<"omp-stack-to-shared", "mlir::func::FuncOp"> {
+  let summary = "Replaces stack allocations with shared memory.";
+  let description = [{
+    `fir.alloca` operations defining values in a target region and then used
+    inside of an `omp.parallel` region are replaced by this pass with
+    `omp.alloc_shared_mem` and `omp.free_shared_mem`. This is also done for
+    top-level function `fir.alloca`s used in the same way when the parent
+    function is a target device function.
+
+    This ensures that explicit private allocations, intended to be shared across
+    threads, use the proper memory space on a target device while supporting the
+    case of parallel regions indirectly reached from within a target region via
+    function calls.
+  }];
+  let dependentDialects = ["mlir::omp::OpenMPDialect"];
+}
+
 #endif //FORTRAN_OPTIMIZER_OPENMP_PASSES
diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index 304333fa8830e..5a72fe47fa0e7 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -13,6 +13,7 @@ add_flang_library(FlangOpenMPTransforms
   LowerWorkshare.cpp
   LowerNontemporal.cpp
   SimdOnly.cpp
+  StackToShared.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/OpenMP/StackToShared.cpp b/flang/lib/Optimizer/OpenMP/StackToShared.cpp
new file mode 100644
index 0000000000000..e666e2ed8f9b9
--- /dev/null
+++ b/flang/lib/Optimizer/OpenMP/StackToShared.cpp
@@ -0,0 +1,162 @@
+//===- StackToShared.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements transforms to swap stack allocations on the target
+// device with device shared memory where applicable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/OpenMP/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/Dialect/OpenMP/OpenMPInterfaces.h"
+
+namespace flangomp {
+#define GEN_PASS_DEF_STACKTOSHAREDPASS
+#include "flang/Optimizer/OpenMP/Passes.h.inc"
+} // namespace flangomp
+
+using namespace mlir;
+
+namespace {
+class StackToSharedPass
+    : public flangomp::impl::StackToSharedPassBase<StackToSharedPass> {
+public:
+  StackToSharedPass() = default;
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    OpBuilder builder(context);
+
+    func::FuncOp funcOp = getOperation();
+    auto offloadIface = funcOp->getParentOfType<omp::OffloadModuleInterface>();
+    if (!offloadIface || !offloadIface.getIsTargetDevice())
+      return;
+
+    funcOp->walk([&](fir::AllocaOp allocaOp) {
+      if (!shouldReplaceAlloca(*allocaOp))
+        return;
+
+      // Replace fir.alloca with omp.alloc_shared_mem.
+      builder.setInsertionPoint(allocaOp);
+      auto sharedAllocOp = omp::AllocSharedMemOp::create(
+          builder, allocaOp->getLoc(), allocaOp.getResult().getType(),
+          allocaOp.getInType(), allocaOp.getUniqNameAttr(),
+          allocaOp.getBindcNameAttr(), allocaOp.getTypeparams(),
+          allocaOp.getShape());
+      allocaOp.replaceAllUsesWith(sharedAllocOp.getOperation());
+      allocaOp.erase();
+
+      // Create a new omp.free_shared_mem for the allocated buffer prior to
+      // exiting the region.
+      Block *allocaBlock = sharedAllocOp->getBlock();
+      DominanceInfo domInfo;
+      for (Block &block : sharedAllocOp->getParentRegion()->getBlocks()) {
+        Operation *terminator = block.getTerminator();
+        if (!terminator->hasSuccessors() &&
+            domInfo.dominates(allocaBlock, &block)) {
+          builder.setInsertionPoint(terminator);
+          omp::FreeSharedMemOp::create(builder, sharedAllocOp.getLoc(),
+                                       sharedAllocOp);
+        }
+      }
+    });
+  }
+
+private:
+  // TODO: Refactor the logic in `shouldReplaceAlloca` and `checkAllocaUses` to
+  // be reusable by the MLIR to LLVM IR translation stage, as something very
+  // similar is also implemented there to choose between allocas and device
+  // shared memory allocations when processing OpenMP reductions, mapping and
+  // privatization.
+
+  // Decide whether to replace a fir.alloca with a pair of device shared memory
+  // allocation/deallocation pair based on the location of the allocation and
+  // its uses.
+  //
+  // In summary, it should be done whenever the allocation is placed outside any
+  // parallel regions and inside either a target device function or a generic
+  // kernel, while being used inside of a parallel region.
+  bool shouldReplaceAlloca(Operation &op) {
+    auto targetOp = op.getParentOfType<omp::TargetOp>();
+
+    // It must be inside of a generic omp.target or in a target device function,
+    // and not inside of omp.parallel.
+    if (auto parallelOp = op.getParentOfType<omp::ParallelOp>()) {
+      if (!targetOp || !targetOp->isProperAncestor(parallelOp))
+        return false;
+    }
+
+    if (targetOp) {
+      if (targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) !=
+          mlir::omp::TargetExecMode::generic)
+        return false;
+    } else {
+      auto declTargetIface = dyn_cast<mlir::omp::DeclareTargetInterface>(
+          *op.getParentOfType<func::FuncOp>());
+      if (!declTargetIface || !declTargetIface.isDeclareTarget() ||
+          declTargetIface.getDeclareTargetDeviceType() ==
+              mlir::omp::DeclareTargetDeviceType::host)
+        return false;
+    }
+
+    return checkAllocaUses(op.getUses());
+  }
+
+  // When a use takes place inside an omp.parallel region and it's not as a
+  // private clause argument, or when it is a reduction argument passed to
+  // omp.parallel, then the defining allocation is eligible for replacement with
+  // shared memory.
+  //
+  // Only one of the uses needs to meet these conditions to return true.
+  bool checkAllocaUses(const Operation::use_range &uses) {
+    auto checkUse = [&](const OpOperand &use) {
+      Operation *owner = use.getOwner();
+      auto moduleOp = owner->getParentOfType<ModuleOp>();
+      if (auto parallelOp = dyn_cast<omp::ParallelOp>(owner)) {
+        if (llvm::is_contained(parallelOp.getReductionVars(), use.get()))
+          return true;
+      } else if (owner->getParentOfType<omp::ParallelOp>()) {
+        // If it is used directly inside of a parallel region, it has to be
+        // replaced unless the use is a private clause.
+        if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(owner)) {
+          if (auto privateSyms = llvm::cast_or_null<ArrayAttr>(
+                  owner->getAttr("private_syms"))) {
+            for (auto [var, sym] :
+                 llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
+              if (var != use.get())
+                continue;
+
+              auto privateOp = cast<omp::PrivateClauseOp>(
+                  moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
+              return privateOp.getDataSharingType() !=
+                     omp::DataSharingClauseType::Private;
+            }
+          }
+        }
+        return true;
+      }
+      return false;
+    };
+
+    // Check direct uses and also follow hlfir.declare uses.
+    for (const OpOperand &use : uses) {
+      if (auto declareOp = dyn_cast<hlfir::DeclareOp>(use.getOwner())) {
+        if (checkAllocaUses(declareOp->getUses()))
+          return true;
+      } else if (checkUse(use)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+};
+} // namespace
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index d9b1287829cac..2129ca44e7a8d 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -338,6 +338,7 @@ void createOpenMPFIRPassPipeline(mlir::PassManager &pm,
   pm.addPass(flangomp::createMarkDeclareTargetPass());
   pm.addPass(flangomp::createGenericLoopConversionPass());
   if (opts.isTargetDevice) {
+    pm.addPass(flangomp::createStackToSharedPass());
     pm.addPass(flangomp::createFunctionFilteringPass());
 
     if (opts.enableOffloadGlobalFiltering)
diff --git a/flang/test/Transforms/OpenMP/stack-to-shared.mlir b/flang/test/Transforms/OpenMP/stack-to-shared.mlir
new file mode 100644
index 0000000000000..a7842048a8411
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/stack-to-shared.mlir
@@ -0,0 +1,215 @@
+// RUN: fir-opt --split-input-file --omp-stack-to-shared %s | FileCheck %s
+
+module attributes {omp.is_target_device = true} {
+  omp.declare_reduction @add_reduction_i32 : i32 init {
+  ^bb0(%arg0: i32):
+    %c0_i32 = arith.constant 0 : i32
+    omp.yield(%c0_i32 : i32)
+  } combiner {
+  ^bb0(%arg0: i32, %arg1: i32):
+    %0 = arith.addi %arg0, %arg1 : i32
+    omp.yield(%0 : i32)
+  }
+
+  omp.private {type = private} @privatizer_i32 : i32
+  omp.private {type = firstprivate} @firstprivatizer_i32 : i32 copy {
+  ^bb0(%arg0: i32, %arg1: i32):
+    omp.yield(%arg0 : i32)
+  }
+
+  // Verify that target device functions are searched for allocas shared across
+  // threads of a parallel region.
+  //
+  // Also ensure that all fir.alloca information is adequately forwarded to the
+  // new allocation, that uses of the allocation through hlfir.declare are
+  // detected and that only the expected types of uses (parallel reduction and
+  // non-private uses inside of a parallel region) are replaced.
+  // CHECK-LABEL: func.func @standalone_func
+  func.func @standalone_func(%lb: i32, %ub: i32, %step: i32) attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} {
+    // CHECK: %[[ALLOC_0:.*]] = omp.alloc_shared_mem i32 {uniq_name = "x"} : !fir.ref<i32>
+    %0 = fir.alloca i32 {uniq_name = "x"}
+    %c = arith.constant 1 : index
+    // CHECK: %[[ALLOC_1:.*]] = omp.alloc_shared_mem !fir.char<1,?>(%[[C:.*]] : index), %[[C]] {bindc_name = "y", uniq_name = "y"} : !fir.ref<!fir.char<1,?>>
+    %1 = fir.alloca !fir.char<1,?>(%c : index), %c {bindc_name = "y", uniq_name = "y"}
+    // CHECK: %{{.*}}:2 = hlfir.declare %[[ALLOC_1]] typeparams %[[C]] {uniq_name = "y"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %decl:2 = hlfir.declare %1 typeparams %c {uniq_name = "y"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "z"}
+    %2 = fir.alloca i32 {uniq_name = "z"}
+    // CHECK: %[[ALLOC_2:.*]] = omp.alloc_shared_mem i32 {uniq_name = "a"} : !fir.ref<i32>
+    %3 = fir.alloca i32 {uniq_name = "a"}
+    // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "b"}
+    %4 = fir.alloca i32 {uniq_name = "b"}
+    omp.parallel reduction(@add_reduction_i32 %0 -> %arg0 : !fir.ref<i32>) {
+      // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "c"}
+      %5 = fir.alloca i32 {uniq_name = "c"}
+      %6:2 = fir.unboxchar %decl#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+      omp.wsloop private(@privatizer_i32 %2 -> %arg1, @firstprivatizer_i32 %3 -> %arg2 : !fir.ref<i32>, !fir.ref<i32>) {
+        omp.loop_nest (%arg3) : i32 = (%lb) to (%ub) inclusive step (%step) {
+          %7 = fir.load %5 : !fir.ref<i32>
+          omp.yield
+        }
+      }
+      omp.terminator
+    }
+    %5 = fir.load %4 : !fir.ref<i32>
+    // CHECK: omp.free_shared_mem %[[ALLOC_0]] : !fir.ref<i32>
+    // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_1]] : !fir.ref<!fir.char<1,?>>
+    // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_2]] : !fir.ref<i32>
+    // CHECK-NEXT: return
+    return
+  }
+
+  // Verify that generic target regions are searched for allocas shared across
+  // threads of a parallel region.
+  // CHECK-LABEL: func.func @target_generic
+  func.func @target_generic() {
+    // CHECK: omp.target
+    omp.target {
+      %c = arith.constant 0 : i32
+      // CHECK: %[[ALLOC_0:.*]] = omp.alloc_shared_mem i32 {uniq_name = "x"} : !fir.ref<i32>
+      %0 = fir.alloca i32 {uniq_name = "x"}
+      // CHECK: omp.teams
+      omp.teams {
+        // CHECK: %[[ALLOC_1:.*]] = omp.alloc_shared_mem i32 {uniq_name = "y"} : !fir.ref<i32>
+        %1 = fir.alloca i32 {uniq_name = "y"}
+        // CHECK: omp.distribute
+        omp.distribute {
+          omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) {
+            // CHECK: %[[ALLOC_2:.*]] = omp.alloc_shared_mem i32 {uniq_name = "z"} : !fir.ref<i32>
+            %2 = fir.alloca i32 {uniq_name = "z"}
+            // CHECK: omp.parallel
+            omp.parallel {
+              %3 = fir.load %0 : !fir.ref<i32>
+              %4 = fir.load %1 : !fir.ref<i32>
+              %5 = fir.load %2 : !fir.ref<i32>
+              // CHECK: omp.terminator
+              omp.terminator
+            }
+            // CHECK: omp.free_shared_mem %[[ALLOC_2]] : !fir.ref<i32>
+            // CHECK: omp.yield
+            omp.yield
+          }
+        }
+        // CHECK: omp.free_shared_mem %[[ALLOC_1]] : !fir.ref<i32>
+        // CHECK: omp.terminator
+        omp.terminator
+      }
+      // CHECK: omp.free_shared_mem %[[ALLOC_0]] : !fir.ref<i32>
+      // CHECK: omp.terminator
+      omp.terminator
+    }
+    // CHECK: return
+    return
+  }
+
+  // Make sure that uses not shared across threads on a parallel region inside
+  // of target are not incorrectly detected as such if there's another parallel
+  // region in the host wrapping the whole target region.
+  // CHECK-LABEL: func.func @target_generic_in_parallel
+  func.func @target_generic_in_parallel() {
+    // CHECK-NOT: omp.alloc_shared_mem
+    // CHECK-NOT: omp.free_shared_mem
+    omp.parallel {
+      omp.target {
+        %c = arith.constant 0 : i32
+        %0 = fir.alloca i32 {uniq_name = "x"}
+        omp.teams {
+          %1 = fir.alloca i32 {uniq_name = "y"}
+          omp.distribute {
+            omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) {
+              %3 = fir.load %0 : !fir.ref<i32>
+              %4 = fir.load %1 : !fir.ref<i32>
+              omp.parallel {
+                omp.terminator
+              }
+              omp.yield
+            }
+          }
+          omp.terminator
+        }
+        omp.terminator
+      }
+      omp.terminator
+    }
+    // CHECK: return
+    return
+  }
+
+  // Ensure that allocations within SPMD target regions are not replaced with
+  // device shared memory regardless of use.
+  // CHECK-LABEL: func.func @target_spmd
+  func.func @target_spmd() {
+    // CHECK-NOT: omp.alloc_shared_mem
+    // CHECK-NOT: omp.free_shared_mem
+    omp.target {
+      %c = arith.constant 0 : i32
+      %0 = fir.alloca i32 {uniq_name = "x"}
+      omp.teams {
+        %1 = fir.alloca i32 {uniq_name = "y"}
+        omp.parallel {
+          %2 = fir.alloca i32 {uniq_name = "z"}
+          %3 = fir.load %0 : !fir.ref<i32>
+          %4 = fir.load %1 : !fir.ref<i32>
+          omp.distribute {
+            omp.wsloop {
+              omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) {
+                %5 = fir.load %2 : !fir.ref<i32>
+                omp.yield
+              }
+            } {omp.composite}
+          } {omp.composite}
+          omp.terminator
+        } {omp.composite}
+        omp.terminator
+      }
+      omp.terminator
+    }
+    // CHECK: return
+    return
+  }
+}
+
+// -----
+
+// No transformations must be done when targeting the host device.
+// CHECK-LABEL: func.func @host_standalone
+func.func @host_standalone() {
+  // CHECK-NOT: omp.alloc_shared_mem
+  // CHECK-NOT: omp.free_shared_mem
+  %0 = fir.alloca i32 {uniq_name = "x"}
+  omp.parallel {
+    %1 = fir.load %0 : !fir.ref<i32>
+    omp.terminator
+  }
+  // CHECK: return
+  return
+}
+
+// CHECK-LABEL: func.func @host_target
+func.func @host_target() {
+  // CHECK-NOT: omp.alloc_shared_mem
+  // CHECK-NOT: omp.free_shared_mem
+  omp.target {
+    %c = arith.constant 0 : i32
+    %0 = fir.alloca i32 {uniq_name = "x"}
+    omp.teams {
+      %1 = fir.alloca i32 {uniq_name = "y"}
+      omp.distribute {
+        omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) {
+          %2 = fir.alloca i32 {uniq_name = "z"}
+          omp.parallel {
+            %3 = fir.load %0 : !fir.ref<i32>
+            %4 = fir.load %1 : !fir.ref<i32>
+            %5 = fir.load %2 : !fir.ref<i32>
+            omp.terminator
+          }
+          omp.yield
+        }
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  // CHECK: return
+  return
+}

From 58abac47ac160144444b793059b795d0b08aa4d4 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Tue, 16 Sep 2025 14:18:39 +0100
Subject: [PATCH 11/22] [MLIR][OpenMP][OMPIRBuilder] Improve shared memory
 checks

This patch refines checks to decide whether to use device shared memory or
regular stack allocations. In particular, it adds support for parallel regions
residing on standalone target device functions.

The changes are:
- Shared memory is introduced for `omp.target` implicit allocations, such as
those related to privatization and mapping, as long as they are shared across
threads in a nested parallel region.
- Standalone target device functions are interpreted as being part of a Generic
kernel, since the fact that they are present in the module after filtering
means they must be reachable from a target region.
- Prevent allocations whose only shared uses inside of an `omp.parallel` region
are as part of a `private` clause from being moved to device shared memory.
---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   4 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  28 ++--
 .../Frontend/OpenMPIRBuilderTest.cpp          |  55 +++++---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 122 ++++++++++++------
 .../LLVMIR/omptarget-parallel-llvm.mlir       |   8 +-
 .../LLVMIR/omptarget-parallel-wsloop.mlir     |   7 +-
 .../fortran/target-generic-outlined-loops.f90 | 109 ++++++++++++++++
 7 files changed, 258 insertions(+), 75 deletions(-)
 create mode 100644 offload/test/offloading/fortran/target-generic-outlined-loops.f90

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 5e90417836f4a..9769dff6c26f4 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -3303,8 +3303,8 @@ class OpenMPIRBuilder {
       ArrayRef<InsertPointTy> DeallocIPs)>;
 
   using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy(
-      Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
-      InsertPointTy CodeGenIP)>;
+      Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocIP,
+      InsertPointTy CodeGenIP, ArrayRef<InsertPointTy> DeallocIPs)>;
 
   /// Generator for '#omp target'
   ///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 73c751f2851b7..4f7ada8aa290d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -312,6 +312,12 @@ getTargetKernelExecMode(Function &Kernel) {
   return static_cast<OMPTgtExecModeFlags>(KernelMode->getZExtValue());
 }
 
+static bool isGenericKernel(Function &Fn) {
+  std::optional<omp::OMPTgtExecModeFlags> ExecMode =
+      getTargetKernelExecMode(Fn);
+  return !ExecMode || (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC);
+}
+
 /// Make \p Source branch to \p Target.
 ///
 /// Handles two situations:
@@ -1566,11 +1572,9 @@ static void targetParallelCallback(
       IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
                   : Builder.getInt32(1);
 
-  // If this is not a Generic kernel, we can skip generating the wrapper.
-  std::optional<omp::OMPTgtExecModeFlags> ExecMode =
-      getTargetKernelExecMode(*OuterFn);
+  // If this is a Generic kernel, we can generate the wrapper.
   Value *WrapperFn;
-  if (ExecMode && (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC))
+  if (isGenericKernel(*OuterFn))
     WrapperFn = createTargetParallelWrapper(OMPIRBuilder, OutlinedFn);
   else
     WrapperFn = Constant::getNullValue(PtrTy);
@@ -1845,13 +1849,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
 
   auto OI = [&]() -> std::unique_ptr<OutlineInfo> {
     if (Config.isTargetDevice()) {
-      std::optional<omp::OMPTgtExecModeFlags> ExecMode =
-          getTargetKernelExecMode(*OuterFn);
-
-      // If OuterFn is not a Generic kernel, skip custom allocation. This causes
-      // the CodeExtractor to follow its default behavior. Otherwise, we need to
-      // use device shared memory to allocate argument structures.
-      if (ExecMode && *ExecMode & OMP_TGT_EXEC_MODE_GENERIC)
+      // If OuterFn is a Generic kernel, we need to use device shared memory to
+      // allocate argument structures. Otherwise, we use stack allocations as
+      // usual.
+      if (isGenericKernel(*OuterFn))
         return std::make_unique<DeviceSharedMemOutlineInfo>(*this);
     }
     return std::make_unique<OutlineInfo>();
@@ -7909,8 +7910,9 @@ static Expected<Function *> createOutlinedFunction(
     Argument &Arg = std::get<1>(InArg);
     Value *InputCopy = nullptr;
 
-    llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
-        ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
+    llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = ArgAccessorFuncCB(
+        Arg, Input, InputCopy, AllocaIP, Builder.saveIP(),
+        OpenMPIRBuilder::InsertPointTy(ExitBB, ExitBB->begin()));
     if (!AfterIP)
       return AfterIP.takeError();
     Builder.restoreIP(*AfterIP);
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index b90b6a6923cac..6c100619d04d7 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -747,8 +747,10 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
   EXPECT_EQ(OutlinedFn->getArg(2)->getType(),
             PointerType::get(M->getContext(), 0));
   EXPECT_EQ(&OutlinedFn->getEntryBlock(), PrivAI->getParent());
-  EXPECT_TRUE(OutlinedFn->hasOneUse());
-  User *Usr = OutlinedFn->user_back();
+  EXPECT_TRUE(OutlinedFn->hasNUses(2));
+  User *Usr = *OutlinedFn->users().begin();
+  User *WrapperUsr = *++OutlinedFn->users().begin();
+
   ASSERT_TRUE(isa<CallInst>(Usr));
   CallInst *Parallel51CI = dyn_cast<CallInst>(Usr);
   ASSERT_NE(Parallel51CI, nullptr);
@@ -759,6 +761,20 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
   EXPECT_TRUE(
       isa<GlobalVariable>(Parallel51CI->getArgOperand(0)->stripPointerCasts()));
   EXPECT_EQ(Parallel51CI, Usr);
+
+  ASSERT_TRUE(isa<CallInst>(WrapperUsr));
+  CallInst *OutlinedCI = dyn_cast<CallInst>(WrapperUsr);
+  ASSERT_NE(OutlinedCI, nullptr);
+  EXPECT_EQ(OutlinedCI->getCalledFunction(), OutlinedFn);
+
+  Function *WrapperFn = OutlinedCI->getFunction();
+  EXPECT_TRUE(WrapperFn->hasInternalLinkage());
+  EXPECT_EQ(WrapperFn->arg_size(), 2U);
+  EXPECT_EQ(WrapperFn->getArg(0)->getType(),
+            IntegerType::getInt16Ty(M->getContext()));
+  EXPECT_EQ(WrapperFn->getArg(1)->getType(),
+            IntegerType::getInt32Ty(M->getContext()));
+
   M->setDataLayout(oldDLStr);
 }
 
@@ -6423,7 +6439,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
   auto SimpleArgAccessorCB =
       [&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
           llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
-          llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+          llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+          llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
         IRBuilderBase::InsertPointGuard guard(Builder);
         Builder.SetCurrentDebugLocation(llvm::DebugLoc());
         if (!OMPBuilder.Config.isTargetDevice()) {
@@ -6594,7 +6611,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   auto SimpleArgAccessorCB =
       [&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
           llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
-          llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+          llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+          llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
         IRBuilderBase::InsertPointGuard guard(Builder);
         Builder.SetCurrentDebugLocation(llvm::DebugLoc());
         if (!OMPBuilder.Config.isTargetDevice()) {
@@ -6802,12 +6820,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) {
     return Builder.saveIP();
   };
 
-  auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
-                                 OpenMPIRBuilder::InsertPointTy,
-                                 OpenMPIRBuilder::InsertPointTy CodeGenIP) {
-    Builder.restoreIP(CodeGenIP);
-    return Builder.saveIP();
-  };
+  auto SimpleArgAccessorCB =
+      [&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy,
+          OpenMPIRBuilder::InsertPointTy CodeGenIP,
+          llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy>) {
+        Builder.restoreIP(CodeGenIP);
+        return Builder.saveIP();
+      };
 
   SmallVector<Value *> Inputs;
   OpenMPIRBuilder::MapInfosTy CombinedInfos;
@@ -6902,12 +6921,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) {
   Function *OutlinedFn = nullptr;
   SmallVector<Value *> CapturedArgs;
 
-  auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
-                                 OpenMPIRBuilder::InsertPointTy,
-                                 OpenMPIRBuilder::InsertPointTy CodeGenIP) {
-    Builder.restoreIP(CodeGenIP);
-    return Builder.saveIP();
-  };
+  auto SimpleArgAccessorCB =
+      [&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy,
+          OpenMPIRBuilder::InsertPointTy CodeGenIP,
+          llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy>) {
+        Builder.restoreIP(CodeGenIP);
+        return Builder.saveIP();
+      };
 
   OpenMPIRBuilder::MapInfosTy CombinedInfos;
   auto GenMapInfoCB =
@@ -7007,7 +7027,8 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
   auto SimpleArgAccessorCB =
       [&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
           llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
-          llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+          llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+          llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
         IRBuilderBase::InsertPointGuard guard(Builder);
         Builder.SetCurrentDebugLocation(llvm::DebugLoc());
         if (!OMPBuilder.Config.isTargetDevice()) {
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 5151bc63e6a04..d0bb721a5c302 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1130,9 +1130,10 @@ struct DeferredStore {
 } // namespace
 
 /// Check whether allocations for the given operation might potentially have to
-/// be done in device shared memory. That means we're compiling for a offloading
-/// target, the operation is an `omp::TargetOp` or nested inside of one and that
-/// target region represents a Generic (non-SPMD) kernel.
+/// be done in device shared memory. That means we're compiling for an
+/// offloading target, the operation is neither an `omp::TargetOp` nor nested
+/// inside of one, or it is and that target region represents a Generic
+/// (non-SPMD) kernel.
 ///
 /// This represents a necessary but not sufficient set of conditions to use
 /// device shared memory in place of regular allocas. For some variables, the
@@ -1148,7 +1149,7 @@ mightAllocInDeviceSharedMemory(Operation &op,
   if (!targetOp)
     targetOp = op.getParentOfType<omp::TargetOp>();
 
-  return targetOp &&
+  return !targetOp ||
          targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) ==
              omp::TargetExecMode::generic;
 }
@@ -1162,18 +1163,36 @@ mightAllocInDeviceSharedMemory(Operation &op,
 /// operation that owns the specified block argument.
 static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
   Operation *parentOp = value.getOwner()->getParentOp();
-  auto targetOp = dyn_cast<omp::TargetOp>(parentOp);
-  if (!targetOp)
-    targetOp = parentOp->getParentOfType<omp::TargetOp>();
-  assert(targetOp && "expected a parent omp.target operation");
-
+  auto moduleOp = parentOp->getParentOfType<ModuleOp>();
   for (auto *user : value.getUsers()) {
     if (auto parallelOp = dyn_cast<omp::ParallelOp>(user)) {
       if (llvm::is_contained(parallelOp.getReductionVars(), value))
         return true;
     } else if (auto parallelOp = user->getParentOfType<omp::ParallelOp>()) {
-      if (parentOp->isProperAncestor(parallelOp))
-        return true;
+      if (parentOp->isProperAncestor(parallelOp)) {
+        // If it is used directly inside of a parallel region, skip private
+        // clause uses.
+        bool isPrivateClauseUse = false;
+        if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(user)) {
+          if (auto privateSyms = llvm::cast_or_null<ArrayAttr>(
+                  user->getAttr("private_syms"))) {
+            for (auto [var, sym] :
+                 llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
+              if (var != value)
+                continue;
+
+              auto privateOp = cast<omp::PrivateClauseOp>(
+                  moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
+              if (privateOp.getCopyRegion().empty()) {
+                isPrivateClauseUse = true;
+                break;
+              }
+            }
+          }
+        }
+        if (!isPrivateClauseUse)
+          return true;
+      }
     }
   }
 
@@ -1198,8 +1217,8 @@ allocReductionVars(T op, ArrayRef<BlockArgument> reductionArgs,
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
-  bool useDeviceSharedMem =
-      isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+  bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+                            mightAllocInDeviceSharedMemory(*op, *ompBuilder);
 
   // delay creating stores until after all allocas
   deferredStores.reserve(op.getNumReductionVars());
@@ -1320,8 +1339,8 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
     return success();
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
-  bool useDeviceSharedMem =
-      isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+  bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+                            mightAllocInDeviceSharedMemory(*op, *ompBuilder);
 
   llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
   auto allocaIP = llvm::IRBuilderBase::InsertPoint(
@@ -1537,8 +1556,8 @@ static LogicalResult createReductionsAndCleanup(
       reductionRegions, privateReductionVariables, moduleTranslation, builder,
       "omp.reduction.cleanup");
 
-  bool useDeviceSharedMem =
-      isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+  bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+                            mightAllocInDeviceSharedMemory(*op, *ompBuilder);
   if (useDeviceSharedMem) {
     for (auto [var, reductionDecl] :
          llvm::zip_equal(privateReductionVariables, reductionDecls))
@@ -1718,7 +1737,7 @@ allocatePrivateVars(T op, llvm::IRBuilderBase &builder,
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   bool mightUseDeviceSharedMem =
-      isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
+      isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
       mightAllocInDeviceSharedMemory(*op, *ompBuilder);
   unsigned int allocaAS =
       moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace();
@@ -1836,7 +1855,7 @@ cleanupPrivateVars(T op, llvm::IRBuilderBase &builder,
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   bool mightUseDeviceSharedMem =
-      isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
+      isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
       mightAllocInDeviceSharedMemory(*op, *ompBuilder);
   for (auto [privDecl, llvmPrivVar, blockArg] :
        llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.llvmVars,
@@ -5628,42 +5647,68 @@ handleDeclareTargetMapVar(MapInfoData &mapData,
 // a store of the kernel argument into this allocated memory which
 // will then be loaded from, ByCopy will use the allocated memory
 // directly.
-static llvm::IRBuilderBase::InsertPoint
-createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
-                             llvm::Value *input, llvm::Value *&retVal,
-                             llvm::IRBuilderBase &builder,
-                             llvm::OpenMPIRBuilder &ompBuilder,
-                             LLVM::ModuleTranslation &moduleTranslation,
-                             llvm::IRBuilderBase::InsertPoint allocaIP,
-                             llvm::IRBuilderBase::InsertPoint codeGenIP) {
+static llvm::IRBuilderBase::InsertPoint createDeviceArgumentAccessor(
+    omp::TargetOp targetOp, MapInfoData &mapData, llvm::Argument &arg,
+    llvm::Value *input, llvm::Value *&retVal, llvm::IRBuilderBase &builder,
+    llvm::OpenMPIRBuilder &ompBuilder,
+    LLVM::ModuleTranslation &moduleTranslation,
+    llvm::IRBuilderBase::InsertPoint allocIP,
+    llvm::IRBuilderBase::InsertPoint codeGenIP,
+    llvm::ArrayRef<llvm::IRBuilderBase::InsertPoint> deallocIPs) {
   assert(ompBuilder.Config.isTargetDevice() &&
          "function only supported for target device codegen");
-  builder.restoreIP(allocaIP);
+  builder.restoreIP(allocIP);
 
   omp::VariableCaptureKind capture = omp::VariableCaptureKind::ByRef;
   LLVM::TypeToLLVMIRTranslator typeToLLVMIRTranslator(
       ompBuilder.M.getContext());
   unsigned alignmentValue = 0;
+  BlockArgument mlirArg;
   // Find the associated MapInfoData entry for the current input
-  for (size_t i = 0; i < mapData.MapClause.size(); ++i)
+  for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
     if (mapData.OriginalValue[i] == input) {
       auto mapOp = cast<omp::MapInfoOp>(mapData.MapClause[i]);
       capture = mapOp.getMapCaptureType();
       // Get information of alignment of mapped object
       alignmentValue = typeToLLVMIRTranslator.getPreferredAlignment(
           mapOp.getVarType(), ompBuilder.M.getDataLayout());
+      // Get the corresponding target entry block argument
+      mlirArg =
+          cast<omp::BlockArgOpenMPOpInterface>(*targetOp).getMapBlockArgs()[i];
       break;
     }
+  }
 
   unsigned int allocaAS = ompBuilder.M.getDataLayout().getAllocaAddrSpace();
   unsigned int defaultAS =
       ompBuilder.M.getDataLayout().getProgramAddressSpace();
 
-  // Create the alloca for the argument the current point.
-  llvm::Value *v = builder.CreateAlloca(arg.getType(), allocaAS, nullptr);
+  // Create the allocation for the argument.
+  llvm::Value *v = nullptr;
+  if (mightAllocInDeviceSharedMemory(*targetOp, ompBuilder) &&
+      mustAllocPrivateVarInDeviceSharedMemory(mlirArg)) {
+    // Use the beginning of the codeGenIP rather than the usual allocation point
+    // for shared memory allocations because otherwise these would be done prior
+    // to the target initialization call. Also, the exit block (where the
+    // deallocation is placed) is only executed if the initialization call
+    // succeeds.
+    builder.SetInsertPoint(codeGenIP.getBlock()->getFirstInsertionPt());
+    v = ompBuilder.createOMPAllocShared(builder, arg.getType());
+
+    // Create deallocations in all provided deallocation points and then restore
+    // the insertion point to right after the new allocations.
+    llvm::IRBuilderBase::InsertPointGuard guard(builder);
+    for (auto deallocIP : deallocIPs) {
+      builder.SetInsertPoint(deallocIP.getBlock(), deallocIP.getPoint());
+      ompBuilder.createOMPFreeShared(builder, v, arg.getType());
+    }
+  } else {
+    // Use the current point, which was previously set to allocIP.
+    v = builder.CreateAlloca(arg.getType(), allocaAS, nullptr);
 
-  if (allocaAS != defaultAS && arg.getType()->isPointerTy())
-    v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS));
+    if (allocaAS != defaultAS && arg.getType()->isPointerTy())
+      v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS));
+  }
 
   builder.CreateStore(&arg, v);
 
@@ -6254,8 +6299,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   };
 
   auto argAccessorCB = [&](llvm::Argument &arg, llvm::Value *input,
-                           llvm::Value *&retVal, InsertPointTy allocaIP,
-                           InsertPointTy codeGenIP)
+                           llvm::Value *&retVal, InsertPointTy allocIP,
+                           InsertPointTy codeGenIP,
+                           llvm::ArrayRef<InsertPointTy> deallocIPs)
       -> llvm::OpenMPIRBuilder::InsertPointOrErrorTy {
     llvm::IRBuilderBase::InsertPointGuard guard(builder);
     builder.SetCurrentDebugLocation(llvm::DebugLoc());
@@ -6269,9 +6315,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       return codeGenIP;
     }
 
-    return createDeviceArgumentAccessor(mapData, arg, input, retVal, builder,
-                                        *ompBuilder, moduleTranslation,
-                                        allocaIP, codeGenIP);
+    return createDeviceArgumentAccessor(targetOp, mapData, arg, input, retVal,
+                                        builder, *ompBuilder, moduleTranslation,
+                                        allocIP, codeGenIP, deallocIPs);
   };
 
   llvm::OpenMPIRBuilder::TargetKernelRuntimeAttrs runtimeAttrs;
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
index 866284a47050f..1498a7206b6cd 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
@@ -55,15 +55,14 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK: define weak_odr protected amdgpu_kernel void @[[FUNC0:.*]](
 // CHECK-SAME: ptr %[[TMP:.*]], ptr %[[TMP0:.*]]) #{{[0-9]+}} {
 // CHECK:         %[[TMP1:.*]] = alloca [1 x ptr], align 8, addrspace(5)
-// CHECK:         %[[TMP2:.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK:         %[[TMP3:.*]] = addrspacecast ptr addrspace(5) %[[TMP2]] to ptr
-// CHECK:         store ptr %[[TMP0]], ptr %[[TMP3]], align 8
 // CHECK:         %[[TMP4:.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{.*}} to ptr), ptr %[[TMP]])
 // CHECK:         %[[EXEC_USER_CODE:.*]] = icmp eq i32 %[[TMP4]], -1
 // CHECK:         br i1 %[[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[WORKER_EXIT:.*]]
 // CHECK:         %[[TMP5:.*]] = addrspacecast ptr addrspace(5) %[[TMP1]] to ptr
 // CHECK:         %[[STRUCTARG:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8)
-// CHECK:         %[[TMP6:.*]] = load ptr, ptr %[[TMP3]], align 8
+// CHECK:         %[[TMP2:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8)
+// CHECK:         store ptr %[[TMP0]], ptr %[[TMP2]], align 8
+// CHECK:         %[[TMP6:.*]] = load ptr, ptr %[[TMP2]], align 8
 // CHECK:         %[[OMP_GLOBAL_THREAD_NUM:.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
 // CHECK:         %[[GEP_:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG]], i32 0, i32 0
 // CHECK:         store ptr %[[TMP6]], ptr %[[GEP_]], align 8
@@ -71,6 +70,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:         store ptr %[[STRUCTARG]], ptr %[[TMP7]], align 8
 // CHECK:         call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 1, i32 -1, i32 -1, ptr @[[FUNC1:.*]], ptr @[[FUNC1_WRAPPER:.*]], ptr %[[TMP5]], i64 1)
 // CHECK:         call void @__kmpc_free_shared(ptr %[[STRUCTARG]], i64 8)
+// CHECK:         call void @__kmpc_free_shared(ptr %[[TMP2]], i64 8)
 // CHECK:         call void @__kmpc_target_deinit()
 
 // CHECK: define internal void @[[FUNC1]](
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
index 2df2b8db0e5f7..98db59c288dc8 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
@@ -30,7 +30,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:      call void @__kmpc_parallel_51(ptr addrspacecast
 // CHECK-SAME:  (ptr addrspace(1) @[[GLOB:[0-9]+]] to ptr),
 // CHECK-SAME:  i32 %[[THREAD_NUM:.*]], i32 1, i32 -1, i32 -1,
-// CHECK-SAME:  ptr @[[PARALLEL_FUNC:.*]], ptr null, ptr %[[PARALLEL_ARGS:.*]], i64 1)
+// CHECK-SAME:  ptr @[[PARALLEL_FUNC:.*]], ptr @[[PARALLEL_WRAPPER:.*]], ptr %[[PARALLEL_ARGS:.*]], i64 1)
 
 // CHECK:      define internal void @[[PARALLEL_FUNC]]
 // CHECK-SAME:  (ptr noalias noundef %[[TID_ADDR:.*]], ptr noalias noundef %[[ZERO_ADDR:.*]],
@@ -42,6 +42,11 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 
 // CHECK:      define internal void @[[LOOP_BODY_FUNC]](i32 %[[CNT:.*]], ptr %[[LOOP_BODY_ARG_PTR:.*]]) #[[ATTRS2:[0-9]+]] {
 
+// CHECK:      define internal void @[[PARALLEL_WRAPPER]](i16 {{.*}}, i32 {{.*}}) {
+// CHECK-NOT:    ret {{.*}}
+// CHECK:        call void @[[PARALLEL_FUNC]]({{.*}})
+// CHECK-NEXT:   ret void
+
 // CHECK:      attributes #[[ATTRS1]] = {
 // CHECK-SAME:  "target-cpu"="gfx90a"
 // CHECK-SAME:  "target-features"="+gfx9-insts,+wavefrontsize64"
diff --git a/offload/test/offloading/fortran/target-generic-outlined-loops.f90 b/offload/test/offloading/fortran/target-generic-outlined-loops.f90
new file mode 100644
index 0000000000000..594809027e115
--- /dev/null
+++ b/offload/test/offloading/fortran/target-generic-outlined-loops.f90
@@ -0,0 +1,109 @@
+! Offloading test for generic target regions containing different kinds of
+! loop constructs inside, moving parallel regions into a separate subroutine.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+subroutine parallel_loop(n, counter)
+  implicit none
+  integer, intent(in) :: n
+  integer, intent(inout) :: counter
+  integer :: i
+
+  !$omp parallel do reduction(+:counter)
+  do i=1, n
+    counter = counter + 1
+  end do
+end subroutine
+
+program main
+  integer :: i1, i2, n1, n2, counter
+
+  n1 = 100
+  n2 = 50
+
+  counter = 0
+  !$omp target map(tofrom:counter)
+    !$omp teams distribute reduction(+:counter)
+    do i1=1, n1
+      counter = counter + 1
+    end do
+  !$omp end target
+
+  ! CHECK: 1 100
+  print '(I2" "I0)', 1, counter
+
+  counter = 0
+  !$omp target map(tofrom:counter)
+    call parallel_loop(n1, counter)
+    call parallel_loop(n1, counter)
+  !$omp end target
+
+  ! CHECK: 2 200
+  print '(I2" "I0)', 2, counter
+
+  counter = 0
+  !$omp target map(tofrom:counter)
+    counter = counter + 1
+    call parallel_loop(n1, counter)
+    counter = counter + 1
+    call parallel_loop(n1, counter)
+    counter = counter + 1
+  !$omp end target
+
+  ! CHECK: 3 203
+  print '(I2" "I0)', 3, counter
+
+  counter = 0
+  !$omp target map(tofrom: counter)
+    counter = counter + 1
+    call parallel_loop(n1, counter)
+    counter = counter + 1
+  !$omp end target
+
+  ! CHECK: 4 102
+  print '(I2" "I0)', 4, counter
+
+
+  counter = 0
+  !$omp target teams distribute reduction(+:counter)
+  do i1=1, n1
+    call parallel_loop(n2, counter)
+  end do
+
+  ! CHECK: 5 5000
+  print '(I2" "I0)', 5, counter
+
+  counter = 0
+  !$omp target teams distribute reduction(+:counter)
+  do i1=1, n1
+    counter = counter + 1
+    call parallel_loop(n2, counter)
+    counter = counter + 1
+  end do
+
+  ! CHECK: 6 5200
+  print '(I2" "I0)', 6, counter
+
+  counter = 0
+  !$omp target teams distribute reduction(+:counter)
+  do i1=1, n1
+    call parallel_loop(n2, counter)
+    call parallel_loop(n2, counter)
+  end do
+
+  ! CHECK: 7 10000
+  print '(I2" "I0)', 7, counter
+
+  counter = 0
+  !$omp target teams distribute reduction(+:counter)
+  do i1=1, n1
+    counter = counter + 1
+    call parallel_loop(n2, counter)
+    counter = counter + 1
+    call parallel_loop(n2, counter)
+    counter = counter + 1
+  end do
+
+  ! CHECK: 8 10300
+  print '(I2" "I0)', 8, counter
+end program

From 4af004b82e8d740a50641020a21051402705b5ad Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Tue, 14 Oct 2025 09:15:52 -0500
Subject: [PATCH 12/22] Address test failures: enable passing test and fix
 omp.target private variables deallocation

---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 32 ++++++-------------
 .../LLVMIR/omptarget-parallel-llvm.mlir       |  1 -
 2 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index d0bb721a5c302..427776cd188a5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -6247,35 +6247,21 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
             targetOp.getPrivateNeedsBarrier(), &mappedPrivateVars)))
       return llvm::make_error<PreviouslyReportedError>();
 
-    SmallVector<Region *> privateCleanupRegions;
-    llvm::transform(privateVarsInfo.privatizers,
-                    std::back_inserter(privateCleanupRegions),
-                    [](omp::PrivateClauseOp privatizer) {
-                      return &privatizer.getDeallocRegion();
-                    });
-
     LLVM::ModuleTranslation::SaveStack<OpenMPAllocStackFrame> frame(
         moduleTranslation, allocIP, deallocIPs);
     llvm::Expected<llvm::BasicBlock *> exitBlock = convertOmpOpRegions(
         targetRegion, "omp.target", builder, moduleTranslation);
 
-    if (!exitBlock)
-      return exitBlock.takeError();
-
-    builder.SetInsertPoint(*exitBlock);
-    if (!privateCleanupRegions.empty()) {
-      if (failed(inlineOmpRegionCleanup(
-              privateCleanupRegions, privateVarsInfo.llvmVars,
-              moduleTranslation, builder, "omp.targetop.private.cleanup",
-              /*shouldLoadCleanupRegionArg=*/false))) {
-        return llvm::createStringError(
-            "failed to inline `dealloc` region of `omp.private` "
-            "op in the target region");
-      }
-      return builder.saveIP();
-    }
+    if (failed(handleError(exitBlock, *targetOp)))
+      return llvm::make_error<PreviouslyReportedError>();
 
-    return InsertPointTy(exitBlock.get(), exitBlock.get()->end());
+    builder.SetInsertPoint(exitBlock.get()->getTerminator());
+
+    if (failed(cleanupPrivateVars(targetOp, builder, moduleTranslation,
+                                  targetOp.getLoc(), privateVarsInfo)))
+      return llvm::make_error<PreviouslyReportedError>();
+
+    return builder.saveIP();
   };
 
   StringRef parentName = parentFn.getName();
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
index 1498a7206b6cd..c6eba6553fe54 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
@@ -1,5 +1,4 @@
 // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
-// XFAIL: *
 // The aim of the test is to check the LLVM IR codegen for the device
 // for omp target parallel construct
 

From 424886ad029c2b577aa94541563fe965f7193746 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Mon, 20 Oct 2025 09:35:51 -0500
Subject: [PATCH 13/22] [Flang][MLIR][OpenMP] Support passing local values to
 device functions

This patch updates MLIR lowering of `fir.embox` and `fircg.ext_embox`
operations to potentially use OpenMP device shared memory for the created
descriptor when compiling for a target device. Any operations introducing
stack allocations inside of a target or teams constructs but outside of a
parallel region, and passing that value into a parallel region or to another
function that might contain one, need to instead use device shared memory for
correctness when running on a GPU.

Also, the logic deciding whether to use device shared memory in place of stack
allocations is updated to also use the former when that memory is passed as an
argument to a function.
---
 .../flang/Optimizer/CodeGen/FIROpPatterns.h   |   8 +-
 flang/include/flang/Utils/OpenMP.h            |  20 +++
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       |  48 ++++---
 flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp |  24 +++-
 flang/lib/Optimizer/OpenMP/StackToShared.cpp  | 106 +--------------
 flang/lib/Utils/OpenMP.cpp                    | 123 ++++++++++++++++--
 flang/test/Fir/OpenMP/embox-to-shared-mem.fir |  29 +++++
 .../OpenMP/threadprivate-target-device.f90    |  14 +-
 .../Transforms/OpenMP/stack-to-shared.mlir    |  21 ++-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |   7 +-
 .../omptarget-constant-alloca-raise.mlir      |   3 +-
 .../openmp-target-private-shared-mem.mlir     |  76 +++++++++++
 12 files changed, 325 insertions(+), 154 deletions(-)
 create mode 100644 flang/test/Fir/OpenMP/embox-to-shared-mem.fir
 create mode 100644 mlir/test/Target/LLVMIR/openmp-target-private-shared-mem.mlir

diff --git a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
index 7d816a8843371..f4c0e273a6e3a 100644
--- a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
+++ b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
@@ -163,10 +163,16 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern {
   // program address space we perform a cast. In the case of most architectures
   // the program and allocation address space will be the default of 0 and no
   // cast will be emitted.
+  //
+  // If `useDeviceSharedMem = true`, an `omp.alloc_shared_mem` operation for the
+  // same type will be used instead, with no address space cast. This is only
+  // intended for allocations on an OpenMP application when compiling for a
+  // target device.
   mlir::Value
   genAllocaAndAddrCastWithType(mlir::Location loc, mlir::Type llvmObjectTy,
                                unsigned alignment,
-                               mlir::ConversionPatternRewriter &rewriter) const;
+                               mlir::ConversionPatternRewriter &rewriter,
+                               bool useDeviceSharedMem = false) const;
 
   const fir::LLVMTypeConverter &lowerTy() const {
     return *static_cast<const fir::LLVMTypeConverter *>(
diff --git a/flang/include/flang/Utils/OpenMP.h b/flang/include/flang/Utils/OpenMP.h
index bad0abb6f5788..433d825168296 100644
--- a/flang/include/flang/Utils/OpenMP.h
+++ b/flang/include/flang/Utils/OpenMP.h
@@ -59,6 +59,26 @@ mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder,
 /// maps.
 void cloneOrMapRegionOutsiders(
     fir::FirOpBuilder &firOpBuilder, mlir::omp::TargetOp targetOp);
+
+/// Tell whether to replace an operation representing a stack allocation with a
+/// device shared memory allocation/deallocation pair based on the location of
+/// the allocation and its uses.
+///
+/// For it to be replaced, an allocation must be:
+///   - located in an OpenMP module for a target device;
+///   - located outside of any parallel regions;
+///   - located inside of a generic target region or a device function; and
+///   - used as a parallel reduction variable, passed as function argument or
+///   used inside of a parallel region in any way except as argument to an
+///   OpenMP private clause.
+bool shouldReplaceAllocaWithDeviceSharedMem(mlir::Operation &op);
+
+/// Based on the location of the definition of the given value representing the
+/// result of a device shared memory allocation, find the corresponding points
+/// where its deallocation should be placed and introduce `omp.free_shared_mem`
+/// ops at those points.
+void insertDeviceSharedMemDeallocation(
+    mlir::OpBuilder &builder, mlir::Value allocVal);
 } // namespace Fortran::utils::openmp
 
 #endif // FORTRAN_UTILS_OPENMP_H_
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 478ab151b96d0..4826406635f1b 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -31,6 +31,7 @@
 #include "flang/Runtime/allocator-registry-consts.h"
 #include "flang/Runtime/descriptor-consts.h"
 #include "flang/Semantics/runtime-type-info.h"
+#include "flang/Utils/OpenMP.h"
 #include "mlir/Conversion/ArithCommon/AttrToLLVMConverter.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
@@ -1818,15 +1819,21 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
   }
 
   /// If the embox is not in a globalOp body, allocate storage for the box;
-  /// store the value inside and return the generated alloca. Return the input
-  /// value otherwise.
-  mlir::Value
+  /// store the value inside and replace the original embox with the generated
+  /// alloca. Replace it with the input value otherwise.
+  ///
+  /// The allocated storage might be OpenMP device shared memory, if required by
+  /// the context and uses of the operation. In that case, the corresponding
+  /// explicit deallocation is also introduced at the applicable block(s).
+  void
   placeInMemoryIfNotGlobalInit(mlir::ConversionPatternRewriter &rewriter,
-                               mlir::Location loc, mlir::Type boxTy,
-                               mlir::Value boxValue,
+                               mlir::Operation *embox, mlir::Location loc,
+                               mlir::Type boxTy, mlir::Value boxValue,
                                bool needDeviceAllocation = false) const {
-    if (isInGlobalOp(rewriter))
-      return boxValue;
+    if (isInGlobalOp(rewriter)) {
+      rewriter.replaceOp(embox, boxValue);
+      return;
+    }
     mlir::Type llvmBoxTy = boxValue.getType();
     mlir::Value storage;
     if (needDeviceAllocation) {
@@ -1835,13 +1842,19 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
       storage =
           genCUFAllocDescriptor(loc, rewriter, mod, baseBoxTy, this->lowerTy());
     } else {
-      storage = this->genAllocaAndAddrCastWithType(loc, llvmBoxTy, defaultAlign,
-                                                   rewriter);
+      storage = this->genAllocaAndAddrCastWithType(
+          loc, llvmBoxTy, defaultAlign, rewriter,
+          Fortran::utils::openmp::shouldReplaceAllocaWithDeviceSharedMem(
+              *embox));
     }
     auto storeOp =
         mlir::LLVM::StoreOp::create(rewriter, loc, boxValue, storage);
     this->attachTBAATag(storeOp, boxTy, boxTy, nullptr);
-    return storage;
+
+    rewriter.replaceOp(embox, storage);
+    if (mlir::isa<mlir::omp::AllocSharedMemOp>(storage.getDefiningOp()))
+      Fortran::utils::openmp::insertDeviceSharedMemDeallocation(rewriter,
+                                                                storage);
   }
 
   /// Compute the extent of a triplet slice (lb:ub:step).
@@ -1890,9 +1903,8 @@ struct EmboxOpConversion : public EmboxCommonConversion<fir::EmboxOp> {
            "fir.embox codegen of derived with length parameters");
       return mlir::failure();
     }
-    auto result =
-        placeInMemoryIfNotGlobalInit(rewriter, embox.getLoc(), boxTy, dest);
-    rewriter.replaceOp(embox, result);
+    placeInMemoryIfNotGlobalInit(rewriter, embox, embox.getLoc(),
+                                               boxTy, dest);
     return mlir::success();
   }
 };
@@ -2138,10 +2150,9 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
     dest = insertBaseAddress(rewriter, loc, dest, base);
     if (fir::isDerivedTypeWithLenParams(boxTy))
       TODO(loc, "fir.embox codegen of derived with length parameters");
-    mlir::Value result = placeInMemoryIfNotGlobalInit(
-        rewriter, loc, boxTy, dest,
+    placeInMemoryIfNotGlobalInit(
+        rewriter, xbox, loc, boxTy, dest,
         isDeviceAllocation(xbox.getMemref(), adaptor.getMemref()));
-    rewriter.replaceOp(xbox, result);
     return mlir::success();
   }
 
@@ -2256,10 +2267,9 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
       dest = insertStride(rewriter, loc, dest, dim, std::get<1>(iter.value()));
     }
     dest = insertBaseAddress(rewriter, loc, dest, base);
-    mlir::Value result = placeInMemoryIfNotGlobalInit(
-        rewriter, rebox.getLoc(), destBoxTy, dest,
+    placeInMemoryIfNotGlobalInit(
+        rewriter, rebox, rebox.getLoc(), destBoxTy, dest,
         isDeviceAllocation(rebox.getBox(), adaptor.getBox()));
-    rewriter.replaceOp(rebox, result);
     return mlir::success();
   }
 
diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
index 96e3caa481f51..6304cdf36f44f 100644
--- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
+++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
@@ -12,6 +12,7 @@
 
 #include "flang/Optimizer/CodeGen/FIROpPatterns.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Utils/OpenMP.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "llvm/Support/Debug.h"
 
@@ -313,9 +314,14 @@ mlir::Block *ConvertFIRToLLVMPattern::getBlockForAllocaInsert(
 // program address space we perform a cast. In the case of most architectures
 // the program and allocation address space will be the default of 0 and no
 // cast will be emitted.
+//
+// If `useDeviceSharedMem = true`, an `omp.alloc_shared_mem` operation for the
+// same type will be used instead, with no address space cast. This is only
+// intended for allocations on an OpenMP application when compiling for a
+// target device.
 mlir::Value ConvertFIRToLLVMPattern::genAllocaAndAddrCastWithType(
     mlir::Location loc, mlir::Type llvmObjectTy, unsigned alignment,
-    mlir::ConversionPatternRewriter &rewriter) const {
+    mlir::ConversionPatternRewriter &rewriter, bool useDeviceSharedMem) const {
   auto thisPt = rewriter.saveInsertionPoint();
   mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
   mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
@@ -325,16 +331,24 @@ mlir::Value ConvertFIRToLLVMPattern::genAllocaAndAddrCastWithType(
   unsigned allocaAs = getAllocaAddressSpace(rewriter);
   unsigned programAs = getProgramAddressSpace(rewriter);
 
-  mlir::Value al = mlir::LLVM::AllocaOp::create(
-      rewriter, loc, ::getLlvmPtrType(llvmObjectTy.getContext(), allocaAs),
-      llvmObjectTy, size, alignment);
+  mlir::Value al;
+  if (useDeviceSharedMem) {
+    al = mlir::omp::AllocSharedMemOp::create(
+        rewriter, loc, ::getLlvmPtrType(llvmObjectTy.getContext()),
+        llvmObjectTy, /*uniq_name=*/nullptr, /*bindc_name=*/nullptr,
+        /*typeparams=*/{}, /*shape=*/{size});
+  } else {
+    al = mlir::LLVM::AllocaOp::create(
+        rewriter, loc, ::getLlvmPtrType(llvmObjectTy.getContext(), allocaAs),
+        llvmObjectTy, size, alignment);
+  }
 
   // if our allocation address space, is not the same as the program address
   // space, then we must emit a cast to the program address space before use.
   // An example case would be on AMDGPU, where the allocation address space is
   // the numeric value 5 (private), and the program address space is 0
   // (generic).
-  if (allocaAs != programAs) {
+  if (!useDeviceSharedMem && allocaAs != programAs) {
     al = mlir::LLVM::AddrSpaceCastOp::create(
         rewriter, loc, ::getLlvmPtrType(llvmObjectTy.getContext(), programAs),
         al);
diff --git a/flang/lib/Optimizer/OpenMP/StackToShared.cpp b/flang/lib/Optimizer/OpenMP/StackToShared.cpp
index e666e2ed8f9b9..846ff8819432c 100644
--- a/flang/lib/Optimizer/OpenMP/StackToShared.cpp
+++ b/flang/lib/Optimizer/OpenMP/StackToShared.cpp
@@ -14,6 +14,7 @@
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
+#include "flang/Utils/OpenMP.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Dialect/OpenMP/OpenMPInterfaces.h"
@@ -41,7 +42,8 @@ class StackToSharedPass
       return;
 
     funcOp->walk([&](fir::AllocaOp allocaOp) {
-      if (!shouldReplaceAlloca(*allocaOp))
+      if (!Fortran::utils::openmp::shouldReplaceAllocaWithDeviceSharedMem(
+              *allocaOp))
         return;
 
       // Replace fir.alloca with omp.alloc_shared_mem.
@@ -56,107 +58,9 @@ class StackToSharedPass
 
       // Create a new omp.free_shared_mem for the allocated buffer prior to
       // exiting the region.
-      Block *allocaBlock = sharedAllocOp->getBlock();
-      DominanceInfo domInfo;
-      for (Block &block : sharedAllocOp->getParentRegion()->getBlocks()) {
-        Operation *terminator = block.getTerminator();
-        if (!terminator->hasSuccessors() &&
-            domInfo.dominates(allocaBlock, &block)) {
-          builder.setInsertionPoint(terminator);
-          omp::FreeSharedMemOp::create(builder, sharedAllocOp.getLoc(),
-                                       sharedAllocOp);
-        }
-      }
+      Fortran::utils::openmp::insertDeviceSharedMemDeallocation(
+          builder, sharedAllocOp.getResult());
     });
   }
-
-private:
-  // TODO: Refactor the logic in `shouldReplaceAlloca` and `checkAllocaUses` to
-  // be reusable by the MLIR to LLVM IR translation stage, as something very
-  // similar is also implemented there to choose between allocas and device
-  // shared memory allocations when processing OpenMP reductions, mapping and
-  // privatization.
-
-  // Decide whether to replace a fir.alloca with a pair of device shared memory
-  // allocation/deallocation pair based on the location of the allocation and
-  // its uses.
-  //
-  // In summary, it should be done whenever the allocation is placed outside any
-  // parallel regions and inside either a target device function or a generic
-  // kernel, while being used inside of a parallel region.
-  bool shouldReplaceAlloca(Operation &op) {
-    auto targetOp = op.getParentOfType<omp::TargetOp>();
-
-    // It must be inside of a generic omp.target or in a target device function,
-    // and not inside of omp.parallel.
-    if (auto parallelOp = op.getParentOfType<omp::ParallelOp>()) {
-      if (!targetOp || !targetOp->isProperAncestor(parallelOp))
-        return false;
-    }
-
-    if (targetOp) {
-      if (targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) !=
-          mlir::omp::TargetExecMode::generic)
-        return false;
-    } else {
-      auto declTargetIface = dyn_cast<mlir::omp::DeclareTargetInterface>(
-          *op.getParentOfType<func::FuncOp>());
-      if (!declTargetIface || !declTargetIface.isDeclareTarget() ||
-          declTargetIface.getDeclareTargetDeviceType() ==
-              mlir::omp::DeclareTargetDeviceType::host)
-        return false;
-    }
-
-    return checkAllocaUses(op.getUses());
-  }
-
-  // When a use takes place inside an omp.parallel region and it's not as a
-  // private clause argument, or when it is a reduction argument passed to
-  // omp.parallel, then the defining allocation is eligible for replacement with
-  // shared memory.
-  //
-  // Only one of the uses needs to meet these conditions to return true.
-  bool checkAllocaUses(const Operation::use_range &uses) {
-    auto checkUse = [&](const OpOperand &use) {
-      Operation *owner = use.getOwner();
-      auto moduleOp = owner->getParentOfType<ModuleOp>();
-      if (auto parallelOp = dyn_cast<omp::ParallelOp>(owner)) {
-        if (llvm::is_contained(parallelOp.getReductionVars(), use.get()))
-          return true;
-      } else if (owner->getParentOfType<omp::ParallelOp>()) {
-        // If it is used directly inside of a parallel region, it has to be
-        // replaced unless the use is a private clause.
-        if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(owner)) {
-          if (auto privateSyms = llvm::cast_or_null<ArrayAttr>(
-                  owner->getAttr("private_syms"))) {
-            for (auto [var, sym] :
-                 llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
-              if (var != use.get())
-                continue;
-
-              auto privateOp = cast<omp::PrivateClauseOp>(
-                  moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
-              return privateOp.getDataSharingType() !=
-                     omp::DataSharingClauseType::Private;
-            }
-          }
-        }
-        return true;
-      }
-      return false;
-    };
-
-    // Check direct uses and also follow hlfir.declare uses.
-    for (const OpOperand &use : uses) {
-      if (auto declareOp = dyn_cast<hlfir::DeclareOp>(use.getOwner())) {
-        if (checkAllocaUses(declareOp->getUses()))
-          return true;
-      } else if (checkUse(use)) {
-        return true;
-      }
-    }
-
-    return false;
-  }
 };
 } // namespace
diff --git a/flang/lib/Utils/OpenMP.cpp b/flang/lib/Utils/OpenMP.cpp
index b07caf853191a..73f5b9bc73129 100644
--- a/flang/lib/Utils/OpenMP.cpp
+++ b/flang/lib/Utils/OpenMP.cpp
@@ -17,12 +17,11 @@
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Transforms/RegionUtils.h"
 
-namespace Fortran::utils::openmp {
-mlir::omp::MapInfoOp createMapInfoOp(mlir::OpBuilder &builder,
-    mlir::Location loc, mlir::Value baseAddr, mlir::Value varPtrPtr,
-    llvm::StringRef name, llvm::ArrayRef<mlir::Value> bounds,
-    llvm::ArrayRef<mlir::Value> members, mlir::ArrayAttr membersIndex,
-    mlir::omp::ClauseMapFlags mapType,
+mlir::omp::MapInfoOp Fortran::utils::openmp::createMapInfoOp(
+    mlir::OpBuilder &builder, mlir::Location loc, mlir::Value baseAddr,
+    mlir::Value varPtrPtr, llvm::StringRef name,
+    llvm::ArrayRef<mlir::Value> bounds, llvm::ArrayRef<mlir::Value> members,
+    mlir::ArrayAttr membersIndex, mlir::omp::ClauseMapFlags mapType,
     mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
     bool partialMap, mlir::FlatSymbolRefAttr mapperId) {
 
@@ -50,8 +49,9 @@ mlir::omp::MapInfoOp createMapInfoOp(mlir::OpBuilder &builder,
   return op;
 }
 
-mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder,
-    mlir::omp::TargetOp targetOp, mlir::Value val, llvm::StringRef name) {
+mlir::Value Fortran::utils::openmp::mapTemporaryValue(
+    fir::FirOpBuilder &firOpBuilder, mlir::omp::TargetOp targetOp,
+    mlir::Value val, llvm::StringRef name) {
   mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
   mlir::Operation *valOp = val.getDefiningOp();
 
@@ -116,7 +116,7 @@ mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder,
   return loadOp.getResult();
 }
 
-void cloneOrMapRegionOutsiders(
+void Fortran::utils::openmp::cloneOrMapRegionOutsiders(
     fir::FirOpBuilder &firOpBuilder, mlir::omp::TargetOp targetOp) {
   mlir::Region &region = targetOp.getRegion();
   mlir::Block *entryBlock = &region.getBlocks().front();
@@ -156,4 +156,107 @@ void cloneOrMapRegionOutsiders(
   }
 }
 
-} // namespace Fortran::utils::openmp
+/// When a use takes place inside an omp.parallel region and it's not as a
+/// private clause argument, or when it is a reduction argument passed to
+/// omp.parallel or a function call argument, then the defining allocation is
+/// eligible for replacement with shared memory.
+static bool allocaUseRequiresDeviceSharedMem(const mlir::OpOperand &use) {
+  mlir::Operation *owner = use.getOwner();
+  if (auto parallelOp = llvm::dyn_cast<mlir::omp::ParallelOp>(owner)) {
+    if (llvm::is_contained(parallelOp.getReductionVars(), use.get()))
+      return true;
+  } else if (auto callOp = llvm::dyn_cast<mlir::CallOpInterface>(owner)) {
+    if (llvm::is_contained(callOp.getArgOperands(), use.get()))
+      return true;
+  }
+
+  // If it is used directly inside of a parallel region, it has to be replaced
+  // unless the use is a private clause.
+  if (owner->getParentOfType<mlir::omp::ParallelOp>()) {
+    if (auto argIface =
+            llvm::dyn_cast<mlir::omp::BlockArgOpenMPOpInterface>(owner)) {
+      if (auto privateSyms = llvm::cast_or_null<mlir::ArrayAttr>(
+              owner->getAttr("private_syms"))) {
+        for (auto [var, sym] :
+            llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
+          if (var != use.get())
+            continue;
+
+          auto moduleOp = owner->getParentOfType<mlir::ModuleOp>();
+          auto privateOp = llvm::cast<mlir::omp::PrivateClauseOp>(
+              moduleOp.lookupSymbol(llvm::cast<mlir::SymbolRefAttr>(sym)));
+          return privateOp.getDataSharingType() !=
+              mlir::omp::DataSharingClauseType::Private;
+        }
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+static bool shouldReplaceAllocaWithUses(
+    const mlir::Operation::use_range &uses) {
+  // Check direct uses and also follow hlfir.declare/fir.convert uses.
+  for (const mlir::OpOperand &use : uses) {
+    if (llvm::isa<hlfir::DeclareOp, fir::ConvertOp>(use.getOwner())) {
+      if (shouldReplaceAllocaWithUses(use.getOwner()->getUses()))
+        return true;
+    } else if (allocaUseRequiresDeviceSharedMem(use)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// TODO: Refactor the logic in `shouldReplaceAllocaWithDeviceSharedMem`,
+// `shouldReplaceAllocaWithUses` and `allocaUseRequiresDeviceSharedMem` to
+// be reusable by the MLIR to LLVM IR translation stage, as something very
+// similar is also implemented there to choose between allocas and device
+// shared memory allocations when processing OpenMP reductions, mapping and
+// privatization.
+bool Fortran::utils::openmp::shouldReplaceAllocaWithDeviceSharedMem(
+    mlir::Operation &op) {
+  auto offloadIface = op.getParentOfType<mlir::omp::OffloadModuleInterface>();
+  if (!offloadIface || !offloadIface.getIsTargetDevice())
+    return false;
+
+  auto targetOp = op.getParentOfType<mlir::omp::TargetOp>();
+
+  // It must be inside of a generic omp.target or in a target device function,
+  // and not inside of omp.parallel.
+  if (auto parallelOp = op.getParentOfType<mlir::omp::ParallelOp>()) {
+    if (!targetOp || !targetOp->isProperAncestor(parallelOp))
+      return false;
+  }
+
+  if (targetOp) {
+    if (targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) !=
+        mlir::omp::TargetExecMode::generic)
+      return false;
+  } else {
+    auto declTargetIface =
+        op.getParentOfType<mlir::omp::DeclareTargetInterface>();
+    if (!declTargetIface || !declTargetIface.isDeclareTarget() ||
+        declTargetIface.getDeclareTargetDeviceType() ==
+            mlir::omp::DeclareTargetDeviceType::host)
+      return false;
+  }
+
+  return shouldReplaceAllocaWithUses(op.getUses());
+}
+
+void Fortran::utils::openmp::insertDeviceSharedMemDeallocation(
+    mlir::OpBuilder &builder, mlir::Value allocVal) {
+  mlir::Block *allocaBlock = allocVal.getParentBlock();
+  mlir::DominanceInfo domInfo;
+  for (mlir::Block &block : allocVal.getParentRegion()->getBlocks()) {
+    mlir::Operation *terminator = block.getTerminator();
+    if (!terminator->hasSuccessors() &&
+        domInfo.dominates(allocaBlock, &block)) {
+      builder.setInsertionPoint(terminator);
+      mlir::omp::FreeSharedMemOp::create(builder, allocVal.getLoc(), allocVal);
+    }
+  }
+}
diff --git a/flang/test/Fir/OpenMP/embox-to-shared-mem.fir b/flang/test/Fir/OpenMP/embox-to-shared-mem.fir
new file mode 100644
index 0000000000000..eaa5eb6bbb905
--- /dev/null
+++ b/flang/test/Fir/OpenMP/embox-to-shared-mem.fir
@@ -0,0 +1,29 @@
+// RUN: tco -o - %s | FileCheck %s
+
+module attributes {omp.is_target_device = true} {
+  // CHECK-LABEL: declare void @scalar(ptr)
+  func.func private @scalar(%x : !fir.box<f32>)
+  // CHECK-LABEL: declare void @array(ptr)
+  func.func private @array(%x : !fir.box<!fir.array<?xi32>>)
+
+  // CHECK-LABEL: define void @embox
+  func.func @embox(%arg0 : !fir.ref<f32>, %arg1 : !fir.ref<!fir.array<?xi32>>) attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} {
+    // CHECK: %[[DESC2:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 48)
+    // CHECK: %[[DESC1:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 24)
+    %a = fir.embox %arg0 : (!fir.ref<f32>) -> !fir.box<f32>
+    %c0 = arith.constant 0 : i64
+    %b = fircg.ext_embox %arg1(%c0) origin %c0[%c0, %c0, %c0] : (!fir.ref<!fir.array<?xi32>>, i64, i64, i64, i64, i64) -> !fir.box<!fir.array<?xi32>>
+
+    // CHECK: call void @scalar(ptr %[[DESC1]])
+    fir.call @scalar(%a) : (!fir.box<f32>) -> ()
+
+    // CHECK: call void @array(ptr %[[DESC2]])
+    fir.call @array(%b) : (!fir.box<!fir.array<?xi32>>) -> ()
+
+    // CHECK: call void @__kmpc_free_shared(ptr %[[DESC1]], i64 24)
+    // CHECK: call void @__kmpc_free_shared(ptr %[[DESC2]], i64 48)
+    // CHECK: ret void
+    return
+  }
+
+}
diff --git a/flang/test/Integration/OpenMP/threadprivate-target-device.f90 b/flang/test/Integration/OpenMP/threadprivate-target-device.f90
index 662d6c6357af0..2d5d073520abe 100644
--- a/flang/test/Integration/OpenMP/threadprivate-target-device.f90
+++ b/flang/test/Integration/OpenMP/threadprivate-target-device.f90
@@ -14,16 +14,14 @@
 ! target code in the same function.
 
 ! CHECK: define weak_odr protected amdgpu_kernel void @{{.*}}(ptr %{{.*}}, ptr %[[ARG1:.*]], ptr %[[ARG2:.*]]) #{{[0-9]+}} {
-! CHECK:  %[[ALLOCA_X:.*]] = alloca ptr, align 8, addrspace(5)
-! CHECK:  %[[ASCAST_X:.*]] = addrspacecast ptr addrspace(5) %[[ALLOCA_X]] to ptr
-! CHECK:  store ptr %[[ARG1]], ptr %[[ASCAST_X]], align 8
+! CHECK:  %[[ALLOC_N:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8)
+! CHECK:  store ptr %[[ARG2]], ptr %[[ALLOC_N]], align 8
 
-! CHECK:  %[[ALLOCA_N:.*]] = alloca ptr, align 8, addrspace(5)
-! CHECK:  %[[ASCAST_N:.*]] = addrspacecast ptr addrspace(5) %[[ALLOCA_N]] to ptr
-! CHECK:  store ptr %[[ARG2]], ptr %[[ASCAST_N]], align 8
+! CHECK:  %[[ALLOC_X:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8)
+! CHECK:  store ptr %[[ARG1]], ptr %[[ALLOC_X]], align 8
 
-! CHECK:  %[[LOAD_X:.*]] = load ptr, ptr %[[ASCAST_X]], align 8
-! CHECK:  call void @bar_(ptr %[[LOAD_X]], ptr %[[ASCAST_N]])
+! CHECK:  %[[LOAD_X:.*]] = load ptr, ptr %[[ALLOC_X]], align 8
+! CHECK:  call void @bar_(ptr %[[LOAD_X]], ptr %[[ALLOC_N]])
 
 module test
   implicit none
diff --git a/flang/test/Transforms/OpenMP/stack-to-shared.mlir b/flang/test/Transforms/OpenMP/stack-to-shared.mlir
index a7842048a8411..269b3b28afe2c 100644
--- a/flang/test/Transforms/OpenMP/stack-to-shared.mlir
+++ b/flang/test/Transforms/OpenMP/stack-to-shared.mlir
@@ -17,13 +17,16 @@ module attributes {omp.is_target_device = true} {
     omp.yield(%arg0 : i32)
   }
 
+  func.func private @foo(%b : !fir.ref<i32>)
+
   // Verify that target device functions are searched for allocas shared across
   // threads of a parallel region.
   //
   // Also ensure that all fir.alloca information is adequately forwarded to the
   // new allocation, that uses of the allocation through hlfir.declare are
-  // detected and that only the expected types of uses (parallel reduction and
-  // non-private uses inside of a parallel region) are replaced.
+  // detected and that only the expected types of uses (parallel reduction,
+  // non-private uses inside of a parallel region and function calls) are
+  // replaced.
   // CHECK-LABEL: func.func @standalone_func
   func.func @standalone_func(%lb: i32, %ub: i32, %step: i32) attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} {
     // CHECK: %[[ALLOC_0:.*]] = omp.alloc_shared_mem i32 {uniq_name = "x"} : !fir.ref<i32>
@@ -39,22 +42,26 @@ module attributes {omp.is_target_device = true} {
     %3 = fir.alloca i32 {uniq_name = "a"}
     // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "b"}
     %4 = fir.alloca i32 {uniq_name = "b"}
+    // CHECK: %[[ALLOC_3:.*]] = omp.alloc_shared_mem i32 {uniq_name = "c"} : !fir.ref<i32>
+    %5 = fir.alloca i32 {uniq_name = "c"}
+    fir.call @foo(%5) : (!fir.ref<i32>) -> ()
     omp.parallel reduction(@add_reduction_i32 %0 -> %arg0 : !fir.ref<i32>) {
-      // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "c"}
-      %5 = fir.alloca i32 {uniq_name = "c"}
-      %6:2 = fir.unboxchar %decl#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+      // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "d"}
+      %6 = fir.alloca i32 {uniq_name = "d"}
+      %7:2 = fir.unboxchar %decl#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
       omp.wsloop private(@privatizer_i32 %2 -> %arg1, @firstprivatizer_i32 %3 -> %arg2 : !fir.ref<i32>, !fir.ref<i32>) {
         omp.loop_nest (%arg3) : i32 = (%lb) to (%ub) inclusive step (%step) {
-          %7 = fir.load %5 : !fir.ref<i32>
+          %8 = fir.load %6 : !fir.ref<i32>
           omp.yield
         }
       }
       omp.terminator
     }
-    %5 = fir.load %4 : !fir.ref<i32>
+    %9 = fir.load %4 : !fir.ref<i32>
     // CHECK: omp.free_shared_mem %[[ALLOC_0]] : !fir.ref<i32>
     // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_1]] : !fir.ref<!fir.char<1,?>>
     // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_2]] : !fir.ref<i32>
+    // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_3]] : !fir.ref<i32>
     // CHECK-NEXT: return
     return
   }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 427776cd188a5..605568a28fb28 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1168,7 +1168,12 @@ static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
     if (auto parallelOp = dyn_cast<omp::ParallelOp>(user)) {
       if (llvm::is_contained(parallelOp.getReductionVars(), value))
         return true;
-    } else if (auto parallelOp = user->getParentOfType<omp::ParallelOp>()) {
+    } else if (auto callOp = dyn_cast<CallOpInterface>(user)) {
+      if (llvm::is_contained(callOp.getArgOperands(), value))
+        return true;
+    }
+
+    if (auto parallelOp = user->getParentOfType<omp::ParallelOp>()) {
       if (parentOp->isProperAncestor(parallelOp)) {
         // If it is used directly inside of a parallel region, skip private
         // clause uses.
diff --git a/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir b/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir
index 724e03885d146..fc6f80e1970fc 100644
--- a/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir
@@ -39,6 +39,5 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK-NEXT: entry:
 // CHECK-NEXT:  %[[MOVED_ALLOCA1:.*]] = alloca { ptr }, align 8
 // CHECK-NEXT:  %[[MOVED_ALLOCA2:.*]] = alloca i32, i64 1, align 4
-// CHECK-NEXT:  %[[MAP_ARG_ALLOCA:.*]] = alloca ptr, align 8
-
 // CHECK: user_code.entry:                                  ; preds = %entry
+// CHECK-NEXT:  %[[MAP_ARG_ALLOC:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8)
diff --git a/mlir/test/Target/LLVMIR/openmp-target-private-shared-mem.mlir b/mlir/test/Target/LLVMIR/openmp-target-private-shared-mem.mlir
new file mode 100644
index 0000000000000..bfa679f769c46
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-target-private-shared-mem.mlir
@@ -0,0 +1,76 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {omp.is_target_device = true, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true,  dlti.dl_spec = #dlti.dl_spec<!llvm.ptr = dense<64> : vector<4xi64>, !llvm.ptr<1> = dense<64> : vector<4xi64>, !llvm.ptr<2> = dense<32> : vector<4xi64>, !llvm.ptr<3> = dense<32> : vector<4xi64>, !llvm.ptr<4> = dense<64> : vector<4xi64>, !llvm.ptr<5> = dense<32> : vector<4xi64>, !llvm.ptr<6> = dense<32> : vector<4xi64>, !llvm.ptr<7> = dense<[160, 256, 256, 32]> : vector<4xi64>, !llvm.ptr<8> = dense<[128, 128, 128, 48]> : vector<4xi64>, !llvm.ptr<9> = dense<[192, 256, 256, 32]> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.legal_int_widths" = array<i32: 32, 64>, "dlti.stack_alignment" = 32 : i64, "dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>} {
+  omp.private {type = private} @simple_var.privatizer : i32
+  omp.declare_reduction @simple_var.reducer : i32 init {
+  ^bb0(%arg0: i32):
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    omp.yield(%0 : i32)
+  } combiner {
+  ^bb0(%arg0: i32, %arg1: i32):
+    %0 = llvm.add %arg0, %arg1 : i32
+    omp.yield(%0 : i32)
+  }
+
+  // CHECK-LABEL: declare void @device_func(ptr)
+  llvm.func @device_func(!llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>}
+  
+  // CHECK-NOT: define {{.*}} void @target_map_single_shared_mem_private
+  llvm.func @target_map_single_shared_mem_private() attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr<5>
+    %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+
+    // CHECK-LABEL: define {{.*}} void @__omp_offloading_{{.*}}target_map_single_shared_mem_private{{.*}}({{.*}})
+    // CHECK: call i32 @__kmpc_target_init
+    // CHECK: %[[ALLOC0:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
+    // CHECK: call void @device_func(ptr %[[ALLOC0]])
+    // CHECK: call void @__kmpc_free_shared(ptr %[[ALLOC0]], i64 4)
+    // CHECK: call void @__kmpc_target_deinit
+    omp.target private(@simple_var.privatizer %2 -> %arg0 : !llvm.ptr) {
+      llvm.call @device_func(%arg0) : (!llvm.ptr) -> ()
+      omp.terminator
+    }
+
+    // CHECK-LABEL: define {{.*}} void @__omp_offloading_{{.*}}target_map_single_shared_mem_private{{.*}}({{.*}})
+    // CHECK: call i32 @__kmpc_target_init
+    // CHECK: %[[ALLOC_ARGS0:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8)
+    // CHECK: %[[ALLOC1:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
+    // CHECK: %[[GEP0:.*]] = getelementptr { ptr }, ptr %[[ALLOC_ARGS0]], i32 0, i32 0
+    // CHECK: store ptr %[[ALLOC1]], ptr %[[GEP0]], align 8
+    // CHECK: %[[GEP1:.*]] = getelementptr inbounds [1 x ptr], ptr %[[PAR_ARGS0:.*]], i64 0, i64 0
+    // CHECK: store ptr %[[ALLOC_ARGS0]], ptr %[[GEP1]], align 8
+    // CHECK: call void @__kmpc_parallel_51(ptr {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr %[[PAR_ARGS0]], i64 1)
+    // CHECK: call void @__kmpc_free_shared(ptr %[[ALLOC1]], i64 4)
+    // CHECK: call void @__kmpc_free_shared(ptr %[[ALLOC_ARGS0]], i64 8)
+    // CHECK: call void @__kmpc_target_deinit
+    omp.target private(@simple_var.privatizer %2 -> %arg0 : !llvm.ptr) {
+      omp.parallel reduction(@simple_var.reducer %arg0 -> %arg1 : !llvm.ptr) {
+        %3 = llvm.load %arg1 : !llvm.ptr -> i32
+        omp.terminator
+      }
+      omp.terminator
+    }
+
+    // CHECK-LABEL: define {{.*}} void @__omp_offloading_{{.*}}target_map_single_shared_mem_private{{.*}}({{.*}})
+    // CHECK: call i32 @__kmpc_target_init
+    // CHECK: %[[ALLOC_ARGS1:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8)
+    // CHECK: %[[ALLOC2:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
+    // CHECK: %[[GEP2:.*]] = getelementptr { ptr }, ptr %[[ALLOC_ARGS1]], i32 0, i32 0
+    // CHECK: store ptr %[[ALLOC2]], ptr %[[GEP2]], align 8
+    // CHECK: %[[GEP3:.*]] = getelementptr inbounds [1 x ptr], ptr %[[PAR_ARGS1:.*]], i64 0, i64 0
+    // CHECK: store ptr %[[ALLOC_ARGS1]], ptr %[[GEP3]], align 8
+    // CHECK: call void @__kmpc_parallel_51(ptr {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr %[[PAR_ARGS1]], i64 1)
+    // CHECK: call void @__kmpc_free_shared(ptr %[[ALLOC2]], i64 4)
+    // CHECK: call void @__kmpc_free_shared(ptr %[[ALLOC_ARGS1]], i64 8)
+    // CHECK: call void @__kmpc_target_deinit
+    omp.target private(@simple_var.privatizer %2 -> %arg0 : !llvm.ptr) {
+      omp.parallel {
+        %4 = llvm.load %arg0 : !llvm.ptr -> i32
+        omp.terminator
+      }
+      omp.terminator
+    }
+    llvm.return
+  }
+}

From 64b584ba69f78670b9e5cdb0b417376d07b73e2d Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Wed, 22 Oct 2025 07:36:01 -0500
Subject: [PATCH 14/22] delay stack to shared pass to process all
 llvm.mlir.allocas

---
 .../flang/Optimizer/CodeGen/FIROpPatterns.h   |  8 +---
 .../include/flang/Optimizer/OpenMP/Passes.td  | 13 ++---
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       | 48 ++++++++-----------
 flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp |  6 ++-
 flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp | 24 ++--------
 flang/lib/Optimizer/OpenMP/StackToShared.cpp  | 47 +++++++++++++-----
 flang/lib/Optimizer/Passes/Pipelines.cpp      |  5 +-
 flang/lib/Utils/OpenMP.cpp                    |  6 ++-
 flang/test/Fir/basic-program.fir              |  2 +
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  1 +
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 12 +++--
 11 files changed, 91 insertions(+), 81 deletions(-)

diff --git a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
index f4c0e273a6e3a..7d816a8843371 100644
--- a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
+++ b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
@@ -163,16 +163,10 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern {
   // program address space we perform a cast. In the case of most architectures
   // the program and allocation address space will be the default of 0 and no
   // cast will be emitted.
-  //
-  // If `useDeviceSharedMem = true`, an `omp.alloc_shared_mem` operation for the
-  // same type will be used instead, with no address space cast. This is only
-  // intended for allocations on an OpenMP application when compiling for a
-  // target device.
   mlir::Value
   genAllocaAndAddrCastWithType(mlir::Location loc, mlir::Type llvmObjectTy,
                                unsigned alignment,
-                               mlir::ConversionPatternRewriter &rewriter,
-                               bool useDeviceSharedMem = false) const;
+                               mlir::ConversionPatternRewriter &rewriter) const;
 
   const fir::LLVMTypeConverter &lowerTy() const {
     return *static_cast<const fir::LLVMTypeConverter *>(
diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td
index a024849ed04b2..fd68112f555c7 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.td
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.td
@@ -149,14 +149,15 @@ def AutomapToTargetDataPass
   let dependentDialects = ["mlir::omp::OpenMPDialect"];
 }
 
-def StackToSharedPass : Pass<"omp-stack-to-shared", "mlir::func::FuncOp"> {
+// TODO: Move pass to the omp dialect.
+def StackToSharedPass : Pass<"omp-stack-to-shared", "mlir::LLVM::LLVMFuncOp"> {
   let summary = "Replaces stack allocations with shared memory.";
   let description = [{
-    `fir.alloca` operations defining values in a target region and then used
-    inside of an `omp.parallel` region are replaced by this pass with
-    `omp.alloc_shared_mem` and `omp.free_shared_mem`. This is also done for
-    top-level function `fir.alloca`s used in the same way when the parent
-    function is a target device function.
+    `llvm.mlir.alloca` operations defining values in a target region and then
+    potentially used inside of an `omp.parallel` region are replaced by this
+    pass with `omp.alloc_shared_mem` and `omp.free_shared_mem`. This is also
+    done for top-level function `llvm.mlir.alloca`s used in the same way when
+    the parent function is a target device function.
 
     This ensures that explicit private allocations, intended to be shared across
     threads, use the proper memory space on a target device while supporting the
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 4826406635f1b..478ab151b96d0 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -31,7 +31,6 @@
 #include "flang/Runtime/allocator-registry-consts.h"
 #include "flang/Runtime/descriptor-consts.h"
 #include "flang/Semantics/runtime-type-info.h"
-#include "flang/Utils/OpenMP.h"
 #include "mlir/Conversion/ArithCommon/AttrToLLVMConverter.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
@@ -1819,21 +1818,15 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
   }
 
   /// If the embox is not in a globalOp body, allocate storage for the box;
-  /// store the value inside and replace the original embox with the generated
-  /// alloca. Replace it with the input value otherwise.
-  ///
-  /// The allocated storage might be OpenMP device shared memory, if required by
-  /// the context and uses of the operation. In that case, the corresponding
-  /// explicit deallocation is also introduced at the applicable block(s).
-  void
+  /// store the value inside and return the generated alloca. Return the input
+  /// value otherwise.
+  mlir::Value
   placeInMemoryIfNotGlobalInit(mlir::ConversionPatternRewriter &rewriter,
-                               mlir::Operation *embox, mlir::Location loc,
-                               mlir::Type boxTy, mlir::Value boxValue,
+                               mlir::Location loc, mlir::Type boxTy,
+                               mlir::Value boxValue,
                                bool needDeviceAllocation = false) const {
-    if (isInGlobalOp(rewriter)) {
-      rewriter.replaceOp(embox, boxValue);
-      return;
-    }
+    if (isInGlobalOp(rewriter))
+      return boxValue;
     mlir::Type llvmBoxTy = boxValue.getType();
     mlir::Value storage;
     if (needDeviceAllocation) {
@@ -1842,19 +1835,13 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
       storage =
           genCUFAllocDescriptor(loc, rewriter, mod, baseBoxTy, this->lowerTy());
     } else {
-      storage = this->genAllocaAndAddrCastWithType(
-          loc, llvmBoxTy, defaultAlign, rewriter,
-          Fortran::utils::openmp::shouldReplaceAllocaWithDeviceSharedMem(
-              *embox));
+      storage = this->genAllocaAndAddrCastWithType(loc, llvmBoxTy, defaultAlign,
+                                                   rewriter);
     }
     auto storeOp =
         mlir::LLVM::StoreOp::create(rewriter, loc, boxValue, storage);
     this->attachTBAATag(storeOp, boxTy, boxTy, nullptr);
-
-    rewriter.replaceOp(embox, storage);
-    if (mlir::isa<mlir::omp::AllocSharedMemOp>(storage.getDefiningOp()))
-      Fortran::utils::openmp::insertDeviceSharedMemDeallocation(rewriter,
-                                                                storage);
+    return storage;
   }
 
   /// Compute the extent of a triplet slice (lb:ub:step).
@@ -1903,8 +1890,9 @@ struct EmboxOpConversion : public EmboxCommonConversion<fir::EmboxOp> {
            "fir.embox codegen of derived with length parameters");
       return mlir::failure();
     }
-    placeInMemoryIfNotGlobalInit(rewriter, embox, embox.getLoc(),
-                                               boxTy, dest);
+    auto result =
+        placeInMemoryIfNotGlobalInit(rewriter, embox.getLoc(), boxTy, dest);
+    rewriter.replaceOp(embox, result);
     return mlir::success();
   }
 };
@@ -2150,9 +2138,10 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
     dest = insertBaseAddress(rewriter, loc, dest, base);
     if (fir::isDerivedTypeWithLenParams(boxTy))
       TODO(loc, "fir.embox codegen of derived with length parameters");
-    placeInMemoryIfNotGlobalInit(
-        rewriter, xbox, loc, boxTy, dest,
+    mlir::Value result = placeInMemoryIfNotGlobalInit(
+        rewriter, loc, boxTy, dest,
         isDeviceAllocation(xbox.getMemref(), adaptor.getMemref()));
+    rewriter.replaceOp(xbox, result);
     return mlir::success();
   }
 
@@ -2267,9 +2256,10 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
       dest = insertStride(rewriter, loc, dest, dim, std::get<1>(iter.value()));
     }
     dest = insertBaseAddress(rewriter, loc, dest, base);
-    placeInMemoryIfNotGlobalInit(
-        rewriter, rebox, rebox.getLoc(), destBoxTy, dest,
+    mlir::Value result = placeInMemoryIfNotGlobalInit(
+        rewriter, rebox.getLoc(), destBoxTy, dest,
         isDeviceAllocation(rebox.getBox(), adaptor.getBox()));
+    rewriter.replaceOp(rebox, result);
     return mlir::success();
   }
 
diff --git a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
index ea0f7dff9f99e..c4e9505e74094 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
@@ -277,6 +277,8 @@ void fir::populateOpenMPFIRToLLVMConversionPatterns(
     const LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns) {
   patterns.add<MapInfoOpConversion>(converter);
   patterns.add<PrivateClauseOpConversion>(converter);
-  patterns.add<AllocMemOpConversion<mlir::omp::TargetAllocMemOp>,
-               AllocMemOpConversion<mlir::omp::AllocSharedMemOp>>(converter);
+  // TODO: Undo refactoring in previous commit here.
+  patterns.add<AllocMemOpConversion<mlir::omp::TargetAllocMemOp>/*,
+               AllocMemOpConversion<mlir::omp::AllocSharedMemOp>*/>(
+      converter);
 }
diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
index 6304cdf36f44f..96e3caa481f51 100644
--- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
+++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
@@ -12,7 +12,6 @@
 
 #include "flang/Optimizer/CodeGen/FIROpPatterns.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
-#include "flang/Utils/OpenMP.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "llvm/Support/Debug.h"
 
@@ -314,14 +313,9 @@ mlir::Block *ConvertFIRToLLVMPattern::getBlockForAllocaInsert(
 // program address space we perform a cast. In the case of most architectures
 // the program and allocation address space will be the default of 0 and no
 // cast will be emitted.
-//
-// If `useDeviceSharedMem = true`, an `omp.alloc_shared_mem` operation for the
-// same type will be used instead, with no address space cast. This is only
-// intended for allocations on an OpenMP application when compiling for a
-// target device.
 mlir::Value ConvertFIRToLLVMPattern::genAllocaAndAddrCastWithType(
     mlir::Location loc, mlir::Type llvmObjectTy, unsigned alignment,
-    mlir::ConversionPatternRewriter &rewriter, bool useDeviceSharedMem) const {
+    mlir::ConversionPatternRewriter &rewriter) const {
   auto thisPt = rewriter.saveInsertionPoint();
   mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
   mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
@@ -331,24 +325,16 @@ mlir::Value ConvertFIRToLLVMPattern::genAllocaAndAddrCastWithType(
   unsigned allocaAs = getAllocaAddressSpace(rewriter);
   unsigned programAs = getProgramAddressSpace(rewriter);
 
-  mlir::Value al;
-  if (useDeviceSharedMem) {
-    al = mlir::omp::AllocSharedMemOp::create(
-        rewriter, loc, ::getLlvmPtrType(llvmObjectTy.getContext()),
-        llvmObjectTy, /*uniq_name=*/nullptr, /*bindc_name=*/nullptr,
-        /*typeparams=*/{}, /*shape=*/{size});
-  } else {
-    al = mlir::LLVM::AllocaOp::create(
-        rewriter, loc, ::getLlvmPtrType(llvmObjectTy.getContext(), allocaAs),
-        llvmObjectTy, size, alignment);
-  }
+  mlir::Value al = mlir::LLVM::AllocaOp::create(
+      rewriter, loc, ::getLlvmPtrType(llvmObjectTy.getContext(), allocaAs),
+      llvmObjectTy, size, alignment);
 
   // if our allocation address space, is not the same as the program address
   // space, then we must emit a cast to the program address space before use.
   // An example case would be on AMDGPU, where the allocation address space is
   // the numeric value 5 (private), and the program address space is 0
   // (generic).
-  if (!useDeviceSharedMem && allocaAs != programAs) {
+  if (allocaAs != programAs) {
     al = mlir::LLVM::AddrSpaceCastOp::create(
         rewriter, loc, ::getLlvmPtrType(llvmObjectTy.getContext(), programAs),
         al);
diff --git a/flang/lib/Optimizer/OpenMP/StackToShared.cpp b/flang/lib/Optimizer/OpenMP/StackToShared.cpp
index 846ff8819432c..8de2170d544ff 100644
--- a/flang/lib/Optimizer/OpenMP/StackToShared.cpp
+++ b/flang/lib/Optimizer/OpenMP/StackToShared.cpp
@@ -11,13 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "flang/Optimizer/Dialect/FIROps.h"
-#include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Utils/OpenMP.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
-#include "mlir/Dialect/OpenMP/OpenMPInterfaces.h"
 
 namespace flangomp {
 #define GEN_PASS_DEF_STACKTOSHAREDPASS
@@ -36,31 +34,56 @@ class StackToSharedPass
     MLIRContext *context = &getContext();
     OpBuilder builder(context);
 
-    func::FuncOp funcOp = getOperation();
+    LLVM::LLVMFuncOp funcOp = getOperation();
     auto offloadIface = funcOp->getParentOfType<omp::OffloadModuleInterface>();
     if (!offloadIface || !offloadIface.getIsTargetDevice())
       return;
 
-    funcOp->walk([&](fir::AllocaOp allocaOp) {
+    llvm::SmallVector<Operation *> toBeDeleted;
+    funcOp->walk([&](LLVM::AllocaOp allocaOp) {
       if (!Fortran::utils::openmp::shouldReplaceAllocaWithDeviceSharedMem(
               *allocaOp))
         return;
+      // Replace llvm.alloca with omp.alloc_shared_mem.
+      Type resultType = allocaOp.getResult().getType();
+
+      // TODO: The handling of non-default address spaces might need to be
+      // improved. This currently only handles the case where an alloca to
+      // non-default address space must only be used by a single addrspacecast
+      // to default address space.
+      bool nonDefaultAddrSpace = false;
+      if (auto llvmPtrType = dyn_cast<LLVM::LLVMPointerType>(resultType))
+        nonDefaultAddrSpace = llvmPtrType.getAddressSpace() != 0;
 
-      // Replace fir.alloca with omp.alloc_shared_mem.
       builder.setInsertionPoint(allocaOp);
       auto sharedAllocOp = omp::AllocSharedMemOp::create(
-          builder, allocaOp->getLoc(), allocaOp.getResult().getType(),
-          allocaOp.getInType(), allocaOp.getUniqNameAttr(),
-          allocaOp.getBindcNameAttr(), allocaOp.getTypeparams(),
-          allocaOp.getShape());
-      allocaOp.replaceAllUsesWith(sharedAllocOp.getOperation());
-      allocaOp.erase();
+          builder, allocaOp->getLoc(), LLVM::LLVMPointerType::get(context),
+          allocaOp.getElemType(),
+          /*uniq_name=*/nullptr,
+          /*bindc_name=*/nullptr, /*typeparams=*/{allocaOp.getArraySize()},
+          /*shape=*/{});
+      if (nonDefaultAddrSpace) {
+        assert(allocaOp->hasOneUse() && "alloca must have only one use");
+        auto asCastOp =
+            cast<LLVM::AddrSpaceCastOp>(*allocaOp->getUsers().begin());
+        asCastOp.replaceAllUsesWith(sharedAllocOp.getOperation());
+        // Delete later because we can't delete the cast op before the top-level
+        // iteration visits it. Also, the alloca can't be deleted before because
+        // it's used by it.
+        toBeDeleted.push_back(asCastOp);
+        toBeDeleted.push_back(allocaOp);
+      } else {
+        allocaOp.replaceAllUsesWith(sharedAllocOp.getOperation());
+        allocaOp.erase();
+      }
 
       // Create a new omp.free_shared_mem for the allocated buffer prior to
       // exiting the region.
       Fortran::utils::openmp::insertDeviceSharedMemDeallocation(
           builder, sharedAllocOp.getResult());
     });
+    for (Operation *op : toBeDeleted)
+      op->erase();
   }
 };
 } // namespace
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 2129ca44e7a8d..79ffc07eddcd1 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -338,7 +338,7 @@ void createOpenMPFIRPassPipeline(mlir::PassManager &pm,
   pm.addPass(flangomp::createMarkDeclareTargetPass());
   pm.addPass(flangomp::createGenericLoopConversionPass());
   if (opts.isTargetDevice) {
-    pm.addPass(flangomp::createStackToSharedPass());
+    // pm.addPass(flangomp::createStackToSharedPass());
     pm.addPass(flangomp::createFunctionFilteringPass());
 
     if (opts.enableOffloadGlobalFiltering)
@@ -409,6 +409,9 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
   }
 
   fir::addFIRToLLVMPass(pm, config);
+
+  if (config.EnableOpenMP && !config.EnableOpenMPSimd)
+    pm.addPass(flangomp::createStackToSharedPass());
 }
 
 /// Create a pass pipeline for lowering from MLIR to LLVM IR
diff --git a/flang/lib/Utils/OpenMP.cpp b/flang/lib/Utils/OpenMP.cpp
index 73f5b9bc73129..821ec2545d0ad 100644
--- a/flang/lib/Utils/OpenMP.cpp
+++ b/flang/lib/Utils/OpenMP.cpp
@@ -14,6 +14,7 @@
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Transforms/RegionUtils.h"
 
@@ -199,8 +200,9 @@ static bool shouldReplaceAllocaWithUses(
     const mlir::Operation::use_range &uses) {
   // Check direct uses and also follow hlfir.declare/fir.convert uses.
   for (const mlir::OpOperand &use : uses) {
-    if (llvm::isa<hlfir::DeclareOp, fir::ConvertOp>(use.getOwner())) {
-      if (shouldReplaceAllocaWithUses(use.getOwner()->getUses()))
+    mlir::Operation *owner = use.getOwner();
+    if (llvm::isa<mlir::LLVM::AddrSpaceCastOp, mlir::LLVM::GEPOp>(owner)) {
+      if (shouldReplaceAllocaWithUses(owner->getUses()))
         return true;
     } else if (allocaUseRequiresDeviceSharedMem(use)) {
       return true;
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 6d2beae4da1c8..8fbec2a9ac63d 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -162,4 +162,6 @@ func.func @_QQmain() {
 // PASSES-NEXT: FIRToLLVMLowering
 // PASSES-NEXT: ReconcileUnrealizedCasts
 // PASSES-NEXT: PrepareForOMPOffloadPrivatizationPass
+// PASSES-NEXT: 'llvm.func' Pipeline
+// PASSES-NEXT:  StackToSharedPass
 // PASSES-NEXT: LLVMIRLoweringPass
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 414a61d949914..a9ae260cb917d 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -2196,6 +2196,7 @@ def TargetFreeMemOp : OpenMP_Op<"target_freemem",
 // AllocSharedMemOp
 //===----------------------------------------------------------------------===//
 
+// TODO: Update design to be used in place of llvm.alloca.
 def AllocSharedMemOp : OpenMP_Op<"alloc_shared_mem", traits = [
     AttrSizedOperandSegments
   ], clauses = [
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 605568a28fb28..4c8af9d90f6f8 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -6752,9 +6752,13 @@ getAllocationSize(llvm::IRBuilderBase &builder,
   llvm::Type *llvmHeapTy = moduleTranslation.convertType(allocatedTy);
   llvm::TypeSize typeSize = dataLayout.getTypeStoreSize(llvmHeapTy);
   llvm::Value *allocSize = builder.getInt64(typeSize.getFixedValue());
-  for (auto typeParam : typeparams)
-    allocSize =
-        builder.CreateMul(allocSize, moduleTranslation.lookupValue(typeParam));
+  for (auto typeParam : typeparams) {
+    allocSize = builder.CreateMul(
+        allocSize,
+        builder.CreateIntCast(moduleTranslation.lookupValue(typeParam),
+                              builder.getInt64Ty(),
+                              /*isSigned=*/false));
+  }
   return allocSize;
 }
 
@@ -6785,6 +6789,8 @@ convertTargetAllocMemOp(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
+// TODO: Update after changing op. Currently shape will be ignored, which holds
+// the original array size.
 static LogicalResult
 convertAllocSharedMemOp(omp::AllocSharedMemOp allocMemOp,
                         llvm::IRBuilderBase &builder,

From 09520daee5f2c0dd4b52c9394fe6d5b4ce28bd62 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Wed, 22 Oct 2025 07:38:18 -0500
Subject: [PATCH 15/22] move stack-to-shared pass from Flang to the OpenMP
 dialect

---
 .../include/flang/Optimizer/OpenMP/Passes.td  |  18 --
 flang/include/flang/Utils/OpenMP.h            |  19 --
 flang/lib/Optimizer/OpenMP/CMakeLists.txt     |   1 -
 flang/lib/Optimizer/OpenMP/StackToShared.cpp  |  89 --------
 flang/lib/Optimizer/Passes/Pipelines.cpp      |   4 +-
 flang/lib/Utils/OpenMP.cpp                    | 107 ----------
 .../Dialect/OpenMP/Transforms/CMakeLists.txt  |   2 +-
 .../mlir/Dialect/OpenMP/Transforms/Passes.h   |   6 +-
 .../mlir/Dialect/OpenMP/Transforms/Passes.td  |  18 ++
 mlir/lib/Dialect/OpenMP/CMakeLists.txt        |  20 +-
 mlir/lib/Dialect/OpenMP/IR/CMakeLists.txt     |  18 ++
 .../Dialect/OpenMP/Transforms/CMakeLists.txt  |  10 +
 .../OpenMP/Transforms/StackToShared.cpp       | 190 ++++++++++++++++++
 13 files changed, 245 insertions(+), 257 deletions(-)
 delete mode 100644 flang/lib/Optimizer/OpenMP/StackToShared.cpp
 create mode 100644 mlir/lib/Dialect/OpenMP/IR/CMakeLists.txt
 create mode 100644 mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp

diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td
index fd68112f555c7..8d30f165dd8b6 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.td
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.td
@@ -149,22 +149,4 @@ def AutomapToTargetDataPass
   let dependentDialects = ["mlir::omp::OpenMPDialect"];
 }
 
-// TODO: Move pass to the omp dialect.
-def StackToSharedPass : Pass<"omp-stack-to-shared", "mlir::LLVM::LLVMFuncOp"> {
-  let summary = "Replaces stack allocations with shared memory.";
-  let description = [{
-    `llvm.mlir.alloca` operations defining values in a target region and then
-    potentially used inside of an `omp.parallel` region are replaced by this
-    pass with `omp.alloc_shared_mem` and `omp.free_shared_mem`. This is also
-    done for top-level function `llvm.mlir.alloca`s used in the same way when
-    the parent function is a target device function.
-
-    This ensures that explicit private allocations, intended to be shared across
-    threads, use the proper memory space on a target device while supporting the
-    case of parallel regions indirectly reached from within a target region via
-    function calls.
-  }];
-  let dependentDialects = ["mlir::omp::OpenMPDialect"];
-}
-
 #endif //FORTRAN_OPTIMIZER_OPENMP_PASSES
diff --git a/flang/include/flang/Utils/OpenMP.h b/flang/include/flang/Utils/OpenMP.h
index 433d825168296..334f8866fa560 100644
--- a/flang/include/flang/Utils/OpenMP.h
+++ b/flang/include/flang/Utils/OpenMP.h
@@ -60,25 +60,6 @@ mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder,
 void cloneOrMapRegionOutsiders(
     fir::FirOpBuilder &firOpBuilder, mlir::omp::TargetOp targetOp);
 
-/// Tell whether to replace an operation representing a stack allocation with a
-/// device shared memory allocation/deallocation pair based on the location of
-/// the allocation and its uses.
-///
-/// For it to be replaced, an allocation must be:
-///   - located in an OpenMP module for a target device;
-///   - located outside of any parallel regions;
-///   - located inside of a generic target region or a device function; and
-///   - used as a parallel reduction variable, passed as function argument or
-///   used inside of a parallel region in any way except as argument to an
-///   OpenMP private clause.
-bool shouldReplaceAllocaWithDeviceSharedMem(mlir::Operation &op);
-
-/// Based on the location of the definition of the given value representing the
-/// result of a device shared memory allocation, find the corresponding points
-/// where its deallocation should be placed and introduce `omp.free_shared_mem`
-/// ops at those points.
-void insertDeviceSharedMemDeallocation(
-    mlir::OpBuilder &builder, mlir::Value allocVal);
 } // namespace Fortran::utils::openmp
 
 #endif // FORTRAN_UTILS_OPENMP_H_
diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index 5a72fe47fa0e7..304333fa8830e 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -13,7 +13,6 @@ add_flang_library(FlangOpenMPTransforms
   LowerWorkshare.cpp
   LowerNontemporal.cpp
   SimdOnly.cpp
-  StackToShared.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/OpenMP/StackToShared.cpp b/flang/lib/Optimizer/OpenMP/StackToShared.cpp
deleted file mode 100644
index 8de2170d544ff..0000000000000
--- a/flang/lib/Optimizer/OpenMP/StackToShared.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-//===- StackToShared.cpp -------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements transforms to swap stack allocations on the target
-// device with device shared memory where applicable.
-//
-//===----------------------------------------------------------------------===//
-
-#include "flang/Optimizer/OpenMP/Passes.h"
-#include "flang/Utils/OpenMP.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
-
-namespace flangomp {
-#define GEN_PASS_DEF_STACKTOSHAREDPASS
-#include "flang/Optimizer/OpenMP/Passes.h.inc"
-} // namespace flangomp
-
-using namespace mlir;
-
-namespace {
-class StackToSharedPass
-    : public flangomp::impl::StackToSharedPassBase<StackToSharedPass> {
-public:
-  StackToSharedPass() = default;
-
-  void runOnOperation() override {
-    MLIRContext *context = &getContext();
-    OpBuilder builder(context);
-
-    LLVM::LLVMFuncOp funcOp = getOperation();
-    auto offloadIface = funcOp->getParentOfType<omp::OffloadModuleInterface>();
-    if (!offloadIface || !offloadIface.getIsTargetDevice())
-      return;
-
-    llvm::SmallVector<Operation *> toBeDeleted;
-    funcOp->walk([&](LLVM::AllocaOp allocaOp) {
-      if (!Fortran::utils::openmp::shouldReplaceAllocaWithDeviceSharedMem(
-              *allocaOp))
-        return;
-      // Replace llvm.alloca with omp.alloc_shared_mem.
-      Type resultType = allocaOp.getResult().getType();
-
-      // TODO: The handling of non-default address spaces might need to be
-      // improved. This currently only handles the case where an alloca to
-      // non-default address space must only be used by a single addrspacecast
-      // to default address space.
-      bool nonDefaultAddrSpace = false;
-      if (auto llvmPtrType = dyn_cast<LLVM::LLVMPointerType>(resultType))
-        nonDefaultAddrSpace = llvmPtrType.getAddressSpace() != 0;
-
-      builder.setInsertionPoint(allocaOp);
-      auto sharedAllocOp = omp::AllocSharedMemOp::create(
-          builder, allocaOp->getLoc(), LLVM::LLVMPointerType::get(context),
-          allocaOp.getElemType(),
-          /*uniq_name=*/nullptr,
-          /*bindc_name=*/nullptr, /*typeparams=*/{allocaOp.getArraySize()},
-          /*shape=*/{});
-      if (nonDefaultAddrSpace) {
-        assert(allocaOp->hasOneUse() && "alloca must have only one use");
-        auto asCastOp =
-            cast<LLVM::AddrSpaceCastOp>(*allocaOp->getUsers().begin());
-        asCastOp.replaceAllUsesWith(sharedAllocOp.getOperation());
-        // Delete later because we can't delete the cast op before the top-level
-        // iteration visits it. Also, the alloca can't be deleted before because
-        // it's used by it.
-        toBeDeleted.push_back(asCastOp);
-        toBeDeleted.push_back(allocaOp);
-      } else {
-        allocaOp.replaceAllUsesWith(sharedAllocOp.getOperation());
-        allocaOp.erase();
-      }
-
-      // Create a new omp.free_shared_mem for the allocated buffer prior to
-      // exiting the region.
-      Fortran::utils::openmp::insertDeviceSharedMemDeallocation(
-          builder, sharedAllocOp.getResult());
-    });
-    for (Operation *op : toBeDeleted)
-      op->erase();
-  }
-};
-} // namespace
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 79ffc07eddcd1..65f61439a1219 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -10,6 +10,7 @@
 /// common to flang and the test tools.
 
 #include "flang/Optimizer/Passes/Pipelines.h"
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
 #include "llvm/Support/CommandLine.h"
 
 /// Force setting the no-alias attribute on fuction arguments when possible.
@@ -338,7 +339,6 @@ void createOpenMPFIRPassPipeline(mlir::PassManager &pm,
   pm.addPass(flangomp::createMarkDeclareTargetPass());
   pm.addPass(flangomp::createGenericLoopConversionPass());
   if (opts.isTargetDevice) {
-    // pm.addPass(flangomp::createStackToSharedPass());
     pm.addPass(flangomp::createFunctionFilteringPass());
 
     if (opts.enableOffloadGlobalFiltering)
@@ -411,7 +411,7 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
   fir::addFIRToLLVMPass(pm, config);
 
   if (config.EnableOpenMP && !config.EnableOpenMPSimd)
-    pm.addPass(flangomp::createStackToSharedPass());
+    pm.addPass(mlir::omp::createStackToSharedPass());
 }
 
 /// Create a pass pipeline for lowering from MLIR to LLVM IR
diff --git a/flang/lib/Utils/OpenMP.cpp b/flang/lib/Utils/OpenMP.cpp
index 821ec2545d0ad..c2801d97004c0 100644
--- a/flang/lib/Utils/OpenMP.cpp
+++ b/flang/lib/Utils/OpenMP.cpp
@@ -14,7 +14,6 @@
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Transforms/RegionUtils.h"
 
@@ -156,109 +155,3 @@ void Fortran::utils::openmp::cloneOrMapRegionOutsiders(
     mlir::getUsedValuesDefinedAbove(region, valuesDefinedAbove);
   }
 }
-
-/// When a use takes place inside an omp.parallel region and it's not as a
-/// private clause argument, or when it is a reduction argument passed to
-/// omp.parallel or a function call argument, then the defining allocation is
-/// eligible for replacement with shared memory.
-static bool allocaUseRequiresDeviceSharedMem(const mlir::OpOperand &use) {
-  mlir::Operation *owner = use.getOwner();
-  if (auto parallelOp = llvm::dyn_cast<mlir::omp::ParallelOp>(owner)) {
-    if (llvm::is_contained(parallelOp.getReductionVars(), use.get()))
-      return true;
-  } else if (auto callOp = llvm::dyn_cast<mlir::CallOpInterface>(owner)) {
-    if (llvm::is_contained(callOp.getArgOperands(), use.get()))
-      return true;
-  }
-
-  // If it is used directly inside of a parallel region, it has to be replaced
-  // unless the use is a private clause.
-  if (owner->getParentOfType<mlir::omp::ParallelOp>()) {
-    if (auto argIface =
-            llvm::dyn_cast<mlir::omp::BlockArgOpenMPOpInterface>(owner)) {
-      if (auto privateSyms = llvm::cast_or_null<mlir::ArrayAttr>(
-              owner->getAttr("private_syms"))) {
-        for (auto [var, sym] :
-            llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
-          if (var != use.get())
-            continue;
-
-          auto moduleOp = owner->getParentOfType<mlir::ModuleOp>();
-          auto privateOp = llvm::cast<mlir::omp::PrivateClauseOp>(
-              moduleOp.lookupSymbol(llvm::cast<mlir::SymbolRefAttr>(sym)));
-          return privateOp.getDataSharingType() !=
-              mlir::omp::DataSharingClauseType::Private;
-        }
-      }
-    }
-    return true;
-  }
-  return false;
-}
-
-static bool shouldReplaceAllocaWithUses(
-    const mlir::Operation::use_range &uses) {
-  // Check direct uses and also follow hlfir.declare/fir.convert uses.
-  for (const mlir::OpOperand &use : uses) {
-    mlir::Operation *owner = use.getOwner();
-    if (llvm::isa<mlir::LLVM::AddrSpaceCastOp, mlir::LLVM::GEPOp>(owner)) {
-      if (shouldReplaceAllocaWithUses(owner->getUses()))
-        return true;
-    } else if (allocaUseRequiresDeviceSharedMem(use)) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-// TODO: Refactor the logic in `shouldReplaceAllocaWithDeviceSharedMem`,
-// `shouldReplaceAllocaWithUses` and `allocaUseRequiresDeviceSharedMem` to
-// be reusable by the MLIR to LLVM IR translation stage, as something very
-// similar is also implemented there to choose between allocas and device
-// shared memory allocations when processing OpenMP reductions, mapping and
-// privatization.
-bool Fortran::utils::openmp::shouldReplaceAllocaWithDeviceSharedMem(
-    mlir::Operation &op) {
-  auto offloadIface = op.getParentOfType<mlir::omp::OffloadModuleInterface>();
-  if (!offloadIface || !offloadIface.getIsTargetDevice())
-    return false;
-
-  auto targetOp = op.getParentOfType<mlir::omp::TargetOp>();
-
-  // It must be inside of a generic omp.target or in a target device function,
-  // and not inside of omp.parallel.
-  if (auto parallelOp = op.getParentOfType<mlir::omp::ParallelOp>()) {
-    if (!targetOp || !targetOp->isProperAncestor(parallelOp))
-      return false;
-  }
-
-  if (targetOp) {
-    if (targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) !=
-        mlir::omp::TargetExecMode::generic)
-      return false;
-  } else {
-    auto declTargetIface =
-        op.getParentOfType<mlir::omp::DeclareTargetInterface>();
-    if (!declTargetIface || !declTargetIface.isDeclareTarget() ||
-        declTargetIface.getDeclareTargetDeviceType() ==
-            mlir::omp::DeclareTargetDeviceType::host)
-      return false;
-  }
-
-  return shouldReplaceAllocaWithUses(op.getUses());
-}
-
-void Fortran::utils::openmp::insertDeviceSharedMemDeallocation(
-    mlir::OpBuilder &builder, mlir::Value allocVal) {
-  mlir::Block *allocaBlock = allocVal.getParentBlock();
-  mlir::DominanceInfo domInfo;
-  for (mlir::Block &block : allocVal.getParentRegion()->getBlocks()) {
-    mlir::Operation *terminator = block.getTerminator();
-    if (!terminator->hasSuccessors() &&
-        domInfo.dominates(allocaBlock, &block)) {
-      builder.setInsertionPoint(terminator);
-      mlir::omp::FreeSharedMemOp::create(builder, allocVal.getLoc(), allocVal);
-    }
-  }
-}
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt
index 22f0d92ea4cbf..9c6a607ce6f2a 100644
--- a/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(LLVM_TARGET_DEFINITIONS Passes.td)
 mlir_tablegen(Passes.h.inc -gen-pass-decls -name OpenMP)
-add_public_tablegen_target(MLIROpenMPPassIncGen)
+add_mlir_dialect_tablegen_target(MLIROpenMPPassIncGen)
 
 add_mlir_doc(Passes OpenMPPasses ./ -gen-pass-doc)
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
index 21b6d1f466558..ddbe662be69fc 100644
--- a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
@@ -13,6 +13,10 @@
 
 namespace mlir {
 
+namespace LLVM {
+class LLVMFuncOp;
+} // namespace LLVM
+
 namespace omp {
 
 /// Generate the code for registering conversion passes.
@@ -23,4 +27,4 @@ namespace omp {
 } // namespace omp
 } // namespace mlir
 
-#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES_H
+#endif // MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
index 1fde7e08ab433..73ec455a3aef5 100644
--- a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
@@ -23,4 +23,22 @@ def PrepareForOMPOffloadPrivatizationPass : Pass<"omp-offload-privatization-prep
     }];
   let dependentDialects = ["LLVM::LLVMDialect"];
 }
+
+def StackToSharedPass : Pass<"omp-stack-to-shared", "mlir::LLVM::LLVMFuncOp"> {
+  let summary = "Replaces stack allocations target devices with shared memory.";
+  let description = [{
+    `llvm.mlir.alloca` operations defining values in a non-SPMD target region
+    and then potentially used inside of an `omp.parallel` region are replaced by
+    this pass with `omp.alloc_shared_mem` and `omp.free_shared_mem`. This is
+    also done for top-level function `llvm.mlir.alloca`s used in the same way
+    when the parent function is a target device function.
+
+    This ensures that explicit private allocations, intended to be shared across
+    threads, use the proper memory space on a target device while supporting the
+    case of parallel regions indirectly reached from within a target region via
+    function calls.
+  }];
+  let dependentDialects = ["mlir::omp::OpenMPDialect"];
+}
+
 #endif // MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES
diff --git a/mlir/lib/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
index f3c02da458508..9f57627c321fb 100644
--- a/mlir/lib/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
@@ -1,20 +1,2 @@
+add_subdirectory(IR)
 add_subdirectory(Transforms)
-
-add_mlir_dialect_library(MLIROpenMPDialect
-  IR/OpenMPDialect.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
-
-  DEPENDS
-  omp_gen
-  MLIROpenMPOpsIncGen
-  MLIROpenMPOpsInterfacesIncGen
-  MLIROpenMPTypeInterfacesIncGen
-
-  LINK_LIBS PUBLIC
-  MLIRIR
-  MLIRLLVMDialect
-  MLIRFuncDialect
-  MLIROpenACCMPCommon
-  )
diff --git a/mlir/lib/Dialect/OpenMP/IR/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/IR/CMakeLists.txt
new file mode 100644
index 0000000000000..05923032d9077
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/IR/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_mlir_dialect_library(MLIROpenMPDialect
+  OpenMPDialect.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
+
+  DEPENDS
+  omp_gen
+  MLIROpenMPOpsIncGen
+  MLIROpenMPOpsInterfacesIncGen
+  MLIROpenMPTypeInterfacesIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRLLVMDialect
+  MLIRFuncDialect
+  MLIROpenACCMPCommon
+  )
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
index b9b8eda9ed51b..b00ca178dd9df 100644
--- a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
@@ -1,14 +1,24 @@
 add_mlir_dialect_library(MLIROpenMPTransforms
   OpenMPOffloadPrivatizationPrepare.cpp
+  StackToShared.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
 
   DEPENDS
+  omp_gen
   MLIROpenMPPassIncGen
+  MLIROpenMPOpsIncGen
+  MLIROpenMPOpsInterfacesIncGen
+  MLIROpenMPTypeInterfacesIncGen
 
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRFuncDialect
   MLIRLLVMDialect
+  MLIROpenACCMPCommon
   MLIROpenMPDialect
   MLIRPass
+  MLIRSupport
   MLIRTransforms
   )
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp b/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp
new file mode 100644
index 0000000000000..721e1f8dcf334
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp
@@ -0,0 +1,190 @@
+//===- StackToShared.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements transforms to swap stack allocations on the target
+// device with device shared memory where applicable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace omp {
+#define GEN_PASS_DEF_STACKTOSHAREDPASS
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
+} // namespace omp
+} // namespace mlir
+
+using namespace mlir;
+
+/// When a use takes place inside an omp.parallel region and it's not as a
+/// private clause argument, or when it is a reduction argument passed to
+/// omp.parallel or a function call argument, then the defining allocation is
+/// eligible for replacement with shared memory.
+static bool allocaUseRequiresDeviceSharedMem(const OpOperand &use) {
+  Operation *owner = use.getOwner();
+  if (auto parallelOp = dyn_cast<omp::ParallelOp>(owner)) {
+    if (llvm::is_contained(parallelOp.getReductionVars(), use.get()))
+      return true;
+  } else if (auto callOp = dyn_cast<CallOpInterface>(owner)) {
+    if (llvm::is_contained(callOp.getArgOperands(), use.get()))
+      return true;
+  }
+
+  // If it is used directly inside of a parallel region, it has to be replaced
+  // unless the use is a private clause.
+  if (owner->getParentOfType<omp::ParallelOp>()) {
+    if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(owner)) {
+      if (auto privateSyms =
+              cast_or_null<ArrayAttr>(owner->getAttr("private_syms"))) {
+        for (auto [var, sym] :
+             llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
+          if (var != use.get())
+            continue;
+
+          auto moduleOp = owner->getParentOfType<ModuleOp>();
+          auto privateOp = cast<omp::PrivateClauseOp>(
+              moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
+          return privateOp.getDataSharingType() !=
+                 omp::DataSharingClauseType::Private;
+        }
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+static bool shouldReplaceAllocaWithUses(const Operation::use_range &uses) {
+  // Check direct uses and also follow hlfir.declare/fir.convert uses.
+  for (const OpOperand &use : uses) {
+    Operation *owner = use.getOwner();
+    if (llvm::isa<LLVM::AddrSpaceCastOp, LLVM::GEPOp>(owner)) {
+      if (shouldReplaceAllocaWithUses(owner->getUses()))
+        return true;
+    } else if (allocaUseRequiresDeviceSharedMem(use)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// TODO: Refactor the logic in `shouldReplaceAllocaWithDeviceSharedMem`,
+// `shouldReplaceAllocaWithUses` and `allocaUseRequiresDeviceSharedMem` to
+// be reusable by the MLIR to LLVM IR translation stage, as something very
+// similar is also implemented there to choose between allocas and device
+// shared memory allocations when processing OpenMP reductions, mapping and
+// privatization.
+bool shouldReplaceAllocaWithDeviceSharedMem(Operation &op) {
+  auto offloadIface = op.getParentOfType<omp::OffloadModuleInterface>();
+  if (!offloadIface || !offloadIface.getIsTargetDevice())
+    return false;
+
+  auto targetOp = op.getParentOfType<omp::TargetOp>();
+
+  // It must be inside of a generic omp.target or in a target device function,
+  // and not inside of omp.parallel.
+  if (auto parallelOp = op.getParentOfType<omp::ParallelOp>()) {
+    if (!targetOp || !targetOp->isProperAncestor(parallelOp))
+      return false;
+  }
+
+  if (targetOp) {
+    if (targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) !=
+        omp::TargetExecMode::generic)
+      return false;
+  } else {
+    auto declTargetIface = op.getParentOfType<omp::DeclareTargetInterface>();
+    if (!declTargetIface || !declTargetIface.isDeclareTarget() ||
+        declTargetIface.getDeclareTargetDeviceType() ==
+            omp::DeclareTargetDeviceType::host)
+      return false;
+  }
+
+  return shouldReplaceAllocaWithUses(op.getUses());
+}
+
+void insertDeviceSharedMemDeallocation(OpBuilder &builder, Value allocVal) {
+  Block *allocaBlock = allocVal.getParentBlock();
+  DominanceInfo domInfo;
+  for (Block &block : allocVal.getParentRegion()->getBlocks()) {
+    Operation *terminator = block.getTerminator();
+    if (!terminator->hasSuccessors() &&
+        domInfo.dominates(allocaBlock, &block)) {
+      builder.setInsertionPoint(terminator);
+      omp::FreeSharedMemOp::create(builder, allocVal.getLoc(), allocVal);
+    }
+  }
+}
+
+namespace {
+class StackToSharedPass
+    : public omp::impl::StackToSharedPassBase<StackToSharedPass> {
+public:
+  StackToSharedPass() = default;
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    OpBuilder builder(context);
+
+    LLVM::LLVMFuncOp funcOp = getOperation();
+    auto offloadIface = funcOp->getParentOfType<omp::OffloadModuleInterface>();
+    if (!offloadIface || !offloadIface.getIsTargetDevice())
+      return;
+
+    llvm::SmallVector<Operation *> toBeDeleted;
+    funcOp->walk([&](LLVM::AllocaOp allocaOp) {
+      if (!shouldReplaceAllocaWithDeviceSharedMem(*allocaOp))
+        return;
+      // Replace llvm.alloca with omp.alloc_shared_mem.
+      Type resultType = allocaOp.getResult().getType();
+
+      // TODO: The handling of non-default address spaces might need to be
+      // improved. This currently only handles the case where an alloca to
+      // non-default address space must only be used by a single addrspacecast
+      // to default address space.
+      bool nonDefaultAddrSpace = false;
+      if (auto llvmPtrType = dyn_cast<LLVM::LLVMPointerType>(resultType))
+        nonDefaultAddrSpace = llvmPtrType.getAddressSpace() != 0;
+
+      builder.setInsertionPoint(allocaOp);
+      auto sharedAllocOp = omp::AllocSharedMemOp::create(
+          builder, allocaOp->getLoc(), LLVM::LLVMPointerType::get(context),
+          allocaOp.getElemType(),
+          /*uniq_name=*/nullptr,
+          /*bindc_name=*/nullptr, /*typeparams=*/{allocaOp.getArraySize()},
+          /*shape=*/{});
+      if (nonDefaultAddrSpace) {
+        assert(allocaOp->hasOneUse() && "alloca must have only one use");
+        auto asCastOp =
+            cast<LLVM::AddrSpaceCastOp>(*allocaOp->getUsers().begin());
+        asCastOp.replaceAllUsesWith(sharedAllocOp.getOperation());
+        // Delete later because we can't delete the cast op before the top-level
+        // iteration visits it. Also, the alloca can't be deleted before because
+        // it's used by it.
+        toBeDeleted.push_back(asCastOp);
+        toBeDeleted.push_back(allocaOp);
+      } else {
+        allocaOp.replaceAllUsesWith(sharedAllocOp.getOperation());
+        allocaOp.erase();
+      }
+
+      // Create a new omp.free_shared_mem for the allocated buffer prior to
+      // exiting the region.
+      insertDeviceSharedMemDeallocation(builder, sharedAllocOp.getResult());
+    });
+    for (Operation *op : toBeDeleted)
+      op->erase();
+  }
+};
+} // namespace

From 9bbee8f7c6576fb1b86a777894cd0c27e7d014e4 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Thu, 23 Oct 2025 06:20:53 -0500
Subject: [PATCH 16/22] Unify device shared memory logic for fix-up pass and
 MLIR to LLVMRI translation

---
 .../include/mlir/Dialect/OpenMP/Utils/Utils.h |  53 +++++++++
 mlir/lib/Dialect/OpenMP/CMakeLists.txt        |   1 +
 .../Dialect/OpenMP/Transforms/CMakeLists.txt  |   1 +
 .../OpenMP/Transforms/StackToShared.cpp       | 103 +++---------------
 mlir/lib/Dialect/OpenMP/Utils/CMakeLists.txt  |  13 +++
 mlir/lib/Dialect/OpenMP/Utils/Utils.cpp       | 101 +++++++++++++++++
 .../LLVMIR/Dialect/OpenMP/CMakeLists.txt      |   1 +
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 102 +++--------------
 8 files changed, 199 insertions(+), 176 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/OpenMP/Utils/Utils.h
 create mode 100644 mlir/lib/Dialect/OpenMP/Utils/CMakeLists.txt
 create mode 100644 mlir/lib/Dialect/OpenMP/Utils/Utils.cpp

diff --git a/mlir/include/mlir/Dialect/OpenMP/Utils/Utils.h b/mlir/include/mlir/Dialect/OpenMP/Utils/Utils.h
new file mode 100644
index 0000000000000..ce625c7170efe
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/Utils/Utils.h
@@ -0,0 +1,53 @@
+//===- Utils.h - OpenMP dialect utilities -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for various OpenMP utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENMP_UTILS_UTILS_H_
+#define MLIR_DIALECT_OPENMP_UTILS_UTILS_H_
+
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+namespace omp {
+
+/// Check whether the value representing an allocation, assumed to have been
+/// defined in a shared device context, is used in a manner that would require
+/// device shared memory for correctness.
+///
+/// When a use takes place inside an omp.parallel region and it's not as a
+/// private clause argument, or when it is a reduction argument passed to
+/// omp.parallel or a function call argument, then the defining allocation is
+/// eligible for replacement with shared memory.
+///
+/// \see mlir::omp::opInSharedDeviceContext().
+bool allocaUsesRequireSharedMem(Value alloc);
+
+/// Check whether the given operation is located in a context where an
+/// allocation to be used by multiple threads in a parallel region would have to
+/// be placed in device shared memory to be accessible.
+///
+/// That means that it is inside of a target device module, it is a non-SPMD
+/// target region, is inside of one or it's located in a device function, and it
+/// is not not inside of a parallel region.
+///
+/// This represents a necessary but not sufficient set of conditions to use
+/// device shared memory in place of regular allocas. For some variables, the
+/// associated OpenMP construct or their uses might also need to be taken into
+/// account.
+///
+/// \see mlir::omp::allocaUsesRequireSharedMem().
+bool opInSharedDeviceContext(Operation &op);
+
+} // namespace omp
+} // namespace mlir
+
+#endif // MLIR_DIALECT_OPENMP_UTILS_UTILS_H_
diff --git a/mlir/lib/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
index 9f57627c321fb..31167e6af908b 100644
--- a/mlir/lib/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(IR)
 add_subdirectory(Transforms)
+add_subdirectory(Utils)
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
index b00ca178dd9df..fa723239299a2 100644
--- a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
@@ -18,6 +18,7 @@ add_mlir_dialect_library(MLIROpenMPTransforms
   MLIRLLVMDialect
   MLIROpenACCMPCommon
   MLIROpenMPDialect
+  MLIROpenMPUtils
   MLIRPass
   MLIRSupport
   MLIRTransforms
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp b/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp
index 721e1f8dcf334..9a2ad304a9a73 100644
--- a/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp
+++ b/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp
@@ -15,7 +15,9 @@
 
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/Dialect/OpenMP/Utils/Utils.h"
 #include "mlir/Pass/Pass.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace mlir {
 namespace omp {
@@ -26,95 +28,22 @@ namespace omp {
 
 using namespace mlir;
 
-/// When a use takes place inside an omp.parallel region and it's not as a
-/// private clause argument, or when it is a reduction argument passed to
-/// omp.parallel or a function call argument, then the defining allocation is
-/// eligible for replacement with shared memory.
-static bool allocaUseRequiresDeviceSharedMem(const OpOperand &use) {
-  Operation *owner = use.getOwner();
-  if (auto parallelOp = dyn_cast<omp::ParallelOp>(owner)) {
-    if (llvm::is_contained(parallelOp.getReductionVars(), use.get()))
-      return true;
-  } else if (auto callOp = dyn_cast<CallOpInterface>(owner)) {
-    if (llvm::is_contained(callOp.getArgOperands(), use.get()))
-      return true;
-  }
-
-  // If it is used directly inside of a parallel region, it has to be replaced
-  // unless the use is a private clause.
-  if (owner->getParentOfType<omp::ParallelOp>()) {
-    if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(owner)) {
-      if (auto privateSyms =
-              cast_or_null<ArrayAttr>(owner->getAttr("private_syms"))) {
-        for (auto [var, sym] :
-             llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
-          if (var != use.get())
-            continue;
-
-          auto moduleOp = owner->getParentOfType<ModuleOp>();
-          auto privateOp = cast<omp::PrivateClauseOp>(
-              moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
-          return privateOp.getDataSharingType() !=
-                 omp::DataSharingClauseType::Private;
-        }
-      }
-    }
-    return true;
-  }
-  return false;
-}
-
-static bool shouldReplaceAllocaWithUses(const Operation::use_range &uses) {
-  // Check direct uses and also follow hlfir.declare/fir.convert uses.
-  for (const OpOperand &use : uses) {
-    Operation *owner = use.getOwner();
-    if (llvm::isa<LLVM::AddrSpaceCastOp, LLVM::GEPOp>(owner)) {
-      if (shouldReplaceAllocaWithUses(owner->getUses()))
-        return true;
-    } else if (allocaUseRequiresDeviceSharedMem(use)) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-// TODO: Refactor the logic in `shouldReplaceAllocaWithDeviceSharedMem`,
-// `shouldReplaceAllocaWithUses` and `allocaUseRequiresDeviceSharedMem` to
-// be reusable by the MLIR to LLVM IR translation stage, as something very
-// similar is also implemented there to choose between allocas and device
-// shared memory allocations when processing OpenMP reductions, mapping and
-// privatization.
-bool shouldReplaceAllocaWithDeviceSharedMem(Operation &op) {
-  auto offloadIface = op.getParentOfType<omp::OffloadModuleInterface>();
-  if (!offloadIface || !offloadIface.getIsTargetDevice())
-    return false;
-
-  auto targetOp = op.getParentOfType<omp::TargetOp>();
-
-  // It must be inside of a generic omp.target or in a target device function,
-  // and not inside of omp.parallel.
-  if (auto parallelOp = op.getParentOfType<omp::ParallelOp>()) {
-    if (!targetOp || !targetOp->isProperAncestor(parallelOp))
-      return false;
-  }
-
-  if (targetOp) {
-    if (targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) !=
-        omp::TargetExecMode::generic)
-      return false;
-  } else {
-    auto declTargetIface = op.getParentOfType<omp::DeclareTargetInterface>();
-    if (!declTargetIface || !declTargetIface.isDeclareTarget() ||
-        declTargetIface.getDeclareTargetDeviceType() ==
-            omp::DeclareTargetDeviceType::host)
-      return false;
-  }
-
-  return shouldReplaceAllocaWithUses(op.getUses());
+/// Tell whether to replace an operation representing a stack allocation with a
+/// device shared memory allocation/deallocation pair based on the location of
+/// the allocation and its uses.
+static bool shouldReplaceAllocaWithDeviceSharedMem(Operation &op) {
+  return omp::opInSharedDeviceContext(op) &&
+         llvm::any_of(op.getResults(), [&](Value result) {
+           return omp::allocaUsesRequireSharedMem(result);
+         });
 }
 
-void insertDeviceSharedMemDeallocation(OpBuilder &builder, Value allocVal) {
+/// Based on the location of the definition of the given value representing the
+/// result of a device shared memory allocation, find the corresponding points
+/// where its deallocation should be placed and introduce `omp.free_shared_mem`
+/// ops at those points.
+static void insertDeviceSharedMemDeallocation(OpBuilder &builder,
+                                              Value allocVal) {
   Block *allocaBlock = allocVal.getParentBlock();
   DominanceInfo domInfo;
   for (Block &block : allocVal.getParentRegion()->getBlocks()) {
diff --git a/mlir/lib/Dialect/OpenMP/Utils/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/Utils/CMakeLists.txt
new file mode 100644
index 0000000000000..8fd8ba2622c68
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/Utils/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_mlir_dialect_library(MLIROpenMPUtils
+  Utils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRLLVMDialect
+  MLIROpenACCMPCommon
+  MLIROpenMPDialect
+  MLIRSupport
+  )
diff --git a/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp b/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp
new file mode 100644
index 0000000000000..26e89c78f8ec2
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp
@@ -0,0 +1,101 @@
+//===- StackToShared.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements various OpenMP dialect utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenMP/Utils/Utils.h"
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+
+using namespace mlir;
+
+static bool allocaUseRequiresSharedMem(const OpOperand &use) {
+  Operation *owner = use.getOwner();
+  if (auto parallelOp = dyn_cast<omp::ParallelOp>(owner)) {
+    if (llvm::is_contained(parallelOp.getReductionVars(), use.get()))
+      return true;
+  } else if (auto callOp = dyn_cast<CallOpInterface>(owner)) {
+    if (llvm::is_contained(callOp.getArgOperands(), use.get()))
+      return true;
+  }
+
+  // If it is used directly inside of a parallel region, it has to be replaced
+  // unless the use is a private clause.
+  if (owner->getParentOfType<omp::ParallelOp>()) {
+    if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(owner)) {
+      if (auto privateSyms =
+              cast_or_null<ArrayAttr>(owner->getAttr("private_syms"))) {
+        for (auto [var, sym] :
+             llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
+          if (var != use.get())
+            continue;
+
+          auto moduleOp = owner->getParentOfType<ModuleOp>();
+          auto privateOp = cast<omp::PrivateClauseOp>(
+              moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
+          return privateOp.getDataSharingType() !=
+                 omp::DataSharingClauseType::Private;
+        }
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+bool mlir::omp::allocaUsesRequireSharedMem(Value alloc) {
+  for (const OpOperand &use : alloc.getUses()) {
+    Operation *owner = use.getOwner();
+    if (isa<LLVM::AddrSpaceCastOp, LLVM::GEPOp>(owner)) {
+      if (llvm::any_of(owner->getResults(), [&](Value result) {
+            return allocaUsesRequireSharedMem(result);
+          }))
+        return true;
+    } else if (allocaUseRequiresSharedMem(use)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool mlir::omp::opInSharedDeviceContext(Operation &op) {
+  auto offloadIface = op.getParentOfType<omp::OffloadModuleInterface>();
+  if (!offloadIface || !offloadIface.getIsTargetDevice())
+    return false;
+
+  auto targetOp = op.getParentOfType<omp::TargetOp>();
+
+  // It must be inside of a generic omp.target or in a target device function,
+  // and not inside of omp.parallel.
+  if (auto parallelOp = op.getParentOfType<omp::ParallelOp>()) {
+    if (!targetOp || !targetOp->isProperAncestor(parallelOp))
+      return false;
+  }
+
+  // The omp.target operation itself is considered in a shared device context in
+  // order to properly process its own allocation-defining entry block
+  // arguments.
+  if (!targetOp)
+    targetOp = dyn_cast<omp::TargetOp>(op);
+
+  if (targetOp) {
+    if (targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) !=
+        omp::TargetExecMode::generic)
+      return false;
+  } else {
+    auto declTargetIface = op.getParentOfType<omp::DeclareTargetInterface>();
+    if (!declTargetIface || !declTargetIface.isDeclareTarget() ||
+        declTargetIface.getDeclareTargetDeviceType() ==
+            omp::DeclareTargetDeviceType::host)
+      return false;
+  }
+  return true;
+}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/CMakeLists.txt
index 0a5d7c6e22058..eb748d8b43630 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/CMakeLists.txt
@@ -8,6 +8,7 @@ add_mlir_translation_library(MLIROpenMPToLLVMIRTranslation
   MLIRIR
   MLIRLLVMDialect
   MLIROpenMPDialect
+  MLIROpenMPUtils
   MLIRSupport
   MLIRTargetLLVMIRExport
   MLIRTransformUtils
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 4c8af9d90f6f8..d60aae2fc9162 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Dialect/OpenMP/OpenMPInterfaces.h"
+#include "mlir/Dialect/OpenMP/Utils/Utils.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Target/LLVMIR/Dialect/OpenMPCommon.h"
@@ -1129,81 +1130,6 @@ struct DeferredStore {
 };
 } // namespace
 
-/// Check whether allocations for the given operation might potentially have to
-/// be done in device shared memory. That means we're compiling for an
-/// offloading target, the operation is neither an `omp::TargetOp` nor nested
-/// inside of one, or it is and that target region represents a Generic
-/// (non-SPMD) kernel.
-///
-/// This represents a necessary but not sufficient set of conditions to use
-/// device shared memory in place of regular allocas. For some variables, the
-/// associated OpenMP construct or their uses might also need to be taken into
-/// account.
-static bool
-mightAllocInDeviceSharedMemory(Operation &op,
-                               const llvm::OpenMPIRBuilder &ompBuilder) {
-  if (!ompBuilder.Config.isTargetDevice())
-    return false;
-
-  auto targetOp = dyn_cast<omp::TargetOp>(op);
-  if (!targetOp)
-    targetOp = op.getParentOfType<omp::TargetOp>();
-
-  return !targetOp ||
-         targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) ==
-             omp::TargetExecMode::generic;
-}
-
-/// Check whether the entry block argument representing the private copy of a
-/// variable in an OpenMP construct must be allocated in device shared memory,
-/// based on what the uses of that copy are.
-///
-/// This must only be called if a previous call to
-/// \c mightAllocInDeviceSharedMemory has already returned \c true for the
-/// operation that owns the specified block argument.
-static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
-  Operation *parentOp = value.getOwner()->getParentOp();
-  auto moduleOp = parentOp->getParentOfType<ModuleOp>();
-  for (auto *user : value.getUsers()) {
-    if (auto parallelOp = dyn_cast<omp::ParallelOp>(user)) {
-      if (llvm::is_contained(parallelOp.getReductionVars(), value))
-        return true;
-    } else if (auto callOp = dyn_cast<CallOpInterface>(user)) {
-      if (llvm::is_contained(callOp.getArgOperands(), value))
-        return true;
-    }
-
-    if (auto parallelOp = user->getParentOfType<omp::ParallelOp>()) {
-      if (parentOp->isProperAncestor(parallelOp)) {
-        // If it is used directly inside of a parallel region, skip private
-        // clause uses.
-        bool isPrivateClauseUse = false;
-        if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(user)) {
-          if (auto privateSyms = llvm::cast_or_null<ArrayAttr>(
-                  user->getAttr("private_syms"))) {
-            for (auto [var, sym] :
-                 llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
-              if (var != value)
-                continue;
-
-              auto privateOp = cast<omp::PrivateClauseOp>(
-                  moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
-              if (privateOp.getCopyRegion().empty()) {
-                isPrivateClauseUse = true;
-                break;
-              }
-            }
-          }
-        }
-        if (!isPrivateClauseUse)
-          return true;
-      }
-    }
-  }
-
-  return false;
-}
-
 /// Allocate space for privatized reduction variables.
 /// `deferredStores` contains information to create store operations which needs
 /// to be inserted after all allocas
@@ -1222,8 +1148,8 @@ allocReductionVars(T op, ArrayRef<BlockArgument> reductionArgs,
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
-  bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
-                            mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+  bool useDeviceSharedMem =
+      isa<omp::TeamsOp>(*op) && omp::opInSharedDeviceContext(*op);
 
   // delay creating stores until after all allocas
   deferredStores.reserve(op.getNumReductionVars());
@@ -1344,8 +1270,8 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
     return success();
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
-  bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
-                            mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+  bool useDeviceSharedMem =
+      isa<omp::TeamsOp>(*op) && omp::opInSharedDeviceContext(*op);
 
   llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
   auto allocaIP = llvm::IRBuilderBase::InsertPoint(
@@ -1561,8 +1487,8 @@ static LogicalResult createReductionsAndCleanup(
       reductionRegions, privateReductionVariables, moduleTranslation, builder,
       "omp.reduction.cleanup");
 
-  bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
-                            mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+  bool useDeviceSharedMem =
+      isa<omp::TeamsOp>(*op) && omp::opInSharedDeviceContext(*op);
   if (useDeviceSharedMem) {
     for (auto [var, reductionDecl] :
          llvm::zip_equal(privateReductionVariables, reductionDecls))
@@ -1743,7 +1669,7 @@ allocatePrivateVars(T op, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   bool mightUseDeviceSharedMem =
       isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
-      mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+      omp::opInSharedDeviceContext(*op);
   unsigned int allocaAS =
       moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace();
   unsigned int defaultAS = moduleTranslation.getLLVMModule()
@@ -1757,8 +1683,7 @@ allocatePrivateVars(T op, llvm::IRBuilderBase &builder,
         moduleTranslation.convertType(privDecl.getType());
     builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
     llvm::Value *llvmPrivateVar = nullptr;
-    if (mightUseDeviceSharedMem &&
-        mustAllocPrivateVarInDeviceSharedMemory(blockArg)) {
+    if (mightUseDeviceSharedMem && omp::allocaUsesRequireSharedMem(blockArg)) {
       llvmPrivateVar = ompBuilder->createOMPAllocShared(builder, llvmAllocType);
     } else {
       llvmPrivateVar = builder.CreateAlloca(
@@ -1861,12 +1786,11 @@ cleanupPrivateVars(T op, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   bool mightUseDeviceSharedMem =
       isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
-      mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+      omp::opInSharedDeviceContext(*op);
   for (auto [privDecl, llvmPrivVar, blockArg] :
        llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.llvmVars,
                        privateVarsInfo.blockArgs)) {
-    if (mightUseDeviceSharedMem &&
-        mustAllocPrivateVarInDeviceSharedMemory(blockArg)) {
+    if (mightUseDeviceSharedMem && omp::allocaUsesRequireSharedMem(blockArg)) {
       ompBuilder->createOMPFreeShared(
           builder, llvmPrivVar,
           moduleTranslation.convertType(privDecl.getType()));
@@ -5690,8 +5614,8 @@ static llvm::IRBuilderBase::InsertPoint createDeviceArgumentAccessor(
 
   // Create the allocation for the argument.
   llvm::Value *v = nullptr;
-  if (mightAllocInDeviceSharedMemory(*targetOp, ompBuilder) &&
-      mustAllocPrivateVarInDeviceSharedMemory(mlirArg)) {
+  if (omp::opInSharedDeviceContext(*targetOp) &&
+      omp::allocaUsesRequireSharedMem(mlirArg)) {
     // Use the beginning of the codeGenIP rather than the usual allocation point
     // for shared memory allocations because otherwise these would be done prior
     // to the target initialization call. Also, the exit block (where the

From 4c1c46ae9cfc56b75762de28a418f37050424cff Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Fri, 24 Oct 2025 06:15:13 -0500
Subject: [PATCH 17/22] Simplify omp.alloc_shared_mem

---
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 38 +++++++++++----
 .../mlir/Dialect/OpenMP/Transforms/Passes.td  |  8 ++--
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 27 +++++++----
 .../OpenMP/Transforms/StackToShared.cpp       |  6 +--
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 46 ++++++++++++-------
 mlir/test/Dialect/OpenMP/invalid.mlir         | 19 +++++---
 mlir/test/Dialect/OpenMP/ops.mlir             | 35 +++++++-------
 .../LLVMIR/omptarget-device-shared-mem.mlir   | 42 +++++++++++++++++
 8 files changed, 153 insertions(+), 68 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-device-shared-mem.mlir

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index a9ae260cb917d..9e9c7ed94b6a7 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -2196,13 +2196,16 @@ def TargetFreeMemOp : OpenMP_Op<"target_freemem",
 // AllocSharedMemOp
 //===----------------------------------------------------------------------===//
 
-// TODO: Update design to be used in place of llvm.alloca.
 def AllocSharedMemOp : OpenMP_Op<"alloc_shared_mem", traits = [
-    AttrSizedOperandSegments
-  ], clauses = [
-    OpenMP_HeapAllocClause
+    MemoryEffects<[MemAlloc<DefaultResource>]>
   ]> {
-  let summary = "allocate storage on shared memory for an object of a given type";
+  let summary = "allocate storage on shared memory for objects of a given type";
+
+  let arguments = (ins
+    TypeAttr:$elem_type,
+    AnySignlessInteger:$array_size,
+    ConfinedAttr<OptionalAttr<I64Attr>, [IntPositive]>:$alignment
+  );
 
   let description = [{
     Allocates memory shared across threads of a team for an object of the given
@@ -2211,15 +2214,30 @@ def AllocSharedMemOp : OpenMP_Op<"alloc_shared_mem", traits = [
     `omp.free_shared` to avoid memory leaks.
 
     ```mlir
-      // Allocate a static 3x3 integer vector.
-      %ptr_shared = omp.alloc_shared_mem vector<3x3xi32> : !llvm.ptr
+      // Allocate an i32 vector with %size elements and aligned to 8 bytes.
+      %ptr_shared = omp.alloc_shared_mem %size x i32 {alignment = 8} : (i64) -> (!llvm.ptr)
       // ...
       omp.free_shared_mem %ptr_shared : !llvm.ptr
     ```
-  }] # clausesDescription;
+
+    The `elem_type` is the type of the object for which memory is being
+    allocated.
+
+    The `array_size` is the number of objects to allocate memory for.
+
+    The optional `alignment` is used to specify the alignment for each element.
+    If not set, the `DataLayout` defaults will be used instead.
+  }];
 
   let results = (outs OpenMP_PointerLikeType);
-  let assemblyFormat = clausesAssemblyFormat # " attr-dict `:` type(results)";
+  let assemblyFormat = [{
+    $array_size `x` $elem_type attr-dict `:` `(` type($array_size) `)` `->` type(results)
+  }];
+
+  let extraClassDeclaration = [{
+    mlir::Type getAllocatedType() { return getElemTypeAttr().getValue(); }
+  }];
+  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2238,7 +2256,7 @@ def FreeSharedMemOp : OpenMP_Op<"free_shared_mem", [MemoryEffects<[MemFree]>]> {
 
     ```mlir
       // Example of allocating and freeing shared memory.
-      %ptr_shared = omp.alloc_shared_mem vector<3x3xi32> : !llvm.ptr
+      %ptr_shared = omp.alloc_shared_mem %size x i32 : (i64) -> (!llvm.ptr)
       // ...
       omp.free_shared_mem %ptr_shared : !llvm.ptr
     ```
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
index 73ec455a3aef5..498b8a4812caa 100644
--- a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
@@ -27,11 +27,11 @@ def PrepareForOMPOffloadPrivatizationPass : Pass<"omp-offload-privatization-prep
 def StackToSharedPass : Pass<"omp-stack-to-shared", "mlir::LLVM::LLVMFuncOp"> {
   let summary = "Replaces stack allocations target devices with shared memory.";
   let description = [{
-    `llvm.mlir.alloca` operations defining values in a non-SPMD target region
-    and then potentially used inside of an `omp.parallel` region are replaced by
+    `llvm.alloca` operations defining values in a non-SPMD target region and
+    then potentially used inside of an `omp.parallel` region are replaced by
     this pass with `omp.alloc_shared_mem` and `omp.free_shared_mem`. This is
-    also done for top-level function `llvm.mlir.alloca`s used in the same way
-    when the parent function is a target device function.
+    also done for top-level function `llvm.alloca`s used in the same way when
+    the parent function is a target device function.
 
     This ensures that explicit private allocations, intended to be shared across
     threads, use the proper memory space on a target device while supporting the
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 8b92f942d05e5..7349db3a6b80a 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -4323,19 +4323,20 @@ LogicalResult ScanOp::verify() {
 }
 
 /// Verifies align clause in allocate directive
-
-LogicalResult AllocateDirOp::verify() {
-  std::optional<uint64_t> align = this->getAlign();
-
-  if (align.has_value()) {
-    if ((align.value() > 0) && !llvm::has_single_bit(align.value()))
-      return emitError() << "ALIGN value : " << align.value()
-                         << " must be power of 2";
+LogicalResult verifyAlignment(Operation &op,
+                              std::optional<uint64_t> alignment) {
+  if (alignment.has_value()) {
+    if ((alignment.value() != 0) && !llvm::has_single_bit(alignment.value()))
+      return op.emitError()
+             << "ALIGN value : " << alignment.value() << " must be power of 2";
   }
-
   return success();
 }
 
+LogicalResult AllocateDirOp::verify() {
+  return verifyAlignment(*getOperation(), getAlign());
+}
+
 //===----------------------------------------------------------------------===//
 // TargetFreeMemOp
 //===----------------------------------------------------------------------===//
@@ -4347,6 +4348,14 @@ LogicalResult TargetFreeMemOp::verify() {
                                 "'omp.target_allocmem' op";
 }
 
+//===----------------------------------------------------------------------===//
+// AllocSharedMemOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult AllocSharedMemOp::verify() {
+  return verifyAlignment(*getOperation(), getAlignment());
+}
+
 //===----------------------------------------------------------------------===//
 // FreeSharedMemOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp b/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp
index 9a2ad304a9a73..61bf1401c3e67 100644
--- a/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp
+++ b/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp
@@ -89,10 +89,8 @@ class StackToSharedPass
       builder.setInsertionPoint(allocaOp);
       auto sharedAllocOp = omp::AllocSharedMemOp::create(
           builder, allocaOp->getLoc(), LLVM::LLVMPointerType::get(context),
-          allocaOp.getElemType(),
-          /*uniq_name=*/nullptr,
-          /*bindc_name=*/nullptr, /*typeparams=*/{allocaOp.getArraySize()},
-          /*shape=*/{});
+          allocaOp.getElemTypeAttr(), allocaOp.getArraySize(),
+          allocaOp.getAlignmentAttr());
       if (nonDefaultAddrSpace) {
         assert(allocaOp->hasOneUse() && "alloca must have only one use");
         auto asCastOp =
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index d60aae2fc9162..b0267c543a0a3 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -6669,14 +6669,14 @@ static llvm::Function *getOmpTargetAlloc(llvm::IRBuilderBase &builder,
 
 static llvm::Value *
 getAllocationSize(llvm::IRBuilderBase &builder,
-                  LLVM::ModuleTranslation &moduleTranslation, Type allocatedTy,
-                  OperandRange typeparams, OperandRange shape) {
+                  LLVM::ModuleTranslation &moduleTranslation,
+                  omp::TargetAllocMemOp op) {
   llvm::DataLayout dataLayout =
       moduleTranslation.getLLVMModule()->getDataLayout();
-  llvm::Type *llvmHeapTy = moduleTranslation.convertType(allocatedTy);
-  llvm::TypeSize typeSize = dataLayout.getTypeStoreSize(llvmHeapTy);
+  llvm::Type *llvmHeapTy = moduleTranslation.convertType(op.getAllocatedType());
+  llvm::TypeSize typeSize = dataLayout.getTypeAllocSize(llvmHeapTy);
   llvm::Value *allocSize = builder.getInt64(typeSize.getFixedValue());
-  for (auto typeParam : typeparams) {
+  for (auto typeParam : op.getTypeparams()) {
     allocSize = builder.CreateMul(
         allocSize,
         builder.CreateIntCast(moduleTranslation.lookupValue(typeParam),
@@ -6686,6 +6686,27 @@ getAllocationSize(llvm::IRBuilderBase &builder,
   return allocSize;
 }
 
+static llvm::Value *
+getAllocationSize(llvm::IRBuilderBase &builder,
+                  LLVM::ModuleTranslation &moduleTranslation,
+                  omp::AllocSharedMemOp op) {
+  llvm::DataLayout dataLayout =
+      moduleTranslation.getLLVMModule()->getDataLayout();
+  llvm::Type *llvmHeapTy = moduleTranslation.convertType(op.getAllocatedType());
+
+  auto alignment = op.getAlignment();
+  llvm::TypeSize typeSize = llvm::alignTo(
+      dataLayout.getTypeStoreSize(llvmHeapTy),
+      alignment ? *alignment : dataLayout.getABITypeAlign(llvmHeapTy).value());
+
+  llvm::Value *allocSize = builder.getInt64(typeSize.getFixedValue());
+  return builder.CreateMul(
+      allocSize,
+      builder.CreateIntCast(moduleTranslation.lookupValue(op.getArraySize()),
+                            builder.getInt64Ty(),
+                            /*isSigned=*/false));
+}
+
 static LogicalResult
 convertTargetAllocMemOp(Operation &opInst, llvm::IRBuilderBase &builder,
                         LLVM::ModuleTranslation &moduleTranslation) {
@@ -6700,9 +6721,8 @@ convertTargetAllocMemOp(Operation &opInst, llvm::IRBuilderBase &builder,
   mlir::Value deviceNum = allocMemOp.getDevice();
   llvm::Value *llvmDeviceNum = moduleTranslation.lookupValue(deviceNum);
   // Get the allocation size.
-  llvm::Value *allocSize = getAllocationSize(
-      builder, moduleTranslation, allocMemOp.getAllocatedType(),
-      allocMemOp.getTypeparams(), allocMemOp.getShape());
+  llvm::Value *allocSize =
+      getAllocationSize(builder, moduleTranslation, allocMemOp);
   // Create call to "omp_target_alloc" with the args as translated llvm values.
   llvm::CallInst *call =
       builder.CreateCall(ompTargetAllocFunc, {allocSize, llvmDeviceNum});
@@ -6713,16 +6733,12 @@ convertTargetAllocMemOp(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
-// TODO: Update after changing op. Currently shape will be ignored, which holds
-// the original array size.
 static LogicalResult
 convertAllocSharedMemOp(omp::AllocSharedMemOp allocMemOp,
                         llvm::IRBuilderBase &builder,
                         LLVM::ModuleTranslation &moduleTranslation) {
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
-  llvm::Value *size = getAllocationSize(
-      builder, moduleTranslation, allocMemOp.getAllocatedType(),
-      allocMemOp.getTypeparams(), allocMemOp.getShape());
+  llvm::Value *size = getAllocationSize(builder, moduleTranslation, allocMemOp);
   moduleTranslation.mapValue(allocMemOp.getResult(),
                              ompBuilder->createOMPAllocShared(builder, size));
   return success();
@@ -6770,9 +6786,7 @@ convertFreeSharedMemOp(omp::FreeSharedMemOp freeMemOp,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   auto allocMemOp =
       freeMemOp.getHeapref().getDefiningOp<omp::AllocSharedMemOp>();
-  llvm::Value *size = getAllocationSize(
-      builder, moduleTranslation, allocMemOp.getAllocatedType(),
-      allocMemOp.getTypeparams(), allocMemOp.getShape());
+  llvm::Value *size = getAllocationSize(builder, moduleTranslation, allocMemOp);
   ompBuilder->createOMPFreeShared(
       builder, moduleTranslation.lookupValue(freeMemOp.getHeapref()), size);
   return success();
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index c9d0a413aaa38..38dd8516dc41d 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -3156,16 +3156,23 @@ func.func @target_freemem_invalid_ptr(%device : i32, %ptr : i64) -> () {
 }
 
 // -----
-func.func @alloc_shared_mem_invalid_uniq_name() -> () {
-  // expected-error @below {{op attribute 'uniq_name' failed to satisfy constraint: string attribute}}
-  %0 = omp.alloc_shared_mem i64 {uniq_name=2}
+func.func @alloc_shared_mem_invalid_alignment1(%n: i32) -> () {
+  // expected-error @below {{op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive}}
+  %0 = omp.alloc_shared_mem %n x i64 {alignment=-2} : (i32) -> !llvm.ptr
   return
 }
 
 // -----
-func.func @alloc_shared_mem_invalid_bindc_name() -> () {
-  // expected-error @below {{op attribute 'bindc_name' failed to satisfy constraint: string attribute}}
-  %0 = omp.alloc_shared_mem i64 {bindc_name=2}
+func.func @alloc_shared_mem_invalid_alignment2(%n: i32) -> () {
+  // expected-error @below {{ALIGN value : 3 must be power of 2}}
+  %0 = omp.alloc_shared_mem %n x i64 {alignment=3} : (i32) -> !llvm.ptr
+  return
+}
+
+// -----
+func.func @alloc_shared_mem_invalid_array_size(%n: f32) -> () {
+  // expected-error @below {{invalid kind of type specified: expected builtin.integer, but found 'f32'}}
+  %0 = omp.alloc_shared_mem %n x i64 : (f32) -> !llvm.ptr
   return
 }
 
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index b9bf8c3f39468..87ea1adfa9bdf 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -3395,25 +3395,22 @@ func.func @omp_target_freemem(%device : i32) {
 }
 
 // CHECK-LABEL: func.func @omp_alloc_shared_mem(
-// CHECK-SAME: %[[X:.*]]: index, %[[Y:.*]]: index, %[[Z:.*]]: i32) {
-func.func @omp_alloc_shared_mem(%x: index, %y: index, %z: i32) {
-  // CHECK: %{{.*}} = omp.alloc_shared_mem i64 : !llvm.ptr
-  %0 = omp.alloc_shared_mem i64 : !llvm.ptr
-  // CHECK: %{{.*}} = omp.alloc_shared_mem vector<16x16xf32> {bindc_name = "bindc", uniq_name = "uniq"} : !llvm.ptr
-  %1 = omp.alloc_shared_mem vector<16x16xf32> {uniq_name="uniq", bindc_name="bindc"} : !llvm.ptr
-  // CHECK: %{{.*}} = omp.alloc_shared_mem !llvm.ptr(%[[X]], %[[Y]], %[[Z]] : index, index, i32) : !llvm.ptr
-  %2 = omp.alloc_shared_mem !llvm.ptr(%x, %y, %z : index, index, i32) : !llvm.ptr
-  // CHECK: %{{.*}} = omp.alloc_shared_mem !llvm.ptr, %[[X]], %[[Y]] : !llvm.ptr
-  %3 = omp.alloc_shared_mem !llvm.ptr, %x, %y : !llvm.ptr
-  // CHECK: %{{.*}} = omp.alloc_shared_mem !llvm.ptr(%[[X]], %[[Y]], %[[Z]] : index, index, i32), %[[X]], %[[Y]] : !llvm.ptr
-  %4 = omp.alloc_shared_mem !llvm.ptr(%x, %y, %z : index, index, i32), %x, %y : !llvm.ptr
-  return
-}
-
-// CHECK-LABEL: func.func @omp_free_shared_mem() {
-func.func @omp_free_shared_mem() {
-  // CHECK: %[[PTR:.*]] = omp.alloc_shared_mem
-  %0 = omp.alloc_shared_mem i64 : !llvm.ptr
+// CHECK-SAME: %[[N:.*]]: i32) {
+func.func @omp_alloc_shared_mem(%n: i32) {
+  // CHECK: %{{.*}} = omp.alloc_shared_mem %[[N]] x i64 : (i32) -> !llvm.ptr
+  %0 = omp.alloc_shared_mem %n x i64 : (i32) -> !llvm.ptr
+  // CHECK: %{{.*}} = omp.alloc_shared_mem %[[N]] x vector<16x16xf32> : (i32) -> !llvm.ptr
+  %1 = omp.alloc_shared_mem %n x vector<16x16xf32> : (i32) -> !llvm.ptr
+  // CHECK: %{{.*}} = omp.alloc_shared_mem %[[N]] x !llvm.ptr {alignment = 16 : i64} : (i32) -> !llvm.ptr
+  %2 = omp.alloc_shared_mem %n x !llvm.ptr {alignment = 16} : (i32) -> !llvm.ptr
+  return
+}
+
+// CHECK-LABEL: func.func @omp_free_shared_mem(
+// CHECK-SAME: %[[N:.*]]: i64) {
+func.func @omp_free_shared_mem(%n: i64) {
+  // CHECK: %[[PTR:.*]] = omp.alloc_shared_mem %[[N]] x f32 : (i64) -> !llvm.ptr
+  %0 = omp.alloc_shared_mem %n x f32 : (i64) -> !llvm.ptr
   // CHECK: omp.free_shared_mem %[[PTR]] : !llvm.ptr
   omp.free_shared_mem %0 : !llvm.ptr
   return
diff --git a/mlir/test/Target/LLVMIR/omptarget-device-shared-mem.mlir b/mlir/test/Target/LLVMIR/omptarget-device-shared-mem.mlir
new file mode 100644
index 0000000000000..72b0a2daadfc3
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-device-shared-mem.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+  // CHECK-LABEL: define void @device_shared_mem(
+  // CHECK-SAME:  i32 %[[N0:.*]], i64 %[[N1:.*]])
+  llvm.func @device_shared_mem(%n0: i32, %n1: i64) attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} {
+    // CHECK:      %[[CAST_N0:.*]] = zext i32 %[[N0]] to i64
+    // CHECK-NEXT: %[[ALLOC0_SZ:.*]] = mul i64 8, %[[CAST_N0]]
+    // CHECK-NEXT: %[[ALLOC0:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 %[[ALLOC0_SZ]])
+    %0 = omp.alloc_shared_mem %n0 x i64 : (i32) -> !llvm.ptr
+
+    // CHECK:      %[[ALLOC1_SZ:.*]] = mul i64 8, %[[N1]]
+    // CHECK-NEXT: %[[ALLOC1:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 %[[ALLOC1_SZ]])
+    %1 = omp.alloc_shared_mem %n1 x i64 : (i64) -> !llvm.ptr
+
+    // CHECK:      %[[ALLOC2_SZ:.*]] = mul i64 64, %[[N1]]
+    // CHECK-NEXT: %[[ALLOC2:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 %[[ALLOC2_SZ]])
+    %2 = omp.alloc_shared_mem %n1 x vector<16xf32> : (i64) -> !llvm.ptr
+
+    // CHECK:      %[[ALLOC3_SZ:.*]] = mul i64 128, %[[N1]]
+    // CHECK-NEXT: %[[ALLOC3:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 %[[ALLOC3_SZ]])
+    %3 = omp.alloc_shared_mem %n1 x vector<16xf32> {alignment = 128} : (i64) -> !llvm.ptr
+
+    // CHECK:      %[[CAST_N0_1:.*]] = zext i32 %[[N0]] to i64
+    // CHECK-NEXT: %[[FREE0_SZ:.*]] = mul i64 8, %[[CAST_N0_1]]
+    // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[ALLOC0]], i64 %[[FREE0_SZ]])
+    omp.free_shared_mem %0 : !llvm.ptr
+
+    // CHECK:      %[[FREE1_SZ:.*]] = mul i64 8, %[[N1]]
+    // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[ALLOC1]], i64 %[[FREE1_SZ]])
+    omp.free_shared_mem %1 : !llvm.ptr
+
+    // CHECK:      %[[FREE2_SZ:.*]] = mul i64 64, %[[N1]]
+    // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[ALLOC2]], i64 %[[FREE2_SZ]])
+    omp.free_shared_mem %2 : !llvm.ptr
+
+    // CHECK:      %[[FREE3_SZ:.*]] = mul i64 128, %[[N1]]
+    // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[ALLOC3]], i64 %[[FREE3_SZ]])
+    omp.free_shared_mem %3 : !llvm.ptr
+    llvm.return
+  }
+}

From afd271b84e4ab4cc792c54209d31bd4e188a9576 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Fri, 24 Oct 2025 09:34:59 -0500
Subject: [PATCH 18/22] Make omp-stack-to-shared pass available to the compiler
 and move tests from Flang to MLIR

---
 .../include/flang/Optimizer/Support/InitFIR.h |   2 +
 flang/test/Fir/OpenMP/embox-to-shared-mem.fir |  29 ---
 .../Transforms/OpenMP/stack-to-shared.mlir    | 222 ------------------
 mlir/docs/Passes.md                           |   4 +
 mlir/lib/Dialect/OpenMP/Utils/Utils.cpp       |   2 +-
 mlir/lib/RegisterAllPasses.cpp                |   1 +
 mlir/test/Dialect/OpenMP/stack-to-shared.mlir | 149 ++++++++++++
 7 files changed, 157 insertions(+), 252 deletions(-)
 delete mode 100644 flang/test/Fir/OpenMP/embox-to-shared-mem.fir
 delete mode 100644 flang/test/Transforms/OpenMP/stack-to-shared.mlir
 create mode 100644 mlir/test/Dialect/OpenMP/stack-to-shared.mlir

diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h
index 67e9287ddad4f..a035437afcc07 100644
--- a/flang/include/flang/Optimizer/Support/InitFIR.h
+++ b/flang/include/flang/Optimizer/Support/InitFIR.h
@@ -34,6 +34,7 @@
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Passes.h"
 #include "mlir/InitAllDialects.h"
@@ -106,6 +107,7 @@ inline void loadDialects(mlir::MLIRContext &context) {
 /// but is a smaller set since we aren't using many of the passes found there.
 inline void registerMLIRPassesForFortranTools() {
   mlir::acc::registerOpenACCPasses();
+  mlir::omp::registerOpenMPPasses();
   mlir::registerCanonicalizerPass();
   mlir::registerCSEPass();
   mlir::affine::registerAffineLoopFusionPass();
diff --git a/flang/test/Fir/OpenMP/embox-to-shared-mem.fir b/flang/test/Fir/OpenMP/embox-to-shared-mem.fir
deleted file mode 100644
index eaa5eb6bbb905..0000000000000
--- a/flang/test/Fir/OpenMP/embox-to-shared-mem.fir
+++ /dev/null
@@ -1,29 +0,0 @@
-// RUN: tco -o - %s | FileCheck %s
-
-module attributes {omp.is_target_device = true} {
-  // CHECK-LABEL: declare void @scalar(ptr)
-  func.func private @scalar(%x : !fir.box<f32>)
-  // CHECK-LABEL: declare void @array(ptr)
-  func.func private @array(%x : !fir.box<!fir.array<?xi32>>)
-
-  // CHECK-LABEL: define void @embox
-  func.func @embox(%arg0 : !fir.ref<f32>, %arg1 : !fir.ref<!fir.array<?xi32>>) attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} {
-    // CHECK: %[[DESC2:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 48)
-    // CHECK: %[[DESC1:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 24)
-    %a = fir.embox %arg0 : (!fir.ref<f32>) -> !fir.box<f32>
-    %c0 = arith.constant 0 : i64
-    %b = fircg.ext_embox %arg1(%c0) origin %c0[%c0, %c0, %c0] : (!fir.ref<!fir.array<?xi32>>, i64, i64, i64, i64, i64) -> !fir.box<!fir.array<?xi32>>
-
-    // CHECK: call void @scalar(ptr %[[DESC1]])
-    fir.call @scalar(%a) : (!fir.box<f32>) -> ()
-
-    // CHECK: call void @array(ptr %[[DESC2]])
-    fir.call @array(%b) : (!fir.box<!fir.array<?xi32>>) -> ()
-
-    // CHECK: call void @__kmpc_free_shared(ptr %[[DESC1]], i64 24)
-    // CHECK: call void @__kmpc_free_shared(ptr %[[DESC2]], i64 48)
-    // CHECK: ret void
-    return
-  }
-
-}
diff --git a/flang/test/Transforms/OpenMP/stack-to-shared.mlir b/flang/test/Transforms/OpenMP/stack-to-shared.mlir
deleted file mode 100644
index 269b3b28afe2c..0000000000000
--- a/flang/test/Transforms/OpenMP/stack-to-shared.mlir
+++ /dev/null
@@ -1,222 +0,0 @@
-// RUN: fir-opt --split-input-file --omp-stack-to-shared %s | FileCheck %s
-
-module attributes {omp.is_target_device = true} {
-  omp.declare_reduction @add_reduction_i32 : i32 init {
-  ^bb0(%arg0: i32):
-    %c0_i32 = arith.constant 0 : i32
-    omp.yield(%c0_i32 : i32)
-  } combiner {
-  ^bb0(%arg0: i32, %arg1: i32):
-    %0 = arith.addi %arg0, %arg1 : i32
-    omp.yield(%0 : i32)
-  }
-
-  omp.private {type = private} @privatizer_i32 : i32
-  omp.private {type = firstprivate} @firstprivatizer_i32 : i32 copy {
-  ^bb0(%arg0: i32, %arg1: i32):
-    omp.yield(%arg0 : i32)
-  }
-
-  func.func private @foo(%b : !fir.ref<i32>)
-
-  // Verify that target device functions are searched for allocas shared across
-  // threads of a parallel region.
-  //
-  // Also ensure that all fir.alloca information is adequately forwarded to the
-  // new allocation, that uses of the allocation through hlfir.declare are
-  // detected and that only the expected types of uses (parallel reduction,
-  // non-private uses inside of a parallel region and function calls) are
-  // replaced.
-  // CHECK-LABEL: func.func @standalone_func
-  func.func @standalone_func(%lb: i32, %ub: i32, %step: i32) attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} {
-    // CHECK: %[[ALLOC_0:.*]] = omp.alloc_shared_mem i32 {uniq_name = "x"} : !fir.ref<i32>
-    %0 = fir.alloca i32 {uniq_name = "x"}
-    %c = arith.constant 1 : index
-    // CHECK: %[[ALLOC_1:.*]] = omp.alloc_shared_mem !fir.char<1,?>(%[[C:.*]] : index), %[[C]] {bindc_name = "y", uniq_name = "y"} : !fir.ref<!fir.char<1,?>>
-    %1 = fir.alloca !fir.char<1,?>(%c : index), %c {bindc_name = "y", uniq_name = "y"}
-    // CHECK: %{{.*}}:2 = hlfir.declare %[[ALLOC_1]] typeparams %[[C]] {uniq_name = "y"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-    %decl:2 = hlfir.declare %1 typeparams %c {uniq_name = "y"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-    // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "z"}
-    %2 = fir.alloca i32 {uniq_name = "z"}
-    // CHECK: %[[ALLOC_2:.*]] = omp.alloc_shared_mem i32 {uniq_name = "a"} : !fir.ref<i32>
-    %3 = fir.alloca i32 {uniq_name = "a"}
-    // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "b"}
-    %4 = fir.alloca i32 {uniq_name = "b"}
-    // CHECK: %[[ALLOC_3:.*]] = omp.alloc_shared_mem i32 {uniq_name = "c"} : !fir.ref<i32>
-    %5 = fir.alloca i32 {uniq_name = "c"}
-    fir.call @foo(%5) : (!fir.ref<i32>) -> ()
-    omp.parallel reduction(@add_reduction_i32 %0 -> %arg0 : !fir.ref<i32>) {
-      // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "d"}
-      %6 = fir.alloca i32 {uniq_name = "d"}
-      %7:2 = fir.unboxchar %decl#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-      omp.wsloop private(@privatizer_i32 %2 -> %arg1, @firstprivatizer_i32 %3 -> %arg2 : !fir.ref<i32>, !fir.ref<i32>) {
-        omp.loop_nest (%arg3) : i32 = (%lb) to (%ub) inclusive step (%step) {
-          %8 = fir.load %6 : !fir.ref<i32>
-          omp.yield
-        }
-      }
-      omp.terminator
-    }
-    %9 = fir.load %4 : !fir.ref<i32>
-    // CHECK: omp.free_shared_mem %[[ALLOC_0]] : !fir.ref<i32>
-    // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_1]] : !fir.ref<!fir.char<1,?>>
-    // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_2]] : !fir.ref<i32>
-    // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_3]] : !fir.ref<i32>
-    // CHECK-NEXT: return
-    return
-  }
-
-  // Verify that generic target regions are searched for allocas shared across
-  // threads of a parallel region.
-  // CHECK-LABEL: func.func @target_generic
-  func.func @target_generic() {
-    // CHECK: omp.target
-    omp.target {
-      %c = arith.constant 0 : i32
-      // CHECK: %[[ALLOC_0:.*]] = omp.alloc_shared_mem i32 {uniq_name = "x"} : !fir.ref<i32>
-      %0 = fir.alloca i32 {uniq_name = "x"}
-      // CHECK: omp.teams
-      omp.teams {
-        // CHECK: %[[ALLOC_1:.*]] = omp.alloc_shared_mem i32 {uniq_name = "y"} : !fir.ref<i32>
-        %1 = fir.alloca i32 {uniq_name = "y"}
-        // CHECK: omp.distribute
-        omp.distribute {
-          omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) {
-            // CHECK: %[[ALLOC_2:.*]] = omp.alloc_shared_mem i32 {uniq_name = "z"} : !fir.ref<i32>
-            %2 = fir.alloca i32 {uniq_name = "z"}
-            // CHECK: omp.parallel
-            omp.parallel {
-              %3 = fir.load %0 : !fir.ref<i32>
-              %4 = fir.load %1 : !fir.ref<i32>
-              %5 = fir.load %2 : !fir.ref<i32>
-              // CHECK: omp.terminator
-              omp.terminator
-            }
-            // CHECK: omp.free_shared_mem %[[ALLOC_2]] : !fir.ref<i32>
-            // CHECK: omp.yield
-            omp.yield
-          }
-        }
-        // CHECK: omp.free_shared_mem %[[ALLOC_1]] : !fir.ref<i32>
-        // CHECK: omp.terminator
-        omp.terminator
-      }
-      // CHECK: omp.free_shared_mem %[[ALLOC_0]] : !fir.ref<i32>
-      // CHECK: omp.terminator
-      omp.terminator
-    }
-    // CHECK: return
-    return
-  }
-
-  // Make sure that uses not shared across threads on a parallel region inside
-  // of target are not incorrectly detected as such if there's another parallel
-  // region in the host wrapping the whole target region.
-  // CHECK-LABEL: func.func @target_generic_in_parallel
-  func.func @target_generic_in_parallel() {
-    // CHECK-NOT: omp.alloc_shared_mem
-    // CHECK-NOT: omp.free_shared_mem
-    omp.parallel {
-      omp.target {
-        %c = arith.constant 0 : i32
-        %0 = fir.alloca i32 {uniq_name = "x"}
-        omp.teams {
-          %1 = fir.alloca i32 {uniq_name = "y"}
-          omp.distribute {
-            omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) {
-              %3 = fir.load %0 : !fir.ref<i32>
-              %4 = fir.load %1 : !fir.ref<i32>
-              omp.parallel {
-                omp.terminator
-              }
-              omp.yield
-            }
-          }
-          omp.terminator
-        }
-        omp.terminator
-      }
-      omp.terminator
-    }
-    // CHECK: return
-    return
-  }
-
-  // Ensure that allocations within SPMD target regions are not replaced with
-  // device shared memory regardless of use.
-  // CHECK-LABEL: func.func @target_spmd
-  func.func @target_spmd() {
-    // CHECK-NOT: omp.alloc_shared_mem
-    // CHECK-NOT: omp.free_shared_mem
-    omp.target {
-      %c = arith.constant 0 : i32
-      %0 = fir.alloca i32 {uniq_name = "x"}
-      omp.teams {
-        %1 = fir.alloca i32 {uniq_name = "y"}
-        omp.parallel {
-          %2 = fir.alloca i32 {uniq_name = "z"}
-          %3 = fir.load %0 : !fir.ref<i32>
-          %4 = fir.load %1 : !fir.ref<i32>
-          omp.distribute {
-            omp.wsloop {
-              omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) {
-                %5 = fir.load %2 : !fir.ref<i32>
-                omp.yield
-              }
-            } {omp.composite}
-          } {omp.composite}
-          omp.terminator
-        } {omp.composite}
-        omp.terminator
-      }
-      omp.terminator
-    }
-    // CHECK: return
-    return
-  }
-}
-
-// -----
-
-// No transformations must be done when targeting the host device.
-// CHECK-LABEL: func.func @host_standalone
-func.func @host_standalone() {
-  // CHECK-NOT: omp.alloc_shared_mem
-  // CHECK-NOT: omp.free_shared_mem
-  %0 = fir.alloca i32 {uniq_name = "x"}
-  omp.parallel {
-    %1 = fir.load %0 : !fir.ref<i32>
-    omp.terminator
-  }
-  // CHECK: return
-  return
-}
-
-// CHECK-LABEL: func.func @host_target
-func.func @host_target() {
-  // CHECK-NOT: omp.alloc_shared_mem
-  // CHECK-NOT: omp.free_shared_mem
-  omp.target {
-    %c = arith.constant 0 : i32
-    %0 = fir.alloca i32 {uniq_name = "x"}
-    omp.teams {
-      %1 = fir.alloca i32 {uniq_name = "y"}
-      omp.distribute {
-        omp.loop_nest (%arg0) : i32 = (%c) to (%c) inclusive step (%c) {
-          %2 = fir.alloca i32 {uniq_name = "z"}
-          omp.parallel {
-            %3 = fir.load %0 : !fir.ref<i32>
-            %4 = fir.load %1 : !fir.ref<i32>
-            %5 = fir.load %2 : !fir.ref<i32>
-            omp.terminator
-          }
-          omp.yield
-        }
-      }
-      omp.terminator
-    }
-    omp.terminator
-  }
-  // CHECK: return
-  return
-}
diff --git a/mlir/docs/Passes.md b/mlir/docs/Passes.md
index 9df32666415bb..f3d8a75c65840 100644
--- a/mlir/docs/Passes.md
+++ b/mlir/docs/Passes.md
@@ -72,6 +72,10 @@ This document describes the available MLIR passes and their contracts.
 
 [include "MemRefPasses.md"]
 
+## 'omp' Dialect Passes
+
+[include "OpenMPPasses.md"]
+
 ## 'shard' Dialect Passes
 
 [include "ShardPasses.md"]
diff --git a/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp b/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp
index 26e89c78f8ec2..6b2f4805c2c18 100644
--- a/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp
@@ -76,7 +76,7 @@ bool mlir::omp::opInSharedDeviceContext(Operation &op) {
   // It must be inside of a generic omp.target or in a target device function,
   // and not inside of omp.parallel.
   if (auto parallelOp = op.getParentOfType<omp::ParallelOp>()) {
-    if (!targetOp || !targetOp->isProperAncestor(parallelOp))
+    if (!targetOp || targetOp->isProperAncestor(parallelOp))
       return false;
   }
 
diff --git a/mlir/lib/RegisterAllPasses.cpp b/mlir/lib/RegisterAllPasses.cpp
index d7e321a61d4ac..cdda72688bc58 100644
--- a/mlir/lib/RegisterAllPasses.cpp
+++ b/mlir/lib/RegisterAllPasses.cpp
@@ -79,6 +79,7 @@ void mlir::registerAllPasses() {
   LLVM::registerTargetLLVMIRTransformsPasses();
   math::registerMathPasses();
   memref::registerMemRefPasses();
+  omp::registerOpenMPPasses();
   shard::registerShardPasses();
   ml_program::registerMLProgramPasses();
   omp::registerOpenMPPasses();
diff --git a/mlir/test/Dialect/OpenMP/stack-to-shared.mlir b/mlir/test/Dialect/OpenMP/stack-to-shared.mlir
new file mode 100644
index 0000000000000..81b03acd4d368
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/stack-to-shared.mlir
@@ -0,0 +1,149 @@
+// RUN: mlir-opt --omp-stack-to-shared %s | FileCheck %s
+
+module attributes {omp.is_target_device = true} {
+
+omp.declare_reduction @add_f32 : f32
+init {
+^bb0(%arg: f32):
+  %0 = llvm.mlir.constant(0.0 : f32) : f32
+  omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+  %1 = llvm.fadd %arg0, %arg1 : f32
+  omp.yield (%1 : f32)
+}
+atomic {
+^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr):
+  %2 = llvm.load %arg3 : !llvm.ptr -> f32
+  llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32
+  omp.yield
+}
+omp.private {type = private} @privatizer_i32 : i32
+omp.private {type = firstprivate} @firstprivatizer_f32 : f32 copy {
+^bb0(%arg0: f32, %arg1: f32):
+  omp.yield(%arg0 : f32)
+}
+
+llvm.func @foo(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>}
+
+// CHECK-LABEL: llvm.func @device_func(
+// CHECK-SAME:  %[[N:.*]]: i64, %[[COND:.*]]: i1)
+llvm.func @device_func(%arg0: i64, %cond: i1) attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} {
+  // CHECK: %[[ALLOC0:.*]] = omp.alloc_shared_mem %[[N]] x i64 : (i64) -> !llvm.ptr
+  %0 = llvm.alloca %arg0 x i64 : (i64) -> !llvm.ptr
+  // CHECK: %[[ALLOC1:.*]] = omp.alloc_shared_mem %[[N]] x f32 {alignment = 128 : i64} : (i64) -> !llvm.ptr
+  %1 = llvm.alloca %arg0 x f32 {alignment = 128} : (i64) -> !llvm.ptr
+  // CHECK: %[[ALLOC2:.*]] = omp.alloc_shared_mem %[[N]] x vector<16xf32> : (i64) -> !llvm.ptr
+  %2 = llvm.alloca %arg0 x vector<16xf32> : (i64) -> !llvm.ptr
+  // CHECK: %[[ALLOC3:.*]] = omp.alloc_shared_mem %[[N]] x i32 : (i64) -> !llvm.ptr
+  %3 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr<5>
+  %4 = llvm.addrspacecast %3 : !llvm.ptr<5> to !llvm.ptr
+
+  // CHECK: %[[ALLOC4:.*]] = llvm.alloca %[[N]] x i32 : (i64) -> !llvm.ptr
+  %5 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr
+  // CHECK: %[[ALLOC5:.*]] = llvm.alloca %[[N]] x i32 : (i64) -> !llvm.ptr
+  %6 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr
+  // CHECK: llvm.cond_br %[[COND]], ^[[IF:.*]], ^[[ELSE:.*]]
+  llvm.cond_br %cond, ^if, ^else
+
+// CHECK: ^[[IF]]:
+^if:
+  // CHECK: omp.parallel reduction(@add_f32 %[[ALLOC0]] -> %{{.*}} : !llvm.ptr)
+  omp.parallel reduction(@add_f32 %0 -> %arg1 : !llvm.ptr) {
+    // CHECK: %{{.*}} = llvm.load %[[ALLOC2]]
+    %7 = llvm.load %2 : !llvm.ptr -> vector<16xf32>
+    // CHECK: %{{.*}} = llvm.alloca
+    %8 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr
+    // CHECK: omp.wsloop private(@privatizer_i32 %[[ALLOC4]] -> %{{.*}}, @firstprivatizer_f32 %[[ALLOC1]] -> %{{.*}} : !llvm.ptr, !llvm.ptr)
+    omp.wsloop private(@privatizer_i32 %5 -> %arg2, @firstprivatizer_f32 %1 -> %arg3 : !llvm.ptr, !llvm.ptr) {
+      omp.loop_nest (%arg4) : i64 = (%arg0) to (%arg0) inclusive step (%arg0) {
+        llvm.call @foo(%arg1) : (!llvm.ptr) -> ()
+        llvm.call @foo(%8) : (!llvm.ptr) -> ()
+        llvm.call @foo(%arg2) : (!llvm.ptr) -> ()
+        llvm.call @foo(%arg3) : (!llvm.ptr) -> ()
+        omp.yield
+      }
+    }
+    omp.terminator
+  }
+  // CHECK: llvm.br ^[[EXIT:.*]]
+  llvm.br ^exit
+
+// CHECK: ^[[ELSE]]:
+^else:
+  // CHECK: llvm.call @foo(%[[ALLOC3]]) : (!llvm.ptr) -> ()
+  llvm.call @foo(%4) : (!llvm.ptr) -> ()
+  // CHECK: %{{.*}} = llvm.load %[[ALLOC5]]
+  %8 = llvm.load %6 : !llvm.ptr -> i32
+  // CHECK: llvm.br ^[[EXIT]]
+  llvm.br ^exit
+
+// CHECK: ^[[EXIT]]:
+^exit:
+  // CHECK: omp.free_shared_mem %[[ALLOC0]] : !llvm.ptr
+  // CHECK: omp.free_shared_mem %[[ALLOC1]] : !llvm.ptr
+  // CHECK: omp.free_shared_mem %[[ALLOC2]] : !llvm.ptr
+  // CHECK: omp.free_shared_mem %[[ALLOC3]] : !llvm.ptr
+  // CHECK-NOT: omp.free_shared_mem
+  // CHECK: llvm.return
+  llvm.return
+}
+
+// CHECK-LABEL: llvm.func @host_func(
+// CHECK-SAME:  %[[N:.*]]: i64)
+llvm.func @host_func(%arg0: i64) {
+  // CHECK: %[[ALLOC0:.*]] = llvm.alloca %[[N]] x i32 : (i64) -> !llvm.ptr
+  %0 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr
+  // CHECK: omp.parallel
+  omp.parallel {
+    // CHECK: llvm.call @foo(%[[ALLOC0]]) : (!llvm.ptr) -> ()
+    llvm.call @foo(%0) : (!llvm.ptr) -> ()
+    // CHECK: omp.target
+    omp.target {
+      %c0 = llvm.mlir.constant(1 : i64) : i64
+      // CHECK: %[[ALLOC1:.*]] = omp.alloc_shared_mem %{{.*}}
+      %1 = llvm.alloca %c0 x i32 : (i64) -> !llvm.ptr
+      // CHECK-NEXT: llvm.call @foo(%[[ALLOC1]]) : (!llvm.ptr) -> ()
+      llvm.call @foo(%1) : (!llvm.ptr) -> ()
+      // CHECK-NEXT: omp.free_shared_mem %[[ALLOC1]] : !llvm.ptr
+      // CHECK-NEXT: omp.terminator
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: llvm.func @target_spmd(
+llvm.func @target_spmd() {
+  // CHECK-NOT: omp.alloc_shared_mem
+  // CHECK-NOT: omp.free_shared_mem
+  omp.target {
+    %c = llvm.mlir.constant(1 : i64) : i64
+    %0 = llvm.alloca %c x i32 : (i64) -> !llvm.ptr
+    omp.teams {
+      %1 = llvm.alloca %c x i32 : (i64) -> !llvm.ptr
+      omp.parallel {
+        %2 = llvm.alloca %c x i32 : (i64) -> !llvm.ptr
+        %3 = llvm.load %0 : !llvm.ptr -> i32
+        %4 = llvm.load %1 : !llvm.ptr -> i32
+        omp.distribute {
+          omp.wsloop {
+            omp.loop_nest (%arg0) : i64 = (%c) to (%c) inclusive step (%c) {
+              %5 = llvm.load %2 : !llvm.ptr -> i32
+              omp.yield
+            }
+          } {omp.composite}
+        } {omp.composite}
+        omp.terminator
+      } {omp.composite}
+      omp.terminator
+    }
+    omp.terminator
+  }
+  // CHECK: return
+  llvm.return
+}
+
+}

From 9f113a445b60b73db96d70d2e70c580142614d1c Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Mon, 27 Oct 2025 07:29:09 -0500
Subject: [PATCH 19/22] remove spurious diffs

---
 flang/include/flang/Utils/OpenMP.h            |  1 -
 flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp | 45 +++++++------------
 flang/lib/Utils/OpenMP.cpp                    | 20 +++++----
 flang/test/Fir/basic-program.fir              |  2 +-
 mlir/lib/Dialect/OpenMP/Utils/Utils.cpp       |  3 ++
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 17 +++----
 6 files changed, 35 insertions(+), 53 deletions(-)

diff --git a/flang/include/flang/Utils/OpenMP.h b/flang/include/flang/Utils/OpenMP.h
index 334f8866fa560..bad0abb6f5788 100644
--- a/flang/include/flang/Utils/OpenMP.h
+++ b/flang/include/flang/Utils/OpenMP.h
@@ -59,7 +59,6 @@ mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder,
 /// maps.
 void cloneOrMapRegionOutsiders(
     fir::FirOpBuilder &firOpBuilder, mlir::omp::TargetOp targetOp);
-
 } // namespace Fortran::utils::openmp
 
 #endif // FORTRAN_UTILS_OPENMP_H_
diff --git a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
index c4e9505e74094..3e9c6bc211650 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
@@ -222,47 +222,35 @@ static mlir::Type convertObjectType(const fir::LLVMTypeConverter &converter,
   return converter.convertType(firType);
 }
 
-// FIR Op specific conversion for allocation operations
-template <typename T>
-struct AllocMemOpConversion : public OpenMPFIROpConversion<T> {
-  using OpenMPFIROpConversion<T>::OpenMPFIROpConversion;
+// FIR Op specific conversion for TargetAllocMemOp
+struct TargetAllocMemOpConversion
+    : public OpenMPFIROpConversion<mlir::omp::TargetAllocMemOp> {
+  using OpenMPFIROpConversion::OpenMPFIROpConversion;
 
   llvm::LogicalResult
-  matchAndRewrite(T allocmemOp,
-                  typename OpenMPFIROpConversion<T>::OpAdaptor adaptor,
+  matchAndRewrite(mlir::omp::TargetAllocMemOp allocmemOp, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     mlir::Type heapTy = allocmemOp.getAllocatedType();
     mlir::Location loc = allocmemOp.getLoc();
-    auto ity = OpenMPFIROpConversion<T>::lowerTy().indexType();
+    auto ity = lowerTy().indexType();
     mlir::Type dataTy = fir::unwrapRefType(heapTy);
-    mlir::Type llvmObjectTy =
-        convertObjectType(OpenMPFIROpConversion<T>::lowerTy(), dataTy);
+    mlir::Type llvmObjectTy = convertObjectType(lowerTy(), dataTy);
     if (fir::isRecordWithTypeParameters(fir::unwrapSequenceType(dataTy)))
-      TODO(loc, allocmemOp->getName().getStringRef() +
-                    " codegen of derived type with length parameters");
+      TODO(loc, "omp.target_allocmem codegen of derived type with length "
+                "parameters");
     mlir::Value size = fir::computeElementDistance(
-        loc, llvmObjectTy, ity, rewriter,
-        OpenMPFIROpConversion<T>::lowerTy().getDataLayout());
+        loc, llvmObjectTy, ity, rewriter, lowerTy().getDataLayout());
     if (auto scaleSize = fir::genAllocationScaleSize(
             loc, allocmemOp.getInType(), ity, rewriter))
       size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, scaleSize);
-    for (mlir::Value opnd : adaptor.getTypeparams())
-      size = mlir::LLVM::MulOp::create(
-          rewriter, loc, ity, size,
-          integerCast(OpenMPFIROpConversion<T>::lowerTy(), loc, rewriter, ity,
-                      opnd));
-    for (mlir::Value opnd : adaptor.getShape())
+    for (mlir::Value opnd : adaptor.getOperands().drop_front())
       size = mlir::LLVM::MulOp::create(
-          rewriter, loc, ity, size,
-          integerCast(OpenMPFIROpConversion<T>::lowerTy(), loc, rewriter, ity,
-                      opnd));
-    auto mallocTyWidth =
-        OpenMPFIROpConversion<T>::lowerTy().getIndexTypeBitwidth();
+          rewriter, loc, ity, size, integerCast(lowerTy(), loc, rewriter, ity, opnd));
+    auto mallocTyWidth = lowerTy().getIndexTypeBitwidth();
     auto mallocTy =
         mlir::IntegerType::get(rewriter.getContext(), mallocTyWidth);
     if (mallocTyWidth != ity.getIntOrFloatBitWidth())
-      size = integerCast(OpenMPFIROpConversion<T>::lowerTy(), loc, rewriter,
-                         mallocTy, size);
+      size = integerCast(lowerTy(), loc, rewriter, mallocTy, size);
     rewriter.modifyOpInPlace(allocmemOp, [&]() {
       allocmemOp.setInType(rewriter.getI8Type());
       allocmemOp.getTypeparamsMutable().clear();
@@ -277,8 +265,5 @@ void fir::populateOpenMPFIRToLLVMConversionPatterns(
     const LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns) {
   patterns.add<MapInfoOpConversion>(converter);
   patterns.add<PrivateClauseOpConversion>(converter);
-  // TODO: Undo refactoring in previous commit here.
-  patterns.add<AllocMemOpConversion<mlir::omp::TargetAllocMemOp>/*,
-               AllocMemOpConversion<mlir::omp::AllocSharedMemOp>*/>(
-      converter);
+  patterns.add<TargetAllocMemOpConversion>(converter);
 }
diff --git a/flang/lib/Utils/OpenMP.cpp b/flang/lib/Utils/OpenMP.cpp
index c2801d97004c0..b07caf853191a 100644
--- a/flang/lib/Utils/OpenMP.cpp
+++ b/flang/lib/Utils/OpenMP.cpp
@@ -17,11 +17,12 @@
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Transforms/RegionUtils.h"
 
-mlir::omp::MapInfoOp Fortran::utils::openmp::createMapInfoOp(
-    mlir::OpBuilder &builder, mlir::Location loc, mlir::Value baseAddr,
-    mlir::Value varPtrPtr, llvm::StringRef name,
-    llvm::ArrayRef<mlir::Value> bounds, llvm::ArrayRef<mlir::Value> members,
-    mlir::ArrayAttr membersIndex, mlir::omp::ClauseMapFlags mapType,
+namespace Fortran::utils::openmp {
+mlir::omp::MapInfoOp createMapInfoOp(mlir::OpBuilder &builder,
+    mlir::Location loc, mlir::Value baseAddr, mlir::Value varPtrPtr,
+    llvm::StringRef name, llvm::ArrayRef<mlir::Value> bounds,
+    llvm::ArrayRef<mlir::Value> members, mlir::ArrayAttr membersIndex,
+    mlir::omp::ClauseMapFlags mapType,
     mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
     bool partialMap, mlir::FlatSymbolRefAttr mapperId) {
 
@@ -49,9 +50,8 @@ mlir::omp::MapInfoOp Fortran::utils::openmp::createMapInfoOp(
   return op;
 }
 
-mlir::Value Fortran::utils::openmp::mapTemporaryValue(
-    fir::FirOpBuilder &firOpBuilder, mlir::omp::TargetOp targetOp,
-    mlir::Value val, llvm::StringRef name) {
+mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder,
+    mlir::omp::TargetOp targetOp, mlir::Value val, llvm::StringRef name) {
   mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
   mlir::Operation *valOp = val.getDefiningOp();
 
@@ -116,7 +116,7 @@ mlir::Value Fortran::utils::openmp::mapTemporaryValue(
   return loadOp.getResult();
 }
 
-void Fortran::utils::openmp::cloneOrMapRegionOutsiders(
+void cloneOrMapRegionOutsiders(
     fir::FirOpBuilder &firOpBuilder, mlir::omp::TargetOp targetOp) {
   mlir::Region &region = targetOp.getRegion();
   mlir::Block *entryBlock = &region.getBlocks().front();
@@ -155,3 +155,5 @@ void Fortran::utils::openmp::cloneOrMapRegionOutsiders(
     mlir::getUsedValuesDefinedAbove(region, valuesDefinedAbove);
   }
 }
+
+} // namespace Fortran::utils::openmp
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 8fbec2a9ac63d..8056fcf5a733c 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -161,7 +161,7 @@ func.func @_QQmain() {
 // PASSES-NEXT:  LowerNontemporalPass
 // PASSES-NEXT: FIRToLLVMLowering
 // PASSES-NEXT: ReconcileUnrealizedCasts
-// PASSES-NEXT: PrepareForOMPOffloadPrivatizationPass
 // PASSES-NEXT: 'llvm.func' Pipeline
 // PASSES-NEXT:  StackToSharedPass
+// PASSES-NEXT: PrepareForOMPOffloadPrivatizationPass
 // PASSES-NEXT: LLVMIRLoweringPass
diff --git a/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp b/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp
index 6b2f4805c2c18..f5b7aa7ca2e2c 100644
--- a/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp
@@ -67,6 +67,9 @@ bool mlir::omp::allocaUsesRequireSharedMem(Value alloc) {
 }
 
 bool mlir::omp::opInSharedDeviceContext(Operation &op) {
+  if (isa<omp::ParallelOp>(op))
+    return false;
+
   auto offloadIface = op.getParentOfType<omp::OffloadModuleInterface>();
   if (!offloadIface || !offloadIface.getIsTargetDevice())
     return false;
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index b0267c543a0a3..53bb20a0973b0 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1148,8 +1148,7 @@ allocReductionVars(T op, ArrayRef<BlockArgument> reductionArgs,
   builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
-  bool useDeviceSharedMem =
-      isa<omp::TeamsOp>(*op) && omp::opInSharedDeviceContext(*op);
+  bool useDeviceSharedMem = omp::opInSharedDeviceContext(*op);
 
   // delay creating stores until after all allocas
   deferredStores.reserve(op.getNumReductionVars());
@@ -1270,8 +1269,7 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
     return success();
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
-  bool useDeviceSharedMem =
-      isa<omp::TeamsOp>(*op) && omp::opInSharedDeviceContext(*op);
+  bool useDeviceSharedMem = omp::opInSharedDeviceContext(*op);
 
   llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
   auto allocaIP = llvm::IRBuilderBase::InsertPoint(
@@ -1487,8 +1485,7 @@ static LogicalResult createReductionsAndCleanup(
       reductionRegions, privateReductionVariables, moduleTranslation, builder,
       "omp.reduction.cleanup");
 
-  bool useDeviceSharedMem =
-      isa<omp::TeamsOp>(*op) && omp::opInSharedDeviceContext(*op);
+  bool useDeviceSharedMem = omp::opInSharedDeviceContext(*op);
   if (useDeviceSharedMem) {
     for (auto [var, reductionDecl] :
          llvm::zip_equal(privateReductionVariables, reductionDecls))
@@ -1667,9 +1664,7 @@ allocatePrivateVars(T op, llvm::IRBuilderBase &builder,
   llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0);
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
-  bool mightUseDeviceSharedMem =
-      isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
-      omp::opInSharedDeviceContext(*op);
+  bool mightUseDeviceSharedMem = omp::opInSharedDeviceContext(*op);
   unsigned int allocaAS =
       moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace();
   unsigned int defaultAS = moduleTranslation.getLLVMModule()
@@ -1784,9 +1779,7 @@ cleanupPrivateVars(T op, llvm::IRBuilderBase &builder,
                                 "`omp.private` op in");
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
-  bool mightUseDeviceSharedMem =
-      isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
-      omp::opInSharedDeviceContext(*op);
+  bool mightUseDeviceSharedMem = omp::opInSharedDeviceContext(*op);
   for (auto [privDecl, llvmPrivVar, blockArg] :
        llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.llvmVars,
                        privateVarsInfo.blockArgs)) {

From 270a8891169a7da5ec2f24f6d5cb42e086d25874 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Tue, 28 Oct 2025 09:07:11 -0500
Subject: [PATCH 20/22] Remove overly restrictive verifier check for
 omp.target_freemem

---
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  1 -
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 11 -----------
 mlir/test/Dialect/OpenMP/invalid.mlir         |  7 -------
 3 files changed, 19 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 9e9c7ed94b6a7..c59bee75fa6c7 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -2189,7 +2189,6 @@ def TargetFreeMemOp : OpenMP_Op<"target_freemem",
   Arg<I64, "", [MemFree]>:$heapref
   );
   let assemblyFormat = "$device `,` $heapref attr-dict `:` type($device) `,` qualified(type($heapref))";
-  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 7349db3a6b80a..0d1af4198f1a9 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -4337,17 +4337,6 @@ LogicalResult AllocateDirOp::verify() {
   return verifyAlignment(*getOperation(), getAlign());
 }
 
-//===----------------------------------------------------------------------===//
-// TargetFreeMemOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult TargetFreeMemOp::verify() {
-  return getHeapref().getDefiningOp<TargetAllocMemOp>()
-             ? success()
-             : emitOpError() << "'heapref' operand must be defined by an "
-                                "'omp.target_allocmem' op";
-}
-
 //===----------------------------------------------------------------------===//
 // AllocSharedMemOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 38dd8516dc41d..f6a2bbeadf0cf 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -3148,13 +3148,6 @@ func.func @target_allocmem_invalid_bindc_name(%device : i32) -> () {
   return
 }
 
-// -----
-func.func @target_freemem_invalid_ptr(%device : i32, %ptr : i64) -> () {
-  // expected-error @below {{op 'heapref' operand must be defined by an 'omp.target_allocmem' op}}
-  omp.target_freemem %device, %ptr : i32, i64
-  return
-}
-
 // -----
 func.func @alloc_shared_mem_invalid_alignment1(%n: i32) -> () {
   // expected-error @below {{op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive}}

From 609eda1575e4d3c35add88d4876a43c0dffbe49a Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Fri, 31 Oct 2025 10:07:08 -0500
Subject: [PATCH 21/22] Fix issue with non-pointer parallel region inputs

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 35 ++++++++++++++---------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 4f7ada8aa290d..742268b6dbd49 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1847,16 +1847,14 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
 
   LLVM_DEBUG(dbgs() << "After  body codegen: " << *OuterFn << "\n");
 
-  auto OI = [&]() -> std::unique_ptr<OutlineInfo> {
-    if (Config.isTargetDevice()) {
-      // If OuterFn is a Generic kernel, we need to use device shared memory to
-      // allocate argument structures. Otherwise, we use stack allocations as
-      // usual.
-      if (isGenericKernel(*OuterFn))
-        return std::make_unique<DeviceSharedMemOutlineInfo>(*this);
-    }
-    return std::make_unique<OutlineInfo>();
-  }();
+  // If OuterFn is a Generic kernel, we need to use device shared memory to
+  // allocate argument structures. Otherwise, we use stack allocations as usual.
+  bool UsesDeviceSharedMemory =
+      Config.isTargetDevice() && isGenericKernel(*OuterFn);
+  std::unique_ptr<OutlineInfo> OI =
+      UsesDeviceSharedMemory
+          ? std::make_unique<DeviceSharedMemOutlineInfo>(*this)
+          : std::make_unique<OutlineInfo>();
 
   if (Config.isTargetDevice()) {
     // Generate OpenMP target specific runtime call
@@ -1942,8 +1940,17 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
       LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
 
       Builder.restoreIP(OuterAllocIP);
-      Value *Ptr =
-          Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
+      Value *Ptr;
+      if (UsesDeviceSharedMemory) {
+        // Use device shared memory instead, if needed.
+        Ptr = createOMPAllocShared(OuterAllocIP, V.getType(),
+                                   V.getName() + ".reloaded");
+        for (InsertPointTy DeallocIP : OuterDeallocIPs)
+          createOMPFreeShared(DeallocIP, Ptr, V.getType());
+      } else {
+        Ptr = Builder.CreateAlloca(V.getType(), nullptr,
+                                   V.getName() + ".reloaded");
+      }
 
       // Store to stack at end of the block that currently branches to the entry
       // block of the to-be-outlined region.
@@ -10365,7 +10372,7 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
 
   addOutlineInfo(std::move(OI));
 
-  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  Builder.SetInsertPoint(ExitBB);
 
   return Builder.saveIP();
 }
@@ -10410,7 +10417,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createDistribute(
 
     addOutlineInfo(std::move(OI));
   }
-  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  Builder.SetInsertPoint(ExitBB);
 
   return Builder.saveIP();
 }

From f264642effcb365a054428bbaa94a739e63bdf48 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Wed, 5 Nov 2025 04:49:08 -0600
Subject: [PATCH 22/22] Support other map-like clauses

---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp        | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 53bb20a0973b0..fb0d263dfd354 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -5586,6 +5586,9 @@ static llvm::IRBuilderBase::InsertPoint createDeviceArgumentAccessor(
       ompBuilder.M.getContext());
   unsigned alignmentValue = 0;
   BlockArgument mlirArg;
+  SmallVector<std::pair<Value, BlockArgument>> blockArgsPairs;
+  cast<omp::BlockArgOpenMPOpInterface>(*targetOp).getBlockArgsPairs(
+      blockArgsPairs);
   // Find the associated MapInfoData entry for the current input
   for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
     if (mapData.OriginalValue[i] == input) {
@@ -5594,13 +5597,19 @@ static llvm::IRBuilderBase::InsertPoint createDeviceArgumentAccessor(
       // Get information of alignment of mapped object
       alignmentValue = typeToLLVMIRTranslator.getPreferredAlignment(
           mapOp.getVarType(), ompBuilder.M.getDataLayout());
-      // Get the corresponding target entry block argument
-      mlirArg =
-          cast<omp::BlockArgOpenMPOpInterface>(*targetOp).getMapBlockArgs()[i];
+
+      // Find the corresponding entry block argument, which can be associated to
+      // a map, use_device* or has_device* clause.
+      for (auto &[val, arg] : blockArgsPairs) {
+        if (mapOp.getResult() == val) {
+          mlirArg = arg;
+          break;
+        }
+      }
+      assert(mlirArg && "expected to find entry block argument for map clause");
       break;
     }
   }
-
   unsigned int allocaAS = ompBuilder.M.getDataLayout().getAllocaAddrSpace();
   unsigned int defaultAS =
       ompBuilder.M.getDataLayout().getProgramAddressSpace();