diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index a044168205d67..7cbe23e21f6c1 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -11354,8 +11354,8 @@ void CGOpenMPRuntime::emitTargetDataCalls( llvm::OpenMPIRBuilder::LocationDescription OmpLoc(CodeGenIP); llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail(OMPBuilder.createTargetData( - OmpLoc, AllocaIP, CodeGenIP, DeviceID, IfCondVal, Info, GenMapInfoCB, - CustomMapperCB, + OmpLoc, AllocaIP, CodeGenIP, /*DeallocIPs=*/{}, DeviceID, IfCondVal, + Info, GenMapInfoCB, CustomMapperCB, /*MapperFunc=*/nullptr, BodyCB, DeviceAddrCB, RTLoc)); CGF.Builder.restoreIP(AfterIP); } diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index dd9a4b3fa076a..9a5104dc70fb1 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -2238,10 +2238,10 @@ void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) { const CapturedStmt *CS = S.getCapturedStmt(OMPD_parallel); const Stmt *ParallelRegionBodyStmt = CS->getCapturedStmt(); - auto BodyGenCB = [&, this](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) { + auto BodyGenCB = [&, this](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { OMPBuilderCBHelpers::EmitOMPOutlinedRegionBody( - *this, ParallelRegionBodyStmt, AllocaIP, CodeGenIP, "parallel"); + *this, ParallelRegionBodyStmt, AllocIP, CodeGenIP, "parallel"); return llvm::Error::success(); }; @@ -2249,9 +2249,10 @@ void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) { CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(*this, &CGSI); llvm::OpenMPIRBuilder::InsertPointTy AllocaIP( AllocaInsertPt->getParent(), AllocaInsertPt->getIterator()); - llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail( - OMPBuilder.createParallel(Builder, AllocaIP, BodyGenCB, PrivCB, FiniCB, - IfCond, NumThreads, ProcBind, S.hasCancel())); + llvm::OpenMPIRBuilder::InsertPointTy AfterIP = + cantFail(OMPBuilder.createParallel( + Builder, AllocaIP, /*DeallocIPs=*/{}, BodyGenCB, PrivCB, FiniCB, + IfCond, NumThreads, ProcBind, S.hasCancel())); Builder.restoreIP(AfterIP); return; } @@ -4936,21 +4937,23 @@ void CodeGenFunction::EmitOMPSectionsDirective(const OMPSectionsDirective &S) { llvm::SmallVector SectionCBVector; if (CS) { for (const Stmt *SubStmt : CS->children()) { - auto SectionCB = [this, SubStmt](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) { - OMPBuilderCBHelpers::EmitOMPInlinedRegionBody( - *this, SubStmt, AllocaIP, CodeGenIP, "section"); + auto SectionCB = [this, SubStmt](InsertPointTy AllocIP, + InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { + OMPBuilderCBHelpers::EmitOMPInlinedRegionBody(*this, SubStmt, AllocIP, + CodeGenIP, "section"); return llvm::Error::success(); }; SectionCBVector.push_back(SectionCB); } } else { - auto SectionCB = [this, CapturedStmt](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) { - OMPBuilderCBHelpers::EmitOMPInlinedRegionBody( - *this, CapturedStmt, AllocaIP, CodeGenIP, "section"); - return llvm::Error::success(); - }; + auto SectionCB = + [this, CapturedStmt](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { + OMPBuilderCBHelpers::EmitOMPInlinedRegionBody( + *this, CapturedStmt, AllocIP, CodeGenIP, "section"); + return llvm::Error::success(); + }; SectionCBVector.push_back(SectionCB); } @@ -5004,10 +5007,11 @@ void CodeGenFunction::EmitOMPSectionDirective(const OMPSectionDirective &S) { return llvm::Error::success(); }; - auto BodyGenCB = [SectionRegionBodyStmt, this](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) { + auto BodyGenCB = [SectionRegionBodyStmt, + this](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { OMPBuilderCBHelpers::EmitOMPInlinedRegionBody( - *this, SectionRegionBodyStmt, AllocaIP, CodeGenIP, "section"); + *this, SectionRegionBodyStmt, AllocIP, CodeGenIP, "section"); return llvm::Error::success(); }; @@ -5089,10 +5093,11 @@ void CodeGenFunction::EmitOMPMasterDirective(const OMPMasterDirective &S) { return llvm::Error::success(); }; - auto BodyGenCB = [MasterRegionBodyStmt, this](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) { + auto BodyGenCB = [MasterRegionBodyStmt, + this](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { OMPBuilderCBHelpers::EmitOMPInlinedRegionBody( - *this, MasterRegionBodyStmt, AllocaIP, CodeGenIP, "master"); + *this, MasterRegionBodyStmt, AllocIP, CodeGenIP, "master"); return llvm::Error::success(); }; @@ -5139,10 +5144,11 @@ void CodeGenFunction::EmitOMPMaskedDirective(const OMPMaskedDirective &S) { return llvm::Error::success(); }; - auto BodyGenCB = [MaskedRegionBodyStmt, this](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) { + auto BodyGenCB = [MaskedRegionBodyStmt, + this](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { OMPBuilderCBHelpers::EmitOMPInlinedRegionBody( - *this, MaskedRegionBodyStmt, AllocaIP, CodeGenIP, "masked"); + *this, MaskedRegionBodyStmt, AllocIP, CodeGenIP, "masked"); return llvm::Error::success(); }; @@ -5182,10 +5188,11 @@ void CodeGenFunction::EmitOMPCriticalDirective(const OMPCriticalDirective &S) { return llvm::Error::success(); }; - auto BodyGenCB = [CriticalRegionBodyStmt, this](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) { + auto BodyGenCB = [CriticalRegionBodyStmt, + this](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { OMPBuilderCBHelpers::EmitOMPInlinedRegionBody( - *this, CriticalRegionBodyStmt, AllocaIP, CodeGenIP, "critical"); + *this, CriticalRegionBodyStmt, AllocIP, CodeGenIP, "critical"); return llvm::Error::success(); }; @@ -6152,8 +6159,8 @@ void CodeGenFunction::EmitOMPTaskgroupDirective( InsertPointTy AllocaIP(AllocaInsertPt->getParent(), AllocaInsertPt->getIterator()); - auto BodyGenCB = [&, this](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) { + auto BodyGenCB = [&, this](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { Builder.restoreIP(CodeGenIP); EmitStmt(S.getInnermostCapturedStmt()->getCapturedStmt()); return llvm::Error::success(); @@ -6162,7 +6169,8 @@ void CodeGenFunction::EmitOMPTaskgroupDirective( if (!CapturedStmtInfo) CapturedStmtInfo = &CapStmtInfo; llvm::OpenMPIRBuilder::InsertPointTy AfterIP = - cantFail(OMPBuilder.createTaskgroup(Builder, AllocaIP, BodyGenCB)); + cantFail(OMPBuilder.createTaskgroup(Builder, AllocaIP, + /*DeallocIPs=*/{}, BodyGenCB)); Builder.restoreIP(AfterIP); return; } @@ -6879,8 +6887,9 @@ void CodeGenFunction::EmitOMPOrderedDirective(const OMPOrderedDirective &S) { return llvm::Error::success(); }; - auto BodyGenCB = [&S, C, this](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) { + auto BodyGenCB = [&S, C, this](InsertPointTy AllocIP, + InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { Builder.restoreIP(CodeGenIP); const CapturedStmt *CS = S.getInnermostCapturedStmt(); @@ -6898,7 +6907,7 @@ void CodeGenFunction::EmitOMPOrderedDirective(const OMPOrderedDirective &S) { OutlinedFn, CapturedVars); } else { OMPBuilderCBHelpers::EmitOMPInlinedRegionBody( - *this, CS->getCapturedStmt(), AllocaIP, CodeGenIP, "ordered"); + *this, CS->getCapturedStmt(), AllocIP, CodeGenIP, "ordered"); } return llvm::Error::success(); }; diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h index 67e9287ddad4f..a035437afcc07 100644 --- a/flang/include/flang/Optimizer/Support/InitFIR.h +++ b/flang/include/flang/Optimizer/Support/InitFIR.h @@ -34,6 +34,7 @@ #include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/Dialect/OpenACC/Transforms/Passes.h" +#include "mlir/Dialect/OpenMP/Transforms/Passes.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SCF/Transforms/Passes.h" #include "mlir/InitAllDialects.h" @@ -106,6 +107,7 @@ inline void loadDialects(mlir::MLIRContext &context) { /// but is a smaller set since we aren't using many of the passes found there. inline void registerMLIRPassesForFortranTools() { mlir::acc::registerOpenACCPasses(); + mlir::omp::registerOpenMPPasses(); mlir::registerCanonicalizerPass(); mlir::registerCSEPass(); mlir::affine::registerAffineLoopFusionPass(); diff --git a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp index f74d635d50a75..3e9c6bc211650 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp @@ -245,8 +245,7 @@ struct TargetAllocMemOpConversion size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, scaleSize); for (mlir::Value opnd : adaptor.getOperands().drop_front()) size = mlir::LLVM::MulOp::create( - rewriter, loc, ity, size, - integerCast(lowerTy(), loc, rewriter, ity, opnd)); + rewriter, loc, ity, size, integerCast(lowerTy(), loc, rewriter, ity, opnd)); auto mallocTyWidth = lowerTy().getIndexTypeBitwidth(); auto mallocTy = mlir::IntegerType::get(rewriter.getContext(), mallocTyWidth); diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index d9b1287829cac..65f61439a1219 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -10,6 +10,7 @@ /// common to flang and the test tools. #include "flang/Optimizer/Passes/Pipelines.h" +#include "mlir/Dialect/OpenMP/Transforms/Passes.h" #include "llvm/Support/CommandLine.h" /// Force setting the no-alias attribute on fuction arguments when possible. @@ -408,6 +409,9 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm, } fir::addFIRToLLVMPass(pm, config); + + if (config.EnableOpenMP && !config.EnableOpenMPSimd) + pm.addPass(mlir::omp::createStackToSharedPass()); } /// Create a pass pipeline for lowering from MLIR to LLVM IR diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index 6d2beae4da1c8..8056fcf5a733c 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -161,5 +161,7 @@ func.func @_QQmain() { // PASSES-NEXT: LowerNontemporalPass // PASSES-NEXT: FIRToLLVMLowering // PASSES-NEXT: ReconcileUnrealizedCasts +// PASSES-NEXT: 'llvm.func' Pipeline +// PASSES-NEXT: StackToSharedPass // PASSES-NEXT: PrepareForOMPOffloadPrivatizationPass // PASSES-NEXT: LLVMIRLoweringPass diff --git a/flang/test/Integration/OpenMP/threadprivate-target-device.f90 b/flang/test/Integration/OpenMP/threadprivate-target-device.f90 index 662d6c6357af0..2d5d073520abe 100644 --- a/flang/test/Integration/OpenMP/threadprivate-target-device.f90 +++ b/flang/test/Integration/OpenMP/threadprivate-target-device.f90 @@ -14,16 +14,14 @@ ! target code in the same function. ! CHECK: define weak_odr protected amdgpu_kernel void @{{.*}}(ptr %{{.*}}, ptr %[[ARG1:.*]], ptr %[[ARG2:.*]]) #{{[0-9]+}} { -! CHECK: %[[ALLOCA_X:.*]] = alloca ptr, align 8, addrspace(5) -! CHECK: %[[ASCAST_X:.*]] = addrspacecast ptr addrspace(5) %[[ALLOCA_X]] to ptr -! CHECK: store ptr %[[ARG1]], ptr %[[ASCAST_X]], align 8 +! CHECK: %[[ALLOC_N:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8) +! CHECK: store ptr %[[ARG2]], ptr %[[ALLOC_N]], align 8 -! CHECK: %[[ALLOCA_N:.*]] = alloca ptr, align 8, addrspace(5) -! CHECK: %[[ASCAST_N:.*]] = addrspacecast ptr addrspace(5) %[[ALLOCA_N]] to ptr -! CHECK: store ptr %[[ARG2]], ptr %[[ASCAST_N]], align 8 +! CHECK: %[[ALLOC_X:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8) +! CHECK: store ptr %[[ARG1]], ptr %[[ALLOC_X]], align 8 -! CHECK: %[[LOAD_X:.*]] = load ptr, ptr %[[ASCAST_X]], align 8 -! CHECK: call void @bar_(ptr %[[LOAD_X]], ptr %[[ASCAST_N]]) +! CHECK: %[[LOAD_X:.*]] = load ptr, ptr %[[ALLOC_X]], align 8 +! CHECK: call void @bar_(ptr %[[LOAD_X]], ptr %[[ALLOC_N]]) module test implicit none diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index fb8563402528c..9769dff6c26f4 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -31,6 +31,7 @@ namespace llvm { class CanonicalLoopInfo; +class CodeExtractor; class ScanInfo; struct TargetRegionEntryInfo; class OffloadEntriesInfoManager; @@ -612,17 +613,19 @@ class OpenMPIRBuilder { /// such InsertPoints need to be preserved, it can split the block itself /// before calling the callback. /// - /// AllocaIP and CodeGenIP must not point to the same position. - /// - /// \param AllocaIP is the insertion point at which new alloca instructions - /// should be placed. The BasicBlock it is pointing to must - /// not be split. - /// \param CodeGenIP is the insertion point at which the body code should be - /// placed. + /// AllocIP and CodeGenIP must not point to the same position. /// + /// \param AllocIP is the insertion point at which new allocations should + /// be placed. The BasicBlock it is pointing to must not be + /// split. + /// \param CodeGenIP is the insertion point at which the body code should be + /// placed. + /// \param DeallocIPs is the list of insertion points where explicit + /// deallocations, if needed, should be placed. /// \return an error, if any were triggered during execution. using BodyGenCallbackTy = - function_ref; + function_ref DeallocIPs)>; // This is created primarily for sections construct as llvm::function_ref // (BodyGenCallbackTy) is not storable (as described in the comments of @@ -631,7 +634,8 @@ class OpenMPIRBuilder { /// /// \return an error, if any were triggered during execution. using StorableBodyGenCallbackTy = - std::function; + std::function DeallocIPs)>; /// Callback type for loop body code generation. /// @@ -725,7 +729,9 @@ class OpenMPIRBuilder { /// Generator for '#omp parallel' /// /// \param Loc The insert and source location description. - /// \param AllocaIP The insertion points to be used for alloca instructions. + /// \param AllocIP The insertion point to be used for allocations. + /// \param DeallocIPs The insertion points to be used for explicit + /// deallocations, if needed. /// \param BodyGenCB Callback that will generate the region code. /// \param PrivCB Callback to copy a given variable (think copy constructor). /// \param FiniCB Callback to finalize variable copies. @@ -736,10 +742,10 @@ class OpenMPIRBuilder { /// /// \returns The insertion position *after* the parallel. LLVM_ABI InsertPointOrErrorTy createParallel( - const LocationDescription &Loc, InsertPointTy AllocaIP, - BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, - FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, - omp::ProcBindKind ProcBind, bool IsCancellable); + const LocationDescription &Loc, InsertPointTy AllocIP, + ArrayRef DeallocIPs, BodyGenCallbackTy BodyGenCB, + PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, + Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable); /// Generator for the control flow structure of an OpenMP canonical loop. /// @@ -1363,7 +1369,9 @@ class OpenMPIRBuilder { /// Generator for `#omp task` /// /// \param Loc The location where the task construct was encountered. - /// \param AllocaIP The insertion point to be used for alloca instructions. + /// \param AllocIP The insertion point to be used for allocations. + /// \param DeallocIPs The insertion points to be used for explicit + /// deallocations, if needed. /// \param BodyGenCB Callback that will generate the region code. /// \param Tied True if the task is tied, false if the task is untied. /// \param Final i1 value which is `true` if the task is final, `false` if the @@ -1379,21 +1387,23 @@ class OpenMPIRBuilder { /// \param Mergeable If the given task is `mergeable` /// \param priority `priority-value' specifies the execution order of the /// tasks that is generated by the construct - LLVM_ABI InsertPointOrErrorTy - createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, - BodyGenCallbackTy BodyGenCB, bool Tied = true, - Value *Final = nullptr, Value *IfCondition = nullptr, - SmallVector Dependencies = {}, bool Mergeable = false, - Value *EventHandle = nullptr, Value *Priority = nullptr); + LLVM_ABI InsertPointOrErrorTy createTask( + const LocationDescription &Loc, InsertPointTy AllocIP, + ArrayRef DeallocIPs, BodyGenCallbackTy BodyGenCB, + bool Tied = true, Value *Final = nullptr, Value *IfCondition = nullptr, + SmallVector Dependencies = {}, bool Mergeable = false, + Value *EventHandle = nullptr, Value *Priority = nullptr); /// Generator for the taskgroup construct /// /// \param Loc The location where the taskgroup construct was encountered. - /// \param AllocaIP The insertion point to be used for alloca instructions. + /// \param AllocIP The insertion point to be used for allocations. + /// \param DeallocIPs The insertion point to be used for explicit deallocation + /// instructions, if needed. /// \param BodyGenCB Callback that will generate the region code. - LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, - InsertPointTy AllocaIP, - BodyGenCallbackTy BodyGenCB); + LLVM_ABI InsertPointOrErrorTy createTaskgroup( + const LocationDescription &Loc, InsertPointTy AllocIP, + ArrayRef DeallocIPs, BodyGenCallbackTy BodyGenCB); using FileIdentifierInfoCallbackTy = std::function()>; @@ -2273,20 +2283,31 @@ class OpenMPIRBuilder { struct OutlineInfo { using PostOutlineCBTy = std::function; PostOutlineCBTy PostOutlineCB; - BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB; + BasicBlock *EntryBB, *ExitBB, *OuterAllocBB; + SmallVector OuterDeallocBBs; SmallVector ExcludeArgsFromAggregate; + LLVM_ABI virtual ~OutlineInfo() = default; + /// Collect all blocks in between EntryBB and ExitBB in both the given /// vector and set. LLVM_ABI void collectBlocks(SmallPtrSetImpl &BlockSet, SmallVectorImpl &BlockVector); + /// Create a CodeExtractor instance based on the information stored in this + /// structure, the list of collected blocks from a previous call to + /// \c collectBlocks and a flag stating whether arguments must be passed in + /// address space 0. + LLVM_ABI virtual std::unique_ptr + createCodeExtractor(ArrayRef Blocks, + bool ArgsInZeroAddressSpace, Twine Suffix = Twine("")); + /// Return the function that contains the region to be outlined. Function *getFunction() const { return EntryBB->getParent(); } }; /// Collection of regions that need to be outlined during finalization. - SmallVector OutlineInfos; + SmallVector, 16> OutlineInfos; /// A collection of candidate target functions that's constant allocas will /// attempt to be raised on a call of finalize after all currently enqueued @@ -2301,7 +2322,9 @@ class OpenMPIRBuilder { std::forward_list ScanInfos; /// Add a new region that will be outlined later. - void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); } + void addOutlineInfo(std::unique_ptr &&OI) { + OutlineInfos.emplace_back(std::move(OI)); + } /// An ordered map of auto-generated variables to their unique names. /// It stores variables with the following names: 1) ".gomp_critical_user_" + @@ -2334,7 +2357,8 @@ class OpenMPIRBuilder { /// \return an error, if any were triggered during execution. LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, - InsertPointTy AllocaIP = {}); + InsertPointTy AllocIP = {}, + ArrayRef DeallocIPs = {}); /// Create the global variable holding the offload mappings information. LLVM_ABI GlobalVariable * @@ -2889,11 +2913,13 @@ class OpenMPIRBuilder { /// Generator for `#omp distribute` /// /// \param Loc The location where the distribute construct was encountered. - /// \param AllocaIP The insertion points to be used for alloca instructions. + /// \param AllocIP The insertion point to be used for allocations. + /// \param DeallocIPs The insertion points to be used for explicit + /// deallocations, if needed. /// \param BodyGenCB Callback that will generate the region code. - LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, - InsertPointTy AllocaIP, - BodyGenCallbackTy BodyGenCB); + LLVM_ABI InsertPointOrErrorTy createDistribute( + const LocationDescription &Loc, InsertPointTy AllocIP, + ArrayRef DeallocIPs, BodyGenCallbackTy BodyGenCB); /// Generate conditional branch and relevant BasicBlocks through which private /// threads copy the 'copyin' variables from Master copy to threadprivate @@ -2935,6 +2961,52 @@ class OpenMPIRBuilder { LLVM_ABI CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name = ""); + /// Create a runtime call for kmpc_alloc_shared. + /// + /// \param Loc The insert and source location description. + /// \param Size Size of allocated memory space. + /// \param Name Name of call Instruction. + /// + /// \returns CallInst to the kmpc_alloc_shared call. + LLVM_ABI CallInst *createOMPAllocShared(const LocationDescription &Loc, + Value *Size, + const Twine &Name = Twine("")); + + /// Create a runtime call for kmpc_alloc_shared. + /// + /// \param Loc The insert and source location description. + /// \param VarType Type of variable to be allocated. + /// \param Name Name of call Instruction. + /// + /// \returns CallInst to the kmpc_alloc_shared call. + LLVM_ABI CallInst *createOMPAllocShared(const LocationDescription &Loc, + Type *VarType, + const Twine &Name = Twine("")); + + /// Create a runtime call for kmpc_free_shared. + /// + /// \param Loc The insert and source location description. + /// \param Addr Value obtained from the corresponding kmpc_alloc_shared call. + /// \param Size Size of allocated memory space. + /// \param Name Name of call Instruction. + /// + /// \returns CallInst to the kmpc_free_shared call. + LLVM_ABI CallInst *createOMPFreeShared(const LocationDescription &Loc, + Value *Addr, Value *Size, + const Twine &Name = Twine("")); + + /// Create a runtime call for kmpc_free_shared. + /// + /// \param Loc The insert and source location description. + /// \param Addr Value obtained from the corresponding kmpc_alloc_shared call. + /// \param VarType Type of variable to be freed. + /// \param Name Name of call Instruction. + /// + /// \returns CallInst to the kmpc_free_shared call. + LLVM_ABI CallInst *createOMPFreeShared(const LocationDescription &Loc, + Value *Addr, Type *VarType, + const Twine &Name = Twine("")); + /// Create a runtime call for kmpc_threadprivate_cached /// /// \param Loc The insert and source location description. @@ -3198,9 +3270,11 @@ class OpenMPIRBuilder { /// Generator for '#omp target data' /// /// \param Loc The location where the target data construct was encountered. - /// \param AllocaIP The insertion points to be used for alloca instructions. + /// \param AllocIP The insertion points to be used for allocations. /// \param CodeGenIP The insertion point at which the target directive code /// should be placed. + /// \param DeallocIPs The insertion points at which explicit deallocations + /// should be placed, if needed. /// \param IsBegin If true then emits begin mapper call otherwise emits /// end mapper call. /// \param DeviceID Stores the DeviceID from the device clause. @@ -3213,10 +3287,10 @@ class OpenMPIRBuilder { /// \param DeviceAddrCB Optional callback to generate code related to /// use_device_ptr and use_device_addr. LLVM_ABI InsertPointOrErrorTy createTargetData( - const LocationDescription &Loc, InsertPointTy AllocaIP, - InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, - TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, - CustomMapperCallbackTy CustomMapperCB, + const LocationDescription &Loc, InsertPointTy AllocIP, + InsertPointTy CodeGenIP, ArrayRef DeallocIPs, + Value *DeviceID, Value *IfCond, TargetDataInfo &Info, + GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc = nullptr, function_ref @@ -3225,11 +3299,12 @@ class OpenMPIRBuilder { Value *SrcLocInfo = nullptr); using TargetBodyGenCallbackTy = function_ref; + InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs)>; using TargetGenArgAccessorsCallbackTy = function_ref; + Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocIP, + InsertPointTy CodeGenIP, ArrayRef DeallocIPs)>; /// Generator for '#omp target' /// @@ -3237,6 +3312,8 @@ class OpenMPIRBuilder { /// \param IsOffloadEntry whether it is an offload entry. /// \param CodeGenIP The insertion point where the call to the outlined /// function should be emitted. + /// \param DeallocIPs The insertion points at which explicit deallocations + /// should be placed, if needed. /// \param Info Stores all information realted to the Target directive. /// \param EntryInfo The entry information about the function. /// \param DefaultAttrs Structure containing the default attributes, including @@ -3257,8 +3334,9 @@ class OpenMPIRBuilder { /// not. LLVM_ABI InsertPointOrErrorTy createTarget( const LocationDescription &Loc, bool IsOffloadEntry, - OpenMPIRBuilder::InsertPointTy AllocaIP, - OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info, + OpenMPIRBuilder::InsertPointTy AllocIP, + OpenMPIRBuilder::InsertPointTy CodeGenIP, + ArrayRef DeallocIPs, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, diff --git a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h index 407eb50d2c7a3..7b1e3a759470f 100644 --- a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h +++ b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h @@ -17,14 +17,15 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SetVector.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/Support/Compiler.h" #include namespace llvm { template class SmallPtrSetImpl; +class AddrSpaceCastInst; class AllocaInst; -class BasicBlock; class BlockFrequency; class BlockFrequencyInfo; class BranchProbabilityInfo; @@ -94,15 +95,23 @@ class CodeExtractorAnalysisCache { BranchProbabilityInfo *BPI; AssumptionCache *AC; - // A block outside of the extraction set where any intermediate - // allocations will be placed inside. If this is null, allocations - // will be placed in the entry block of the function. + /// A block outside of the extraction set where any intermediate + /// allocations will be placed inside. If this is null, allocations + /// will be placed in the entry block of the function. BasicBlock *AllocationBlock; - // If true, varargs functions can be extracted. + /// A set of blocks outside of the extraction set where deallocations for + /// intermediate allocations should be placed. Not used for automatically + /// deallocated memory (e.g. `alloca`), which is the default. + /// + /// If it is empty and needed, the end of the replacement basic block will + /// be used to place deallocations. + SmallVector DeallocationBlocks; + + /// If true, varargs functions can be extracted. bool AllowVarArgs; - // Bits of intermediate state computed at various phases of extraction. + /// Bits of intermediate state computed at various phases of extraction. SetVector Blocks; /// Lists of blocks that are branched from the code region to be extracted, @@ -124,13 +133,13 @@ class CodeExtractorAnalysisCache { /// returns 1, etc. SmallVector ExtractedFuncRetVals; - // Suffix to use when creating extracted function (appended to the original - // function name + "."). If empty, the default is to use the entry block - // label, if non-empty, otherwise "extracted". + /// Suffix to use when creating extracted function (appended to the original + /// function name + "."). If empty, the default is to use the entry block + /// label, if non-empty, otherwise "extracted". std::string Suffix; - // If true, the outlined function has aggregate argument in zero address - // space. + /// If true, the outlined function has aggregate argument in zero address + /// space. bool ArgsInZeroAddressSpace; public: @@ -146,10 +155,12 @@ class CodeExtractorAnalysisCache { /// however code extractor won't validate whether extraction is legal. /// Any new allocations will be placed in the AllocationBlock, unless /// it is null, in which case it will be placed in the entry block of - /// the function from which the code is being extracted. - /// If ArgsInZeroAddressSpace param is set to true, then the aggregate - /// param pointer of the outlined function is declared in zero address - /// space. + /// the function from which the code is being extracted. Explicit + /// deallocations for the aforementioned allocations will be placed, if + /// needed, in all blocks in DeallocationBlocks or the end of the + /// replacement block. If ArgsInZeroAddressSpace param is set to true, then + /// the aggregate param pointer of the outlined function is declared in zero + /// address space. LLVM_ABI CodeExtractor(ArrayRef BBs, DominatorTree *DT = nullptr, bool AggregateArgs = false, BlockFrequencyInfo *BFI = nullptr, @@ -157,8 +168,11 @@ class CodeExtractorAnalysisCache { AssumptionCache *AC = nullptr, bool AllowVarArgs = false, bool AllowAlloca = false, BasicBlock *AllocationBlock = nullptr, + ArrayRef DeallocationBlocks = {}, std::string Suffix = "", bool ArgsInZeroAddressSpace = false); + LLVM_ABI virtual ~CodeExtractor() = default; + /// Perform the extraction, returning the new function. /// /// Returns zero when called on a CodeExtractor instance where isEligible @@ -243,6 +257,19 @@ class CodeExtractorAnalysisCache { /// region, passing it instead as a scalar. LLVM_ABI void excludeArgFromAggregate(Value *Arg); + protected: + /// Allocate an intermediate variable at the specified point. + LLVM_ABI virtual Instruction * + allocateVar(BasicBlock *BB, BasicBlock::iterator AllocIP, Type *VarType, + const Twine &Name = Twine(""), + AddrSpaceCastInst **CastedAlloc = nullptr); + + /// Deallocate a previously-allocated intermediate variable at the specified + /// point. + LLVM_ABI virtual Instruction *deallocateVar(BasicBlock *BB, + BasicBlock::iterator DeallocIP, + Value *Var, Type *VarType); + private: struct LifetimeMarkerInfo { bool SinkLifeStart = false; diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 10b2608d95a9c..742268b6dbd49 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -280,6 +280,44 @@ computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, return Result; } +/// Given a function, if it represents the entry point of a target kernel, this +/// returns the execution mode flags associated with that kernel. +static std::optional +getTargetKernelExecMode(Function &Kernel) { + CallInst *TargetInitCall = nullptr; + for (Instruction &Inst : Kernel.getEntryBlock()) { + if (auto *Call = dyn_cast(&Inst)) { + if (Call->getCalledFunction()->getName() == "__kmpc_target_init") { + TargetInitCall = Call; + break; + } + } + } + + if (!TargetInitCall) + return std::nullopt; + + // Get the kernel mode information from the global variable associated to the + // first argument to the call to __kmpc_target_init. Refer to + // createTargetInit() to see how this is initialized. + Value *InitOperand = TargetInitCall->getArgOperand(0); + GlobalVariable *KernelEnv = nullptr; + if (auto *Cast = dyn_cast(InitOperand)) + KernelEnv = cast(Cast->getOperand(0)); + else + KernelEnv = cast(InitOperand); + auto *KernelEnvInit = cast(KernelEnv->getInitializer()); + auto *ConfigEnv = cast(KernelEnvInit->getOperand(0)); + auto *KernelMode = cast(ConfigEnv->getOperand(2)); + return static_cast(KernelMode->getZExtValue()); +} + +static bool isGenericKernel(Function &Fn) { + std::optional ExecMode = + getTargetKernelExecMode(Fn); + return !ExecMode || (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC); +} + /// Make \p Source branch to \p Target. /// /// Handles two situations: @@ -455,6 +493,65 @@ enum OpenMPOffloadingRequiresDirFlags { LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS) }; +class OMPCodeExtractor : public CodeExtractor { +public: + OMPCodeExtractor(OpenMPIRBuilder &OMPBuilder, ArrayRef BBs, + DominatorTree *DT = nullptr, bool AggregateArgs = false, + BlockFrequencyInfo *BFI = nullptr, + BranchProbabilityInfo *BPI = nullptr, + AssumptionCache *AC = nullptr, bool AllowVarArgs = false, + bool AllowAlloca = false, + BasicBlock *AllocationBlock = nullptr, + ArrayRef DeallocationBlocks = {}, + std::string Suffix = "", bool ArgsInZeroAddressSpace = false) + : CodeExtractor(BBs, DT, AggregateArgs, BFI, BPI, AC, AllowVarArgs, + AllowAlloca, AllocationBlock, DeallocationBlocks, Suffix, + ArgsInZeroAddressSpace), + OMPBuilder(OMPBuilder) {} + + virtual ~OMPCodeExtractor() = default; + +protected: + OpenMPIRBuilder &OMPBuilder; +}; + +class DeviceSharedMemCodeExtractor : public OMPCodeExtractor { +public: + using OMPCodeExtractor::OMPCodeExtractor; + virtual ~DeviceSharedMemCodeExtractor() = default; + +protected: + virtual Instruction * + allocateVar(BasicBlock *BB, BasicBlock::iterator AllocIP, Type *VarType, + const Twine &Name = Twine(""), + AddrSpaceCastInst **CastedAlloc = nullptr) override { + return OMPBuilder.createOMPAllocShared( + OpenMPIRBuilder::InsertPointTy(BB, AllocIP), VarType, Name); + } + + virtual Instruction *deallocateVar(BasicBlock *BB, + BasicBlock::iterator DeallocIP, Value *Var, + Type *VarType) override { + return OMPBuilder.createOMPFreeShared( + OpenMPIRBuilder::InsertPointTy(BB, DeallocIP), Var, VarType); + } +}; + +/// Helper storing information about regions to outline using device shared +/// memory for intermediate allocations. +struct DeviceSharedMemOutlineInfo : public OpenMPIRBuilder::OutlineInfo { + OpenMPIRBuilder &OMPBuilder; + + DeviceSharedMemOutlineInfo(OpenMPIRBuilder &OMPBuilder) + : OMPBuilder(OMPBuilder) {} + virtual ~DeviceSharedMemOutlineInfo() = default; + + virtual std::unique_ptr + createCodeExtractor(ArrayRef Blocks, + bool ArgsInZeroAddressSpace, + Twine Suffix = Twine("")) override; +}; + } // anonymous namespace OpenMPIRBuilderConfig::OpenMPIRBuilderConfig() @@ -734,20 +831,20 @@ static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, void OpenMPIRBuilder::finalize(Function *Fn) { SmallPtrSet ParallelRegionBlockSet; SmallVector Blocks; - SmallVector DeferredOutlines; - for (OutlineInfo &OI : OutlineInfos) { + SmallVector, 16> DeferredOutlines; + for (std::unique_ptr &OI : OutlineInfos) { // Skip functions that have not finalized yet; may happen with nested // function generation. - if (Fn && OI.getFunction() != Fn) { - DeferredOutlines.push_back(OI); + if (Fn && OI->getFunction() != Fn) { + DeferredOutlines.push_back(std::move(OI)); continue; } ParallelRegionBlockSet.clear(); Blocks.clear(); - OI.collectBlocks(ParallelRegionBlockSet, Blocks); + OI->collectBlocks(ParallelRegionBlockSet, Blocks); - Function *OuterFn = OI.getFunction(); + Function *OuterFn = OI->getFunction(); CodeExtractorAnalysisCache CEAC(*OuterFn); // If we generate code for the target device, we need to allocate // struct for aggregate params in the device default alloca address space. @@ -756,26 +853,19 @@ void OpenMPIRBuilder::finalize(Function *Fn) { // CodeExtractor generates correct code for extracted functions // which are used by OpenMP runtime. bool ArgsInZeroAddressSpace = Config.isTargetDevice(); - CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, - /* AggregateArgs */ true, - /* BlockFrequencyInfo */ nullptr, - /* BranchProbabilityInfo */ nullptr, - /* AssumptionCache */ nullptr, - /* AllowVarArgs */ true, - /* AllowAlloca */ true, - /* AllocaBlock*/ OI.OuterAllocaBB, - /* Suffix */ ".omp_par", ArgsInZeroAddressSpace); + std::unique_ptr Extractor = + OI->createCodeExtractor(Blocks, ArgsInZeroAddressSpace, ".omp_par"); LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n"); - LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName() - << " Exit: " << OI.ExitBB->getName() << "\n"); - assert(Extractor.isEligible() && + LLVM_DEBUG(dbgs() << "Entry " << OI->EntryBB->getName() + << " Exit: " << OI->ExitBB->getName() << "\n"); + assert(Extractor->isEligible() && "Expected OpenMP outlining to be possible!"); - for (auto *V : OI.ExcludeArgsFromAggregate) - Extractor.excludeArgFromAggregate(V); + for (auto *V : OI->ExcludeArgsFromAggregate) + Extractor->excludeArgFromAggregate(V); - Function *OutlinedFn = Extractor.extractCodeRegion(CEAC); + Function *OutlinedFn = Extractor->extractCodeRegion(CEAC); if (Config.isGPU()) OutlinedFn->addFnAttr(Attribute::AlwaysInline); @@ -802,8 +892,8 @@ void OpenMPIRBuilder::finalize(Function *Fn) { // made our own entry block after all. { BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock(); - assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB); - assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry); + assert(ArtificialEntry.getUniqueSuccessor() == OI->EntryBB); + assert(OI->EntryBB->getUniquePredecessor() == &ArtificialEntry); // Move instructions from the to-be-deleted ArtificialEntry to the entry // basic block of the parallel region. CodeExtractor generates // instructions to unwrap the aggregate argument and may sink @@ -819,24 +909,25 @@ void OpenMPIRBuilder::finalize(Function *Fn) { if (I.isTerminator()) { // Absorb any debug value that terminator may have - if (OI.EntryBB->getTerminator()) - OI.EntryBB->getTerminator()->adoptDbgRecords( + if (OI->EntryBB->getTerminator()) + OI->EntryBB->getTerminator()->adoptDbgRecords( &ArtificialEntry, I.getIterator(), false); continue; } - I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt()); + I.moveBeforePreserving(*OI->EntryBB, + OI->EntryBB->getFirstInsertionPt()); } - OI.EntryBB->moveBefore(&ArtificialEntry); + OI->EntryBB->moveBefore(&ArtificialEntry); ArtificialEntry.eraseFromParent(); } - assert(&OutlinedFn->getEntryBlock() == OI.EntryBB); + assert(&OutlinedFn->getEntryBlock() == OI->EntryBB); assert(OutlinedFn && OutlinedFn->hasNUses(1)); // Run a user callback, e.g. to add attributes. - if (OI.PostOutlineCB) - OI.PostOutlineCB(*OutlinedFn); + if (OI->PostOutlineCB) + OI->PostOutlineCB(*OutlinedFn); } // Remove work items that have been completed. @@ -1349,6 +1440,86 @@ Error OpenMPIRBuilder::emitCancelationCheckImpl( return Error::success(); } +/// Create wrapper function used to gather the outlined function's argument +/// structure from a shared buffer and to forward them to it when running in +/// Generic mode. +/// +/// The outlined function is expected to receive 2 integer arguments followed by +/// an optional pointer argument to an argument structure holding the rest. +static Function *createTargetParallelWrapper(OpenMPIRBuilder *OMPIRBuilder, + Function &OutlinedFn) { + size_t NumArgs = OutlinedFn.arg_size(); + assert((NumArgs == 2 || NumArgs == 3) && + "expected a 2-3 argument parallel outlined function"); + bool UseArgStruct = NumArgs == 3; + + IRBuilder<> &Builder = OMPIRBuilder->Builder; + IRBuilder<>::InsertPointGuard IPG(Builder); + auto *FnTy = FunctionType::get(Builder.getVoidTy(), + {Builder.getInt16Ty(), Builder.getInt32Ty()}, + /*isVarArg=*/false); + auto *WrapperFn = + Function::Create(FnTy, GlobalValue::InternalLinkage, + OutlinedFn.getName() + ".wrapper", OMPIRBuilder->M); + + WrapperFn->addParamAttr(0, Attribute::NoUndef); + WrapperFn->addParamAttr(0, Attribute::ZExt); + WrapperFn->addParamAttr(1, Attribute::NoUndef); + + BasicBlock *EntryBB = + BasicBlock::Create(OMPIRBuilder->M.getContext(), "entry", WrapperFn); + Builder.SetInsertPoint(EntryBB); + + // Allocation. + Value *AddrAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), + /*ArraySize=*/nullptr, "addr"); + AddrAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast( + AddrAlloca, Builder.getPtrTy(/*AddrSpace=*/0), + AddrAlloca->getName() + ".ascast"); + + Value *ZeroAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), + /*ArraySize=*/nullptr, "zero"); + ZeroAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast( + ZeroAlloca, Builder.getPtrTy(/*AddrSpace=*/0), + ZeroAlloca->getName() + ".ascast"); + + Value *ArgsAlloca = nullptr; + if (UseArgStruct) { + ArgsAlloca = Builder.CreateAlloca(Builder.getPtrTy(), + /*ArraySize=*/nullptr, "global_args"); + ArgsAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast( + ArgsAlloca, Builder.getPtrTy(/*AddrSpace=*/0), + ArgsAlloca->getName() + ".ascast"); + } + + // Initialization. + Builder.CreateStore(WrapperFn->getArg(1), AddrAlloca); + Builder.CreateStore(Builder.getInt32(0), ZeroAlloca); + if (UseArgStruct) { + Builder.CreateCall( + OMPIRBuilder->getOrCreateRuntimeFunctionPtr( + llvm::omp::RuntimeFunction::OMPRTL___kmpc_get_shared_variables), + {ArgsAlloca}); + } + + SmallVector Args{AddrAlloca, ZeroAlloca}; + + // Load structArg from global_args. + if (UseArgStruct) { + Value *StructArg = Builder.CreateLoad(Builder.getPtrTy(), ArgsAlloca); + StructArg = Builder.CreateInBoundsGEP(Builder.getPtrTy(), StructArg, + {Builder.getInt64(0)}); + StructArg = Builder.CreateLoad(Builder.getPtrTy(), StructArg, "structArg"); + Args.push_back(StructArg); + } + + // Call the outlined function holding the parallel body. + Builder.CreateCall(&OutlinedFn, Args); + Builder.CreateRetVoid(); + + return WrapperFn; +} + // Callback used to create OpenMP runtime calls to support // omp parallel clause for the device. // We need to use this callback to replace call to the OutlinedFn in OuterFn @@ -1358,6 +1529,10 @@ static void targetParallelCallback( BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, Value *PrivTIDAddr, Value *ThreadID, const SmallVector &ToBeDeleted) { + assert(OutlinedFn.arg_size() >= 2 && + "Expected at least tid and bounded tid as arguments"); + unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2; + // Add some known attributes. IRBuilder<> &Builder = OMPIRBuilder->Builder; OutlinedFn.addParamAttr(0, Attribute::NoAlias); @@ -1366,17 +1541,12 @@ static void targetParallelCallback( OutlinedFn.addParamAttr(1, Attribute::NoUndef); OutlinedFn.addFnAttr(Attribute::NoUnwind); - assert(OutlinedFn.arg_size() >= 2 && - "Expected at least tid and bounded tid as arguments"); - unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2; - CallInst *CI = cast(OutlinedFn.user_back()); assert(CI && "Expected call instruction to outlined function"); CI->getParent()->setName("omp_parallel"); Builder.SetInsertPoint(CI); Type *PtrTy = OMPIRBuilder->VoidPtr; - Value *NullPtrValue = Constant::getNullValue(PtrTy); // Add alloca for kernel args OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP(); @@ -1402,6 +1572,13 @@ static void targetParallelCallback( IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32) : Builder.getInt32(1); + // If this is a Generic kernel, we can generate the wrapper. + Value *WrapperFn; + if (isGenericKernel(*OuterFn)) + WrapperFn = createTargetParallelWrapper(OMPIRBuilder, OutlinedFn); + else + WrapperFn = Constant::getNullValue(PtrTy); + // Build kmpc_parallel_51 call Value *Parallel51CallArgs[] = { /* identifier*/ Ident, @@ -1410,7 +1587,7 @@ static void targetParallelCallback( /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1), /* Proc bind */ Builder.getInt32(-1), /* outlined function */ &OutlinedFn, - /* wrapper function */ NullPtrValue, + /* wrapper function */ WrapperFn, /* arguments of the outlined funciton*/ Args, /* number of arguments */ Builder.getInt64(NumCapturedVars)}; @@ -1522,11 +1699,11 @@ hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, } OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel( - const LocationDescription &Loc, InsertPointTy OuterAllocaIP, - BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, - FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, - omp::ProcBindKind ProcBind, bool IsCancellable) { - assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous"); + const LocationDescription &Loc, InsertPointTy OuterAllocIP, + ArrayRef OuterDeallocIPs, BodyGenCallbackTy BodyGenCB, + PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, + Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable) { + assert(!isConflictIP(Loc.IP, OuterAllocIP) && "IPs must not be ambiguous"); if (!updateToLocation(Loc)) return Loc.IP; @@ -1566,7 +1743,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel( // Save the outer alloca block because the insertion iterator may get // invalidated and we still need this later. - BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock(); + BasicBlock *OuterAllocaBlock = OuterAllocIP.getBlock(); // Vector to remember instructions we used only during the modeling but which // we want to delete at the end. @@ -1664,15 +1841,24 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel( // Let the caller create the body. assert(BodyGenCB && "Expected body generation callback!"); InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin()); - if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP)) + InsertPointTy DeallocIP(PRegExitBB, PRegExitBB->begin()); + if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP, DeallocIP)) return Err; LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n"); - OutlineInfo OI; + // If OuterFn is a Generic kernel, we need to use device shared memory to + // allocate argument structures. Otherwise, we use stack allocations as usual. + bool UsesDeviceSharedMemory = + Config.isTargetDevice() && isGenericKernel(*OuterFn); + std::unique_ptr OI = + UsesDeviceSharedMemory + ? std::make_unique(*this) + : std::make_unique(); + if (Config.isTargetDevice()) { // Generate OpenMP target specific runtime call - OI.PostOutlineCB = [=, ToBeDeletedVec = + OI->PostOutlineCB = [=, ToBeDeletedVec = std::move(ToBeDeleted)](Function &OutlinedFn) { targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident, IfCondition, NumThreads, PrivTID, PrivTIDAddrAcast, @@ -1680,20 +1866,23 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel( }; } else { // Generate OpenMP host runtime call - OI.PostOutlineCB = [=, ToBeDeletedVec = - std::move(ToBeDeleted)](Function &OutlinedFn) { + OI->PostOutlineCB = [=, ToBeDeletedVec = + std::move(ToBeDeleted)](Function &OutlinedFn) { hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition, PrivTID, PrivTIDAddrAcast, ToBeDeletedVec); }; } - OI.OuterAllocaBB = OuterAllocaBlock; - OI.EntryBB = PRegEntryBB; - OI.ExitBB = PRegExitBB; + OI->OuterAllocBB = OuterAllocaBlock; + OI->EntryBB = PRegEntryBB; + OI->ExitBB = PRegExitBB; + OI->OuterDeallocBBs.reserve(OuterDeallocIPs.size()); + for (InsertPointTy DeallocIP : OuterDeallocIPs) + OI->OuterDeallocBBs.push_back(DeallocIP.getBlock()); SmallPtrSet ParallelRegionBlockSet; SmallVector Blocks; - OI.collectBlocks(ParallelRegionBlockSet, Blocks); + OI->collectBlocks(ParallelRegionBlockSet, Blocks); CodeExtractorAnalysisCache CEAC(*OuterFn); CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, @@ -1704,6 +1893,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel( /* AllowVarArgs */ true, /* AllowAlloca */ true, /* AllocationBlock */ OuterAllocaBlock, + /* DeallocationBlocks */ {}, /* Suffix */ ".omp_par", ArgsInZeroAddressSpace); // Find inputs to, outputs from the code region. @@ -1728,7 +1918,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel( auto PrivHelper = [&](Value &V) -> Error { if (&V == TIDAddr || &V == ZeroAddr) { - OI.ExcludeArgsFromAggregate.push_back(&V); + OI->ExcludeArgsFromAggregate.push_back(&V); return Error::success(); } @@ -1749,9 +1939,18 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel( IRBuilder<>::InsertPointGuard Guard(Builder); LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n"); - Builder.restoreIP(OuterAllocaIP); - Value *Ptr = - Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded"); + Builder.restoreIP(OuterAllocIP); + Value *Ptr; + if (UsesDeviceSharedMemory) { + // Use device shared memory instead, if needed. + Ptr = createOMPAllocShared(OuterAllocIP, V.getType(), + V.getName() + ".reloaded"); + for (InsertPointTy DeallocIP : OuterDeallocIPs) + createOMPFreeShared(DeallocIP, Ptr, V.getType()); + } else { + Ptr = Builder.CreateAlloca(V.getType(), nullptr, + V.getName() + ".reloaded"); + } // Store to stack at end of the block that currently branches to the entry // block of the to-be-outlined region. @@ -1801,7 +2000,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel( // Reset the outer alloca insertion point to the entry of the relevant block // in case it was invalidated. - OuterAllocaIP = IRBuilder<>::InsertPoint( + OuterAllocIP = IRBuilder<>::InsertPoint( OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt()); for (Value *Input : Inputs) { @@ -1967,10 +2166,10 @@ static Value *emitTaskDependencies( } OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask( - const LocationDescription &Loc, InsertPointTy AllocaIP, - BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition, - SmallVector Dependencies, bool Mergeable, Value *EventHandle, - Value *Priority) { + const LocationDescription &Loc, InsertPointTy AllocIP, + ArrayRef DeallocIPs, BodyGenCallbackTy BodyGenCB, bool Tied, + Value *Final, Value *IfCondition, SmallVector Dependencies, + bool Mergeable, Value *EventHandle, Value *Priority) { if (!updateToLocation(Loc)) return InsertPointTy(); @@ -2002,22 +2201,26 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask( InsertPointTy TaskAllocaIP = InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin()); InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin()); - if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP)) + InsertPointTy TaskDeallocIP = InsertPointTy(TaskExitBB, TaskExitBB->begin()); + if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP, TaskDeallocIP)) return Err; - OutlineInfo OI; - OI.EntryBB = TaskAllocaBB; - OI.OuterAllocaBB = AllocaIP.getBlock(); - OI.ExitBB = TaskExitBB; + auto OI = std::make_unique(); + OI->EntryBB = TaskAllocaBB; + OI->OuterAllocBB = AllocIP.getBlock(); + OI->ExitBB = TaskExitBB; + OI->OuterDeallocBBs.reserve(DeallocIPs.size()); + for (InsertPointTy DeallocIP : DeallocIPs) + OI->OuterDeallocBBs.push_back(DeallocIP.getBlock()); // Add the thread ID argument. SmallVector ToBeDeleted; - OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( - Builder, M, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false)); + OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal( + Builder, M, AllocIP, ToBeDeleted, TaskAllocaIP, "global.tid", false)); - OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies, - Mergeable, Priority, EventHandle, TaskAllocaBB, - ToBeDeleted](Function &OutlinedFn) mutable { + OI->PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies, + Mergeable, Priority, EventHandle, TaskAllocaBB, + ToBeDeleted](Function &OutlinedFn) mutable { // Replace the Stale CI by appropriate RTL function call. assert(OutlinedFn.hasOneUse() && "there must be a single user for the outlined function"); @@ -2230,10 +2433,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask( return Builder.saveIP(); } -OpenMPIRBuilder::InsertPointOrErrorTy -OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc, - InsertPointTy AllocaIP, - BodyGenCallbackTy BodyGenCB) { +OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskgroup( + const LocationDescription &Loc, InsertPointTy AllocIP, + ArrayRef DeallocIPs, BodyGenCallbackTy BodyGenCB) { if (!updateToLocation(Loc)) return InsertPointTy(); @@ -2248,7 +2450,7 @@ OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc, Builder.CreateCall(TaskgroupFn, {Ident, ThreadID}); BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit"); - if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP())) + if (Error Err = BodyGenCB(AllocIP, Builder.saveIP(), DeallocIPs)) return Err; Builder.SetInsertPoint(TaskgroupExitBB); @@ -2317,8 +2519,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections( SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB); Builder.SetInsertPoint(CaseBB); BranchInst *CaseEndBr = Builder.CreateBr(Continue); - if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(), - CaseEndBr->getIterator()})) + if (Error Err = + SectionCB(InsertPointTy(), + {CaseEndBr->getParent(), CaseEndBr->getIterator()}, {})) return Err; CaseNumber++; } @@ -4176,8 +4379,8 @@ Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR( } // Allocate temporary buffer by master thread - auto BodyGenCB = [&](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) -> Error { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) -> Error { Builder.restoreIP(CodeGenIP); Value *AllocSpan = Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1)); @@ -4216,8 +4419,8 @@ Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR( Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR( ArrayRef ReductionInfos, ScanInfo *ScanRedInfo) { - auto BodyGenCB = [&](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) -> Error { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) -> Error { Builder.restoreIP(CodeGenIP); for (ReductionInfo RedInfo : ReductionInfos) { Value *PrivateVar = RedInfo.PrivateVariable; @@ -4268,8 +4471,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction( if (!updateToLocation(Loc)) return Loc.IP; - auto BodyGenCB = [&](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) -> Error { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) -> Error { Builder.restoreIP(CodeGenIP); Function *CurFn = Builder.GetInsertBlock()->getParent(); // for (int k = 0; k <= ceil(log2(n)); ++k) @@ -5115,19 +5318,19 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget( Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); - OutlineInfo OI; - OI.OuterAllocaBB = CLI->getPreheader(); + auto OI = std::make_unique(); + OI->OuterAllocBB = CLI->getPreheader(); Function *OuterFn = CLI->getPreheader()->getParent(); // Instructions which need to be deleted at the end of code generation SmallVector ToBeDeleted; - OI.OuterAllocaBB = AllocaIP.getBlock(); + OI->OuterAllocBB = AllocaIP.getBlock(); // Mark the body loop as region which needs to be extracted - OI.EntryBB = CLI->getBody(); - OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(), - "omp.prelatch", true); + OI->EntryBB = CLI->getBody(); + OI->ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(), + "omp.prelatch", true); // Prepare loop body for extraction Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()}); @@ -5147,7 +5350,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget( // loop body region. SmallPtrSet ParallelRegionBlockSet; SmallVector Blocks; - OI.collectBlocks(ParallelRegionBlockSet, Blocks); + OI->collectBlocks(ParallelRegionBlockSet, Blocks); CodeExtractorAnalysisCache CEAC(*OuterFn); CodeExtractor Extractor(Blocks, @@ -5159,6 +5362,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget( /* AllowVarArgs */ true, /* AllowAlloca */ true, /* AllocationBlock */ CLI->getPreheader(), + /* DeallocationBlocks */ {}, /* Suffix */ ".omp_wsloop", /* AggrArgsIn0AddrSpace */ true); @@ -5183,15 +5387,15 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget( } // Make sure that loop counter variable is not merged into loop body // function argument structure and it is passed as separate variable - OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad); + OI->ExcludeArgsFromAggregate.push_back(NewLoopCntLoad); // PostOutline CB is invoked when loop body function is outlined and // loop body is replaced by call to outlined function. We need to add // call to OpenMP device rtl inside loop preheader. OpenMP device rtl // function will handle loop control logic. // - OI.PostOutlineCB = [=, ToBeDeletedVec = - std::move(ToBeDeleted)](Function &OutlinedFn) { + OI->PostOutlineCB = [=, ToBeDeletedVec = + std::move(ToBeDeleted)](Function &OutlinedFn) { workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec, LoopType, NoLoop); }; @@ -6515,8 +6719,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion( emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional); // generate body - if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(), - /* CodeGenIP */ Builder.saveIP())) + if (Error Err = + BodyGenCB(/* AllocIP */ InsertPointTy(), + /* CodeGenIP */ Builder.saveIP(), /* DeallocIPs */ {})) return Err; // emit exit call and do any needed finalization. @@ -6693,6 +6898,46 @@ CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc, return Builder.CreateCall(Fn, Args, Name); } +CallInst *OpenMPIRBuilder::createOMPAllocShared(const LocationDescription &Loc, + Value *Size, + const Twine &Name) { + IRBuilder<>::InsertPointGuard IPG(Builder); + updateToLocation(Loc); + + Value *Args[] = {Size}; + Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc_shared); + CallInst *Call = Builder.CreateCall(Fn, Args, Name); + Call->addRetAttr(Attribute::getWithAlignment( + M.getContext(), M.getDataLayout().getPrefTypeAlign(Int64))); + return Call; +} + +CallInst *OpenMPIRBuilder::createOMPAllocShared(const LocationDescription &Loc, + Type *VarType, + const Twine &Name) { + return createOMPAllocShared( + Loc, Builder.getInt64(M.getDataLayout().getTypeStoreSize(VarType)), Name); +} + +CallInst *OpenMPIRBuilder::createOMPFreeShared(const LocationDescription &Loc, + Value *Addr, Value *Size, + const Twine &Name) { + IRBuilder<>::InsertPointGuard IPG(Builder); + updateToLocation(Loc); + + Value *Args[] = {Addr, Size}; + Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free_shared); + return Builder.CreateCall(Fn, Args, Name); +} + +CallInst *OpenMPIRBuilder::createOMPFreeShared(const LocationDescription &Loc, + Value *Addr, Type *VarType, + const Twine &Name) { + return createOMPFreeShared( + Loc, Addr, Builder.getInt64(M.getDataLayout().getTypeStoreSize(VarType)), + Name); +} + CallInst *OpenMPIRBuilder::createOMPInteropInit( const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, @@ -6811,7 +7056,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit( Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags); Constant *UseGenericStateMachineVal = ConstantInt::getSigned( - Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD); + Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD && + Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP); Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true); Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0); @@ -7132,10 +7378,11 @@ Constant *OpenMPIRBuilder::registerTargetRegionFunction( } OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData( - const LocationDescription &Loc, InsertPointTy AllocaIP, - InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, - TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, - CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc, + const LocationDescription &Loc, InsertPointTy AllocIP, + InsertPointTy CodeGenIP, ArrayRef DeallocIPs, + Value *DeviceID, Value *IfCond, TargetDataInfo &Info, + GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, + omp::RuntimeFunction *MapperFunc, function_ref BodyGenCB, @@ -7160,11 +7407,11 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData( // Generate the code for the opening of the data environment. Capture all the // arguments of the runtime call by reference because they are used in the // closing of the region. - auto BeginThenGen = [&](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) -> Error { + auto BeginThenGen = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) -> Error { MapInfo = &GenMapInfoCB(Builder.saveIP()); if (Error Err = emitOffloadingArrays( - AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB, + AllocIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB, /*IsNonContiguous=*/true, DeviceAddrCB)) return Err; @@ -7218,7 +7465,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData( cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr, /*TargetTaskAllocaIP=*/{})); else - cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP, + cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocIP, /*Dependencies=*/{}, RTArgs, Info.HasNoWait)); } else { Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr( @@ -7249,8 +7496,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData( // If we need device pointer privatization, we need to emit the body of the // region with no privatization in the 'else' branch of the conditional. // Otherwise, we don't have to do anything. - auto BeginElseGen = [&](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) -> Error { + auto BeginElseGen = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) -> Error { InsertPointOrErrorTy AfterIP = BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv); if (!AfterIP) @@ -7260,7 +7507,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData( }; // Generate code for the closing of the data region. - auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto EndThenGen = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { TargetDataRTArgs RTArgs; Info.EmitDebug = !MapInfo->Names.empty(); emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true); @@ -7289,7 +7537,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData( // We don't have to do anything to close the region if the if clause evaluates // to false. - auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto EndElseGen = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { return Error::success(); }; @@ -7297,8 +7546,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData( if (BodyGenCB) { Error Err = [&]() { if (IfCond) - return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP); - return BeginThenGen(AllocaIP, Builder.saveIP()); + return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocIP); + return BeginThenGen(AllocIP, Builder.saveIP(), DeallocIPs); }(); if (Err) @@ -7313,12 +7562,12 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData( restoreIPandDebugLoc(Builder, *AfterIP); if (IfCond) - return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP); - return EndThenGen(AllocaIP, Builder.saveIP()); + return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocIP); + return EndThenGen(AllocIP, Builder.saveIP(), DeallocIPs); } if (IfCond) - return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP); - return BeginThenGen(AllocaIP, Builder.saveIP()); + return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocIP); + return BeginThenGen(AllocIP, Builder.saveIP(), DeallocIPs); }(); if (Err) @@ -7596,15 +7845,18 @@ static Expected createOutlinedFunction( if (OMPBuilder.Config.isTargetDevice()) OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func); - // Insert target deinit call in the device compilation pass. + BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "target.exit"); BasicBlock *OutlinedBodyBB = splitBB(Builder, /*CreateBranch=*/true, "outlined.body"); llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc( Builder.saveIP(), - OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin())); + OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()), + OpenMPIRBuilder::InsertPointTy(ExitBB, ExitBB->begin())); if (!AfterIP) return AfterIP.takeError(); - Builder.restoreIP(*AfterIP); + Builder.SetInsertPoint(ExitBB); + + // Insert target deinit call in the device compilation pass. if (OMPBuilder.Config.isTargetDevice()) OMPBuilder.createTargetDeinit(Builder); @@ -7665,8 +7917,9 @@ static Expected createOutlinedFunction( Argument &Arg = std::get<1>(InArg); Value *InputCopy = nullptr; - llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = - ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP()); + llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = ArgAccessorFuncCB( + Arg, Input, InputCopy, AllocaIP, Builder.saveIP(), + OpenMPIRBuilder::InsertPointTy(ExitBB, ExitBB->begin())); if (!AfterIP) return AfterIP.takeError(); Builder.restoreIP(*AfterIP); @@ -8051,13 +8304,13 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( TargetTaskAllocaBB->begin()); InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin()); - OutlineInfo OI; - OI.EntryBB = TargetTaskAllocaBB; - OI.OuterAllocaBB = AllocaIP.getBlock(); + auto OI = std::make_unique(); + OI->EntryBB = TargetTaskAllocaBB; + OI->OuterAllocBB = AllocaIP.getBlock(); // Add the thread ID argument. SmallVector ToBeDeleted; - OI.ExcludeArgsFromAggregate.push_back( + OI->ExcludeArgsFromAggregate.push_back( createFakeIntVal(Builder, M, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false)); @@ -8076,8 +8329,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( // OI.ExitBlock is set to the single task body block and will get left out of // the outlining process. So, simply create a new empty block to which we // uncoditionally branch from where TaskBodyCB left off - OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont"); - emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(), + OI->ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont"); + emitBlock(OI->ExitBB, Builder.GetInsertBlock()->getParent(), /*IsFinished=*/true); SmallVector OffloadingArraysToPrivatize; @@ -8089,13 +8342,13 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( RTArgs.SizesArray}) { if (V && !isa(V)) { OffloadingArraysToPrivatize.push_back(V); - OI.ExcludeArgsFromAggregate.push_back(V); + OI->ExcludeArgsFromAggregate.push_back(V); } } } - OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask, - DeviceID, OffloadingArraysToPrivatize]( - Function &OutlinedFn) mutable { + OI->PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask, + DeviceID, OffloadingArraysToPrivatize]( + Function &OutlinedFn) mutable { assert(OutlinedFn.hasOneUse() && "there must be a single user for the outlined function"); @@ -8321,7 +8574,8 @@ Error OpenMPIRBuilder::emitOffloadingArraysAndArgs( static void emitTargetCall( OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, - OpenMPIRBuilder::InsertPointTy AllocaIP, + OpenMPIRBuilder::InsertPointTy AllocIP, + ArrayRef DeallocIPs, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, @@ -8378,8 +8632,9 @@ static void emitTargetCall( }; auto &&EmitTargetCallElse = - [&](OpenMPIRBuilder::InsertPointTy AllocaIP, - OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error { + [&](OpenMPIRBuilder::InsertPointTy AllocIP, + OpenMPIRBuilder::InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) -> Error { // Assume no error was returned because EmitTargetCallFallbackCB doesn't // produce any. OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() { @@ -8389,7 +8644,7 @@ static void emitTargetCall( // OutlinedFnID=nullptr results in that call not being done. OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs; return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr, - /*RTLoc=*/nullptr, AllocaIP, + /*RTLoc=*/nullptr, AllocIP, Dependencies, EmptyRTArgs, HasNoWait); } return EmitTargetCallFallbackCB(CodeGenIP); @@ -8400,13 +8655,14 @@ static void emitTargetCall( }; auto &&EmitTargetCallThen = - [&](OpenMPIRBuilder::InsertPointTy AllocaIP, - OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error { + [&](OpenMPIRBuilder::InsertPointTy AllocIP, + OpenMPIRBuilder::InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) -> Error { Info.HasNoWait = HasNoWait; OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP()); OpenMPIRBuilder::TargetDataRTArgs RTArgs; if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs( - AllocaIP, CodeGenIP, Info, RTArgs, MapInfo, CustomMapperCB, + AllocIP, CodeGenIP, Info, RTArgs, MapInfo, CustomMapperCB, /*IsNonContiguous=*/true, /*ForEndCall=*/false)) return Err; @@ -8479,13 +8735,13 @@ static void emitTargetCall( // The presence of certain clauses on the target directive require the // explicit generation of the target task. if (RequiresOuterTargetTask) - return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP, + return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocIP, Dependencies, KArgs.RTArgs, Info.HasNoWait); return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, - DeviceID, RTLoc, AllocaIP); + DeviceID, RTLoc, AllocIP); }()); Builder.restoreIP(AfterIP); @@ -8496,24 +8752,24 @@ static void emitTargetCall( // wasn't created. In this case we just run the host fallback directly and // ignore any potential 'if' clauses. if (!OutlinedFnID) { - cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP())); + cantFail(EmitTargetCallElse(AllocIP, Builder.saveIP(), DeallocIPs)); return; } // If there's no 'if' clause, only generate the kernel launch code path. if (!IfCond) { - cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP())); + cantFail(EmitTargetCallThen(AllocIP, Builder.saveIP(), DeallocIPs)); return; } cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen, - EmitTargetCallElse, AllocaIP)); + EmitTargetCallElse, AllocIP)); } OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget( - const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP, - InsertPointTy CodeGenIP, TargetDataInfo &Info, - TargetRegionEntryInfo &EntryInfo, + const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocIP, + InsertPointTy CodeGenIP, ArrayRef DeallocIPs, + TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl &Inputs, GenMapInfoCallbackTy GenMapInfoCB, @@ -8541,9 +8797,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget( // to make a remote call (offload) to the previously outlined function // that represents the target region. Do that now. if (!Config.isTargetDevice()) - emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs, - IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB, - CustomMapperCB, Dependencies, HasNowait); + emitTargetCall(*this, Builder, AllocIP, DeallocIPs, Info, DefaultAttrs, + RuntimeAttrs, IfCond, OutlinedFn, OutlinedFnID, Inputs, + GenMapInfoCB, CustomMapperCB, Dependencies, HasNowait); return Builder.saveIP(); } @@ -9322,15 +9578,16 @@ void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn, Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, - InsertPointTy AllocaIP) { + InsertPointTy AllocIP, + ArrayRef DeallocIPs) { // If the condition constant folds and can be elided, try to avoid emitting // the condition and the dead arm of the if/else. if (auto *CI = dyn_cast(Cond)) { auto CondConstant = CI->getSExtValue(); if (CondConstant) - return ThenGen(AllocaIP, Builder.saveIP()); + return ThenGen(AllocIP, Builder.saveIP(), DeallocIPs); - return ElseGen(AllocaIP, Builder.saveIP()); + return ElseGen(AllocIP, Builder.saveIP(), DeallocIPs); } Function *CurFn = Builder.GetInsertBlock()->getParent(); @@ -9343,13 +9600,13 @@ Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, Builder.CreateCondBr(Cond, ThenBlock, ElseBlock); // Emit the 'then' code. emitBlock(ThenBlock, CurFn); - if (Error Err = ThenGen(AllocaIP, Builder.saveIP())) + if (Error Err = ThenGen(AllocIP, Builder.saveIP(), DeallocIPs)) return Err; emitBranch(ContBlock); // Emit the 'else' code if present. // There is no need to emit line number for unconditional branch. emitBlock(ElseBlock, CurFn); - if (Error Err = ElseGen(AllocaIP, Builder.saveIP())) + if (Error Err = ElseGen(AllocIP, Builder.saveIP(), DeallocIPs)) return Err; // There is no need to emit line number for unconditional branch. emitBranch(ContBlock); @@ -10057,20 +10314,21 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc, // Generate the body of teams. InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin()); InsertPointTy CodeGenIP(BodyBB, BodyBB->begin()); - if (Error Err = BodyGenCB(AllocaIP, CodeGenIP)) + InsertPointTy DeallocIP(ExitBB, ExitBB->begin()); + if (Error Err = BodyGenCB(AllocaIP, CodeGenIP, DeallocIP)) return Err; - OutlineInfo OI; - OI.EntryBB = AllocaBB; - OI.ExitBB = ExitBB; - OI.OuterAllocaBB = &OuterAllocaBB; + auto OI = std::make_unique(); + OI->EntryBB = AllocaBB; + OI->ExitBB = ExitBB; + OI->OuterAllocBB = &OuterAllocaBB; // Insert fake values for global tid and bound tid. SmallVector ToBeDeleted; InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin()); - OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( + OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal( Builder, M, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true)); - OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( + OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal( Builder, M, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true)); auto HostPostOutlineCB = [this, Ident, @@ -10110,23 +10368,22 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc, }; if (!Config.isTargetDevice()) - OI.PostOutlineCB = HostPostOutlineCB; + OI->PostOutlineCB = HostPostOutlineCB; addOutlineInfo(std::move(OI)); - Builder.SetInsertPoint(ExitBB, ExitBB->begin()); + Builder.SetInsertPoint(ExitBB); return Builder.saveIP(); } -OpenMPIRBuilder::InsertPointOrErrorTy -OpenMPIRBuilder::createDistribute(const LocationDescription &Loc, - InsertPointTy OuterAllocaIP, - BodyGenCallbackTy BodyGenCB) { +OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createDistribute( + const LocationDescription &Loc, InsertPointTy OuterAllocIP, + ArrayRef OuterDeallocIPs, BodyGenCallbackTy BodyGenCB) { if (!updateToLocation(Loc)) return InsertPointTy(); - BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock(); + BasicBlock *OuterAllocaBB = OuterAllocIP.getBlock(); if (OuterAllocaBB == Builder.GetInsertBlock()) { BasicBlock *BodyBB = @@ -10143,20 +10400,24 @@ OpenMPIRBuilder::createDistribute(const LocationDescription &Loc, // Generate the body of distribute clause InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin()); InsertPointTy CodeGenIP(BodyBB, BodyBB->begin()); - if (Error Err = BodyGenCB(AllocaIP, CodeGenIP)) + InsertPointTy DeallocIP(ExitBB, ExitBB->begin()); + if (Error Err = BodyGenCB(AllocaIP, CodeGenIP, DeallocIP)) return Err; // When using target we use different runtime functions which require a // callback. if (Config.isTargetDevice()) { - OutlineInfo OI; - OI.OuterAllocaBB = OuterAllocaIP.getBlock(); - OI.EntryBB = AllocaBB; - OI.ExitBB = ExitBB; + auto OI = std::make_unique(); + OI->OuterAllocBB = OuterAllocIP.getBlock(); + OI->EntryBB = AllocaBB; + OI->ExitBB = ExitBB; + OI->OuterDeallocBBs.reserve(OuterDeallocIPs.size()); + for (InsertPointTy DeallocIP : OuterDeallocIPs) + OI->OuterDeallocBBs.push_back(DeallocIP.getBlock()); addOutlineInfo(std::move(OI)); } - Builder.SetInsertPoint(ExitBB, ExitBB->begin()); + Builder.SetInsertPoint(ExitBB); return Builder.saveIP(); } @@ -10215,6 +10476,40 @@ void OpenMPIRBuilder::OutlineInfo::collectBlocks( } } +std::unique_ptr +OpenMPIRBuilder::OutlineInfo::createCodeExtractor(ArrayRef Blocks, + bool ArgsInZeroAddressSpace, + Twine Suffix) { + return std::make_unique( + Blocks, /* DominatorTree */ nullptr, + /* AggregateArgs */ true, + /* BlockFrequencyInfo */ nullptr, + /* BranchProbabilityInfo */ nullptr, + /* AssumptionCache */ nullptr, + /* AllowVarArgs */ true, + /* AllowAlloca */ true, + /* AllocationBlock*/ OuterAllocBB, + /* DeallocationBlocks */ ArrayRef(), + /* Suffix */ Suffix.str(), ArgsInZeroAddressSpace); +} + +std::unique_ptr DeviceSharedMemOutlineInfo::createCodeExtractor( + ArrayRef Blocks, bool ArgsInZeroAddressSpace, Twine Suffix) { + return std::make_unique( + OMPBuilder, Blocks, /* DominatorTree */ nullptr, + /* AggregateArgs */ true, + /* BlockFrequencyInfo */ nullptr, + /* BranchProbabilityInfo */ nullptr, + /* AssumptionCache */ nullptr, + /* AllowVarArgs */ true, + /* AllowAlloca */ true, + /* AllocationBlock*/ OuterAllocBB, + /* DeallocationBlocks */ OuterDeallocBBs.empty() + ? SmallVector{ExitBB} + : OuterDeallocBBs, + /* Suffix */ Suffix.str(), ArgsInZeroAddressSpace); +} + void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp index 3d8b7cbb59630..e8f3c68f90980 100644 --- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -721,6 +721,7 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { SubRegion, &*DT, /* AggregateArgs */ false, /* BFI */ nullptr, /* BPI */ nullptr, AC, /* AllowVarArgs */ false, /* AllowAlloca */ false, /* AllocaBlock */ nullptr, + /* DeallocationBlocks */ {}, /* Suffix */ "cold." + std::to_string(OutlinedFunctionID)); if (CE.isEligible() && isSplittingBeneficial(CE, SubRegion, TTI) && diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index e3e31befdbfd2..73c48db131b28 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -2829,7 +2829,7 @@ unsigned IROutliner::doOutline(Module &M) { OS->Candidate->getBasicBlocks(BlocksInRegion, BE); OS->CE = new (ExtractorAllocator.Allocate()) CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false, - false, nullptr, "outlined"); + false, nullptr, {}, "outlined"); findAddInputsOutputs(M, *OS, NotSame); if (!OS->IgnoreRegion) OutlinedRegions.push_back(OS); @@ -2940,7 +2940,7 @@ unsigned IROutliner::doOutline(Module &M) { OS->Candidate->getBasicBlocks(BlocksInRegion, BE); OS->CE = new (ExtractorAllocator.Allocate()) CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false, - false, nullptr, "outlined"); + false, nullptr, {}, "outlined"); bool FunctionOutlined = extractSection(*OS); if (FunctionOutlined) { unsigned StartIdx = OS->Candidate->getStartIdx(); diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 8b1968343416e..12b61c27fc78b 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1086,7 +1086,8 @@ struct OpenMPOpt { SmallDenseMap> BB2PRMap; BasicBlock *StartBB = nullptr, *EndBB = nullptr; - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { BasicBlock *CGStartBB = CodeGenIP.getBlock(); BasicBlock *CGEndBB = SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); @@ -1126,7 +1127,8 @@ struct OpenMPOpt { const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc(); ParentBB->getTerminator()->eraseFromParent(); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { BasicBlock *CGStartBB = CodeGenIP.getBlock(); BasicBlock *CGEndBB = SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI); @@ -1256,8 +1258,9 @@ struct OpenMPOpt { // avoid overriding binding settings, and without explicit cancellation. OpenMPIRBuilder::InsertPointTy AfterIP = cantFail(OMPInfoCache.OMPBuilder.createParallel( - Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr, - OMP_PROC_BIND_default, /* IsCancellable */ false)); + Loc, AllocaIP, /* DeallocIPs */ {}, BodyGenCB, PrivCB, FiniCB, + nullptr, nullptr, OMP_PROC_BIND_default, + /* IsCancellable */ false)); BranchInst::Create(AfterBB, AfterIP.getBlock()); // Perform the actual outlining. @@ -5047,6 +5050,29 @@ struct AAKernelInfoCallSite : AAKernelInfo { case OMPRTL___kmpc_free_shared: // Return without setting a fixpoint, to be resolved in updateImpl. return; + case OMPRTL___kmpc_distribute_static_loop_4: + case OMPRTL___kmpc_distribute_static_loop_4u: + case OMPRTL___kmpc_distribute_static_loop_8: + case OMPRTL___kmpc_distribute_static_loop_8u: + case OMPRTL___kmpc_distribute_for_static_loop_4: + case OMPRTL___kmpc_distribute_for_static_loop_4u: + case OMPRTL___kmpc_distribute_for_static_loop_8: + case OMPRTL___kmpc_distribute_for_static_loop_8u: + case OMPRTL___kmpc_for_static_loop_4: + case OMPRTL___kmpc_for_static_loop_4u: + case OMPRTL___kmpc_for_static_loop_8: + case OMPRTL___kmpc_for_static_loop_8u: + // Parallel regions might be reached by these calls, as they take a + // callback argument potentially containing arbitrary user-provided + // code. + ReachedUnknownParallelRegions.insert(&CB); + // TODO: The presence of these calls on their own does not prevent a + // kernel from being SPMD-izable. We mark it as such because we need + // further changes in order to also consider the contents of the + // callbacks passed to them. + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); + SPMDCompatibilityTracker.insert(&CB); + break; default: // Unknown OpenMP runtime calls cannot be executed in SPMD-mode, // generally. However, they do not hide parallel regions. diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 40de78a1d6e31..675413a963fd8 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -25,7 +25,6 @@ #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" -#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -264,11 +263,12 @@ CodeExtractor::CodeExtractor(ArrayRef BBs, DominatorTree *DT, bool AggregateArgs, BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI, AssumptionCache *AC, bool AllowVarArgs, bool AllowAlloca, - BasicBlock *AllocationBlock, std::string Suffix, - bool ArgsInZeroAddressSpace) + BasicBlock *AllocationBlock, + ArrayRef DeallocationBlocks, + std::string Suffix, bool ArgsInZeroAddressSpace) : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI), BPI(BPI), AC(AC), AllocationBlock(AllocationBlock), - AllowVarArgs(AllowVarArgs), + DeallocationBlocks(DeallocationBlocks), AllowVarArgs(AllowVarArgs), Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)), Suffix(Suffix), ArgsInZeroAddressSpace(ArgsInZeroAddressSpace) {} @@ -444,6 +444,27 @@ CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) { return CommonExitBlock; } +Instruction *CodeExtractor::allocateVar(BasicBlock *BB, + BasicBlock::iterator AllocIP, + Type *VarType, const Twine &Name, + AddrSpaceCastInst **CastedAlloc) { + const DataLayout &DL = BB->getModule()->getDataLayout(); + Instruction *Alloca = + new AllocaInst(VarType, DL.getAllocaAddrSpace(), nullptr, Name, AllocIP); + + if (CastedAlloc && ArgsInZeroAddressSpace && DL.getAllocaAddrSpace() != 0) { + *CastedAlloc = new AddrSpaceCastInst( + Alloca, PointerType::get(BB->getContext(), 0), Name + ".ascast"); + (*CastedAlloc)->insertAfter(Alloca->getIterator()); + } + return Alloca; +} + +Instruction *CodeExtractor::deallocateVar(BasicBlock *, BasicBlock::iterator, + Value *, Type *) { + return nullptr; +} + // Find the pair of life time markers for address 'Addr' that are either // defined inside the outline region or can legally be shrinkwrapped into the // outline region. If there are not other untracked uses of the address, return @@ -1821,7 +1842,6 @@ CallInst *CodeExtractor::emitReplacerCall( std::vector &Reloads) { LLVMContext &Context = oldFunction->getContext(); Module *M = oldFunction->getParent(); - const DataLayout &DL = M->getDataLayout(); // This takes place of the original loop BasicBlock *codeReplacer = @@ -1852,25 +1872,22 @@ CallInst *CodeExtractor::emitReplacerCall( if (StructValues.contains(output)) continue; - AllocaInst *alloca = new AllocaInst( - output->getType(), DL.getAllocaAddrSpace(), nullptr, - output->getName() + ".loc", AllocaBlock->getFirstInsertionPt()); - params.push_back(alloca); - ReloadOutputs.push_back(alloca); + Value *OutAlloc = + allocateVar(AllocaBlock, AllocaBlock->getFirstInsertionPt(), + output->getType(), output->getName() + ".loc"); + params.push_back(OutAlloc); + ReloadOutputs.push_back(OutAlloc); } - AllocaInst *Struct = nullptr; + Instruction *Struct = nullptr; if (!StructValues.empty()) { - Struct = new AllocaInst(StructArgTy, DL.getAllocaAddrSpace(), nullptr, - "structArg", AllocaBlock->getFirstInsertionPt()); - if (ArgsInZeroAddressSpace && DL.getAllocaAddrSpace() != 0) { - auto *StructSpaceCast = new AddrSpaceCastInst( - Struct, PointerType ::get(Context, 0), "structArg.ascast"); - StructSpaceCast->insertAfter(Struct->getIterator()); + AddrSpaceCastInst *StructSpaceCast = nullptr; + Struct = allocateVar(AllocaBlock, AllocaBlock->getFirstInsertionPt(), + StructArgTy, "structArg", &StructSpaceCast); + if (StructSpaceCast) params.push_back(StructSpaceCast); - } else { + else params.push_back(Struct); - } unsigned AggIdx = 0; for (Value *input : inputs) { @@ -2013,6 +2030,27 @@ CallInst *CodeExtractor::emitReplacerCall( insertLifetimeMarkersSurroundingCall(oldFunction->getParent(), LifetimesStart, {}, call); + // Deallocate intermediate variables if they need explicit deallocation. + auto deallocVars = [&](BasicBlock *DeallocBlock, + BasicBlock::iterator DeallocIP) { + int Index = 0; + for (Value *Output : outputs) { + if (!StructValues.contains(Output)) + deallocateVar(DeallocBlock, DeallocIP, ReloadOutputs[Index++], + Output->getType()); + } + + if (Struct) + deallocateVar(DeallocBlock, DeallocIP, Struct, StructArgTy); + }; + + if (DeallocationBlocks.empty()) { + deallocVars(codeReplacer, codeReplacer->end()); + } else { + for (BasicBlock *DeallocationBlock : DeallocationBlocks) + deallocVars(DeallocationBlock, DeallocationBlock->getFirstInsertionPt()); + } + return call; } diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 5b22ac31b572e..6c100619d04d7 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -56,8 +56,9 @@ using namespace omp; } #define BODYGENCB_WRAPPER(cb) \ - [&cb](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) -> Error { \ - cb(AllocaIP, CodeGenIP); \ + [&cb](InsertPointTy AllocIP, InsertPointTy CodeGenIP, \ + ArrayRef DeallocIPs) -> Error { \ + cb(AllocIP, CodeGenIP, DeallocIPs); \ return Error::success(); \ } @@ -666,10 +667,11 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) { unsigned NumPrivatizedVars = 0; unsigned NumFinalizationPoints = 0; - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { ++NumBodiesGenerated; - Builder.restoreIP(AllocaIP); + Builder.restoreIP(AllocIP); PrivAI = Builder.CreateAlloca(F->arg_begin()->getType()); Builder.CreateStore(F->arg_begin(), PrivAI); @@ -717,8 +719,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) { F->getEntryBlock().getFirstInsertionPt()); ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createParallel( - Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, - nullptr, OMP_PROC_BIND_default, false)); + Loc, AllocaIP, {}, BodyGenCB, PrivCB, FiniCB, + nullptr, nullptr, OMP_PROC_BIND_default, false)); EXPECT_EQ(NumBodiesGenerated, 1U); EXPECT_EQ(NumPrivatizedVars, 1U); @@ -745,8 +747,10 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) { EXPECT_EQ(OutlinedFn->getArg(2)->getType(), PointerType::get(M->getContext(), 0)); EXPECT_EQ(&OutlinedFn->getEntryBlock(), PrivAI->getParent()); - EXPECT_TRUE(OutlinedFn->hasOneUse()); - User *Usr = OutlinedFn->user_back(); + EXPECT_TRUE(OutlinedFn->hasNUses(2)); + User *Usr = *OutlinedFn->users().begin(); + User *WrapperUsr = *++OutlinedFn->users().begin(); + ASSERT_TRUE(isa(Usr)); CallInst *Parallel51CI = dyn_cast(Usr); ASSERT_NE(Parallel51CI, nullptr); @@ -757,6 +761,20 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) { EXPECT_TRUE( isa(Parallel51CI->getArgOperand(0)->stripPointerCasts())); EXPECT_EQ(Parallel51CI, Usr); + + ASSERT_TRUE(isa(WrapperUsr)); + CallInst *OutlinedCI = dyn_cast(WrapperUsr); + ASSERT_NE(OutlinedCI, nullptr); + EXPECT_EQ(OutlinedCI->getCalledFunction(), OutlinedFn); + + Function *WrapperFn = OutlinedCI->getFunction(); + EXPECT_TRUE(WrapperFn->hasInternalLinkage()); + EXPECT_EQ(WrapperFn->arg_size(), 2U); + EXPECT_EQ(WrapperFn->getArg(0)->getType(), + IntegerType::getInt16Ty(M->getContext())); + EXPECT_EQ(WrapperFn->getArg(1)->getType(), + IntegerType::getInt32Ty(M->getContext())); + M->setDataLayout(oldDLStr); } @@ -780,10 +798,11 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimple) { unsigned NumPrivatizedVars = 0; unsigned NumFinalizationPoints = 0; - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { ++NumBodiesGenerated; - Builder.restoreIP(AllocaIP); + Builder.restoreIP(AllocIP); PrivAI = Builder.CreateAlloca(F->arg_begin()->getType()); Builder.CreateStore(F->arg_begin(), PrivAI); @@ -831,8 +850,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimple) { F->getEntryBlock().getFirstInsertionPt()); ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createParallel( - Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, - nullptr, OMP_PROC_BIND_default, false)); + Loc, AllocaIP, {}, BodyGenCB, PrivCB, FiniCB, + nullptr, nullptr, OMP_PROC_BIND_default, false)); EXPECT_EQ(NumBodiesGenerated, 1U); EXPECT_EQ(NumPrivatizedVars, 1U); EXPECT_EQ(NumFinalizationPoints, 1U); @@ -889,7 +908,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested) { unsigned NumOuterBodiesGenerated = 0; unsigned NumFinalizationPoints = 0; - auto InnerBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto InnerBodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { ++NumInnerBodiesGenerated; return Error::success(); }; @@ -912,7 +932,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested) { return Error::success(); }; - auto OuterBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto OuterBodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { ++NumOuterBodiesGenerated; Builder.restoreIP(CodeGenIP); BasicBlock *CGBB = CodeGenIP.getBlock(); @@ -921,7 +942,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested) { ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, AfterIP, - OMPBuilder.createParallel(InsertPointTy(CGBB, CGBB->end()), AllocaIP, + OMPBuilder.createParallel(InsertPointTy(CGBB, CGBB->end()), AllocIP, {}, InnerBodyGenCB, PrivCB, FiniCB, nullptr, nullptr, OMP_PROC_BIND_default, false)); @@ -933,7 +954,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested) { F->getEntryBlock().getFirstInsertionPt()); ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createParallel( - Loc, AllocaIP, BODYGENCB_WRAPPER(OuterBodyGenCB), + Loc, AllocaIP, {}, BODYGENCB_WRAPPER(OuterBodyGenCB), PrivCB, FiniCB, nullptr, nullptr, OMP_PROC_BIND_default, false)); @@ -991,7 +1012,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) { unsigned NumOuterBodiesGenerated = 0; unsigned NumFinalizationPoints = 0; - auto InnerBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto InnerBodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { ++NumInnerBodiesGenerated; return Error::success(); }; @@ -1014,7 +1036,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) { return Error::success(); }; - auto OuterBodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto OuterBodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { ++NumOuterBodiesGenerated; Builder.restoreIP(CodeGenIP); BasicBlock *CGBB = CodeGenIP.getBlock(); @@ -1027,18 +1050,18 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) { ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, AfterIP1, - OMPBuilder.createParallel(InsertPointTy(CGBB, CGBB->end()), AllocaIP, + OMPBuilder.createParallel(InsertPointTy(CGBB, CGBB->end()), AllocIP, {}, InnerBodyGenCB, PrivCB, FiniCB, nullptr, nullptr, OMP_PROC_BIND_default, false)); Builder.restoreIP(AfterIP1); Builder.CreateBr(NewBB1); - ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP2, - OMPBuilder.createParallel( - InsertPointTy(NewBB1, NewBB1->end()), AllocaIP, - InnerBodyGenCB, PrivCB, FiniCB, nullptr, nullptr, - OMP_PROC_BIND_default, false)); + ASSERT_EXPECTED_INIT( + OpenMPIRBuilder::InsertPointTy, AfterIP2, + OMPBuilder.createParallel(InsertPointTy(NewBB1, NewBB1->end()), AllocIP, + {}, InnerBodyGenCB, PrivCB, FiniCB, nullptr, + nullptr, OMP_PROC_BIND_default, false)); Builder.restoreIP(AfterIP2); Builder.CreateBr(NewBB2); @@ -1048,7 +1071,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelNested2Inner) { F->getEntryBlock().getFirstInsertionPt()); ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createParallel( - Loc, AllocaIP, BODYGENCB_WRAPPER(OuterBodyGenCB), + Loc, AllocaIP, {}, BODYGENCB_WRAPPER(OuterBodyGenCB), PrivCB, FiniCB, nullptr, nullptr, OMP_PROC_BIND_default, false)); @@ -1113,10 +1136,11 @@ TEST_F(OpenMPIRBuilderTest, ParallelIfCond) { unsigned NumPrivatizedVars = 0; unsigned NumFinalizationPoints = 0; - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { ++NumBodiesGenerated; - Builder.restoreIP(AllocaIP); + Builder.restoreIP(AllocIP); PrivAI = Builder.CreateAlloca(F->arg_begin()->getType()); Builder.CreateStore(F->arg_begin(), PrivAI); @@ -1165,7 +1189,7 @@ TEST_F(OpenMPIRBuilderTest, ParallelIfCond) { F->getEntryBlock().getFirstInsertionPt()); ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, AfterIP, - OMPBuilder.createParallel(Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, + OMPBuilder.createParallel(Loc, AllocaIP, {}, BodyGenCB, PrivCB, FiniCB, Builder.CreateIsNotNull(F->arg_begin()), nullptr, OMP_PROC_BIND_default, false)); @@ -1221,7 +1245,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelCancelBarrier) { unsigned NumFinalizationPoints = 0; CallInst *CheckedBarrier = nullptr; - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { ++NumBodiesGenerated; Builder.restoreIP(CodeGenIP); @@ -1289,11 +1314,12 @@ TEST_F(OpenMPIRBuilderTest, ParallelCancelBarrier) { IRBuilder<>::InsertPoint AllocaIP(&F->getEntryBlock(), F->getEntryBlock().getFirstInsertionPt()); - ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, - OMPBuilder.createParallel( - Loc, AllocaIP, BODYGENCB_WRAPPER(BodyGenCB), PrivCB, - FiniCB, Builder.CreateIsNotNull(F->arg_begin()), - nullptr, OMP_PROC_BIND_default, true)); + ASSERT_EXPECTED_INIT( + OpenMPIRBuilder::InsertPointTy, AfterIP, + OMPBuilder.createParallel(Loc, AllocaIP, {}, BODYGENCB_WRAPPER(BodyGenCB), + PrivCB, FiniCB, + Builder.CreateIsNotNull(F->arg_begin()), + nullptr, OMP_PROC_BIND_default, true)); EXPECT_EQ(NumBodiesGenerated, 1U); EXPECT_EQ(NumPrivatizedVars, 0U); @@ -1359,7 +1385,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelForwardAsPointers) { Value *StructPtrVal = Builder.CreateCall(RetStructPtrFunc); Instruction *Internal; - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { IRBuilder<>::InsertPointGuard Guard(Builder); Builder.restoreIP(CodeGenIP); Internal = Builder.CreateCall(TakeI32Func, I32Val); @@ -1379,8 +1406,8 @@ TEST_F(OpenMPIRBuilderTest, ParallelForwardAsPointers) { F->getEntryBlock().getFirstInsertionPt()); ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createParallel( - Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, - nullptr, OMP_PROC_BIND_default, false)); + Loc, AllocaIP, {}, BodyGenCB, PrivCB, FiniCB, + nullptr, nullptr, OMP_PROC_BIND_default, false)); Builder.restoreIP(AfterIP); Builder.CreateRetVoid(); @@ -2893,9 +2920,10 @@ TEST_F(OpenMPIRBuilderTest, MasterDirective) { BasicBlock *EntryBB = nullptr; BasicBlock *ThenBB = nullptr; - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { - if (AllocaIP.isSet()) - Builder.restoreIP(AllocaIP); + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { + if (AllocIP.isSet()) + Builder.restoreIP(AllocIP); else Builder.SetInsertPoint(&*(F->getEntryBlock().getFirstInsertionPt())); PrivAI = Builder.CreateAlloca(F->arg_begin()->getType()); @@ -2974,9 +3002,10 @@ TEST_F(OpenMPIRBuilderTest, MaskedDirective) { BasicBlock *EntryBB = nullptr; BasicBlock *ThenBB = nullptr; - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { - if (AllocaIP.isSet()) - Builder.restoreIP(AllocaIP); + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { + if (AllocIP.isSet()) + Builder.restoreIP(AllocIP); else Builder.SetInsertPoint(&*(F->getEntryBlock().getFirstInsertionPt())); PrivAI = Builder.CreateAlloca(F->arg_begin()->getType()); @@ -3053,7 +3082,8 @@ TEST_F(OpenMPIRBuilderTest, CriticalDirective) { AllocaInst *PrivAI = Builder.CreateAlloca(F->arg_begin()->getType()); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { // actual start for bodyCB llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock(); llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint(); @@ -3304,7 +3334,8 @@ TEST_F(OpenMPIRBuilderTest, OrderedDirectiveThreads) { AllocaInst *PrivAI = Builder.CreateAlloca(F->arg_begin()->getType(), nullptr, "priv.inst"); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock(); llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint(); EXPECT_EQ(CodeGenIPBB->getTerminator(), CodeGenIPInst); @@ -3378,7 +3409,8 @@ TEST_F(OpenMPIRBuilderTest, OrderedDirectiveSimd) { AllocaInst *PrivAI = Builder.CreateAlloca(F->arg_begin()->getType(), nullptr, "priv.inst"); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock(); llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint(); EXPECT_EQ(CodeGenIPBB->getTerminator(), CodeGenIPInst); @@ -3485,9 +3517,10 @@ TEST_F(OpenMPIRBuilderTest, SingleDirective) { BasicBlock *EntryBB = nullptr; BasicBlock *ThenBB = nullptr; - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { - if (AllocaIP.isSet()) - Builder.restoreIP(AllocaIP); + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { + if (AllocIP.isSet()) + Builder.restoreIP(AllocIP); else Builder.SetInsertPoint(&*(F->getEntryBlock().getFirstInsertionPt())); PrivAI = Builder.CreateAlloca(F->arg_begin()->getType()); @@ -3578,9 +3611,10 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) { BasicBlock *EntryBB = nullptr; BasicBlock *ThenBB = nullptr; - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { - if (AllocaIP.isSet()) - Builder.restoreIP(AllocaIP); + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { + if (AllocIP.isSet()) + Builder.restoreIP(AllocIP); else Builder.SetInsertPoint(&*(F->getEntryBlock().getFirstInsertionPt())); PrivAI = Builder.CreateAlloca(F->arg_begin()->getType()); @@ -3699,9 +3733,10 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveCopyPrivate) { Function *CopyFunc = Function::Create(CopyFuncTy, Function::PrivateLinkage, "copy_var", *M); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { - if (AllocaIP.isSet()) - Builder.restoreIP(AllocaIP); + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { + if (AllocIP.isSet()) + Builder.restoreIP(AllocIP); else Builder.SetInsertPoint(&*(F->getEntryBlock().getFirstInsertionPt())); PrivAI = Builder.CreateAlloca(F->arg_begin()->getType()); @@ -4564,8 +4599,9 @@ TEST_F(OpenMPIRBuilderTest, CreateTeams) { AllocaInst *ValPtr128 = Builder.CreateAlloca(Builder.getInt128Ty()); Value *Val128 = Builder.CreateLoad(Builder.getInt128Ty(), ValPtr128, "load"); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { - Builder.restoreIP(AllocaIP); + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { + Builder.restoreIP(AllocIP); AllocaInst *Local128 = Builder.CreateAlloca(Builder.getInt128Ty(), nullptr, "bodygen.alloca128"); @@ -4646,7 +4682,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithThreadLimit) { Function::Create(FunctionType::get(Builder.getVoidTy(), false), GlobalValue::ExternalLinkage, "fakeFunction", M.get()); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { Builder.restoreIP(CodeGenIP); Builder.CreateCall(FakeFunction, {}); return Error::success(); @@ -4703,7 +4740,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsUpper) { Function::Create(FunctionType::get(Builder.getVoidTy(), false), GlobalValue::ExternalLinkage, "fakeFunction", M.get()); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { Builder.restoreIP(CodeGenIP); Builder.CreateCall(FakeFunction, {}); return Error::success(); @@ -4766,7 +4804,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsBoth) { Value *NumTeamsUpper = Builder.CreateAdd(F->arg_begin(), Builder.getInt32(10), "numTeamsUpper"); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { Builder.restoreIP(CodeGenIP); Builder.CreateCall(FakeFunction, {}); return Error::success(); @@ -4834,7 +4873,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsAndThreadLimit) { Function::Create(FunctionType::get(Builder.getVoidTy(), false), GlobalValue::ExternalLinkage, "fakeFunction", M.get()); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { Builder.restoreIP(CodeGenIP); Builder.CreateCall(FakeFunction, {}); return Error::success(); @@ -4892,7 +4932,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithIfCondition) { Function::Create(FunctionType::get(Builder.getVoidTy(), false), GlobalValue::ExternalLinkage, "fakeFunction", M.get()); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { Builder.restoreIP(CodeGenIP); Builder.CreateCall(FakeFunction, {}); return Error::success(); @@ -4960,7 +5001,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithIfConditionAndNumTeams) { Function::Create(FunctionType::get(Builder.getVoidTy(), false), GlobalValue::ExternalLinkage, "fakeFunction", M.get()); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { Builder.restoreIP(CodeGenIP); Builder.CreateCall(FakeFunction, {}); return Error::success(); @@ -5179,7 +5221,8 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) { // xor of thread-id; // and store the result in global variables. InsertPointTy BodyIP, BodyAllocaIP; - auto BodyGenCB = [&](InsertPointTy InnerAllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy InnerAllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { IRBuilderBase::InsertPointGuard Guard(Builder); Builder.restoreIP(CodeGenIP); @@ -5197,7 +5240,7 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) { Builder.CreateStore(Xor, XorReduced); BodyIP = Builder.saveIP(); - BodyAllocaIP = InnerAllocaIP; + BodyAllocaIP = InnerAllocIP; return Error::success(); }; @@ -5233,12 +5276,12 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) { // Do nothing in finalization. auto FiniCB = [&](InsertPointTy CodeGenIP) { return Error::success(); }; - ASSERT_EXPECTED_INIT( - OpenMPIRBuilder::InsertPointTy, AfterIP, - OMPBuilder.createParallel(Loc, OuterAllocaIP, BodyGenCB, PrivCB, FiniCB, - /* IfCondition */ nullptr, - /* NumThreads */ nullptr, OMP_PROC_BIND_default, - /* IsCancellable */ false)); + ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, + OMPBuilder.createParallel( + Loc, OuterAllocaIP, {}, BodyGenCB, PrivCB, FiniCB, + /* IfCondition */ nullptr, + /* NumThreads */ nullptr, OMP_PROC_BIND_default, + /* IsCancellable */ false)); Builder.restoreIP(AfterIP); OpenMPIRBuilder::ReductionInfo ReductionInfos[] = { @@ -5558,8 +5601,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) { Builder.CreateStore(Builder.getInt32(1), XorReduced); InsertPointTy FirstBodyIP, FirstBodyAllocaIP; - auto FirstBodyGenCB = [&](InsertPointTy InnerAllocaIP, - InsertPointTy CodeGenIP) { + auto FirstBodyGenCB = [&](InsertPointTy InnerAllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { IRBuilderBase::InsertPointGuard Guard(Builder); Builder.restoreIP(CodeGenIP); @@ -5574,13 +5617,14 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) { Builder.CreateStore(Sum, SumReduced); FirstBodyIP = Builder.saveIP(); - FirstBodyAllocaIP = InnerAllocaIP; + FirstBodyAllocaIP = InnerAllocIP; return Error::success(); }; InsertPointTy SecondBodyIP, SecondBodyAllocaIP; - auto SecondBodyGenCB = [&](InsertPointTy InnerAllocaIP, - InsertPointTy CodeGenIP) { + auto SecondBodyGenCB = [&](InsertPointTy InnerAllocIP, + InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { IRBuilderBase::InsertPointGuard Guard(Builder); Builder.restoreIP(CodeGenIP); @@ -5593,7 +5637,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) { Builder.CreateStore(Xor, XorReduced); SecondBodyIP = Builder.saveIP(); - SecondBodyAllocaIP = InnerAllocaIP; + SecondBodyAllocaIP = InnerAllocIP; return Error::success(); }; @@ -5633,14 +5677,14 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) { ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, AfterIP1, - OMPBuilder.createParallel(Loc, OuterAllocaIP, FirstBodyGenCB, PrivCB, + OMPBuilder.createParallel(Loc, OuterAllocaIP, {}, FirstBodyGenCB, PrivCB, FiniCB, /* IfCondition */ nullptr, /* NumThreads */ nullptr, OMP_PROC_BIND_default, /* IsCancellable */ false)); Builder.restoreIP(AfterIP1); ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, AfterIP2, - OMPBuilder.createParallel({Builder.saveIP(), DL}, OuterAllocaIP, + OMPBuilder.createParallel({Builder.saveIP(), DL}, OuterAllocaIP, {}, SecondBodyGenCB, PrivCB, FiniCB, /* IfCondition */ nullptr, /* NumThreads */ nullptr, OMP_PROC_BIND_default, @@ -5734,7 +5778,8 @@ TEST_F(OpenMPIRBuilderTest, CreateSectionsSimple) { llvm::SmallVector SectionCBVector; auto FiniCB = [&](InsertPointTy IP) { return Error::success(); }; - auto SectionCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto SectionCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { return Error::success(); }; SectionCBVector.push_back(SectionCB); @@ -5779,7 +5824,8 @@ TEST_F(OpenMPIRBuilderTest, CreateSections) { EXPECT_NE(IPBB->end(), IP.getPoint()); }; - auto SectionCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto SectionCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { ++NumBodiesGenerated; CaseBBs.push_back(CodeGenIP.getBlock()); SwitchBB = CodeGenIP.getBlock()->getSinglePredecessor(); @@ -6119,7 +6165,7 @@ TEST_F(OpenMPIRBuilderTest, TargetEnterData) { ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTargetData( - Loc, AllocaIP, Builder.saveIP(), Builder.getInt64(DeviceID), + Loc, AllocaIP, Builder.saveIP(), {}, Builder.getInt64(DeviceID), /* IfCond= */ nullptr, Info, GenMapInfoCB, CustomMapperCB, &RTLFunc)); Builder.restoreIP(AfterIP); @@ -6182,7 +6228,7 @@ TEST_F(OpenMPIRBuilderTest, TargetExitData) { ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTargetData( - Loc, AllocaIP, Builder.saveIP(), Builder.getInt64(DeviceID), + Loc, AllocaIP, Builder.saveIP(), {}, Builder.getInt64(DeviceID), /* IfCond= */ nullptr, Info, GenMapInfoCB, CustomMapperCB, &RTLFunc)); Builder.restoreIP(AfterIP); @@ -6293,7 +6339,7 @@ TEST_F(OpenMPIRBuilderTest, TargetDataRegion) { ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, TargetDataIP1, - OMPBuilder.createTargetData(Loc, AllocaIP, Builder.saveIP(), + OMPBuilder.createTargetData(Loc, AllocaIP, Builder.saveIP(), {}, Builder.getInt64(DeviceID), /* IfCond= */ nullptr, Info, GenMapInfoCB, CustomMapperCB, nullptr, BodyCB)); @@ -6322,7 +6368,7 @@ TEST_F(OpenMPIRBuilderTest, TargetDataRegion) { }; ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, TargetDataIP2, - OMPBuilder.createTargetData(Loc, AllocaIP, Builder.saveIP(), + OMPBuilder.createTargetData(Loc, AllocaIP, Builder.saveIP(), {}, Builder.getInt64(DeviceID), /* IfCond= */ nullptr, Info, GenMapInfoCB, CustomMapperCB, nullptr, BodyTargetCB)); @@ -6373,8 +6419,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) { Builder.CreateStore(Builder.getInt32(10), APtr); Builder.CreateStore(Builder.getInt32(20), BPtr); - auto BodyGenCB = [&](InsertPointTy AllocaIP, - InsertPointTy CodeGenIP) -> InsertPointTy { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) -> InsertPointTy { IRBuilderBase::InsertPointGuard guard(Builder); Builder.SetCurrentDebugLocation(llvm::DebugLoc()); Builder.restoreIP(CodeGenIP); @@ -6393,7 +6439,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) { auto SimpleArgAccessorCB = [&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal, llvm::OpenMPIRBuilder::InsertPointTy AllocaIP, - llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) { + llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP, + llvm::ArrayRef DeallocIPs) { IRBuilderBase::InsertPointGuard guard(Builder); Builder.SetCurrentDebugLocation(llvm::DebugLoc()); if (!OMPBuilder.Config.isTargetDevice()) { @@ -6444,10 +6491,10 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) { ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTarget(OmpLoc, /*IsOffloadEntry=*/true, Builder.saveIP(), - Builder.saveIP(), Info, EntryInfo, DefaultAttrs, - RuntimeAttrs, /*IfCond=*/nullptr, Inputs, - GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB, - CustomMapperCB, {}, false)); + Builder.saveIP(), {}, Info, EntryInfo, + DefaultAttrs, RuntimeAttrs, /*IfCond=*/nullptr, + Inputs, GenMapInfoCB, BodyGenCB, + SimpleArgAccessorCB, CustomMapperCB, {}, false)); EXPECT_EQ(DL, Builder.getCurrentDebugLocation()); Builder.restoreIP(AfterIP); @@ -6564,7 +6611,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { auto SimpleArgAccessorCB = [&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal, llvm::OpenMPIRBuilder::InsertPointTy AllocaIP, - llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) { + llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP, + llvm::ArrayRef DeallocIPs) { IRBuilderBase::InsertPointGuard guard(Builder); Builder.SetCurrentDebugLocation(llvm::DebugLoc()); if (!OMPBuilder.Config.isTargetDevice()) { @@ -6598,8 +6646,9 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { }; auto CustomMapperCB = [&](unsigned int I) { return nullptr; }; - auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy AllocaIP, - OpenMPIRBuilder::InsertPointTy CodeGenIP) + auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy AllocIP, + OpenMPIRBuilder::InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) -> OpenMPIRBuilder::InsertPointTy { IRBuilderBase::InsertPointGuard guard(Builder); Builder.SetCurrentDebugLocation(llvm::DebugLoc()); @@ -6624,7 +6673,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, - Info, EntryInfo, DefaultAttrs, RuntimeAttrs, + {}, Info, EntryInfo, DefaultAttrs, RuntimeAttrs, /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB, CustomMapperCB, {}, false)); @@ -6710,7 +6759,14 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { Instruction *Value1 = &*OutlinedBlock->getFirstNonPHIIt(); EXPECT_EQ(Value1, Value); EXPECT_EQ(Value1->getNextNode(), TargetStore); - auto *Deinit = TargetStore->getNextNode(); + + auto *TargetExitBlockBr = TargetStore->getNextNode(); + EXPECT_TRUE(isa(TargetExitBlockBr)); + + auto *TargetExitBlock = TargetExitBlockBr->getSuccessor(0); + EXPECT_EQ(TargetExitBlock->getName(), "target.exit"); + + Instruction *Deinit = &*TargetExitBlock->getFirstNonPHIIt(); EXPECT_NE(Deinit, nullptr); auto *DeinitCall = dyn_cast(Deinit); @@ -6758,18 +6814,19 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) { IRBuilder<> Builder(BB); auto CustomMapperCB = [&](unsigned int I) { return nullptr; }; - auto BodyGenCB = [&](InsertPointTy, - InsertPointTy CodeGenIP) -> InsertPointTy { + auto BodyGenCB = [&](InsertPointTy, InsertPointTy CodeGenIP, + ArrayRef) -> InsertPointTy { Builder.restoreIP(CodeGenIP); return Builder.saveIP(); }; - auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&, - OpenMPIRBuilder::InsertPointTy, - OpenMPIRBuilder::InsertPointTy CodeGenIP) { - Builder.restoreIP(CodeGenIP); - return Builder.saveIP(); - }; + auto SimpleArgAccessorCB = + [&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy, + OpenMPIRBuilder::InsertPointTy CodeGenIP, + llvm::ArrayRef) { + Builder.restoreIP(CodeGenIP); + return Builder.saveIP(); + }; SmallVector Inputs; OpenMPIRBuilder::MapInfosTy CombinedInfos; @@ -6792,10 +6849,10 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) { ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTarget(OmpLoc, /*IsOffloadEntry=*/true, Builder.saveIP(), - Builder.saveIP(), Info, EntryInfo, DefaultAttrs, - RuntimeAttrs, /*IfCond=*/nullptr, Inputs, - GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB, - CustomMapperCB, {})); + Builder.saveIP(), {}, Info, EntryInfo, + DefaultAttrs, RuntimeAttrs, /*IfCond=*/nullptr, + Inputs, GenMapInfoCB, BodyGenCB, + SimpleArgAccessorCB, CustomMapperCB, {})); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); @@ -6864,12 +6921,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) { Function *OutlinedFn = nullptr; SmallVector CapturedArgs; - auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&, - OpenMPIRBuilder::InsertPointTy, - OpenMPIRBuilder::InsertPointTy CodeGenIP) { - Builder.restoreIP(CodeGenIP); - return Builder.saveIP(); - }; + auto SimpleArgAccessorCB = + [&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy, + OpenMPIRBuilder::InsertPointTy CodeGenIP, + llvm::ArrayRef) { + Builder.restoreIP(CodeGenIP); + return Builder.saveIP(); + }; OpenMPIRBuilder::MapInfosTy CombinedInfos; auto GenMapInfoCB = @@ -6879,7 +6937,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) { auto CustomMapperCB = [&](unsigned int I) { return nullptr; }; auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy, - OpenMPIRBuilder::InsertPointTy CodeGenIP) + OpenMPIRBuilder::InsertPointTy CodeGenIP, + ArrayRef) -> OpenMPIRBuilder::InsertPointTy { Builder.restoreIP(CodeGenIP); OutlinedFn = CodeGenIP.getBlock()->getParent(); @@ -6900,8 +6959,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) { ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTarget( - Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, Info, - EntryInfo, DefaultAttrs, RuntimeAttrs, + Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, {}, + Info, EntryInfo, DefaultAttrs, RuntimeAttrs, /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB, CustomMapperCB, {})); Builder.restoreIP(AfterIP); @@ -6968,7 +7027,8 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { auto SimpleArgAccessorCB = [&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal, llvm::OpenMPIRBuilder::InsertPointTy AllocaIP, - llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) { + llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP, + llvm::ArrayRef DeallocIPs) { IRBuilderBase::InsertPointGuard guard(Builder); Builder.SetCurrentDebugLocation(llvm::DebugLoc()); if (!OMPBuilder.Config.isTargetDevice()) { @@ -7004,8 +7064,9 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { llvm::Value *RaiseAlloca = nullptr; auto CustomMapperCB = [&](unsigned int I) { return nullptr; }; - auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy AllocaIP, - OpenMPIRBuilder::InsertPointTy CodeGenIP) + auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy AllocIP, + OpenMPIRBuilder::InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) -> OpenMPIRBuilder::InsertPointTy { IRBuilderBase::InsertPointGuard guard(Builder); Builder.SetCurrentDebugLocation(llvm::DebugLoc()); @@ -7031,7 +7092,7 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, - Info, EntryInfo, DefaultAttrs, RuntimeAttrs, + {}, Info, EntryInfo, DefaultAttrs, RuntimeAttrs, /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB, CustomMapperCB, {}, false)); @@ -7110,7 +7171,14 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { EXPECT_TRUE(isa(Load2)); EXPECT_EQ(Load2, Value); EXPECT_EQ(Load2->getNextNode(), TargetStore); - auto *Deinit = TargetStore->getNextNode(); + + auto *TargetExitBlockBr = TargetStore->getNextNode(); + EXPECT_TRUE(isa(TargetExitBlockBr)); + + auto *TargetExitBlock = TargetExitBlockBr->getSuccessor(0); + EXPECT_EQ(TargetExitBlock->getName(), "target.exit"); + + Instruction *Deinit = &*TargetExitBlock->getFirstNonPHIIt(); EXPECT_NE(Deinit, nullptr); auto *DeinitCall = dyn_cast(Deinit); @@ -7141,8 +7209,9 @@ TEST_F(OpenMPIRBuilderTest, CreateTask) { Value *Val128 = Builder.CreateLoad(Builder.getInt128Ty(), ValPtr128, "bodygen.load"); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { - Builder.restoreIP(AllocaIP); + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { + Builder.restoreIP(AllocIP); AllocaInst *Local128 = Builder.CreateAlloca(Builder.getInt128Ty(), nullptr, "bodygen.alloca128"); @@ -7170,7 +7239,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTask) { OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTask( Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()), - BodyGenCB)); + /*DeallocIPs=*/{}, BodyGenCB)); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); Builder.CreateRetVoid(); @@ -7270,7 +7339,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskNoArgs) { F->setName("func"); IRBuilder<> Builder(BB); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { return Error::success(); }; @@ -7282,7 +7352,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskNoArgs) { OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTask( Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()), - BodyGenCB)); + /*DeallocIPs=*/{}, BodyGenCB)); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); Builder.CreateRetVoid(); @@ -7306,7 +7376,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskUntied) { OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { return Error::success(); }; BasicBlock *AllocaBB = Builder.GetInsertBlock(); @@ -7317,7 +7388,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskUntied) { OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTask( Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()), - BodyGenCB, + /*DeallocIPs=*/{}, BodyGenCB, /*Tied=*/false)); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); @@ -7343,7 +7414,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskDepend) { OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { return Error::success(); }; BasicBlock *AllocaBB = Builder.GetInsertBlock(); @@ -7361,7 +7433,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskDepend) { OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTask( Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()), - BodyGenCB, + /*DeallocIPs=*/{}, BodyGenCB, /*Tied=*/false, /*Final*/ nullptr, /*IfCondition*/ nullptr, DDS)); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); @@ -7424,7 +7496,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskFinal) { OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { return Error::success(); }; BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "alloca.split"); @@ -7435,7 +7508,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskFinal) { ConstantInt::get(Type::getInt32Ty(M->getContext()), 0U)); OpenMPIRBuilder::LocationDescription Loc(Builder.saveIP(), DL); ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, - OMPBuilder.createTask(Loc, AllocaIP, BodyGenCB, + OMPBuilder.createTask(Loc, AllocaIP, /*DeallocIPs=*/{}, + BodyGenCB, /*Tied=*/false, Final)); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); @@ -7483,7 +7557,8 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskIfCondition) { OMPBuilder.initialize(); F->setName("func"); IRBuilder<> Builder(BB); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { return Error::success(); }; BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "alloca.split"); @@ -7493,10 +7568,10 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskIfCondition) { CmpInst::Predicate::ICMP_EQ, F->getArg(0), ConstantInt::get(Type::getInt32Ty(M->getContext()), 0U)); OpenMPIRBuilder::LocationDescription Loc(Builder.saveIP(), DL); - ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, - OMPBuilder.createTask(Loc, AllocaIP, BodyGenCB, - /*Tied=*/false, /*Final=*/nullptr, - IfCondition)); + ASSERT_EXPECTED_INIT( + OpenMPIRBuilder::InsertPointTy, AfterIP, + OMPBuilder.createTask(Loc, AllocaIP, /*DeallocIPs=*/{}, BodyGenCB, + /*Tied=*/false, /*Final=*/nullptr, IfCondition)); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); Builder.CreateRetVoid(); @@ -7562,8 +7637,9 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroup) { Value *InternalStoreInst, *InternalLoad32, *InternalLoad128, *InternalIfCmp; - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { - Builder.restoreIP(AllocaIP); + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { + Builder.restoreIP(AllocIP); AllocaInst *Local128 = Builder.CreateAlloca(Builder.getInt128Ty(), nullptr, "bodygen.alloca128"); @@ -7591,7 +7667,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroup) { ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTaskgroup( - Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()), + Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()), {}, BodyGenCB)); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); @@ -7654,14 +7730,16 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroupWithTasks) { F->setName("func"); IRBuilder<> Builder(BB); - auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { - Builder.restoreIP(AllocaIP); + auto BodyGenCB = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { + Builder.restoreIP(AllocIP); AllocaInst *Alloca32 = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, "bodygen.alloca32"); AllocaInst *Alloca64 = Builder.CreateAlloca(Builder.getInt64Ty(), nullptr, "bodygen.alloca64"); Builder.restoreIP(CodeGenIP); - auto TaskBodyGenCB1 = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto TaskBodyGenCB1 = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { Builder.restoreIP(CodeGenIP); LoadInst *LoadValue = Builder.CreateLoad(Alloca64->getAllocatedType(), Alloca64); @@ -7670,11 +7748,13 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroupWithTasks) { return Error::success(); }; OpenMPIRBuilder::LocationDescription Loc(Builder.saveIP(), DL); - ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, TaskIP1, - OMPBuilder.createTask(Loc, AllocaIP, TaskBodyGenCB1)); + ASSERT_EXPECTED_INIT( + OpenMPIRBuilder::InsertPointTy, TaskIP1, + OMPBuilder.createTask(Loc, AllocIP, DeallocIPs, TaskBodyGenCB1)); Builder.restoreIP(TaskIP1); - auto TaskBodyGenCB2 = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { + auto TaskBodyGenCB2 = [&](InsertPointTy AllocIP, InsertPointTy CodeGenIP, + ArrayRef DeallocIPs) { Builder.restoreIP(CodeGenIP); LoadInst *LoadValue = Builder.CreateLoad(Alloca32->getAllocatedType(), Alloca32); @@ -7683,8 +7763,9 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroupWithTasks) { return Error::success(); }; OpenMPIRBuilder::LocationDescription Loc2(Builder.saveIP(), DL); - ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, TaskIP2, - OMPBuilder.createTask(Loc2, AllocaIP, TaskBodyGenCB2)); + ASSERT_EXPECTED_INIT( + OpenMPIRBuilder::InsertPointTy, TaskIP2, + OMPBuilder.createTask(Loc2, AllocIP, DeallocIPs, TaskBodyGenCB2)); Builder.restoreIP(TaskIP2); }; @@ -7695,7 +7776,7 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskgroupWithTasks) { ASSERT_EXPECTED_INIT( OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTaskgroup( - Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()), + Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()), {}, BODYGENCB_WRAPPER(BodyGenCB))); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); diff --git a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp index 9ea8de3da1e5b..d63e346e31a1d 100644 --- a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp +++ b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp @@ -711,7 +711,8 @@ TEST(CodeExtractor, OpenMPAggregateArgs) { /* AssumptionCache */ nullptr, /* AllowVarArgs */ true, /* AllowAlloca */ true, - /* AllocaBlock*/ &Func->getEntryBlock(), + /* AllocationBlock*/ &Func->getEntryBlock(), + /* DeallocationBlocks */ {}, /* Suffix */ ".outlined", /* ArgsInZeroAddressSpace */ true); diff --git a/mlir/docs/Passes.md b/mlir/docs/Passes.md index 9df32666415bb..f3d8a75c65840 100644 --- a/mlir/docs/Passes.md +++ b/mlir/docs/Passes.md @@ -72,6 +72,10 @@ This document describes the available MLIR passes and their contracts. [include "MemRefPasses.md"] +## 'omp' Dialect Passes + +[include "OpenMPPasses.md"] + ## 'shard' Dialect Passes [include "ShardPasses.md"] diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td index 8e43c4284d078..bfee763290757 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td @@ -20,6 +20,7 @@ #define OPENMP_CLAUSES include "mlir/Dialect/OpenMP/OpenMPOpBase.td" +include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/SymbolInterfaces.td" //===----------------------------------------------------------------------===// @@ -547,6 +548,58 @@ class OpenMP_HasDeviceAddrClauseSkip< def OpenMP_HasDeviceAddrClause : OpenMP_HasDeviceAddrClauseSkip<>; +//===----------------------------------------------------------------------===// +// Not in the spec: Clause-like structure to hold heap allocation information. +//===----------------------------------------------------------------------===// + +class OpenMP_HeapAllocClauseSkip< + bit traits = false, bit arguments = false, bit assemblyFormat = false, + bit description = false, bit extraClassDeclaration = false + > : OpenMP_Clause { + let traits = [ + MemoryEffects<[MemAlloc]> + ]; + + let arguments = (ins + TypeAttr:$in_type, + OptionalAttr:$uniq_name, + OptionalAttr:$bindc_name, + Variadic:$typeparams, + Variadic:$shape + ); + + // The custom parser doesn't parse `uniq_name` and `bindc_name`. This is + // handled by the attr-dict, which must be present in the operation's + // `assemblyFormat`. + let reqAssemblyFormat = [{ + custom($in_type, $typeparams, type($typeparams), $shape, + type($shape)) + }]; + + let extraClassDeclaration = [{ + mlir::Type getAllocatedType() { return getInTypeAttr().getValue(); } + }]; + + let description = [{ + The `in_type` is the type of the object for which memory is being allocated. + For arrays, this can be a static or dynamic array type. + + The optional `uniq_name` is a unique name for the allocated memory. + + The optional `bindc_name` is a name used for C interoperability. + + The `typeparams` are runtime type parameters for polymorphic or + parameterized types. These are typically integer values that define aspects + of a type not fixed at compile time. + + The `shape` holds runtime shape operands for dynamic arrays. Each operand is + an integer value representing the extent of a specific dimension. + }]; +} + +def OpenMP_HeapAllocClause : OpenMP_HeapAllocClauseSkip<>; + //===----------------------------------------------------------------------===// // V5.2: [5.4.7] `inclusive` clause //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td index ada3a3edd8a30..309135f0c729c 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td @@ -281,31 +281,21 @@ def ScheduleModifier : OpenMP_I32EnumAttr< def ScheduleModifierAttr : OpenMP_EnumAttr; //===----------------------------------------------------------------------===// -// target_region_flags enum. +// target_exec_mode enum. //===----------------------------------------------------------------------===// -def TargetRegionFlagsNone : I32BitEnumAttrCaseNone<"none">; -def TargetRegionFlagsGeneric : I32BitEnumAttrCaseBit<"generic", 0>; -def TargetRegionFlagsSpmd : I32BitEnumAttrCaseBit<"spmd", 1>; -def TargetRegionFlagsTripCount : I32BitEnumAttrCaseBit<"trip_count", 2>; -def TargetRegionFlagsNoLoop : I32BitEnumAttrCaseBit<"no_loop", 3>; - -def TargetRegionFlags : OpenMP_BitEnumAttr< - "TargetRegionFlags", - "These flags describe properties of the target kernel. " - "TargetRegionFlagsGeneric - denotes generic kernel. " - "TargetRegionFlagsSpmd - denotes SPMD kernel. " - "TargetRegionFlagsNoLoop - denotes kernel where " - "num_teams * num_threads >= loop_trip_count. It allows the conversion " - "of loops into sequential code by ensuring that each team/thread " - "executes at most one iteration. " - "TargetRegionFlagsTripCount - checks if the loop trip count should be " - "calculated.", [ - TargetRegionFlagsNone, - TargetRegionFlagsGeneric, - TargetRegionFlagsSpmd, - TargetRegionFlagsTripCount, - TargetRegionFlagsNoLoop +def TargetExecModeBare : I32EnumAttrCase<"bare", 0>; +def TargetExecModeGeneric : I32EnumAttrCase<"generic", 1>; +def TargetExecModeSpmd : I32EnumAttrCase<"spmd", 2>; +def TargetExecModeSpmdNoLoop : I32EnumAttrCase<"no_loop", 3>; + +def TargetExecMode : OpenMP_I32EnumAttr< + "TargetExecMode", + "target execution mode, mirroring the `OMPTgtExecModeFlags` LLVM enum", [ + TargetExecModeBare, + TargetExecModeGeneric, + TargetExecModeSpmd, + TargetExecModeSpmdNoLoop, ]>; //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 377f1febf6b8f..c59bee75fa6c7 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -1509,13 +1509,17 @@ def TargetOp : OpenMP_Op<"target", traits = [ /// operations, the top level one will be the one captured. Operation *getInnermostCapturedOmpOp(); - /// Infers the kernel type (Generic, SPMD or Generic-SPMD) based on the - /// contents of the target region. + /// Infers the kernel type (Bare, Generic or SPMD) based on the contents of + /// the target region. /// /// \param capturedOp result of a still valid (no modifications made to any /// nested operations) previous call to `getInnermostCapturedOmpOp()`. - static ::mlir::omp::TargetRegionFlags - getKernelExecFlags(Operation *capturedOp); + /// \param hostEvalTripCount output argument to store whether this kernel + /// wraps a loop whose bounds must be evaluated on the host prior to + /// launching it. + static ::mlir::omp::TargetExecMode + getKernelExecFlags(Operation *capturedOp, + bool *hostEvalTripCount = nullptr); }] # clausesExtraClassDeclaration; let assemblyFormat = clausesAssemblyFormat # [{ @@ -2111,59 +2115,45 @@ def AllocateDirOp : OpenMP_Op<"allocate_dir", [AttrSizedOperandSegments], clause // TargetAllocMemOp //===----------------------------------------------------------------------===// -def TargetAllocMemOp : OpenMP_Op<"target_allocmem", - [MemoryEffects<[MemAlloc]>, AttrSizedOperandSegments]> { +def TargetAllocMemOp : OpenMP_Op<"target_allocmem", traits = [ + AttrSizedOperandSegments + ], clauses = [ + OpenMP_HeapAllocClause + ]> { let summary = "allocate storage on an openmp device for an object of a given type"; let description = [{ - Allocates memory on the specified OpenMP device for an object of the given type. - Returns an integer value representing the device pointer to the allocated memory. - The memory is uninitialized after allocation. Operations must be paired with - `omp.target_freemem` to avoid memory leaks. - - * `$device`: The integer ID of the OpenMP device where the memory will be allocated. - * `$in_type`: The type of the object for which memory is being allocated. - For arrays, this can be a static or dynamic array type. - * `$uniq_name`: An optional unique name for the allocated memory. - * `$bindc_name`: An optional name used for C interoperability. - * `$typeparams`: Runtime type parameters for polymorphic or parameterized types. - These are typically integer values that define aspects of a type not fixed at compile time. - * `$shape`: Runtime shape operands for dynamic arrays. - Each operand is an integer value representing the extent of a specific dimension. - - ```mlir - // Allocate a static 3x3 integer vector on device 0 - %device_0 = arith.constant 0 : i32 - %ptr_static = omp.target_allocmem %device_0 : i32, vector<3x3xi32> - // ... use %ptr_static ... - omp.target_freemem %device_0, %ptr_static : i32, i64 - - // Allocate a dynamic 2D Fortran array (fir.array) on device 1 - %device_1 = arith.constant 1 : i32 - %rows = arith.constant 10 : index - %cols = arith.constant 20 : index - %ptr_dynamic = omp.target_allocmem %device_1 : i32, !fir.array, %rows, %cols : index, index - // ... use %ptr_dynamic ... - omp.target_freemem %device_1, %ptr_dynamic : i32, i64 - ``` - }]; + Allocates memory on the specified OpenMP device for an object of the given + type. Returns an integer value representing the device pointer to the + allocated memory. The memory is uninitialized after allocation. Operations + must be paired with `omp.target_freemem` to avoid memory leaks. - let arguments = (ins - Arg:$device, - TypeAttr:$in_type, - OptionalAttr:$uniq_name, - OptionalAttr:$bindc_name, - Variadic:$typeparams, - Variadic:$shape - ); - let results = (outs I64); + ```mlir + // Allocate a static 3x3 integer vector on device 0 + %device_0 = arith.constant 0 : i32 + %ptr_static = omp.target_allocmem %device_0 : i32, vector<3x3xi32> + // ... use %ptr_static ... + omp.target_freemem %device_0, %ptr_static : i32, i64 + + // Allocate a dynamic 2D Fortran array (fir.array) on device 1 + %device_1 = arith.constant 1 : i32 + %rows = arith.constant 10 : index + %cols = arith.constant 20 : index + %ptr_dynamic = omp.target_allocmem %device_1 : i32, !fir.array, %rows, %cols : index, index + // ... use %ptr_dynamic ... + omp.target_freemem %device_1, %ptr_dynamic : i32, i64 + ``` - let hasCustomAssemblyFormat = 1; - let hasVerifier = 1; + The `device` is an integer ID of the OpenMP device where the memory will be + allocated. + }] # clausesDescription; - let extraClassDeclaration = [{ - mlir::Type getAllocatedType(); - }]; + let arguments = !con((ins Arg:$device), clausesArgs); + let results = (outs I64); + + // Override inherited assembly format to include `device`. + let assemblyFormat = " $device `:` type($device) `,` " + # clausesReqAssemblyFormat # " attr-dict"; } //===----------------------------------------------------------------------===// @@ -2201,6 +2191,86 @@ def TargetFreeMemOp : OpenMP_Op<"target_freemem", let assemblyFormat = "$device `,` $heapref attr-dict `:` type($device) `,` qualified(type($heapref))"; } +//===----------------------------------------------------------------------===// +// AllocSharedMemOp +//===----------------------------------------------------------------------===// + +def AllocSharedMemOp : OpenMP_Op<"alloc_shared_mem", traits = [ + MemoryEffects<[MemAlloc]> + ]> { + let summary = "allocate storage on shared memory for objects of a given type"; + + let arguments = (ins + TypeAttr:$elem_type, + AnySignlessInteger:$array_size, + ConfinedAttr, [IntPositive]>:$alignment + ); + + let description = [{ + Allocates memory shared across threads of a team for an object of the given + type. Returns a pointer representing the allocated memory. The memory is + uninitialized after allocation. Operations must be paired with + `omp.free_shared` to avoid memory leaks. + + ```mlir + // Allocate an i32 vector with %size elements and aligned to 8 bytes. + %ptr_shared = omp.alloc_shared_mem %size x i32 {alignment = 8} : (i64) -> (!llvm.ptr) + // ... + omp.free_shared_mem %ptr_shared : !llvm.ptr + ``` + + The `elem_type` is the type of the object for which memory is being + allocated. + + The `array_size` is the number of objects to allocate memory for. + + The optional `alignment` is used to specify the alignment for each element. + If not set, the `DataLayout` defaults will be used instead. + }]; + + let results = (outs OpenMP_PointerLikeType); + let assemblyFormat = [{ + $array_size `x` $elem_type attr-dict `:` `(` type($array_size) `)` `->` type(results) + }]; + + let extraClassDeclaration = [{ + mlir::Type getAllocatedType() { return getElemTypeAttr().getValue(); } + }]; + let hasVerifier = 1; +} + +//===----------------------------------------------------------------------===// +// FreeSharedMemOp +//===----------------------------------------------------------------------===// + +def FreeSharedMemOp : OpenMP_Op<"free_shared_mem", [MemoryEffects<[MemFree]>]> { + let summary = "free shared memory"; + + let description = [{ + Deallocates shared memory that was previously allocated by an + `omp.alloc_shared_mem` operation. After this operation, the deallocated + memory is in an undefined state and should not be accessed. + It is crucial to ensure that all accesses to the memory region are completed + before `omp.alloc_shared_mem` is called to avoid undefined behavior. + + ```mlir + // Example of allocating and freeing shared memory. + %ptr_shared = omp.alloc_shared_mem %size x i32 : (i64) -> (!llvm.ptr) + // ... + omp.free_shared_mem %ptr_shared : !llvm.ptr + ``` + + The `heapref` operand represents the pointer to shared memory to be + deallocated, previously returned by `omp.alloc_shared_mem`. + }]; + + let arguments = (ins + Arg:$heapref + ); + let assemblyFormat = "$heapref attr-dict `:` type($heapref)"; + let hasVerifier = 1; +} + //===----------------------------------------------------------------------===// // workdistribute Construct //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt index 22f0d92ea4cbf..9c6a607ce6f2a 100644 --- a/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt @@ -1,5 +1,5 @@ set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls -name OpenMP) -add_public_tablegen_target(MLIROpenMPPassIncGen) +add_mlir_dialect_tablegen_target(MLIROpenMPPassIncGen) add_mlir_doc(Passes OpenMPPasses ./ -gen-pass-doc) diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h index 21b6d1f466558..ddbe662be69fc 100644 --- a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h @@ -13,6 +13,10 @@ namespace mlir { +namespace LLVM { +class LLVMFuncOp; +} // namespace LLVM + namespace omp { /// Generate the code for registering conversion passes. @@ -23,4 +27,4 @@ namespace omp { } // namespace omp } // namespace mlir -#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES_H +#endif // MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td index 1fde7e08ab433..498b8a4812caa 100644 --- a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td @@ -23,4 +23,22 @@ def PrepareForOMPOffloadPrivatizationPass : Pass<"omp-offload-privatization-prep }]; let dependentDialects = ["LLVM::LLVMDialect"]; } + +def StackToSharedPass : Pass<"omp-stack-to-shared", "mlir::LLVM::LLVMFuncOp"> { + let summary = "Replaces stack allocations target devices with shared memory."; + let description = [{ + `llvm.alloca` operations defining values in a non-SPMD target region and + then potentially used inside of an `omp.parallel` region are replaced by + this pass with `omp.alloc_shared_mem` and `omp.free_shared_mem`. This is + also done for top-level function `llvm.alloca`s used in the same way when + the parent function is a target device function. + + This ensures that explicit private allocations, intended to be shared across + threads, use the proper memory space on a target device while supporting the + case of parallel regions indirectly reached from within a target region via + function calls. + }]; + let dependentDialects = ["mlir::omp::OpenMPDialect"]; +} + #endif // MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES diff --git a/mlir/include/mlir/Dialect/OpenMP/Utils/Utils.h b/mlir/include/mlir/Dialect/OpenMP/Utils/Utils.h new file mode 100644 index 0000000000000..ce625c7170efe --- /dev/null +++ b/mlir/include/mlir/Dialect/OpenMP/Utils/Utils.h @@ -0,0 +1,53 @@ +//===- Utils.h - OpenMP dialect utilities -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header file defines prototypes for various OpenMP utilities. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_OPENMP_UTILS_UTILS_H_ +#define MLIR_DIALECT_OPENMP_UTILS_UTILS_H_ + +#include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" + +namespace mlir { +namespace omp { + +/// Check whether the value representing an allocation, assumed to have been +/// defined in a shared device context, is used in a manner that would require +/// device shared memory for correctness. +/// +/// When a use takes place inside an omp.parallel region and it's not as a +/// private clause argument, or when it is a reduction argument passed to +/// omp.parallel or a function call argument, then the defining allocation is +/// eligible for replacement with shared memory. +/// +/// \see mlir::omp::opInSharedDeviceContext(). +bool allocaUsesRequireSharedMem(Value alloc); + +/// Check whether the given operation is located in a context where an +/// allocation to be used by multiple threads in a parallel region would have to +/// be placed in device shared memory to be accessible. +/// +/// That means that it is inside of a target device module, it is a non-SPMD +/// target region, is inside of one or it's located in a device function, and it +/// is not not inside of a parallel region. +/// +/// This represents a necessary but not sufficient set of conditions to use +/// device shared memory in place of regular allocas. For some variables, the +/// associated OpenMP construct or their uses might also need to be taken into +/// account. +/// +/// \see mlir::omp::allocaUsesRequireSharedMem(). +bool opInSharedDeviceContext(Operation &op); + +} // namespace omp +} // namespace mlir + +#endif // MLIR_DIALECT_OPENMP_UTILS_UTILS_H_ diff --git a/mlir/lib/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/CMakeLists.txt index f3c02da458508..31167e6af908b 100644 --- a/mlir/lib/Dialect/OpenMP/CMakeLists.txt +++ b/mlir/lib/Dialect/OpenMP/CMakeLists.txt @@ -1,20 +1,3 @@ +add_subdirectory(IR) add_subdirectory(Transforms) - -add_mlir_dialect_library(MLIROpenMPDialect - IR/OpenMPDialect.cpp - - ADDITIONAL_HEADER_DIRS - ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP - - DEPENDS - omp_gen - MLIROpenMPOpsIncGen - MLIROpenMPOpsInterfacesIncGen - MLIROpenMPTypeInterfacesIncGen - - LINK_LIBS PUBLIC - MLIRIR - MLIRLLVMDialect - MLIRFuncDialect - MLIROpenACCMPCommon - ) +add_subdirectory(Utils) diff --git a/mlir/lib/Dialect/OpenMP/IR/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/IR/CMakeLists.txt new file mode 100644 index 0000000000000..05923032d9077 --- /dev/null +++ b/mlir/lib/Dialect/OpenMP/IR/CMakeLists.txt @@ -0,0 +1,18 @@ +add_mlir_dialect_library(MLIROpenMPDialect + OpenMPDialect.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP + + DEPENDS + omp_gen + MLIROpenMPOpsIncGen + MLIROpenMPOpsInterfacesIncGen + MLIROpenMPTypeInterfacesIncGen + + LINK_LIBS PUBLIC + MLIRIR + MLIRLLVMDialect + MLIRFuncDialect + MLIROpenACCMPCommon + ) diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 172f21ff1779e..0d1af4198f1a9 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -797,6 +797,58 @@ static void printNumTasksClause(OpAsmPrinter &p, Operation *op, p, op, numTasksMod, numTasks, numTasksType, &stringifyClauseNumTasksType); } +//===----------------------------------------------------------------------===// +// Parser and printer for Heap Alloc Clause +//===----------------------------------------------------------------------===// + +/// operation ::= $in_type ( `(` $typeparams `)` )? ( `,` $shape )? +static ParseResult parseHeapAllocClause( + OpAsmParser &parser, TypeAttr &inTypeAttr, + SmallVectorImpl &typeparams, + SmallVectorImpl &typeparamsTypes, + SmallVectorImpl &shape, + SmallVectorImpl &shapeTypes) { + mlir::Type inType; + if (parser.parseType(inType)) + return mlir::failure(); + inTypeAttr = TypeAttr::get(inType); + + if (!parser.parseOptionalLParen()) { + // parse the LEN params of the derived type. ( : ) + if (parser.parseOperandList(typeparams, OpAsmParser::Delimiter::None) || + parser.parseColonTypeList(typeparamsTypes) || parser.parseRParen()) + return failure(); + } + + if (!parser.parseOptionalComma()) { + // parse size to scale by, vector of n dimensions of type index + if (parser.parseOperandList(shape, OpAsmParser::Delimiter::None)) + return failure(); + + // TODO: This overrides the actual types of the operands, which might cause + // issues when they don't match. At the moment this is done in place of + // making the corresponding operand type `Variadic` because index + // types are lowered to I64 prior to LLVM IR translation. + shapeTypes.append(shape.size(), IndexType::get(parser.getContext())); + } + + return success(); +} + +static void printHeapAllocClause(OpAsmPrinter &p, Operation *op, + TypeAttr inType, ValueRange typeparams, + TypeRange typeparamsTypes, ValueRange shape, + TypeRange shapeTypes) { + p << inType; + if (!typeparams.empty()) { + p << '(' << typeparams << " : " << typeparamsTypes << ')'; + } + for (auto sh : shape) { + p << ", "; + p.printOperand(sh); + } +} + //===----------------------------------------------------------------------===// // Parsers for operations including clauses that define entry block arguments. //===----------------------------------------------------------------------===// @@ -2234,8 +2286,9 @@ LogicalResult TargetOp::verifyRegions() { return emitError("target containing multiple 'omp.teams' nested ops"); // Check that host_eval values are only used in legal ways. + bool hostEvalTripCount; Operation *capturedOp = getInnermostCapturedOmpOp(); - TargetRegionFlags execFlags = getKernelExecFlags(capturedOp); + TargetExecMode execMode = getKernelExecFlags(capturedOp, &hostEvalTripCount); for (Value hostEvalArg : cast(getOperation()).getHostEvalBlockArgs()) { for (Operation *user : hostEvalArg.getUsers()) { @@ -2250,7 +2303,7 @@ LogicalResult TargetOp::verifyRegions() { "and 'thread_limit' in 'omp.teams'"; } if (auto parallelOp = dyn_cast(user)) { - if (bitEnumContainsAny(execFlags, TargetRegionFlags::spmd) && + if (execMode == TargetExecMode::spmd && parallelOp->isAncestor(capturedOp) && hostEvalArg == parallelOp.getNumThreads()) continue; @@ -2260,8 +2313,7 @@ LogicalResult TargetOp::verifyRegions() { "'omp.parallel' when representing target SPMD"; } if (auto loopNestOp = dyn_cast(user)) { - if (bitEnumContainsAny(execFlags, TargetRegionFlags::trip_count) && - loopNestOp.getOperation() == capturedOp && + if (hostEvalTripCount && loopNestOp.getOperation() == capturedOp && (llvm::is_contained(loopNestOp.getLoopLowerBounds(), hostEvalArg) || llvm::is_contained(loopNestOp.getLoopUpperBounds(), hostEvalArg) || llvm::is_contained(loopNestOp.getLoopSteps(), hostEvalArg))) @@ -2387,7 +2439,9 @@ static bool canPromoteToNoLoop(Operation *capturedOp, TeamsOp teamsOp, ompFlags.getAssumeThreadsOversubscription(); } -TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) { +TargetExecMode TargetOp::getKernelExecFlags(Operation *capturedOp, + bool *hostEvalTripCount) { + // TODO: Support detection of bare kernel mode. // A non-null captured op is only valid if it resides inside of a TargetOp // and is the result of calling getInnermostCapturedOmpOp() on it. TargetOp targetOp = @@ -2396,9 +2450,12 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) { (targetOp && targetOp.getInnermostCapturedOmpOp() == capturedOp)) && "unexpected captured op"); + if (hostEvalTripCount) + *hostEvalTripCount = false; + // If it's not capturing a loop, it's a default target region. if (!isa_and_present(capturedOp)) - return TargetRegionFlags::generic; + return TargetExecMode::generic; // Get the innermost non-simd loop wrapper. SmallVector loopWrappers; @@ -2411,31 +2468,32 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) { auto numWrappers = std::distance(innermostWrapper, loopWrappers.end()); if (numWrappers != 1 && numWrappers != 2) - return TargetRegionFlags::generic; + return TargetExecMode::generic; // Detect target-teams-distribute-parallel-wsloop[-simd]. if (numWrappers == 2) { WsloopOp *wsloopOp = dyn_cast(innermostWrapper); if (!wsloopOp) - return TargetRegionFlags::generic; + return TargetExecMode::generic; innermostWrapper = std::next(innermostWrapper); if (!isa(innermostWrapper)) - return TargetRegionFlags::generic; + return TargetExecMode::generic; Operation *parallelOp = (*innermostWrapper)->getParentOp(); if (!isa_and_present(parallelOp)) - return TargetRegionFlags::generic; + return TargetExecMode::generic; TeamsOp teamsOp = dyn_cast(parallelOp->getParentOp()); if (!teamsOp) - return TargetRegionFlags::generic; + return TargetExecMode::generic; if (teamsOp->getParentOp() == targetOp.getOperation()) { - TargetRegionFlags result = - TargetRegionFlags::spmd | TargetRegionFlags::trip_count; + TargetExecMode result = TargetExecMode::spmd; if (canPromoteToNoLoop(capturedOp, teamsOp, wsloopOp)) - result = result | TargetRegionFlags::no_loop; + result = TargetExecMode::no_loop; + if (hostEvalTripCount) + *hostEvalTripCount = true; return result; } } @@ -2443,43 +2501,30 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) { else if (isa(innermostWrapper)) { Operation *teamsOp = (*innermostWrapper)->getParentOp(); if (!isa_and_present(teamsOp)) - return TargetRegionFlags::generic; + return TargetExecMode::generic; if (teamsOp->getParentOp() != targetOp.getOperation()) - return TargetRegionFlags::generic; + return TargetExecMode::generic; + + if (hostEvalTripCount) + *hostEvalTripCount = true; if (isa(innermostWrapper)) - return TargetRegionFlags::spmd | TargetRegionFlags::trip_count; - - // Add spmd flag if there's a nested omp.parallel (generic-spmd case). - // - // TODO: This shouldn't have to be done here, as it is too easy to break. - // The openmp-opt pass should be updated to be able to promote kernels like - // this from "Generic" to "Generic-SPMD". However, the use of the - // `kmpc_distribute_static_loop` family of functions produced by the - // OMPIRBuilder for these kernels prevents that from working. - bool hasParallel = capturedOp - ->walk([](ParallelOp) { - return WalkResult::interrupt(); - }) - .wasInterrupted(); - - TargetRegionFlags result = - TargetRegionFlags::generic | TargetRegionFlags::trip_count; - - return hasParallel ? result | TargetRegionFlags::spmd : result; + return TargetExecMode::spmd; + + return TargetExecMode::generic; } // Detect target-parallel-wsloop[-simd]. else if (isa(innermostWrapper)) { Operation *parallelOp = (*innermostWrapper)->getParentOp(); if (!isa_and_present(parallelOp)) - return TargetRegionFlags::generic; + return TargetExecMode::generic; if (parallelOp->getParentOp() == targetOp.getOperation()) - return TargetRegionFlags::spmd; + return TargetExecMode::spmd; } - return TargetRegionFlags::generic; + return TargetExecMode::generic; } //===----------------------------------------------------------------------===// @@ -4278,118 +4323,37 @@ LogicalResult ScanOp::verify() { } /// Verifies align clause in allocate directive - -LogicalResult AllocateDirOp::verify() { - std::optional align = this->getAlign(); - - if (align.has_value()) { - if ((align.value() > 0) && !llvm::has_single_bit(align.value())) - return emitError() << "ALIGN value : " << align.value() - << " must be power of 2"; +LogicalResult verifyAlignment(Operation &op, + std::optional alignment) { + if (alignment.has_value()) { + if ((alignment.value() != 0) && !llvm::has_single_bit(alignment.value())) + return op.emitError() + << "ALIGN value : " << alignment.value() << " must be power of 2"; } - return success(); } -//===----------------------------------------------------------------------===// -// TargetAllocMemOp -//===----------------------------------------------------------------------===// - -mlir::Type omp::TargetAllocMemOp::getAllocatedType() { - return getInTypeAttr().getValue(); +LogicalResult AllocateDirOp::verify() { + return verifyAlignment(*getOperation(), getAlign()); } -/// operation ::= %res = (`omp.target_alloc_mem`) $device : devicetype, -/// $in_type ( `(` $typeparams `)` )? ( `,` $shape )? -/// attr-dict-without-keyword -static mlir::ParseResult parseTargetAllocMemOp(mlir::OpAsmParser &parser, - mlir::OperationState &result) { - auto &builder = parser.getBuilder(); - bool hasOperands = false; - std::int32_t typeparamsSize = 0; - - // Parse device number as a new operand - mlir::OpAsmParser::UnresolvedOperand deviceOperand; - mlir::Type deviceType; - if (parser.parseOperand(deviceOperand) || parser.parseColonType(deviceType)) - return mlir::failure(); - if (parser.resolveOperand(deviceOperand, deviceType, result.operands)) - return mlir::failure(); - if (parser.parseComma()) - return mlir::failure(); - - mlir::Type intype; - if (parser.parseType(intype)) - return mlir::failure(); - result.addAttribute("in_type", mlir::TypeAttr::get(intype)); - llvm::SmallVector operands; - llvm::SmallVector typeVec; - if (!parser.parseOptionalLParen()) { - // parse the LEN params of the derived type. ( : ) - if (parser.parseOperandList(operands, mlir::OpAsmParser::Delimiter::None) || - parser.parseColonTypeList(typeVec) || parser.parseRParen()) - return mlir::failure(); - typeparamsSize = operands.size(); - hasOperands = true; - } - std::int32_t shapeSize = 0; - if (!parser.parseOptionalComma()) { - // parse size to scale by, vector of n dimensions of type index - if (parser.parseOperandList(operands, mlir::OpAsmParser::Delimiter::None)) - return mlir::failure(); - shapeSize = operands.size() - typeparamsSize; - auto idxTy = builder.getIndexType(); - for (std::int32_t i = typeparamsSize, end = operands.size(); i != end; ++i) - typeVec.push_back(idxTy); - hasOperands = true; - } - if (hasOperands && - parser.resolveOperands(operands, typeVec, parser.getNameLoc(), - result.operands)) - return mlir::failure(); - - mlir::Type restype = builder.getIntegerType(64); - if (!restype) { - parser.emitError(parser.getNameLoc(), "invalid allocate type: ") << intype; - return mlir::failure(); - } - llvm::SmallVector segmentSizes{1, typeparamsSize, shapeSize}; - result.addAttribute("operandSegmentSizes", - builder.getDenseI32ArrayAttr(segmentSizes)); - if (parser.parseOptionalAttrDict(result.attributes) || - parser.addTypeToList(restype, result.types)) - return mlir::failure(); - return mlir::success(); -} +//===----------------------------------------------------------------------===// +// AllocSharedMemOp +//===----------------------------------------------------------------------===// -mlir::ParseResult omp::TargetAllocMemOp::parse(mlir::OpAsmParser &parser, - mlir::OperationState &result) { - return parseTargetAllocMemOp(parser, result); +LogicalResult AllocSharedMemOp::verify() { + return verifyAlignment(*getOperation(), getAlignment()); } -void omp::TargetAllocMemOp::print(mlir::OpAsmPrinter &p) { - p << " "; - p.printOperand(getDevice()); - p << " : "; - p << getDevice().getType(); - p << ", "; - p << getInType(); - if (!getTypeparams().empty()) { - p << '(' << getTypeparams() << " : " << getTypeparams().getTypes() << ')'; - } - for (auto sh : getShape()) { - p << ", "; - p.printOperand(sh); - } - p.printOptionalAttrDict((*this)->getAttrs(), - {"in_type", "operandSegmentSizes"}); -} +//===----------------------------------------------------------------------===// +// FreeSharedMemOp +//===----------------------------------------------------------------------===// -llvm::LogicalResult omp::TargetAllocMemOp::verify() { - mlir::Type outType = getType(); - if (!mlir::dyn_cast(outType)) - return emitOpError("must be a integer type"); - return mlir::success(); +LogicalResult FreeSharedMemOp::verify() { + return getHeapref().getDefiningOp() + ? success() + : emitOpError() << "'heapref' operand must be defined by an " + "'omp.alloc_shared_memory' op"; } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt index b9b8eda9ed51b..fa723239299a2 100644 --- a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt @@ -1,14 +1,25 @@ add_mlir_dialect_library(MLIROpenMPTransforms OpenMPOffloadPrivatizationPrepare.cpp + StackToShared.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP DEPENDS + omp_gen MLIROpenMPPassIncGen + MLIROpenMPOpsIncGen + MLIROpenMPOpsInterfacesIncGen + MLIROpenMPTypeInterfacesIncGen LINK_LIBS PUBLIC MLIRIR MLIRFuncDialect MLIRLLVMDialect + MLIROpenACCMPCommon MLIROpenMPDialect + MLIROpenMPUtils MLIRPass + MLIRSupport MLIRTransforms ) diff --git a/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp b/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp new file mode 100644 index 0000000000000..61bf1401c3e67 --- /dev/null +++ b/mlir/lib/Dialect/OpenMP/Transforms/StackToShared.cpp @@ -0,0 +1,117 @@ +//===- StackToShared.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements transforms to swap stack allocations on the target +// device with device shared memory where applicable. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/OpenMP/Transforms/Passes.h" + +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/Dialect/OpenMP/Utils/Utils.h" +#include "mlir/Pass/Pass.h" +#include "llvm/ADT/STLExtras.h" + +namespace mlir { +namespace omp { +#define GEN_PASS_DEF_STACKTOSHAREDPASS +#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc" +} // namespace omp +} // namespace mlir + +using namespace mlir; + +/// Tell whether to replace an operation representing a stack allocation with a +/// device shared memory allocation/deallocation pair based on the location of +/// the allocation and its uses. +static bool shouldReplaceAllocaWithDeviceSharedMem(Operation &op) { + return omp::opInSharedDeviceContext(op) && + llvm::any_of(op.getResults(), [&](Value result) { + return omp::allocaUsesRequireSharedMem(result); + }); +} + +/// Based on the location of the definition of the given value representing the +/// result of a device shared memory allocation, find the corresponding points +/// where its deallocation should be placed and introduce `omp.free_shared_mem` +/// ops at those points. +static void insertDeviceSharedMemDeallocation(OpBuilder &builder, + Value allocVal) { + Block *allocaBlock = allocVal.getParentBlock(); + DominanceInfo domInfo; + for (Block &block : allocVal.getParentRegion()->getBlocks()) { + Operation *terminator = block.getTerminator(); + if (!terminator->hasSuccessors() && + domInfo.dominates(allocaBlock, &block)) { + builder.setInsertionPoint(terminator); + omp::FreeSharedMemOp::create(builder, allocVal.getLoc(), allocVal); + } + } +} + +namespace { +class StackToSharedPass + : public omp::impl::StackToSharedPassBase { +public: + StackToSharedPass() = default; + + void runOnOperation() override { + MLIRContext *context = &getContext(); + OpBuilder builder(context); + + LLVM::LLVMFuncOp funcOp = getOperation(); + auto offloadIface = funcOp->getParentOfType(); + if (!offloadIface || !offloadIface.getIsTargetDevice()) + return; + + llvm::SmallVector toBeDeleted; + funcOp->walk([&](LLVM::AllocaOp allocaOp) { + if (!shouldReplaceAllocaWithDeviceSharedMem(*allocaOp)) + return; + // Replace llvm.alloca with omp.alloc_shared_mem. + Type resultType = allocaOp.getResult().getType(); + + // TODO: The handling of non-default address spaces might need to be + // improved. This currently only handles the case where an alloca to + // non-default address space must only be used by a single addrspacecast + // to default address space. + bool nonDefaultAddrSpace = false; + if (auto llvmPtrType = dyn_cast(resultType)) + nonDefaultAddrSpace = llvmPtrType.getAddressSpace() != 0; + + builder.setInsertionPoint(allocaOp); + auto sharedAllocOp = omp::AllocSharedMemOp::create( + builder, allocaOp->getLoc(), LLVM::LLVMPointerType::get(context), + allocaOp.getElemTypeAttr(), allocaOp.getArraySize(), + allocaOp.getAlignmentAttr()); + if (nonDefaultAddrSpace) { + assert(allocaOp->hasOneUse() && "alloca must have only one use"); + auto asCastOp = + cast(*allocaOp->getUsers().begin()); + asCastOp.replaceAllUsesWith(sharedAllocOp.getOperation()); + // Delete later because we can't delete the cast op before the top-level + // iteration visits it. Also, the alloca can't be deleted before because + // it's used by it. + toBeDeleted.push_back(asCastOp); + toBeDeleted.push_back(allocaOp); + } else { + allocaOp.replaceAllUsesWith(sharedAllocOp.getOperation()); + allocaOp.erase(); + } + + // Create a new omp.free_shared_mem for the allocated buffer prior to + // exiting the region. + insertDeviceSharedMemDeallocation(builder, sharedAllocOp.getResult()); + }); + for (Operation *op : toBeDeleted) + op->erase(); + } +}; +} // namespace diff --git a/mlir/lib/Dialect/OpenMP/Utils/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/Utils/CMakeLists.txt new file mode 100644 index 0000000000000..8fd8ba2622c68 --- /dev/null +++ b/mlir/lib/Dialect/OpenMP/Utils/CMakeLists.txt @@ -0,0 +1,13 @@ +add_mlir_dialect_library(MLIROpenMPUtils + Utils.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP + + LINK_LIBS PUBLIC + MLIRIR + MLIRLLVMDialect + MLIROpenACCMPCommon + MLIROpenMPDialect + MLIRSupport + ) diff --git a/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp b/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp new file mode 100644 index 0000000000000..f5b7aa7ca2e2c --- /dev/null +++ b/mlir/lib/Dialect/OpenMP/Utils/Utils.cpp @@ -0,0 +1,104 @@ +//===- StackToShared.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements various OpenMP dialect utilities. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/OpenMP/Utils/Utils.h" + +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" + +using namespace mlir; + +static bool allocaUseRequiresSharedMem(const OpOperand &use) { + Operation *owner = use.getOwner(); + if (auto parallelOp = dyn_cast(owner)) { + if (llvm::is_contained(parallelOp.getReductionVars(), use.get())) + return true; + } else if (auto callOp = dyn_cast(owner)) { + if (llvm::is_contained(callOp.getArgOperands(), use.get())) + return true; + } + + // If it is used directly inside of a parallel region, it has to be replaced + // unless the use is a private clause. + if (owner->getParentOfType()) { + if (auto argIface = dyn_cast(owner)) { + if (auto privateSyms = + cast_or_null(owner->getAttr("private_syms"))) { + for (auto [var, sym] : + llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) { + if (var != use.get()) + continue; + + auto moduleOp = owner->getParentOfType(); + auto privateOp = cast( + moduleOp.lookupSymbol(cast(sym))); + return privateOp.getDataSharingType() != + omp::DataSharingClauseType::Private; + } + } + } + return true; + } + return false; +} + +bool mlir::omp::allocaUsesRequireSharedMem(Value alloc) { + for (const OpOperand &use : alloc.getUses()) { + Operation *owner = use.getOwner(); + if (isa(owner)) { + if (llvm::any_of(owner->getResults(), [&](Value result) { + return allocaUsesRequireSharedMem(result); + })) + return true; + } else if (allocaUseRequiresSharedMem(use)) { + return true; + } + } + return false; +} + +bool mlir::omp::opInSharedDeviceContext(Operation &op) { + if (isa(op)) + return false; + + auto offloadIface = op.getParentOfType(); + if (!offloadIface || !offloadIface.getIsTargetDevice()) + return false; + + auto targetOp = op.getParentOfType(); + + // It must be inside of a generic omp.target or in a target device function, + // and not inside of omp.parallel. + if (auto parallelOp = op.getParentOfType()) { + if (!targetOp || targetOp->isProperAncestor(parallelOp)) + return false; + } + + // The omp.target operation itself is considered in a shared device context in + // order to properly process its own allocation-defining entry block + // arguments. + if (!targetOp) + targetOp = dyn_cast(op); + + if (targetOp) { + if (targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) != + omp::TargetExecMode::generic) + return false; + } else { + auto declTargetIface = op.getParentOfType(); + if (!declTargetIface || !declTargetIface.isDeclareTarget() || + declTargetIface.getDeclareTargetDeviceType() == + omp::DeclareTargetDeviceType::host) + return false; + } + return true; +} diff --git a/mlir/lib/RegisterAllPasses.cpp b/mlir/lib/RegisterAllPasses.cpp index d7e321a61d4ac..cdda72688bc58 100644 --- a/mlir/lib/RegisterAllPasses.cpp +++ b/mlir/lib/RegisterAllPasses.cpp @@ -79,6 +79,7 @@ void mlir::registerAllPasses() { LLVM::registerTargetLLVMIRTransformsPasses(); math::registerMathPasses(); memref::registerMemRefPasses(); + omp::registerOpenMPPasses(); shard::registerShardPasses(); ml_program::registerMLProgramPasses(); omp::registerOpenMPPasses(); diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/CMakeLists.txt index 0a5d7c6e22058..eb748d8b43630 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/CMakeLists.txt +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/CMakeLists.txt @@ -8,6 +8,7 @@ add_mlir_translation_library(MLIROpenMPToLLVMIRTranslation MLIRIR MLIRLLVMDialect MLIROpenMPDialect + MLIROpenMPUtils MLIRSupport MLIRTargetLLVMIRExport MLIRTransformUtils diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 8344332c9063f..fb0d263dfd354 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -16,6 +16,7 @@ #include "mlir/Dialect/LLVMIR/LLVMTypes.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/Dialect/OpenMP/OpenMPInterfaces.h" +#include "mlir/Dialect/OpenMP/Utils/Utils.h" #include "mlir/IR/Operation.h" #include "mlir/Support/LLVM.h" #include "mlir/Target/LLVMIR/Dialect/OpenMPCommon.h" @@ -71,14 +72,17 @@ convertToScheduleKind(std::optional schedKind) { /// ModuleTranslation stack frame for OpenMP operations. This keeps track of the /// insertion points for allocas. -class OpenMPAllocaStackFrame - : public StateStackFrameBase { +class OpenMPAllocStackFrame + : public StateStackFrameBase { public: - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OpenMPAllocaStackFrame) - - explicit OpenMPAllocaStackFrame(llvm::OpenMPIRBuilder::InsertPointTy allocaIP) - : allocaInsertPoint(allocaIP) {} - llvm::OpenMPIRBuilder::InsertPointTy allocaInsertPoint; + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OpenMPAllocStackFrame) + + explicit OpenMPAllocStackFrame( + llvm::OpenMPIRBuilder::InsertPointTy allocIP, + llvm::ArrayRef deallocIPs) + : allocInsertPoint(allocIP), deallocInsertPoints(deallocIPs) {} + llvm::OpenMPIRBuilder::InsertPointTy allocInsertPoint; + llvm::SmallVector deallocInsertPoints; }; /// Stack frame to hold a \see llvm::CanonicalLoopInfo representing the @@ -485,26 +489,33 @@ static LogicalResult handleError(llvm::Expected &result, Operation &op) { /// Find the insertion point for allocas given the current insertion point for /// normal operations in the builder. -static llvm::OpenMPIRBuilder::InsertPointTy -findAllocaInsertPoint(llvm::IRBuilderBase &builder, - LLVM::ModuleTranslation &moduleTranslation) { - // If there is an alloca insertion point on stack, i.e. we are in a nested +static llvm::OpenMPIRBuilder::InsertPointTy findAllocInsertPoints( + llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation, + llvm::SmallVectorImpl *deallocIPs = + nullptr) { + // If there is an allocation insertion point on stack, i.e. we are in a nested // operation and a specific point was provided by some surrounding operation, // use it. - llvm::OpenMPIRBuilder::InsertPointTy allocaInsertPoint; - WalkResult walkResult = moduleTranslation.stackWalk( - [&](OpenMPAllocaStackFrame &frame) { - allocaInsertPoint = frame.allocaInsertPoint; + llvm::OpenMPIRBuilder::InsertPointTy allocInsertPoint; + llvm::ArrayRef deallocInsertPoints; + WalkResult walkResult = moduleTranslation.stackWalk( + [&](OpenMPAllocStackFrame &frame) { + allocInsertPoint = frame.allocInsertPoint; + deallocInsertPoints = frame.deallocInsertPoints; return WalkResult::interrupt(); }); // In cases with multiple levels of outlining, the tree walk might find an - // alloca insertion point that is inside the original function while the - // builder insertion point is inside the outlined function. We need to make - // sure that we do not use it in those cases. + // insertion point that is inside the original function while the builder + // insertion point is inside the outlined function. We need to make sure that + // we do not use it in those cases. if (walkResult.wasInterrupted() && - allocaInsertPoint.getBlock()->getParent() == - builder.GetInsertBlock()->getParent()) - return allocaInsertPoint; + allocInsertPoint.getBlock()->getParent() == + builder.GetInsertBlock()->getParent()) { + if (deallocIPs) + deallocIPs->insert(deallocIPs->end(), deallocInsertPoints.begin(), + deallocInsertPoints.end()); + return allocInsertPoint; + } // Otherwise, insert to the entry block of the surrounding function. // If the current IRBuilder InsertPoint is the function's entry, it cannot @@ -512,7 +523,7 @@ findAllocaInsertPoint(llvm::IRBuilderBase &builder, // confusion. Create a new BasicBlock for the Builder and use the entry block // for the allocs. // TODO: Create a dedicated alloca BasicBlock at function creation such that - // we do not need to move the current InertPoint here. + // we do not need to move the current InsertPoint here. if (builder.GetInsertBlock() == &builder.GetInsertBlock()->getParent()->getEntryBlock()) { assert(builder.GetInsertPoint() == builder.GetInsertBlock()->end() && @@ -524,6 +535,16 @@ findAllocaInsertPoint(llvm::IRBuilderBase &builder, builder.SetInsertPoint(entryBB); } + // Collect exit blocks, which is where explicit deallocations should happen in + // this case. + if (deallocIPs) { + for (llvm::BasicBlock &block : *builder.GetInsertBlock()->getParent()) { + llvm::Instruction *terminator = block.getTerminator(); + if (isa_and_present(terminator)) + deallocIPs->emplace_back(&block, terminator->getIterator()); + } + } + llvm::BasicBlock &funcEntryBlock = builder.GetInsertBlock()->getParent()->getEntryBlock(); return llvm::OpenMPIRBuilder::InsertPointTy( @@ -711,7 +732,8 @@ convertOmpMasked(Operation &opInst, llvm::IRBuilderBase &builder, if (failed(checkImplementationStatus(opInst))) return failure(); - auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { + auto bodyGenCB = [&](InsertPointTy allocIP, InsertPointTy codeGenIP, + llvm::ArrayRef deallocIPs) { // MaskedOp has only one region associated with it. auto ®ion = maskedOp.getRegion(); builder.restoreIP(codeGenIP); @@ -755,7 +777,8 @@ convertOmpMaster(Operation &opInst, llvm::IRBuilderBase &builder, if (failed(checkImplementationStatus(opInst))) return failure(); - auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { + auto bodyGenCB = [&](InsertPointTy allocIP, InsertPointTy codeGenIP, + llvm::ArrayRef deallocIPs) { // MasterOp has only one region associated with it. auto ®ion = masterOp.getRegion(); builder.restoreIP(codeGenIP); @@ -790,7 +813,8 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder, if (failed(checkImplementationStatus(opInst))) return failure(); - auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { + auto bodyGenCB = [&](InsertPointTy allocIP, InsertPointTy codeGenIP, + llvm::ArrayRef deallocIPs) { // CriticalOp has only one region associated with it. auto ®ion = cast(opInst).getRegion(); builder.restoreIP(codeGenIP); @@ -1050,7 +1074,7 @@ convertOmpOrdered(Operation &opInst, llvm::IRBuilderBase &builder, indexVecValues++; } llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + findAllocInsertPoints(builder, moduleTranslation); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createOrderedDepend( ompLoc, allocaIP, numLoops, storeValues, ".cnt.addr", isDependSource)); @@ -1069,7 +1093,8 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder, if (failed(checkImplementationStatus(opInst))) return failure(); - auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) { + auto bodyGenCB = [&](InsertPointTy allocIP, InsertPointTy codeGenIP, + llvm::ArrayRef deallocIPs) { // OrderedOp has only one region associated with it. auto ®ion = cast(opInst).getRegion(); builder.restoreIP(codeGenIP); @@ -1110,7 +1135,7 @@ struct DeferredStore { /// to be inserted after all allocas template static LogicalResult -allocReductionVars(T loop, ArrayRef reductionArgs, +allocReductionVars(T op, ArrayRef reductionArgs, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation, const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, @@ -1122,10 +1147,13 @@ allocReductionVars(T loop, ArrayRef reductionArgs, llvm::IRBuilderBase::InsertPointGuard guard(builder); builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + bool useDeviceSharedMem = omp::opInSharedDeviceContext(*op); + // delay creating stores until after all allocas - deferredStores.reserve(loop.getNumReductionVars()); + deferredStores.reserve(op.getNumReductionVars()); - for (std::size_t i = 0; i < loop.getNumReductionVars(); ++i) { + for (std::size_t i = 0; i < op.getNumReductionVars(); ++i) { Region &allocRegion = reductionDecls[i].getAllocRegion(); if (isByRefs[i]) { if (allocRegion.empty()) @@ -1134,7 +1162,7 @@ allocReductionVars(T loop, ArrayRef reductionArgs, SmallVector phis; if (failed(inlineConvertOmpRegions(allocRegion, "omp.reduction.alloc", builder, moduleTranslation, &phis))) - return loop.emitError( + return op.emitError( "failed to inline `alloc` region of `omp.declare_reduction`"); assert(phis.size() == 1 && "expected one allocation to be yielded"); @@ -1142,32 +1170,43 @@ allocReductionVars(T loop, ArrayRef reductionArgs, // Allocate reduction variable (which is a pointer to the real reduction // variable allocated in the inlined region) - llvm::Value *var = builder.CreateAlloca( - moduleTranslation.convertType(reductionDecls[i].getType())); - llvm::Type *ptrTy = builder.getPtrTy(); - llvm::Value *castVar = - builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy); + llvm::Type *varTy = + moduleTranslation.convertType(reductionDecls[i].getType()); + llvm::Value *var; + if (useDeviceSharedMem) { + var = ompBuilder->createOMPAllocShared(builder, varTy); + } else { + var = builder.CreateAlloca(varTy); + var = builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy); + } + llvm::Value *castPhi = builder.CreatePointerBitCastOrAddrSpaceCast(phis[0], ptrTy); - deferredStores.emplace_back(castPhi, castVar); + deferredStores.emplace_back(castPhi, var); - privateReductionVariables[i] = castVar; + privateReductionVariables[i] = var; moduleTranslation.mapValue(reductionArgs[i], castPhi); - reductionVariableMap.try_emplace(loop.getReductionVars()[i], castPhi); + reductionVariableMap.try_emplace(op.getReductionVars()[i], castPhi); } else { assert(allocRegion.empty() && "allocaction is implicit for by-val reduction"); - llvm::Value *var = builder.CreateAlloca( - moduleTranslation.convertType(reductionDecls[i].getType())); + llvm::Type *ptrTy = builder.getPtrTy(); - llvm::Value *castVar = - builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy); + llvm::Type *varTy = + moduleTranslation.convertType(reductionDecls[i].getType()); + llvm::Value *var; + if (useDeviceSharedMem) { + var = ompBuilder->createOMPAllocShared(builder, varTy); + } else { + var = builder.CreateAlloca(varTy); + var = builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy); + } - moduleTranslation.mapValue(reductionArgs[i], castVar); - privateReductionVariables[i] = castVar; - reductionVariableMap.try_emplace(loop.getReductionVars()[i], castVar); + moduleTranslation.mapValue(reductionArgs[i], var); + privateReductionVariables[i] = var; + reductionVariableMap.try_emplace(op.getReductionVars()[i], var); } } @@ -1229,6 +1268,9 @@ initReductionVars(OP op, ArrayRef reductionArgs, if (op.getNumReductionVars() == 0) return success(); + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + bool useDeviceSharedMem = omp::opInSharedDeviceContext(*op); + llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init"); auto allocaIP = llvm::IRBuilderBase::InsertPoint( latestAllocaBlock, latestAllocaBlock->getTerminator()->getIterator()); @@ -1243,8 +1285,12 @@ initReductionVars(OP op, ArrayRef reductionArgs, // TODO: remove after all users of by-ref are updated to use the alloc // region: Allocate reduction variable (which is a pointer to the real // reduciton variable allocated in the inlined region) - byRefVars[i] = builder.CreateAlloca( - moduleTranslation.convertType(reductionDecls[i].getType())); + llvm::Type *varTy = + moduleTranslation.convertType(reductionDecls[i].getType()); + if (useDeviceSharedMem) + byRefVars[i] = ompBuilder->createOMPAllocShared(builder, varTy); + else + byRefVars[i] = builder.CreateAlloca(varTy); } } @@ -1435,9 +1481,19 @@ static LogicalResult createReductionsAndCleanup( [](omp::DeclareReductionOp reductionDecl) { return &reductionDecl.getCleanupRegion(); }); - return inlineOmpRegionCleanup(reductionRegions, privateReductionVariables, - moduleTranslation, builder, - "omp.reduction.cleanup"); + LogicalResult result = inlineOmpRegionCleanup( + reductionRegions, privateReductionVariables, moduleTranslation, builder, + "omp.reduction.cleanup"); + + bool useDeviceSharedMem = omp::opInSharedDeviceContext(*op); + if (useDeviceSharedMem) { + for (auto [var, reductionDecl] : + llvm::zip_equal(privateReductionVariables, reductionDecls)) + ompBuilder->createOMPFreeShared( + builder, var, moduleTranslation.convertType(reductionDecl.getType())); + } + + return result; } static ArrayRef getIsByRef(std::optional> attr) { @@ -1582,8 +1638,9 @@ initPrivateVars(llvm::IRBuilderBase &builder, /// Allocate and initialize delayed private variables. Returns the basic block /// which comes after all of these allocations. llvm::Value * for each of these /// private variables are populated in llvmPrivateVars. +template static llvm::Expected -allocatePrivateVars(llvm::IRBuilderBase &builder, +allocatePrivateVars(T op, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation, PrivateVarsInfo &privateVarsInfo, const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, @@ -1606,6 +1663,8 @@ allocatePrivateVars(llvm::IRBuilderBase &builder, llvm::DataLayout dataLayout = builder.GetInsertBlock()->getDataLayout(); llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0); + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + bool mightUseDeviceSharedMem = omp::opInSharedDeviceContext(*op); unsigned int allocaAS = moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace(); unsigned int defaultAS = moduleTranslation.getLLVMModule() @@ -1618,11 +1677,16 @@ allocatePrivateVars(llvm::IRBuilderBase &builder, llvm::Type *llvmAllocType = moduleTranslation.convertType(privDecl.getType()); builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); - llvm::Value *llvmPrivateVar = builder.CreateAlloca( - llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc"); - if (allocaAS != defaultAS) - llvmPrivateVar = builder.CreateAddrSpaceCast(llvmPrivateVar, - builder.getPtrTy(defaultAS)); + llvm::Value *llvmPrivateVar = nullptr; + if (mightUseDeviceSharedMem && omp::allocaUsesRequireSharedMem(blockArg)) { + llvmPrivateVar = ompBuilder->createOMPAllocShared(builder, llvmAllocType); + } else { + llvmPrivateVar = builder.CreateAlloca( + llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc"); + if (allocaAS != defaultAS) + llvmPrivateVar = builder.CreateAddrSpaceCast( + llvmPrivateVar, builder.getPtrTy(defaultAS)); + } privateVarsInfo.llvmVars.push_back(llvmPrivateVar); } @@ -1694,24 +1758,38 @@ static LogicalResult copyFirstPrivateVars( return success(); } +template static LogicalResult -cleanupPrivateVars(llvm::IRBuilderBase &builder, +cleanupPrivateVars(T op, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation, Location loc, - SmallVectorImpl &llvmPrivateVars, - SmallVectorImpl &privateDecls) { + PrivateVarsInfo &privateVarsInfo) { // private variable deallocation SmallVector privateCleanupRegions; - llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions), + llvm::transform(privateVarsInfo.privatizers, + std::back_inserter(privateCleanupRegions), [](omp::PrivateClauseOp privatizer) { return &privatizer.getDeallocRegion(); }); - if (failed(inlineOmpRegionCleanup( - privateCleanupRegions, llvmPrivateVars, moduleTranslation, builder, - "omp.private.dealloc", /*shouldLoadCleanupRegionArg=*/false))) + if (failed(inlineOmpRegionCleanup(privateCleanupRegions, + privateVarsInfo.llvmVars, moduleTranslation, + builder, "omp.private.dealloc", + /*shouldLoadCleanupRegionArg=*/false))) return mlir::emitError(loc, "failed to inline `dealloc` region of an " "`omp.private` op in"); + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + bool mightUseDeviceSharedMem = omp::opInSharedDeviceContext(*op); + for (auto [privDecl, llvmPrivVar, blockArg] : + llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.llvmVars, + privateVarsInfo.blockArgs)) { + if (mightUseDeviceSharedMem && omp::allocaUsesRequireSharedMem(blockArg)) { + ompBuilder->createOMPFreeShared( + builder, llvmPrivVar, + moduleTranslation.convertType(privDecl.getType())); + } + } + return success(); } @@ -1747,7 +1825,7 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder, SmallVector reductionDecls; collectReductionDecls(sectionsOp, reductionDecls); llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + findAllocInsertPoints(builder, moduleTranslation); SmallVector privateReductionVariables( sectionsOp.getNumReductionVars()); @@ -1771,7 +1849,8 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder, Region ®ion = sectionOp.getRegion(); auto sectionCB = [§ionsOp, ®ion, &builder, &moduleTranslation]( - InsertPointTy allocaIP, InsertPointTy codeGenIP) { + InsertPointTy allocIP, InsertPointTy codeGenIP, + ArrayRef deallocIPs) { builder.restoreIP(codeGenIP); // map the omp.section reduction block argument to the omp.sections block @@ -1816,7 +1895,7 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder, // called for variables which have destructors/finalizers. auto finiCB = [&](InsertPointTy codeGenIP) { return llvm::Error::success(); }; - allocaIP = findAllocaInsertPoint(builder, moduleTranslation); + allocaIP = findAllocInsertPoints(builder, moduleTranslation); bool isCancellable = constructIsCancellable(sectionsOp); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = @@ -1845,7 +1924,8 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder, if (failed(checkImplementationStatus(*singleOp))) return failure(); - auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) { + auto bodyCB = [&](InsertPointTy allocIP, InsertPointTy codegenIP, + llvm::ArrayRef deallocIPs) { builder.restoreIP(codegenIP); return convertOmpOpRegions(singleOp.getRegion(), "omp.single.region", builder, moduleTranslation) @@ -1928,7 +2008,7 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder, SmallVector privateReductionVariables(numReductionVars); llvm::ArrayRef isByRef; llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + findAllocInsertPoints(builder, moduleTranslation); // Only do teams reduction if there is no distribute op that captures the // reduction instead. @@ -1950,9 +2030,10 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder, return failure(); } - auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) { - LLVM::ModuleTranslation::SaveStack frame( - moduleTranslation, allocaIP); + auto bodyCB = [&](InsertPointTy allocIP, InsertPointTy codegenIP, + llvm::ArrayRef deallocIPs) { + LLVM::ModuleTranslation::SaveStack frame( + moduleTranslation, allocIP, deallocIPs); builder.restoreIP(codegenIP); return convertOmpOpRegions(op.getRegion(), "omp.teams.region", builder, moduleTranslation) @@ -2209,9 +2290,9 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder, // code outside of the outlined task region, which is what we want because // this way the initialization and copy regions are executed immediately while // the host variable data are still live. - - llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + llvm::SmallVector deallocIPs; + InsertPointTy allocIP = + findAllocInsertPoints(builder, moduleTranslation, &deallocIPs); // Not using splitBB() because that requires the current block to have a // terminator. @@ -2241,8 +2322,8 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder, // Save the alloca insertion point on ModuleTranslation stack for use in // nested regions. - LLVM::ModuleTranslation::SaveStack frame( - moduleTranslation, allocaIP); + LLVM::ModuleTranslation::SaveStack frame( + moduleTranslation, allocIP, deallocIPs); // Allocate and initialize private variables builder.SetInsertPoint(initBlock->getTerminator()); @@ -2306,12 +2387,12 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder, // Set up for call to createTask() builder.SetInsertPoint(taskStartBlock); - auto bodyCB = [&](InsertPointTy allocaIP, - InsertPointTy codegenIP) -> llvm::Error { + auto bodyCB = [&](InsertPointTy allocIP, InsertPointTy codegenIP, + llvm::ArrayRef deallocIPs) -> llvm::Error { // Save the alloca insertion point on ModuleTranslation stack for use in // nested regions. - LLVM::ModuleTranslation::SaveStack frame( - moduleTranslation, allocaIP); + LLVM::ModuleTranslation::SaveStack frame( + moduleTranslation, allocIP, deallocIPs); // translate the body of the task: builder.restoreIP(codegenIP); @@ -2329,7 +2410,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder, llvm::IRBuilderBase::InsertPointGuard guard(builder); llvm::Type *llvmAllocType = moduleTranslation.convertType(privDecl.getType()); - builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); + builder.SetInsertPoint(allocIP.getBlock()->getTerminator()); llvm::Value *llvmPrivateVar = builder.CreateAlloca( llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc"); @@ -2378,9 +2459,8 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder, builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator()); - if (failed(cleanupPrivateVars(builder, moduleTranslation, taskOp.getLoc(), - privateVarsInfo.llvmVars, - privateVarsInfo.privatizers))) + if (failed(cleanupPrivateVars(taskOp, builder, moduleTranslation, + taskOp.getLoc(), privateVarsInfo))) return llvm::make_error(); // Free heap allocated task context structure at the end of the task. @@ -2404,7 +2484,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = moduleTranslation.getOpenMPBuilder()->createTask( - ompLoc, allocaIP, bodyCB, !taskOp.getUntied(), + ompLoc, allocIP, deallocIPs, bodyCB, !taskOp.getUntied(), moduleTranslation.lookupValue(taskOp.getFinal()), moduleTranslation.lookupValue(taskOp.getIfExpr()), dds, taskOp.getMergeable(), @@ -2429,18 +2509,21 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder, if (failed(checkImplementationStatus(*tgOp))) return failure(); - auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) { + auto bodyCB = [&](InsertPointTy allocIP, InsertPointTy codegenIP, + llvm::ArrayRef deallocIPs) { builder.restoreIP(codegenIP); return convertOmpOpRegions(tgOp.getRegion(), "omp.taskgroup.region", builder, moduleTranslation) .takeError(); }; - InsertPointTy allocaIP = findAllocaInsertPoint(builder, moduleTranslation); + llvm::SmallVector deallocIPs; + InsertPointTy allocIP = + findAllocInsertPoints(builder, moduleTranslation, &deallocIPs); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = - moduleTranslation.getOpenMPBuilder()->createTaskgroup(ompLoc, allocaIP, - bodyCB); + moduleTranslation.getOpenMPBuilder()->createTaskgroup(ompLoc, allocIP, + deallocIPs, bodyCB); if (failed(handleError(afterIP, *tgOp))) return failure(); @@ -2490,14 +2573,15 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, SmallVector reductionDecls; collectReductionDecls(wsloopOp, reductionDecls); + llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + findAllocInsertPoints(builder, moduleTranslation); SmallVector privateReductionVariables( wsloopOp.getNumReductionVars()); llvm::Expected afterAllocas = allocatePrivateVars( - builder, moduleTranslation, privateVarsInfo, allocaIP); + wsloopOp, builder, moduleTranslation, privateVarsInfo, allocaIP); if (handleError(afterAllocas, opInst).failed()) return failure(); @@ -2596,13 +2680,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, // for every omp.wsloop nested inside a no-loop SPMD target region, even if // that loop is not the top-level SPMD one. if (loopOp == targetCapturedOp) { - omp::TargetRegionFlags kernelFlags = - targetOp.getKernelExecFlags(targetCapturedOp); - if (omp::bitEnumContainsAll(kernelFlags, - omp::TargetRegionFlags::spmd | - omp::TargetRegionFlags::no_loop) && - !omp::bitEnumContainsAny(kernelFlags, - omp::TargetRegionFlags::generic)) + if (targetOp.getKernelExecFlags(targetCapturedOp) == + omp::TargetExecMode::no_loop) noLoopMode = true; } } @@ -2644,9 +2723,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, /*isTeamsReduction=*/false))) return failure(); - return cleanupPrivateVars(builder, moduleTranslation, wsloopOp.getLoc(), - privateVarsInfo.llvmVars, - privateVarsInfo.privatizers); + return cleanupPrivateVars(wsloopOp, builder, moduleTranslation, + wsloopOp.getLoc(), privateVarsInfo); } /// Converts the OpenMP parallel operation to LLVM IR. @@ -2670,10 +2748,11 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, opInst.getNumReductionVars()); SmallVector deferredStores; - auto bodyGenCB = [&](InsertPointTy allocaIP, - InsertPointTy codeGenIP) -> llvm::Error { + auto bodyGenCB = + [&](InsertPointTy allocIP, InsertPointTy codeGenIP, + llvm::ArrayRef deallocIPs) -> llvm::Error { llvm::Expected afterAllocas = allocatePrivateVars( - builder, moduleTranslation, privateVarsInfo, allocaIP); + opInst, builder, moduleTranslation, privateVarsInfo, allocIP); if (handleError(afterAllocas, *opInst).failed()) return llvm::make_error(); @@ -2683,12 +2762,11 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, MutableArrayRef reductionArgs = cast(*opInst).getReductionBlockArgs(); - allocaIP = - InsertPointTy(allocaIP.getBlock(), - allocaIP.getBlock()->getTerminator()->getIterator()); + allocIP = InsertPointTy(allocIP.getBlock(), + allocIP.getBlock()->getTerminator()->getIterator()); if (failed(allocReductionVars( - opInst, reductionArgs, builder, moduleTranslation, allocaIP, + opInst, reductionArgs, builder, moduleTranslation, allocIP, reductionDecls, privateReductionVariables, reductionVariableMap, deferredStores, isByRef))) return llvm::make_error(); @@ -2717,8 +2795,8 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, // Save the alloca insertion point on ModuleTranslation stack for use in // nested regions. - LLVM::ModuleTranslation::SaveStack frame( - moduleTranslation, allocaIP); + LLVM::ModuleTranslation::SaveStack frame( + moduleTranslation, allocIP, deallocIPs); // ParallelOp has only one region associated with it. llvm::Expected regionBlock = convertOmpOpRegions( @@ -2745,7 +2823,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder::InsertPointOrErrorTy contInsertPoint = ompBuilder->createReductions( - builder.saveIP(), allocaIP, reductionInfos, isByRef, + builder.saveIP(), allocIP, reductionInfos, isByRef, /*IsNoWait=*/false, /*IsTeamsReduction=*/false); if (!contInsertPoint) return contInsertPoint.takeError(); @@ -2787,9 +2865,8 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, return llvm::createStringError( "failed to inline `cleanup` region of `omp.declare_reduction`"); - if (failed(cleanupPrivateVars(builder, moduleTranslation, opInst.getLoc(), - privateVarsInfo.llvmVars, - privateVarsInfo.privatizers))) + if (failed(cleanupPrivateVars(opInst, builder, moduleTranslation, + opInst.getLoc(), privateVarsInfo))) return llvm::make_error(); builder.restoreIP(oldIP); @@ -2807,13 +2884,15 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, pbKind = getProcBindKind(*bind); bool isCancellable = constructIsCancellable(opInst); - llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + llvm::SmallVector deallocIPs; + llvm::OpenMPIRBuilder::InsertPointTy allocIP = + findAllocInsertPoints(builder, moduleTranslation, &deallocIPs); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = - ompBuilder->createParallel(ompLoc, allocaIP, bodyGenCB, privCB, finiCB, - ifCond, numThreads, pbKind, isCancellable); + ompBuilder->createParallel(ompLoc, allocIP, deallocIPs, bodyGenCB, privCB, + finiCB, ifCond, numThreads, pbKind, + isCancellable); if (failed(handleError(afterIP, *opInst))) return failure(); @@ -2858,10 +2937,10 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder, assert(isByRef.size() == simdOp.getNumReductionVars()); llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + findAllocInsertPoints(builder, moduleTranslation); llvm::Expected afterAllocas = allocatePrivateVars( - builder, moduleTranslation, privateVarsInfo, allocaIP); + simdOp, builder, moduleTranslation, privateVarsInfo, allocaIP); if (handleError(afterAllocas, opInst).failed()) return failure(); @@ -2980,9 +3059,8 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder, "omp.reduction.cleanup"))) return failure(); - return cleanupPrivateVars(builder, moduleTranslation, simdOp.getLoc(), - privateVarsInfo.llvmVars, - privateVarsInfo.privatizers); + return cleanupPrivateVars(simdOp, builder, moduleTranslation, simdOp.getLoc(), + privateVarsInfo); } /// Converts an OpenMP loop nest into LLVM IR using OpenMPIRBuilder. @@ -3239,7 +3317,7 @@ convertOmpAtomicRead(Operation &opInst, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + findAllocInsertPoints(builder, moduleTranslation); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); @@ -3266,7 +3344,7 @@ convertOmpAtomicWrite(Operation &opInst, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + findAllocInsertPoints(builder, moduleTranslation); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); llvm::AtomicOrdering ao = convertAtomicOrdering(writeOp.getMemoryOrder()); @@ -3383,7 +3461,7 @@ convertOmpAtomicUpdate(omp::AtomicUpdateOp &opInst, extractAtomicControlFlags(opInst, isIgnoreDenormalMode, isFineGrainedMemory, isRemoteMemory); // Handle ambiguous alloca, if any. - auto allocaIP = findAllocaInsertPoint(builder, moduleTranslation); + auto allocaIP = findAllocInsertPoints(builder, moduleTranslation); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = ompBuilder->createAtomicUpdate(ompLoc, allocaIP, llvmAtomicX, llvmExpr, @@ -3484,7 +3562,7 @@ convertOmpAtomicCapture(omp::AtomicCaptureOp atomicCaptureOp, extractAtomicControlFlags(atomicUpdateOp, isIgnoreDenormalMode, isFineGrainedMemory, isRemoteMemory); // Handle ambiguous alloca, if any. - auto allocaIP = findAllocaInsertPoint(builder, moduleTranslation); + auto allocaIP = findAllocInsertPoints(builder, moduleTranslation); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = ompBuilder->createAtomicCapture( @@ -4777,7 +4855,7 @@ createAlteredByCaptureMap(MapInfoData &mapData, if (!isPtrTy) { auto curInsert = builder.saveIP(); llvm::DebugLoc DbgLoc = builder.getCurrentDebugLocation(); - builder.restoreIP(findAllocaInsertPoint(builder, moduleTranslation)); + builder.restoreIP(findAllocInsertPoints(builder, moduleTranslation)); auto *memTempAlloc = builder.CreateAlloca(builder.getPtrTy(), nullptr, ".casted"); builder.SetCurrentDebugLocation(DbgLoc); @@ -5169,18 +5247,21 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder, }; llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); - llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + llvm::SmallVector deallocIPs; + llvm::OpenMPIRBuilder::InsertPointTy allocIP = + findAllocInsertPoints(builder, moduleTranslation, &deallocIPs); llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = [&]() { if (isa(op)) - return ompBuilder->createTargetData(ompLoc, allocaIP, builder.saveIP(), + return ompBuilder->createTargetData(ompLoc, allocIP, builder.saveIP(), + deallocIPs, builder.getInt64(deviceID), ifCond, info, genMapInfoCB, customMapperCB, /*MapperFunc=*/nullptr, bodyGenCB, /*DeviceAddrCB=*/nullptr); - return ompBuilder->createTargetData( - ompLoc, allocaIP, builder.saveIP(), builder.getInt64(deviceID), ifCond, - info, genMapInfoCB, customMapperCB, &RTLFn); + return ompBuilder->createTargetData(ompLoc, allocIP, builder.saveIP(), + deallocIPs, builder.getInt64(deviceID), + ifCond, info, genMapInfoCB, + customMapperCB, &RTLFn); }(); if (failed(handleError(afterIP, *op))) @@ -5216,7 +5297,7 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, collectReductionDecls(teamsOp, reductionDecls); llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + findAllocInsertPoints(builder, moduleTranslation); MutableArrayRef reductionArgs = llvm::cast(*teamsOp) @@ -5230,19 +5311,20 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, } using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; - auto bodyGenCB = [&](InsertPointTy allocaIP, - InsertPointTy codeGenIP) -> llvm::Error { + auto bodyGenCB = + [&](InsertPointTy allocIP, InsertPointTy codeGenIP, + llvm::ArrayRef deallocIPs) -> llvm::Error { // Save the alloca insertion point on ModuleTranslation stack for use in // nested regions. - LLVM::ModuleTranslation::SaveStack frame( - moduleTranslation, allocaIP); + LLVM::ModuleTranslation::SaveStack frame( + moduleTranslation, allocIP, deallocIPs); // DistributeOp has only one region associated with it. builder.restoreIP(codeGenIP); PrivateVarsInfo privVarsInfo(distributeOp); - llvm::Expected afterAllocas = - allocatePrivateVars(builder, moduleTranslation, privVarsInfo, allocaIP); + llvm::Expected afterAllocas = allocatePrivateVars( + distributeOp, builder, moduleTranslation, privVarsInfo, allocIP); if (handleError(afterAllocas, opInst).failed()) return llvm::make_error(); @@ -5285,7 +5367,7 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, findCurrentLoopInfo(moduleTranslation); llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP = ompBuilder->applyWorkshareLoop( - ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier, + ompLoc.DL, loopInfo, allocIP, loopNeedsBarrier, convertToScheduleKind(schedule), chunk, isSimd, scheduleMod == omp::ScheduleModifier::monotonic, scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered, @@ -5295,19 +5377,19 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, return wsloopIP.takeError(); } - if (failed(cleanupPrivateVars(builder, moduleTranslation, - distributeOp.getLoc(), privVarsInfo.llvmVars, - privVarsInfo.privatizers))) + if (failed(cleanupPrivateVars(distributeOp, builder, moduleTranslation, + distributeOp.getLoc(), privVarsInfo))) return llvm::make_error(); return llvm::Error::success(); }; - llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + llvm::SmallVector deallocIPs; + llvm::OpenMPIRBuilder::InsertPointTy allocIP = + findAllocInsertPoints(builder, moduleTranslation, &deallocIPs); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = - ompBuilder->createDistribute(ompLoc, allocaIP, bodyGenCB); + ompBuilder->createDistribute(ompLoc, allocIP, deallocIPs, bodyGenCB); if (failed(handleError(afterIP, opInst))) return failure(); @@ -5317,7 +5399,7 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, if (doDistributeReduction) { // Process the reductions if required. return createReductionsAndCleanup( - teamsOp, builder, moduleTranslation, allocaIP, reductionDecls, + teamsOp, builder, moduleTranslation, allocIP, reductionDecls, privateReductionVariables, isByRef, /*isNoWait*/ false, /*isTeamsReduction*/ true); } @@ -5487,42 +5569,77 @@ handleDeclareTargetMapVar(MapInfoData &mapData, // a store of the kernel argument into this allocated memory which // will then be loaded from, ByCopy will use the allocated memory // directly. -static llvm::IRBuilderBase::InsertPoint -createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg, - llvm::Value *input, llvm::Value *&retVal, - llvm::IRBuilderBase &builder, - llvm::OpenMPIRBuilder &ompBuilder, - LLVM::ModuleTranslation &moduleTranslation, - llvm::IRBuilderBase::InsertPoint allocaIP, - llvm::IRBuilderBase::InsertPoint codeGenIP) { +static llvm::IRBuilderBase::InsertPoint createDeviceArgumentAccessor( + omp::TargetOp targetOp, MapInfoData &mapData, llvm::Argument &arg, + llvm::Value *input, llvm::Value *&retVal, llvm::IRBuilderBase &builder, + llvm::OpenMPIRBuilder &ompBuilder, + LLVM::ModuleTranslation &moduleTranslation, + llvm::IRBuilderBase::InsertPoint allocIP, + llvm::IRBuilderBase::InsertPoint codeGenIP, + llvm::ArrayRef deallocIPs) { assert(ompBuilder.Config.isTargetDevice() && "function only supported for target device codegen"); - builder.restoreIP(allocaIP); + builder.restoreIP(allocIP); omp::VariableCaptureKind capture = omp::VariableCaptureKind::ByRef; LLVM::TypeToLLVMIRTranslator typeToLLVMIRTranslator( ompBuilder.M.getContext()); unsigned alignmentValue = 0; + BlockArgument mlirArg; + SmallVector> blockArgsPairs; + cast(*targetOp).getBlockArgsPairs( + blockArgsPairs); // Find the associated MapInfoData entry for the current input - for (size_t i = 0; i < mapData.MapClause.size(); ++i) + for (size_t i = 0; i < mapData.MapClause.size(); ++i) { if (mapData.OriginalValue[i] == input) { auto mapOp = cast(mapData.MapClause[i]); capture = mapOp.getMapCaptureType(); // Get information of alignment of mapped object alignmentValue = typeToLLVMIRTranslator.getPreferredAlignment( mapOp.getVarType(), ompBuilder.M.getDataLayout()); + + // Find the corresponding entry block argument, which can be associated to + // a map, use_device* or has_device* clause. + for (auto &[val, arg] : blockArgsPairs) { + if (mapOp.getResult() == val) { + mlirArg = arg; + break; + } + } + assert(mlirArg && "expected to find entry block argument for map clause"); break; } - + } unsigned int allocaAS = ompBuilder.M.getDataLayout().getAllocaAddrSpace(); unsigned int defaultAS = ompBuilder.M.getDataLayout().getProgramAddressSpace(); - // Create the alloca for the argument the current point. - llvm::Value *v = builder.CreateAlloca(arg.getType(), allocaAS, nullptr); + // Create the allocation for the argument. + llvm::Value *v = nullptr; + if (omp::opInSharedDeviceContext(*targetOp) && + omp::allocaUsesRequireSharedMem(mlirArg)) { + // Use the beginning of the codeGenIP rather than the usual allocation point + // for shared memory allocations because otherwise these would be done prior + // to the target initialization call. Also, the exit block (where the + // deallocation is placed) is only executed if the initialization call + // succeeds. + builder.SetInsertPoint(codeGenIP.getBlock()->getFirstInsertionPt()); + v = ompBuilder.createOMPAllocShared(builder, arg.getType()); + + // Create deallocations in all provided deallocation points and then restore + // the insertion point to right after the new allocations. + llvm::IRBuilderBase::InsertPointGuard guard(builder); + for (auto deallocIP : deallocIPs) { + builder.SetInsertPoint(deallocIP.getBlock(), deallocIP.getPoint()); + ompBuilder.createOMPFreeShared(builder, v, arg.getType()); + } + } else { + // Use the current point, which was previously set to allocIP. + v = builder.CreateAlloca(arg.getType(), allocaAS, nullptr); - if (allocaAS != defaultAS && arg.getType()->isPointerTy()) - v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS)); + if (allocaAS != defaultAS && arg.getType()->isPointerTy()) + v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS)); + } builder.CreateStore(&arg, v); @@ -5797,23 +5914,21 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp, } // Update kernel bounds structure for the `OpenMPIRBuilder` to use. - omp::TargetRegionFlags kernelFlags = targetOp.getKernelExecFlags(capturedOp); - assert( - omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::generic | - omp::TargetRegionFlags::spmd) && - "invalid kernel flags"); - attrs.ExecFlags = - omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::generic) - ? omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::spmd) - ? llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD - : llvm::omp::OMP_TGT_EXEC_MODE_GENERIC - : llvm::omp::OMP_TGT_EXEC_MODE_SPMD; - if (omp::bitEnumContainsAll(kernelFlags, - omp::TargetRegionFlags::spmd | - omp::TargetRegionFlags::no_loop) && - !omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::generic)) + omp::TargetExecMode execMode = targetOp.getKernelExecFlags(capturedOp); + switch (execMode) { + case omp::TargetExecMode::bare: + attrs.ExecFlags = llvm::omp::OMP_TGT_EXEC_MODE_BARE; + break; + case omp::TargetExecMode::generic: + attrs.ExecFlags = llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; + break; + case omp::TargetExecMode::spmd: + attrs.ExecFlags = llvm::omp::OMP_TGT_EXEC_MODE_SPMD; + break; + case omp::TargetExecMode::no_loop: attrs.ExecFlags = llvm::omp::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP; - + break; + } attrs.MinTeams = minTeamsVal; attrs.MaxTeams.front() = maxTeamsVal; attrs.MinThreads = 1; @@ -5863,8 +5978,9 @@ initTargetRuntimeAttrs(llvm::IRBuilderBase &builder, if (numThreads) attrs.MaxThreads = moduleTranslation.lookupValue(numThreads); - if (omp::bitEnumContainsAny(targetOp.getKernelExecFlags(capturedOp), - omp::TargetRegionFlags::trip_count)) { + bool hostEvalTripCount; + targetOp.getKernelExecFlags(capturedOp, &hostEvalTripCount); + if (hostEvalTripCount) { llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); attrs.LoopTripCount = nullptr; @@ -6001,7 +6117,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, } using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; - auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) + auto bodyCB = [&](InsertPointTy allocIP, InsertPointTy codeGenIP, + ArrayRef deallocIPs) -> llvm::OpenMPIRBuilder::InsertPointOrErrorTy { llvm::IRBuilderBase::InsertPointGuard guard(builder); builder.SetCurrentDebugLocation(llvm::DebugLoc()); @@ -6042,8 +6159,8 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, PrivateVarsInfo privateVarsInfo(targetOp); llvm::Expected afterAllocas = - allocatePrivateVars(builder, moduleTranslation, privateVarsInfo, - allocaIP, &mappedPrivateVars); + allocatePrivateVars(targetOp, builder, moduleTranslation, + privateVarsInfo, allocIP, &mappedPrivateVars); if (failed(handleError(afterAllocas, *targetOp))) return llvm::make_error(); @@ -6061,33 +6178,21 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, targetOp.getPrivateNeedsBarrier(), &mappedPrivateVars))) return llvm::make_error(); - SmallVector privateCleanupRegions; - llvm::transform(privateVarsInfo.privatizers, - std::back_inserter(privateCleanupRegions), - [](omp::PrivateClauseOp privatizer) { - return &privatizer.getDeallocRegion(); - }); - + LLVM::ModuleTranslation::SaveStack frame( + moduleTranslation, allocIP, deallocIPs); llvm::Expected exitBlock = convertOmpOpRegions( targetRegion, "omp.target", builder, moduleTranslation); - if (!exitBlock) - return exitBlock.takeError(); - - builder.SetInsertPoint(*exitBlock); - if (!privateCleanupRegions.empty()) { - if (failed(inlineOmpRegionCleanup( - privateCleanupRegions, privateVarsInfo.llvmVars, - moduleTranslation, builder, "omp.targetop.private.cleanup", - /*shouldLoadCleanupRegionArg=*/false))) { - return llvm::createStringError( - "failed to inline `dealloc` region of `omp.private` " - "op in the target region"); - } - return builder.saveIP(); - } + if (failed(handleError(exitBlock, *targetOp))) + return llvm::make_error(); + + builder.SetInsertPoint(exitBlock.get()->getTerminator()); + + if (failed(cleanupPrivateVars(targetOp, builder, moduleTranslation, + targetOp.getLoc(), privateVarsInfo))) + return llvm::make_error(); - return InsertPointTy(exitBlock.get(), exitBlock.get()->end()); + return builder.saveIP(); }; StringRef parentName = parentFn.getName(); @@ -6111,8 +6216,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, }; auto argAccessorCB = [&](llvm::Argument &arg, llvm::Value *input, - llvm::Value *&retVal, InsertPointTy allocaIP, - InsertPointTy codeGenIP) + llvm::Value *&retVal, InsertPointTy allocIP, + InsertPointTy codeGenIP, + llvm::ArrayRef deallocIPs) -> llvm::OpenMPIRBuilder::InsertPointOrErrorTy { llvm::IRBuilderBase::InsertPointGuard guard(builder); builder.SetCurrentDebugLocation(llvm::DebugLoc()); @@ -6126,9 +6232,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, return codeGenIP; } - return createDeviceArgumentAccessor(mapData, arg, input, retVal, builder, - *ompBuilder, moduleTranslation, - allocaIP, codeGenIP); + return createDeviceArgumentAccessor(targetOp, mapData, arg, input, retVal, + builder, *ompBuilder, moduleTranslation, + allocIP, codeGenIP, deallocIPs); }; llvm::OpenMPIRBuilder::TargetKernelRuntimeAttrs runtimeAttrs; @@ -6172,8 +6278,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, buildDependData(targetOp.getDependKinds(), targetOp.getDependVars(), moduleTranslation, dds); - llvm::OpenMPIRBuilder::InsertPointTy allocaIP = - findAllocaInsertPoint(builder, moduleTranslation); + llvm::SmallVector deallocIPs; + llvm::OpenMPIRBuilder::InsertPointTy allocIP = + findAllocInsertPoints(builder, moduleTranslation, &deallocIPs); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); llvm::OpenMPIRBuilder::TargetDataInfo info( @@ -6195,9 +6302,10 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = moduleTranslation.getOpenMPBuilder()->createTarget( - ompLoc, isOffloadEntry, allocaIP, builder.saveIP(), info, entryInfo, - defaultAttrs, runtimeAttrs, ifCond, kernelInput, genMapInfoCB, bodyCB, - argAccessorCB, customMapperCB, dds, targetOp.getNowait()); + ompLoc, isOffloadEntry, allocIP, builder.saveIP(), deallocIPs, info, + entryInfo, defaultAttrs, runtimeAttrs, ifCond, kernelInput, + genMapInfoCB, bodyCB, argAccessorCB, customMapperCB, dds, + targetOp.getNowait()); if (failed(handleError(afterIP, opInst))) return failure(); @@ -6287,6 +6395,7 @@ static void updateDebugInfoForDeclareTargetFunctions( static LogicalResult convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute, + llvm::OpenMPIRBuilder *ompBuilder, LLVM::ModuleTranslation &moduleTranslation) { // Amend omp.declare_target by deleting the IR of the outlined functions // created for target regions. They cannot be filtered out from MLIR earlier @@ -6309,8 +6418,14 @@ convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute, if (declareType == omp::DeclareTargetDeviceType::host) { llvmFunc->dropAllReferences(); llvmFunc->eraseFromParent(); - } else + + // Invalidate the builder's current insertion point, as it now points to + // a deleted block. + ompBuilder->Builder.ClearInsertionPoint(); + ompBuilder->Builder.SetCurrentDebugLocation(llvm::DebugLoc()); + } else { updateDebugInfoForDeclareTargetFunctions(llvmFunc, moduleTranslation); + } } return success(); } @@ -6465,9 +6580,12 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation( .Case("omp.declare_target", [&](Attribute attr) { if (auto declareTargetAttr = - dyn_cast(attr)) + dyn_cast(attr)) { + llvm::OpenMPIRBuilder *ompBuilder = + moduleTranslation.getOpenMPBuilder(); return convertDeclareTargetAttr(op, declareTargetAttr, - moduleTranslation); + ompBuilder, moduleTranslation); + } return failure(); }) .Case("omp.requires", @@ -6521,8 +6639,7 @@ static bool isHostDeviceOp(Operation *op) { if (op->getParentOfType()) return false; - if (mlir::isa(op) || - mlir::isa(op)) + if (mlir::isa(op)) return false; if (auto parentFn = op->getParentOfType()) { @@ -6552,6 +6669,46 @@ static llvm::Function *getOmpTargetAlloc(llvm::IRBuilderBase &builder, return func; } +static llvm::Value * +getAllocationSize(llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, + omp::TargetAllocMemOp op) { + llvm::DataLayout dataLayout = + moduleTranslation.getLLVMModule()->getDataLayout(); + llvm::Type *llvmHeapTy = moduleTranslation.convertType(op.getAllocatedType()); + llvm::TypeSize typeSize = dataLayout.getTypeAllocSize(llvmHeapTy); + llvm::Value *allocSize = builder.getInt64(typeSize.getFixedValue()); + for (auto typeParam : op.getTypeparams()) { + allocSize = builder.CreateMul( + allocSize, + builder.CreateIntCast(moduleTranslation.lookupValue(typeParam), + builder.getInt64Ty(), + /*isSigned=*/false)); + } + return allocSize; +} + +static llvm::Value * +getAllocationSize(llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation, + omp::AllocSharedMemOp op) { + llvm::DataLayout dataLayout = + moduleTranslation.getLLVMModule()->getDataLayout(); + llvm::Type *llvmHeapTy = moduleTranslation.convertType(op.getAllocatedType()); + + auto alignment = op.getAlignment(); + llvm::TypeSize typeSize = llvm::alignTo( + dataLayout.getTypeStoreSize(llvmHeapTy), + alignment ? *alignment : dataLayout.getABITypeAlign(llvmHeapTy).value()); + + llvm::Value *allocSize = builder.getInt64(typeSize.getFixedValue()); + return builder.CreateMul( + allocSize, + builder.CreateIntCast(moduleTranslation.lookupValue(op.getArraySize()), + builder.getInt64Ty(), + /*isSigned=*/false)); +} + static LogicalResult convertTargetAllocMemOp(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) { @@ -6566,14 +6723,8 @@ convertTargetAllocMemOp(Operation &opInst, llvm::IRBuilderBase &builder, mlir::Value deviceNum = allocMemOp.getDevice(); llvm::Value *llvmDeviceNum = moduleTranslation.lookupValue(deviceNum); // Get the allocation size. - llvm::DataLayout dataLayout = llvmModule->getDataLayout(); - mlir::Type heapTy = allocMemOp.getAllocatedType(); - llvm::Type *llvmHeapTy = moduleTranslation.convertType(heapTy); - llvm::TypeSize typeSize = dataLayout.getTypeStoreSize(llvmHeapTy); - llvm::Value *allocSize = builder.getInt64(typeSize.getFixedValue()); - for (auto typeParam : allocMemOp.getTypeparams()) - allocSize = - builder.CreateMul(allocSize, moduleTranslation.lookupValue(typeParam)); + llvm::Value *allocSize = + getAllocationSize(builder, moduleTranslation, allocMemOp); // Create call to "omp_target_alloc" with the args as translated llvm values. llvm::CallInst *call = builder.CreateCall(ompTargetAllocFunc, {allocSize, llvmDeviceNum}); @@ -6584,6 +6735,17 @@ convertTargetAllocMemOp(Operation &opInst, llvm::IRBuilderBase &builder, return success(); } +static LogicalResult +convertAllocSharedMemOp(omp::AllocSharedMemOp allocMemOp, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + llvm::Value *size = getAllocationSize(builder, moduleTranslation, allocMemOp); + moduleTranslation.mapValue(allocMemOp.getResult(), + ompBuilder->createOMPAllocShared(builder, size)); + return success(); +} + static llvm::Function *getOmpTargetFree(llvm::IRBuilderBase &builder, llvm::Module *llvmModule) { llvm::Type *ptrTy = builder.getPtrTy(0); @@ -6619,6 +6781,19 @@ convertTargetFreeMemOp(Operation &opInst, llvm::IRBuilderBase &builder, return success(); } +static LogicalResult +convertFreeSharedMemOp(omp::FreeSharedMemOp freeMemOp, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); + auto allocMemOp = + freeMemOp.getHeapref().getDefiningOp(); + llvm::Value *size = getAllocationSize(builder, moduleTranslation, allocMemOp); + ompBuilder->createOMPFreeShared( + builder, moduleTranslation.lookupValue(freeMemOp.getHeapref()), size); + return success(); +} + /// Given an OpenMP MLIR operation, create the corresponding LLVM IR /// (including OpenMP runtime calls). LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation( @@ -6809,6 +6984,12 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation( .Case([&](omp::TargetFreeMemOp) { return convertTargetFreeMemOp(*op, builder, moduleTranslation); }) + .Case([&](omp::AllocSharedMemOp op) { + return convertAllocSharedMemOp(op, builder, moduleTranslation); + }) + .Case([&](omp::FreeSharedMemOp op) { + return convertFreeSharedMemOp(op, builder, moduleTranslation); + }) .Default([&](Operation *inst) { return inst->emitError() << "not yet implemented: " << inst->getName(); diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index 6777a03a4f026..f6a2bbeadf0cf 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -3133,3 +3133,45 @@ func.func @invalid_workdistribute() -> () { } return } + +// ----- +func.func @target_allocmem_invalid_uniq_name(%device : i32) -> () { +// expected-error @below {{op attribute 'uniq_name' failed to satisfy constraint: string attribute}} + %0 = omp.target_allocmem %device : i32, i64 {uniq_name=2} + return +} + +// ----- +func.func @target_allocmem_invalid_bindc_name(%device : i32) -> () { +// expected-error @below {{op attribute 'bindc_name' failed to satisfy constraint: string attribute}} + %0 = omp.target_allocmem %device : i32, i64 {bindc_name=2} + return +} + +// ----- +func.func @alloc_shared_mem_invalid_alignment1(%n: i32) -> () { + // expected-error @below {{op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive}} + %0 = omp.alloc_shared_mem %n x i64 {alignment=-2} : (i32) -> !llvm.ptr + return +} + +// ----- +func.func @alloc_shared_mem_invalid_alignment2(%n: i32) -> () { + // expected-error @below {{ALIGN value : 3 must be power of 2}} + %0 = omp.alloc_shared_mem %n x i64 {alignment=3} : (i32) -> !llvm.ptr + return +} + +// ----- +func.func @alloc_shared_mem_invalid_array_size(%n: f32) -> () { + // expected-error @below {{invalid kind of type specified: expected builtin.integer, but found 'f32'}} + %0 = omp.alloc_shared_mem %n x i64 : (f32) -> !llvm.ptr + return +} + +// ----- +func.func @free_shared_mem_invalid_ptr(%ptr : !llvm.ptr) -> () { + // expected-error @below {{op 'heapref' operand must be defined by an 'omp.alloc_shared_memory' op}} + omp.free_shared_mem %ptr : !llvm.ptr + return +} diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index ac29e20907b55..87ea1adfa9bdf 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -3367,3 +3367,51 @@ func.func @omp_target_map_clause_type_test(%arg0 : memref) -> () { return } + +// CHECK-LABEL: func.func @omp_target_allocmem( +// CHECK-SAME: %[[DEVICE:.*]]: i32, %[[X:.*]]: index, %[[Y:.*]]: index, %[[Z:.*]]: i32) { +func.func @omp_target_allocmem(%device: i32, %x: index, %y: index, %z: i32) { + // CHECK: %{{.*}} = omp.target_allocmem %[[DEVICE]] : i32, i64 + %0 = omp.target_allocmem %device : i32, i64 + // CHECK: %{{.*}} = omp.target_allocmem %[[DEVICE]] : i32, vector<16x16xf32> {bindc_name = "bindc", uniq_name = "uniq"} + %1 = omp.target_allocmem %device : i32, vector<16x16xf32> {uniq_name="uniq", bindc_name="bindc"} + // CHECK: %{{.*}} = omp.target_allocmem %[[DEVICE]] : i32, !llvm.ptr(%[[X]], %[[Y]], %[[Z]] : index, index, i32) + %2 = omp.target_allocmem %device : i32, !llvm.ptr(%x, %y, %z : index, index, i32) + // CHECK: %{{.*}} = omp.target_allocmem %[[DEVICE]] : i32, !llvm.ptr, %[[X]], %[[Y]] + %3 = omp.target_allocmem %device : i32, !llvm.ptr, %x, %y + // CHECK: %{{.*}} = omp.target_allocmem %[[DEVICE]] : i32, !llvm.ptr(%[[X]], %[[Y]], %[[Z]] : index, index, i32), %[[X]], %[[Y]] + %4 = omp.target_allocmem %device : i32, !llvm.ptr(%x, %y, %z : index, index, i32), %x, %y + return +} + +// CHECK-LABEL: func.func @omp_target_freemem( +// CHECK-SAME: %[[DEVICE:.*]]: i32) { +func.func @omp_target_freemem(%device : i32) { + // CHECK: %[[PTR:.*]] = omp.target_allocmem + %ptr = omp.target_allocmem %device : i32, i64 + // CHECK: omp.target_freemem %[[DEVICE]], %[[PTR]] : i32, i64 + omp.target_freemem %device, %ptr : i32, i64 + return +} + +// CHECK-LABEL: func.func @omp_alloc_shared_mem( +// CHECK-SAME: %[[N:.*]]: i32) { +func.func @omp_alloc_shared_mem(%n: i32) { + // CHECK: %{{.*}} = omp.alloc_shared_mem %[[N]] x i64 : (i32) -> !llvm.ptr + %0 = omp.alloc_shared_mem %n x i64 : (i32) -> !llvm.ptr + // CHECK: %{{.*}} = omp.alloc_shared_mem %[[N]] x vector<16x16xf32> : (i32) -> !llvm.ptr + %1 = omp.alloc_shared_mem %n x vector<16x16xf32> : (i32) -> !llvm.ptr + // CHECK: %{{.*}} = omp.alloc_shared_mem %[[N]] x !llvm.ptr {alignment = 16 : i64} : (i32) -> !llvm.ptr + %2 = omp.alloc_shared_mem %n x !llvm.ptr {alignment = 16} : (i32) -> !llvm.ptr + return +} + +// CHECK-LABEL: func.func @omp_free_shared_mem( +// CHECK-SAME: %[[N:.*]]: i64) { +func.func @omp_free_shared_mem(%n: i64) { + // CHECK: %[[PTR:.*]] = omp.alloc_shared_mem %[[N]] x f32 : (i64) -> !llvm.ptr + %0 = omp.alloc_shared_mem %n x f32 : (i64) -> !llvm.ptr + // CHECK: omp.free_shared_mem %[[PTR]] : !llvm.ptr + omp.free_shared_mem %0 : !llvm.ptr + return +} diff --git a/mlir/test/Dialect/OpenMP/stack-to-shared.mlir b/mlir/test/Dialect/OpenMP/stack-to-shared.mlir new file mode 100644 index 0000000000000..81b03acd4d368 --- /dev/null +++ b/mlir/test/Dialect/OpenMP/stack-to-shared.mlir @@ -0,0 +1,149 @@ +// RUN: mlir-opt --omp-stack-to-shared %s | FileCheck %s + +module attributes {omp.is_target_device = true} { + +omp.declare_reduction @add_f32 : f32 +init { +^bb0(%arg: f32): + %0 = llvm.mlir.constant(0.0 : f32) : f32 + omp.yield (%0 : f32) +} +combiner { +^bb1(%arg0: f32, %arg1: f32): + %1 = llvm.fadd %arg0, %arg1 : f32 + omp.yield (%1 : f32) +} +atomic { +^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): + %2 = llvm.load %arg3 : !llvm.ptr -> f32 + llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32 + omp.yield +} +omp.private {type = private} @privatizer_i32 : i32 +omp.private {type = firstprivate} @firstprivatizer_f32 : f32 copy { +^bb0(%arg0: f32, %arg1: f32): + omp.yield(%arg0 : f32) +} + +llvm.func @foo(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget} + +// CHECK-LABEL: llvm.func @device_func( +// CHECK-SAME: %[[N:.*]]: i64, %[[COND:.*]]: i1) +llvm.func @device_func(%arg0: i64, %cond: i1) attributes {omp.declare_target = #omp.declaretarget} { + // CHECK: %[[ALLOC0:.*]] = omp.alloc_shared_mem %[[N]] x i64 : (i64) -> !llvm.ptr + %0 = llvm.alloca %arg0 x i64 : (i64) -> !llvm.ptr + // CHECK: %[[ALLOC1:.*]] = omp.alloc_shared_mem %[[N]] x f32 {alignment = 128 : i64} : (i64) -> !llvm.ptr + %1 = llvm.alloca %arg0 x f32 {alignment = 128} : (i64) -> !llvm.ptr + // CHECK: %[[ALLOC2:.*]] = omp.alloc_shared_mem %[[N]] x vector<16xf32> : (i64) -> !llvm.ptr + %2 = llvm.alloca %arg0 x vector<16xf32> : (i64) -> !llvm.ptr + // CHECK: %[[ALLOC3:.*]] = omp.alloc_shared_mem %[[N]] x i32 : (i64) -> !llvm.ptr + %3 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr<5> + %4 = llvm.addrspacecast %3 : !llvm.ptr<5> to !llvm.ptr + + // CHECK: %[[ALLOC4:.*]] = llvm.alloca %[[N]] x i32 : (i64) -> !llvm.ptr + %5 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr + // CHECK: %[[ALLOC5:.*]] = llvm.alloca %[[N]] x i32 : (i64) -> !llvm.ptr + %6 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr + // CHECK: llvm.cond_br %[[COND]], ^[[IF:.*]], ^[[ELSE:.*]] + llvm.cond_br %cond, ^if, ^else + +// CHECK: ^[[IF]]: +^if: + // CHECK: omp.parallel reduction(@add_f32 %[[ALLOC0]] -> %{{.*}} : !llvm.ptr) + omp.parallel reduction(@add_f32 %0 -> %arg1 : !llvm.ptr) { + // CHECK: %{{.*}} = llvm.load %[[ALLOC2]] + %7 = llvm.load %2 : !llvm.ptr -> vector<16xf32> + // CHECK: %{{.*}} = llvm.alloca + %8 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr + // CHECK: omp.wsloop private(@privatizer_i32 %[[ALLOC4]] -> %{{.*}}, @firstprivatizer_f32 %[[ALLOC1]] -> %{{.*}} : !llvm.ptr, !llvm.ptr) + omp.wsloop private(@privatizer_i32 %5 -> %arg2, @firstprivatizer_f32 %1 -> %arg3 : !llvm.ptr, !llvm.ptr) { + omp.loop_nest (%arg4) : i64 = (%arg0) to (%arg0) inclusive step (%arg0) { + llvm.call @foo(%arg1) : (!llvm.ptr) -> () + llvm.call @foo(%8) : (!llvm.ptr) -> () + llvm.call @foo(%arg2) : (!llvm.ptr) -> () + llvm.call @foo(%arg3) : (!llvm.ptr) -> () + omp.yield + } + } + omp.terminator + } + // CHECK: llvm.br ^[[EXIT:.*]] + llvm.br ^exit + +// CHECK: ^[[ELSE]]: +^else: + // CHECK: llvm.call @foo(%[[ALLOC3]]) : (!llvm.ptr) -> () + llvm.call @foo(%4) : (!llvm.ptr) -> () + // CHECK: %{{.*}} = llvm.load %[[ALLOC5]] + %8 = llvm.load %6 : !llvm.ptr -> i32 + // CHECK: llvm.br ^[[EXIT]] + llvm.br ^exit + +// CHECK: ^[[EXIT]]: +^exit: + // CHECK: omp.free_shared_mem %[[ALLOC0]] : !llvm.ptr + // CHECK: omp.free_shared_mem %[[ALLOC1]] : !llvm.ptr + // CHECK: omp.free_shared_mem %[[ALLOC2]] : !llvm.ptr + // CHECK: omp.free_shared_mem %[[ALLOC3]] : !llvm.ptr + // CHECK-NOT: omp.free_shared_mem + // CHECK: llvm.return + llvm.return +} + +// CHECK-LABEL: llvm.func @host_func( +// CHECK-SAME: %[[N:.*]]: i64) +llvm.func @host_func(%arg0: i64) { + // CHECK: %[[ALLOC0:.*]] = llvm.alloca %[[N]] x i32 : (i64) -> !llvm.ptr + %0 = llvm.alloca %arg0 x i32 : (i64) -> !llvm.ptr + // CHECK: omp.parallel + omp.parallel { + // CHECK: llvm.call @foo(%[[ALLOC0]]) : (!llvm.ptr) -> () + llvm.call @foo(%0) : (!llvm.ptr) -> () + // CHECK: omp.target + omp.target { + %c0 = llvm.mlir.constant(1 : i64) : i64 + // CHECK: %[[ALLOC1:.*]] = omp.alloc_shared_mem %{{.*}} + %1 = llvm.alloca %c0 x i32 : (i64) -> !llvm.ptr + // CHECK-NEXT: llvm.call @foo(%[[ALLOC1]]) : (!llvm.ptr) -> () + llvm.call @foo(%1) : (!llvm.ptr) -> () + // CHECK-NEXT: omp.free_shared_mem %[[ALLOC1]] : !llvm.ptr + // CHECK-NEXT: omp.terminator + omp.terminator + } + omp.terminator + } + llvm.return +} + +// CHECK-LABEL: llvm.func @target_spmd( +llvm.func @target_spmd() { + // CHECK-NOT: omp.alloc_shared_mem + // CHECK-NOT: omp.free_shared_mem + omp.target { + %c = llvm.mlir.constant(1 : i64) : i64 + %0 = llvm.alloca %c x i32 : (i64) -> !llvm.ptr + omp.teams { + %1 = llvm.alloca %c x i32 : (i64) -> !llvm.ptr + omp.parallel { + %2 = llvm.alloca %c x i32 : (i64) -> !llvm.ptr + %3 = llvm.load %0 : !llvm.ptr -> i32 + %4 = llvm.load %1 : !llvm.ptr -> i32 + omp.distribute { + omp.wsloop { + omp.loop_nest (%arg0) : i64 = (%c) to (%c) inclusive step (%c) { + %5 = llvm.load %2 : !llvm.ptr -> i32 + omp.yield + } + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + omp.terminator + } + // CHECK: return + llvm.return +} + +} diff --git a/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir b/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir index 724e03885d146..fc6f80e1970fc 100644 --- a/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir @@ -39,6 +39,5 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK-NEXT: entry: // CHECK-NEXT: %[[MOVED_ALLOCA1:.*]] = alloca { ptr }, align 8 // CHECK-NEXT: %[[MOVED_ALLOCA2:.*]] = alloca i32, i64 1, align 4 -// CHECK-NEXT: %[[MAP_ARG_ALLOCA:.*]] = alloca ptr, align 8 - // CHECK: user_code.entry: ; preds = %entry +// CHECK-NEXT: %[[MAP_ARG_ALLOC:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8) diff --git a/mlir/test/Target/LLVMIR/omptarget-device-shared-mem.mlir b/mlir/test/Target/LLVMIR/omptarget-device-shared-mem.mlir new file mode 100644 index 0000000000000..72b0a2daadfc3 --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-device-shared-mem.mlir @@ -0,0 +1,42 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { + // CHECK-LABEL: define void @device_shared_mem( + // CHECK-SAME: i32 %[[N0:.*]], i64 %[[N1:.*]]) + llvm.func @device_shared_mem(%n0: i32, %n1: i64) attributes {omp.declare_target = #omp.declaretarget} { + // CHECK: %[[CAST_N0:.*]] = zext i32 %[[N0]] to i64 + // CHECK-NEXT: %[[ALLOC0_SZ:.*]] = mul i64 8, %[[CAST_N0]] + // CHECK-NEXT: %[[ALLOC0:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 %[[ALLOC0_SZ]]) + %0 = omp.alloc_shared_mem %n0 x i64 : (i32) -> !llvm.ptr + + // CHECK: %[[ALLOC1_SZ:.*]] = mul i64 8, %[[N1]] + // CHECK-NEXT: %[[ALLOC1:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 %[[ALLOC1_SZ]]) + %1 = omp.alloc_shared_mem %n1 x i64 : (i64) -> !llvm.ptr + + // CHECK: %[[ALLOC2_SZ:.*]] = mul i64 64, %[[N1]] + // CHECK-NEXT: %[[ALLOC2:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 %[[ALLOC2_SZ]]) + %2 = omp.alloc_shared_mem %n1 x vector<16xf32> : (i64) -> !llvm.ptr + + // CHECK: %[[ALLOC3_SZ:.*]] = mul i64 128, %[[N1]] + // CHECK-NEXT: %[[ALLOC3:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 %[[ALLOC3_SZ]]) + %3 = omp.alloc_shared_mem %n1 x vector<16xf32> {alignment = 128} : (i64) -> !llvm.ptr + + // CHECK: %[[CAST_N0_1:.*]] = zext i32 %[[N0]] to i64 + // CHECK-NEXT: %[[FREE0_SZ:.*]] = mul i64 8, %[[CAST_N0_1]] + // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[ALLOC0]], i64 %[[FREE0_SZ]]) + omp.free_shared_mem %0 : !llvm.ptr + + // CHECK: %[[FREE1_SZ:.*]] = mul i64 8, %[[N1]] + // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[ALLOC1]], i64 %[[FREE1_SZ]]) + omp.free_shared_mem %1 : !llvm.ptr + + // CHECK: %[[FREE2_SZ:.*]] = mul i64 64, %[[N1]] + // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[ALLOC2]], i64 %[[FREE2_SZ]]) + omp.free_shared_mem %2 : !llvm.ptr + + // CHECK: %[[FREE3_SZ:.*]] = mul i64 128, %[[N1]] + // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[ALLOC3]], i64 %[[FREE3_SZ]]) + omp.free_shared_mem %3 : !llvm.ptr + llvm.return + } +} diff --git a/mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir b/mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir new file mode 100644 index 0000000000000..9f57255d564b3 --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-device-shared-memory.mlir @@ -0,0 +1,83 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// This test checks that, when compiling for an offloading target, device shared +// memory will be used in place of allocas for certain private variables. + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { + omp.private {type = private} @privatizer : i32 + omp.declare_reduction @reduction : i32 init { + ^bb0(%arg0: i32): + %0 = llvm.mlir.constant(0 : i32) : i32 + omp.yield(%0 : i32) + } combiner { + ^bb0(%arg0: i32, %arg1: i32): + %0 = llvm.add %arg0, %arg1 : i32 + omp.yield(%0 : i32) + } + llvm.func @main() { + %c0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %c0 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.alloca %c0 x i32 {bindc_name = "y"} : (i64) -> !llvm.ptr<5> + %4 = llvm.addrspacecast %3 : !llvm.ptr<5> to !llvm.ptr + %5 = llvm.alloca %c0 x i32 {bindc_name = "z"} : (i64) -> !llvm.ptr<5> + %6 = llvm.addrspacecast %5 : !llvm.ptr<5> to !llvm.ptr + %7 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "x"} + %8 = omp.map.info var_ptr(%4 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "y"} + %9 = omp.map.info var_ptr(%6 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "z"} + omp.target map_entries(%7 -> %arg0, %8 -> %arg1, %9 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) { + %11 = llvm.mlir.constant(10000 : i32) : i32 + %12 = llvm.mlir.constant(1 : i32) : i32 + omp.teams reduction(@reduction %arg0 -> %arg3 : !llvm.ptr) { + omp.distribute private(@privatizer %arg1 -> %arg4, @privatizer %arg2 -> %arg5 : !llvm.ptr, !llvm.ptr) { + omp.loop_nest (%arg6) : i32 = (%12) to (%11) inclusive step (%12) { + llvm.store %arg6, %arg4 : i32, !llvm.ptr + %13 = llvm.load %arg3 : !llvm.ptr -> i32 + %14 = llvm.add %13, %12 : i32 + llvm.store %14, %arg3 : i32, !llvm.ptr + omp.parallel reduction(@reduction %arg5 -> %arg7 : !llvm.ptr) { + %15 = llvm.load %arg4 : !llvm.ptr -> i32 + %16 = llvm.load %arg7 : !llvm.ptr -> i32 + %17 = llvm.add %15, %16 : i32 + llvm.store %17, %arg7 : i32, !llvm.ptr + omp.terminator + } + omp.yield + } + } + omp.terminator + } + omp.terminator + } + // CHECK: call i32 @__kmpc_target_init + // CHECK: call void @[[OUTLINED_TARGET:__omp_offloading_[A-Za-z0-9_.]*]] + + // CHECK: define internal void @[[OUTLINED_TARGET]] + // CHECK: %[[X_PRIV:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4) + // CHECK: %[[GEP_X:.*]] = getelementptr { {{.*}} }, ptr addrspace(5) %structArg + // CHECK-NEXT: store ptr %[[X_PRIV]], ptr addrspace(5) %[[GEP_X]] + // CHECK-NEXT: call void @[[OUTLINED_TEAMS:__omp_offloading_[A-Za-z0-9_.]*]](ptr %structArg.ascast) + + // CHECK: [[REDUCE_FINALIZE_BB:reduce\.finalize.*]]: + // CHECK-NEXT: %{{.*}} = call i32 @__kmpc_global_thread_num + // CHECK-NEXT: call void @__kmpc_barrier + // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[X_PRIV]], i64 4) + + // CHECK: define internal void @[[OUTLINED_TEAMS]] + // CHECK: %[[Y_PRIV:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4) + // CHECK: %[[Z_PRIV:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4) + + // %[[GEP_Y:.*]] = getelementptr { {{.*}} }, ptr addrspace(5) %structArg + // store ptr %[[Y_PRIV]], ptr addrspace(5) %[[GEP_Y]], align 8 + // %[[GEP_Z:.*]] = getelementptr { {{.*}} }, ptr addrspace(5) %structArg + // store ptr %[[Z_PRIV]], ptr addrspace(5) %[[GEP_Z]], align 8 + + // CHECK: call void @__kmpc_free_shared(ptr %[[Y_PRIV]], i64 4) + // CHECK-NEXT: call void @__kmpc_free_shared(ptr %[[Z_PRIV]], i64 4) + // CHECK-NEXT: br label %[[EXIT_BB:.*]] + + // CHECK: [[EXIT_BB]]: + // CHECK-NEXT: ret void + llvm.return + } +} diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir index 424e948fac750..c6eba6553fe54 100644 --- a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir @@ -1,5 +1,4 @@ // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s -// XFAIL: * // The aim of the test is to check the LLVM IR codegen for the device // for omp target parallel construct @@ -55,22 +54,22 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: define weak_odr protected amdgpu_kernel void @[[FUNC0:.*]]( // CHECK-SAME: ptr %[[TMP:.*]], ptr %[[TMP0:.*]]) #{{[0-9]+}} { // CHECK: %[[TMP1:.*]] = alloca [1 x ptr], align 8, addrspace(5) -// CHECK: %[[TMP2:.*]] = addrspacecast ptr addrspace(5) %[[TMP1]] to ptr -// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8, addrspace(5) -// CHECK: %[[STRUCTARG_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[STRUCTARG]] to ptr -// CHECK: %[[TMP3:.*]] = alloca ptr, align 8, addrspace(5) -// CHECK: %[[TMP4:.*]] = addrspacecast ptr addrspace(5) %[[TMP3]] to ptr -// CHECK: store ptr %[[TMP0]], ptr %[[TMP4]], align 8 -// CHECK: %[[TMP5:.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{.*}} to ptr), ptr %[[TMP]]) -// CHECK: %[[EXEC_USER_CODE:.*]] = icmp eq i32 %[[TMP5]], -1 +// CHECK: %[[TMP4:.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{.*}} to ptr), ptr %[[TMP]]) +// CHECK: %[[EXEC_USER_CODE:.*]] = icmp eq i32 %[[TMP4]], -1 // CHECK: br i1 %[[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[WORKER_EXIT:.*]] -// CHECK: %[[TMP6:.*]] = load ptr, ptr %[[TMP4]], align 8 +// CHECK: %[[TMP5:.*]] = addrspacecast ptr addrspace(5) %[[TMP1]] to ptr +// CHECK: %[[STRUCTARG:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8) +// CHECK: %[[TMP2:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8) +// CHECK: store ptr %[[TMP0]], ptr %[[TMP2]], align 8 +// CHECK: %[[TMP6:.*]] = load ptr, ptr %[[TMP2]], align 8 // CHECK: %[[OMP_GLOBAL_THREAD_NUM:.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr)) -// CHECK: %[[GEP_:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG_ASCAST]], i32 0, i32 0 +// CHECK: %[[GEP_:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG]], i32 0, i32 0 // CHECK: store ptr %[[TMP6]], ptr %[[GEP_]], align 8 -// CHECK: %[[TMP7:.*]] = getelementptr inbounds [1 x ptr], ptr %[[TMP2]], i64 0, i64 0 -// CHECK: store ptr %[[STRUCTARG_ASCAST]], ptr %[[TMP7]], align 8 -// CHECK: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 1, i32 -1, i32 -1, ptr @[[FUNC1:.*]], ptr null, ptr %[[TMP2]], i64 1) +// CHECK: %[[TMP7:.*]] = getelementptr inbounds [1 x ptr], ptr %[[TMP5]], i64 0, i64 0 +// CHECK: store ptr %[[STRUCTARG]], ptr %[[TMP7]], align 8 +// CHECK: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 1, i32 -1, i32 -1, ptr @[[FUNC1:.*]], ptr @[[FUNC1_WRAPPER:.*]], ptr %[[TMP5]], i64 1) +// CHECK: call void @__kmpc_free_shared(ptr %[[STRUCTARG]], i64 8) +// CHECK: call void @__kmpc_free_shared(ptr %[[TMP2]], i64 8) // CHECK: call void @__kmpc_target_deinit() // CHECK: define internal void @[[FUNC1]]( @@ -84,7 +83,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: call void @__kmpc_parallel_51(ptr addrspacecast ( // CHECK-SAME: ptr addrspace(1) @[[NUM_THREADS_GLOB:[0-9]+]] to ptr), // CHECK-SAME: i32 [[NUM_THREADS_TMP0:%.*]], i32 1, i32 156, -// CHECK-SAME: i32 -1, ptr [[FUNC_NUM_THREADS1:@.*]], ptr null, ptr [[NUM_THREADS_TMP1:%.*]], i64 1) +// CHECK-SAME: i32 -1, ptr @[[FUNC_NUM_THREADS1:.*]], ptr @[[FUNC2_WRAPPER:.*]], ptr [[NUM_THREADS_TMP1:%.*]], i64 1) // One of the arguments of kmpc_parallel_51 function is responsible for handling if clause // of omp parallel construct for target region. If this argument is nonzero, @@ -105,4 +104,23 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: call void @__kmpc_parallel_51(ptr addrspacecast ( // CHECK-SAME: ptr addrspace(1) {{.*}} to ptr), // CHECK-SAME: i32 {{.*}}, i32 %[[IFCOND_TMP4]], i32 -1, -// CHECK-SAME: i32 -1, ptr {{.*}}, ptr null, ptr {{.*}}, i64 1) +// CHECK-SAME: i32 -1, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i64 1) + +// CHECK: define internal void @[[FUNC1_WRAPPER]](i16 noundef zeroext %{{.*}}, i32 noundef %[[ADDR:.*]]) +// CHECK: %[[ADDR_ALLOCA:.*]] = alloca i32, align 4, addrspace(5) +// CHECK: %[[ADDR_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[ADDR_ALLOCA]] to ptr +// CHECK: %[[ZERO_ALLOCA:.*]] = alloca i32, align 4, addrspace(5) +// CHECK: %[[ZERO_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[ZERO_ALLOCA]] to ptr +// CHECK: %[[ARGS_ALLOCA:.*]] = alloca ptr, align 8, addrspace(5) +// CHECK: %[[ARGS_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[ARGS_ALLOCA]] to ptr +// CHECK: store i32 %[[ADDR]], ptr %[[ADDR_ASCAST]] +// CHECK: store i32 0, ptr %[[ZERO_ASCAST]] +// CHECK: call void @__kmpc_get_shared_variables(ptr %[[ARGS_ASCAST]]) +// CHECK: %[[LOAD_ARGS:.*]] = load ptr, ptr %[[ARGS_ASCAST]], align 8 +// CHECK: %[[FIRST_ARG:.*]] = getelementptr inbounds ptr, ptr %[[LOAD_ARGS]], i64 0 +// CHECK: %[[STRUCTARG:.*]] = load ptr, ptr %[[FIRST_ARG]], align 8 +// CHECK: call void @[[FUNC1]](ptr %[[ADDR_ASCAST]], ptr %[[ZERO_ASCAST]], ptr %[[STRUCTARG]]) + +// CHECK: define internal void @[[FUNC2_WRAPPER]](i16 noundef zeroext %{{.*}}, i32 noundef %{{.*}}) +// CHECK-NOT: define +// CHECK: call void @[[FUNC_NUM_THREADS1]]({{.*}}) diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir index 2df2b8db0e5f7..98db59c288dc8 100644 --- a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir @@ -30,7 +30,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: call void @__kmpc_parallel_51(ptr addrspacecast // CHECK-SAME: (ptr addrspace(1) @[[GLOB:[0-9]+]] to ptr), // CHECK-SAME: i32 %[[THREAD_NUM:.*]], i32 1, i32 -1, i32 -1, -// CHECK-SAME: ptr @[[PARALLEL_FUNC:.*]], ptr null, ptr %[[PARALLEL_ARGS:.*]], i64 1) +// CHECK-SAME: ptr @[[PARALLEL_FUNC:.*]], ptr @[[PARALLEL_WRAPPER:.*]], ptr %[[PARALLEL_ARGS:.*]], i64 1) // CHECK: define internal void @[[PARALLEL_FUNC]] // CHECK-SAME: (ptr noalias noundef %[[TID_ADDR:.*]], ptr noalias noundef %[[ZERO_ADDR:.*]], @@ -42,6 +42,11 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: define internal void @[[LOOP_BODY_FUNC]](i32 %[[CNT:.*]], ptr %[[LOOP_BODY_ARG_PTR:.*]]) #[[ATTRS2:[0-9]+]] { +// CHECK: define internal void @[[PARALLEL_WRAPPER]](i16 {{.*}}, i32 {{.*}}) { +// CHECK-NOT: ret {{.*}} +// CHECK: call void @[[PARALLEL_FUNC]]({{.*}}) +// CHECK-NEXT: ret void + // CHECK: attributes #[[ATTRS1]] = { // CHECK-SAME: "target-cpu"="gfx90a" // CHECK-SAME: "target-features"="+gfx9-insts,+wavefrontsize64" diff --git a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir index 5a76871c180ab..3ebb79fef7474 100644 --- a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir @@ -56,7 +56,9 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: %[[B:.*]] = load i32, ptr %[[PTR_B]], align 4 // CHECK: %[[C:.*]] = add i32 %[[A]], %[[B]] // CHECK: store i32 %[[C]], ptr %[[PTR_C]], align 4 -// CHECK: br label %[[LABEL_DEINIT:.*]] +// CHECK: br label %[[LABEL_TARGET_EXIT:.*]] +// CHECK: [[LABEL_TARGET_EXIT]]: +// CHECK-NEXT: br label %[[LABEL_DEINIT:.*]] // CHECK: [[LABEL_DEINIT]]: // CHECK-NEXT: call void @__kmpc_target_deinit() // CHECK-NEXT: ret void diff --git a/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir b/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir index 504d91b1f6198..6084a33fac8aa 100644 --- a/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir +++ b/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir @@ -84,7 +84,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo } } -// DEVICE: @[[KERNEL_NAME:.*]]_exec_mode = weak protected constant i8 [[EXEC_MODE:3]] +// DEVICE: @[[KERNEL_NAME:.*]]_exec_mode = weak protected constant i8 [[EXEC_MODE:1]] // DEVICE: @llvm.compiler.used = appending global [1 x ptr] [ptr @[[KERNEL_NAME]]_exec_mode], section "llvm.metadata" // DEVICE: @[[KERNEL_NAME]]_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { // DEVICE-SAME: %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 [[EXEC_MODE]], {{.*}}}, diff --git a/mlir/test/Target/LLVMIR/openmp-target-private-allocatable.mlir b/mlir/test/Target/LLVMIR/openmp-target-private-allocatable.mlir index 0ee9230b5af0e..2aa11f3a1aa34 100644 --- a/mlir/test/Target/LLVMIR/openmp-target-private-allocatable.mlir +++ b/mlir/test/Target/LLVMIR/openmp-target-private-allocatable.mlir @@ -70,4 +70,6 @@ llvm.func @_FortranAAssign(!llvm.ptr, !llvm.ptr, !llvm.ptr, i32) -> !llvm.struct // CHECK: call void @dealloc_foo_1(ptr %[[DESC_TO_DEALLOC]]) // CHECK-NEXT: br label %[[CONT_BLOCK:.*]] // CHECK: [[CONT_BLOCK]]: +// CHECK-NEXT: br label %[[EXIT_BLOCK:.*]] +// CHECK: [[EXIT_BLOCK]]: // CHECK-NEXT: ret void diff --git a/mlir/test/Target/LLVMIR/openmp-target-private-shared-mem.mlir b/mlir/test/Target/LLVMIR/openmp-target-private-shared-mem.mlir new file mode 100644 index 0000000000000..bfa679f769c46 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-target-private-shared-mem.mlir @@ -0,0 +1,76 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +module attributes {omp.is_target_device = true, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, dlti.dl_spec = #dlti.dl_spec : vector<4xi64>, !llvm.ptr<1> = dense<64> : vector<4xi64>, !llvm.ptr<2> = dense<32> : vector<4xi64>, !llvm.ptr<3> = dense<32> : vector<4xi64>, !llvm.ptr<4> = dense<64> : vector<4xi64>, !llvm.ptr<5> = dense<32> : vector<4xi64>, !llvm.ptr<6> = dense<32> : vector<4xi64>, !llvm.ptr<7> = dense<[160, 256, 256, 32]> : vector<4xi64>, !llvm.ptr<8> = dense<[128, 128, 128, 48]> : vector<4xi64>, !llvm.ptr<9> = dense<[192, 256, 256, 32]> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.legal_int_widths" = array, "dlti.stack_alignment" = 32 : i64, "dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>} { + omp.private {type = private} @simple_var.privatizer : i32 + omp.declare_reduction @simple_var.reducer : i32 init { + ^bb0(%arg0: i32): + %0 = llvm.mlir.constant(0 : i32) : i32 + omp.yield(%0 : i32) + } combiner { + ^bb0(%arg0: i32, %arg1: i32): + %0 = llvm.add %arg0, %arg1 : i32 + omp.yield(%0 : i32) + } + + // CHECK-LABEL: declare void @device_func(ptr) + llvm.func @device_func(!llvm.ptr) attributes {omp.declare_target = #omp.declaretarget} + + // CHECK-NOT: define {{.*}} void @target_map_single_shared_mem_private + llvm.func @target_map_single_shared_mem_private() attributes {omp.declare_target = #omp.declaretarget} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + + // CHECK-LABEL: define {{.*}} void @__omp_offloading_{{.*}}target_map_single_shared_mem_private{{.*}}({{.*}}) + // CHECK: call i32 @__kmpc_target_init + // CHECK: %[[ALLOC0:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4) + // CHECK: call void @device_func(ptr %[[ALLOC0]]) + // CHECK: call void @__kmpc_free_shared(ptr %[[ALLOC0]], i64 4) + // CHECK: call void @__kmpc_target_deinit + omp.target private(@simple_var.privatizer %2 -> %arg0 : !llvm.ptr) { + llvm.call @device_func(%arg0) : (!llvm.ptr) -> () + omp.terminator + } + + // CHECK-LABEL: define {{.*}} void @__omp_offloading_{{.*}}target_map_single_shared_mem_private{{.*}}({{.*}}) + // CHECK: call i32 @__kmpc_target_init + // CHECK: %[[ALLOC_ARGS0:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8) + // CHECK: %[[ALLOC1:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4) + // CHECK: %[[GEP0:.*]] = getelementptr { ptr }, ptr %[[ALLOC_ARGS0]], i32 0, i32 0 + // CHECK: store ptr %[[ALLOC1]], ptr %[[GEP0]], align 8 + // CHECK: %[[GEP1:.*]] = getelementptr inbounds [1 x ptr], ptr %[[PAR_ARGS0:.*]], i64 0, i64 0 + // CHECK: store ptr %[[ALLOC_ARGS0]], ptr %[[GEP1]], align 8 + // CHECK: call void @__kmpc_parallel_51(ptr {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr %[[PAR_ARGS0]], i64 1) + // CHECK: call void @__kmpc_free_shared(ptr %[[ALLOC1]], i64 4) + // CHECK: call void @__kmpc_free_shared(ptr %[[ALLOC_ARGS0]], i64 8) + // CHECK: call void @__kmpc_target_deinit + omp.target private(@simple_var.privatizer %2 -> %arg0 : !llvm.ptr) { + omp.parallel reduction(@simple_var.reducer %arg0 -> %arg1 : !llvm.ptr) { + %3 = llvm.load %arg1 : !llvm.ptr -> i32 + omp.terminator + } + omp.terminator + } + + // CHECK-LABEL: define {{.*}} void @__omp_offloading_{{.*}}target_map_single_shared_mem_private{{.*}}({{.*}}) + // CHECK: call i32 @__kmpc_target_init + // CHECK: %[[ALLOC_ARGS1:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8) + // CHECK: %[[ALLOC2:.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4) + // CHECK: %[[GEP2:.*]] = getelementptr { ptr }, ptr %[[ALLOC_ARGS1]], i32 0, i32 0 + // CHECK: store ptr %[[ALLOC2]], ptr %[[GEP2]], align 8 + // CHECK: %[[GEP3:.*]] = getelementptr inbounds [1 x ptr], ptr %[[PAR_ARGS1:.*]], i64 0, i64 0 + // CHECK: store ptr %[[ALLOC_ARGS1]], ptr %[[GEP3]], align 8 + // CHECK: call void @__kmpc_parallel_51(ptr {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr %[[PAR_ARGS1]], i64 1) + // CHECK: call void @__kmpc_free_shared(ptr %[[ALLOC2]], i64 4) + // CHECK: call void @__kmpc_free_shared(ptr %[[ALLOC_ARGS1]], i64 8) + // CHECK: call void @__kmpc_target_deinit + omp.target private(@simple_var.privatizer %2 -> %arg0 : !llvm.ptr) { + omp.parallel { + %4 = llvm.load %arg0 : !llvm.ptr -> i32 + omp.terminator + } + omp.terminator + } + llvm.return + } +} diff --git a/offload/test/offloading/fortran/target-generic-loops.f90 b/offload/test/offloading/fortran/target-generic-loops.f90 new file mode 100644 index 0000000000000..07bcbfd2c8752 --- /dev/null +++ b/offload/test/offloading/fortran/target-generic-loops.f90 @@ -0,0 +1,130 @@ +! Offloading test for generic target regions containing different kinds of +! loop constructs inside. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + integer :: i1, i2, n1, n2, counter + + n1 = 100 + n2 = 50 + + counter = 0 + !$omp target map(tofrom:counter) + !$omp teams distribute reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + !$omp end target + + ! CHECK: 1 100 + print '(I2" "I0)', 1, counter + + counter = 0 + !$omp target map(tofrom:counter) + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + !$omp end target + + ! CHECK: 2 200 + print '(I2" "I0)', 2, counter + + counter = 0 + !$omp target map(tofrom:counter) + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + counter = counter + 1 + !$omp end target + + ! CHECK: 3 203 + print '(I2" "I0)', 3, counter + + counter = 0 + !$omp target map(tofrom: counter) + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + counter = counter + 1 + !$omp end target + + ! CHECK: 4 102 + print '(I2" "I0)', 4, counter + + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + end do + + ! CHECK: 5 5000 + print '(I2" "I0)', 5, counter + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + counter = counter + 1 + end do + + ! CHECK: 6 5200 + print '(I2" "I0)', 6, counter + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + end do + + ! CHECK: 7 10000 + print '(I2" "I0)', 7, counter + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + counter = counter + 1 + !$omp parallel do reduction(+:counter) + do i2=1, n2 + counter = counter + 1 + end do + counter = counter + 1 + end do + + ! CHECK: 8 10300 + print '(I2" "I0)', 8, counter +end program diff --git a/offload/test/offloading/fortran/target-generic-outlined-loops.f90 b/offload/test/offloading/fortran/target-generic-outlined-loops.f90 new file mode 100644 index 0000000000000..594809027e115 --- /dev/null +++ b/offload/test/offloading/fortran/target-generic-outlined-loops.f90 @@ -0,0 +1,109 @@ +! Offloading test for generic target regions containing different kinds of +! loop constructs inside, moving parallel regions into a separate subroutine. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +subroutine parallel_loop(n, counter) + implicit none + integer, intent(in) :: n + integer, intent(inout) :: counter + integer :: i + + !$omp parallel do reduction(+:counter) + do i=1, n + counter = counter + 1 + end do +end subroutine + +program main + integer :: i1, i2, n1, n2, counter + + n1 = 100 + n2 = 50 + + counter = 0 + !$omp target map(tofrom:counter) + !$omp teams distribute reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + !$omp end target + + ! CHECK: 1 100 + print '(I2" "I0)', 1, counter + + counter = 0 + !$omp target map(tofrom:counter) + call parallel_loop(n1, counter) + call parallel_loop(n1, counter) + !$omp end target + + ! CHECK: 2 200 + print '(I2" "I0)', 2, counter + + counter = 0 + !$omp target map(tofrom:counter) + counter = counter + 1 + call parallel_loop(n1, counter) + counter = counter + 1 + call parallel_loop(n1, counter) + counter = counter + 1 + !$omp end target + + ! CHECK: 3 203 + print '(I2" "I0)', 3, counter + + counter = 0 + !$omp target map(tofrom: counter) + counter = counter + 1 + call parallel_loop(n1, counter) + counter = counter + 1 + !$omp end target + + ! CHECK: 4 102 + print '(I2" "I0)', 4, counter + + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + call parallel_loop(n2, counter) + end do + + ! CHECK: 5 5000 + print '(I2" "I0)', 5, counter + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + call parallel_loop(n2, counter) + counter = counter + 1 + end do + + ! CHECK: 6 5200 + print '(I2" "I0)', 6, counter + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + call parallel_loop(n2, counter) + call parallel_loop(n2, counter) + end do + + ! CHECK: 7 10000 + print '(I2" "I0)', 7, counter + + counter = 0 + !$omp target teams distribute reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + call parallel_loop(n2, counter) + counter = counter + 1 + call parallel_loop(n2, counter) + counter = counter + 1 + end do + + ! CHECK: 8 10300 + print '(I2" "I0)', 8, counter +end program diff --git a/offload/test/offloading/fortran/target-spmd-loops.f90 b/offload/test/offloading/fortran/target-spmd-loops.f90 new file mode 100644 index 0000000000000..7407f0c0768cb --- /dev/null +++ b/offload/test/offloading/fortran/target-spmd-loops.f90 @@ -0,0 +1,39 @@ +! Offloading test for generic target regions containing different kinds of +! loop constructs inside. +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-run-and-check-generic +program main + integer :: i1, n1, counter + + n1 = 100 + + counter = 0 + !$omp target parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + + ! CHECK: 1 100 + print '(I2" "I0)', 1, counter + + counter = 0 + !$omp target map(tofrom:counter) + !$omp parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + !$omp end target + + ! CHECK: 2 100 + print '(I2" "I0)', 2, counter + + counter = 0 + !$omp target teams distribute parallel do reduction(+:counter) + do i1=1, n1 + counter = counter + 1 + end do + + ! CHECK: 3 100 + print '(I2" "I0)', 3, counter +end program